diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index b70ea7d78b..b9727e6c9c 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -1415,11 +1415,10 @@ static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block GGML_ASSERT(interleave_block == 4 || interleave_block == 8); constexpr int nrows_interleaved = 4; - block_q4_0x4 * dst = (block_q4_0x4 *)t->data; - const block_q4_0 * src = (const block_q4_0 *)data; - block_q4_0 dst_tmp[4]; - int nrow = ggml_nrows(t); - int nblocks = t->ne[0] / QK4_0; + block_q4_0x4 * dst_base = (block_q4_0x4 *)t->data; + const block_q4_0 * src_base = (const block_q4_0 *)data; + const int nrow = ggml_nrows(t); + const int nblocks = t->ne[0] / QK4_0; GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0)); @@ -1427,14 +1426,23 @@ static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block return -1; } - for (int b = 0; b < nrow; b += nrows_interleaved) { + const int n_row_groups = nrow / nrows_interleaved; + +#ifdef GGML_USE_OPENMP + #pragma omp parallel for +#endif + for (int bg = 0; bg < n_row_groups; bg++) { + const int b = bg * nrows_interleaved; + const block_q4_0 * src = src_base + b * nblocks; + block_q4_0x4 * dst = dst_base + bg * nblocks; + block_q4_0 dst_tmp[4]; + for (int64_t x = 0; x < nblocks; x++) { for (int i = 0; i < nrows_interleaved; i++) { dst_tmp[i] = src[x + i * nblocks]; } - *dst++ = make_block_q4_0x4(dst_tmp, interleave_block); + dst[x] = make_block_q4_0x4(dst_tmp, interleave_block); } - src += nrows_interleaved * nblocks; } return 0; @@ -1446,11 +1454,10 @@ static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block GGML_ASSERT(interleave_block == 8 || interleave_block == 4); constexpr int nrows_interleaved = 8; - block_q4_Kx8 * dst = (block_q4_Kx8*)t->data; - const block_q4_K * src = (const block_q4_K*) data; - block_q4_K dst_tmp[8]; - int nrow = ggml_nrows(t); - int nblocks = t->ne[0] / QK_K; + block_q4_Kx8 * dst_base = (block_q4_Kx8*)t->data; + const block_q4_K * src_base = (const block_q4_K*) data; + const int nrow = ggml_nrows(t); + const int nblocks = t->ne[0] / QK_K; GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_K)); @@ -1458,14 +1465,23 @@ static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block return -1; } - for (int b = 0; b < nrow; b += nrows_interleaved) { + const int n_row_groups = nrow / nrows_interleaved; + +#ifdef GGML_USE_OPENMP + #pragma omp parallel for +#endif + for (int bg = 0; bg < n_row_groups; bg++) { + const int b = bg * nrows_interleaved; + const block_q4_K * src = src_base + b * nblocks; + block_q4_Kx8 * dst = dst_base + bg * nblocks; + block_q4_K dst_tmp[8]; + for (int64_t x = 0; x < nblocks; x++) { - for (int i = 0; i < nrows_interleaved; i++ ) { + for (int i = 0; i < nrows_interleaved; i++) { dst_tmp[i] = src[x + i * nblocks]; } - *dst++ = make_block_q4_Kx8(dst_tmp, interleave_block); + dst[x] = make_block_q4_Kx8(dst_tmp, interleave_block); } - src += nrows_interleaved * nblocks; } return 0; @@ -1477,11 +1493,10 @@ static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block GGML_ASSERT(interleave_block == 8); constexpr int nrows_interleaved = 8; - block_q2_Kx8 * dst = (block_q2_Kx8*)t->data; - const block_q2_K * src = (const block_q2_K*) data; - block_q2_K dst_tmp[8]; - int nrow = ggml_nrows(t); - int nblocks = t->ne[0] / QK_K; + block_q2_Kx8 * dst_base = (block_q2_Kx8*)t->data; + const block_q2_K * src_base = (const block_q2_K*) data; + const int nrow = ggml_nrows(t); + const int nblocks = t->ne[0] / QK_K; GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q2_K)); @@ -1489,14 +1504,23 @@ static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block return -1; } - for (int b = 0; b < nrow; b += nrows_interleaved) { + const int n_row_groups = nrow / nrows_interleaved; + +#ifdef GGML_USE_OPENMP + #pragma omp parallel for +#endif + for (int bg = 0; bg < n_row_groups; bg++) { + const int b = bg * nrows_interleaved; + const block_q2_K * src = src_base + b * nblocks; + block_q2_Kx8 * dst = dst_base + bg * nblocks; + block_q2_K dst_tmp[8]; + for (int64_t x = 0; x < nblocks; x++) { - for (int i = 0; i < nrows_interleaved; i++ ) { + for (int i = 0; i < nrows_interleaved; i++) { dst_tmp[i] = src[x + i * nblocks]; } - *dst++ = make_block_q2_Kx8(dst_tmp, interleave_block); + dst[x] = make_block_q2_Kx8(dst_tmp, interleave_block); } - src += nrows_interleaved * nblocks; } return 0; @@ -1508,11 +1532,10 @@ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block GGML_ASSERT(interleave_block == 8); constexpr int nrows_interleaved = 8; - block_q4_0x8 * dst = (block_q4_0x8*)t->data; - const block_q4_0 * src = (const block_q4_0*) data; - block_q4_0 dst_tmp[8]; - int nrow = ggml_nrows(t); - int nblocks = t->ne[0] / QK4_0; + block_q4_0x8 * dst_base = (block_q4_0x8*)t->data; + const block_q4_0 * src_base = (const block_q4_0*) data; + const int nrow = ggml_nrows(t); + const int nblocks = t->ne[0] / QK4_0; GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0)); @@ -1520,14 +1543,23 @@ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block return -1; } - for (int b = 0; b < nrow; b += nrows_interleaved) { + const int n_row_groups = nrow / nrows_interleaved; + +#ifdef GGML_USE_OPENMP + #pragma omp parallel for +#endif + for (int bg = 0; bg < n_row_groups; bg++) { + const int b = bg * nrows_interleaved; + const block_q4_0 * src = src_base + b * nblocks; + block_q4_0x8 * dst = dst_base + bg * nblocks; + block_q4_0 dst_tmp[8]; + for (int64_t x = 0; x < nblocks; x++) { - for (int i = 0; i < nrows_interleaved; i++ ) { + for (int i = 0; i < nrows_interleaved; i++) { dst_tmp[i] = src[x + i * nblocks]; } - *dst++ = make_block_q4_0x8(dst_tmp, interleave_block); + dst[x] = make_block_q4_0x8(dst_tmp, interleave_block); } - src += nrows_interleaved * nblocks; } return 0; @@ -1573,14 +1605,12 @@ static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_b GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL); GGML_ASSERT(interleave_block == 4); - const block_iq4_nl * src = (const block_iq4_nl *)data; - block_iq4_nlx4 * dst = ( block_iq4_nlx4 *)t->data; + const block_iq4_nl * src_base = (const block_iq4_nl *)data; + block_iq4_nlx4 * dst_base = (block_iq4_nlx4 *)t->data; - block_iq4_nl dst_tmp[4]; - - int nrow = ggml_nrows(t); - int nrows_interleaved = 4; - int nblocks = t->ne[0] / QK4_NL; + const int nrow = ggml_nrows(t); + const int nrows_interleaved = 4; + const int nblocks = t->ne[0] / QK4_NL; GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl)); @@ -1588,14 +1618,23 @@ static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_b return -1; } - for (int b = 0; b < nrow; b += nrows_interleaved) { + const int n_row_groups = nrow / nrows_interleaved; + +#ifdef GGML_USE_OPENMP + #pragma omp parallel for +#endif + for (int bg = 0; bg < n_row_groups; bg++) { + const int b = bg * nrows_interleaved; + const block_iq4_nl * src = src_base + b * nblocks; + block_iq4_nlx4 * dst = dst_base + bg * nblocks; + block_iq4_nl dst_tmp[4]; + for (int64_t x = 0; x < nblocks; x++) { for (int i = 0; i < nrows_interleaved; i++) { dst_tmp[i] = src[x + i * nblocks]; } - *dst++ = make_block_iq4_nlx4(dst_tmp, interleave_block); + dst[x] = make_block_iq4_nlx4(dst_tmp, interleave_block); } - src += nrows_interleaved * nblocks; } return 0; @@ -1630,14 +1669,12 @@ static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_b GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL); GGML_ASSERT(interleave_block == 8); - const block_iq4_nl * src = (const block_iq4_nl *)data; - block_iq4_nlx8 * dst = ( block_iq4_nlx8 *)t->data; + const block_iq4_nl * src_base = (const block_iq4_nl *)data; + block_iq4_nlx8 * dst_base = (block_iq4_nlx8 *)t->data; - block_iq4_nl dst_tmp[8]; - - int nrow = ggml_nrows(t); - int nrows_interleaved = 8; - int nblocks = t->ne[0] / QK4_NL; + const int nrow = ggml_nrows(t); + const int nrows_interleaved = 8; + const int nblocks = t->ne[0] / QK4_NL; GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl)); @@ -1645,14 +1682,23 @@ static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_b return -1; } - for (int b = 0; b < nrow; b += nrows_interleaved) { + const int n_row_groups = nrow / nrows_interleaved; + +#ifdef GGML_USE_OPENMP + #pragma omp parallel for +#endif + for (int bg = 0; bg < n_row_groups; bg++) { + const int b = bg * nrows_interleaved; + const block_iq4_nl * src = src_base + b * nblocks; + block_iq4_nlx8 * dst = dst_base + bg * nblocks; + block_iq4_nl dst_tmp[8]; + for (int64_t x = 0; x < nblocks; x++) { for (int i = 0; i < nrows_interleaved; i++) { dst_tmp[i] = src[x + i * nblocks]; } - *dst++ = make_block_iq4_nlx8(dst_tmp, interleave_block); + dst[x] = make_block_iq4_nlx8(dst_tmp, interleave_block); } - src += nrows_interleaved * nblocks; } return 0;