ggml-cpu: parallelize tensor repacking with OpenMP

Add OpenMP parallelization to tensor repack functions to significantly
speed up model loading on many-core CPUs.

Measured on AMD EPYC 9655 (96 cores):

| Model Size | Before | After | Speedup |
|------------|--------|-------|---------|
| 6.8GB Q4_K | 5.0s   | 3.3s  | 1.5x    |
| 19GB Q4_K  | 11.9s  | 5.3s  | 2.2x    |
| 271GB Q4_K | ~150s  | ~60s  | ~2.5x   |

The repack functions convert quantized tensors from storage layout
to SIMD-optimized layout for AVX-512. This was previously single-threaded
and is now parallelized across row groups.

Key changes:
- Convert pointer-increment loops to explicit indexing
- Add #pragma omp parallel for to outer loops (guarded by #ifdef _OPENMP)
- Each thread processes independent row groups
- Move thread-local dst_tmp arrays inside parallel region

Functions parallelized:
- repack_q4_0_to_q4_0_4_bl (Q4_0 x4 interleave)
- repack_q4_K_to_q4_K_8_bl (Q4_K_M, Q4_K_S models)
- repack_q2_K_to_q2_K_8_bl (Q2_K models)
- repack_q4_0_to_q4_0_8_bl (Q4_0 x8 interleave)
- repack_iq4_nl_to_iq4_nl_4_bl (IQ4_NL x4)
- repack_iq4_nl_to_iq4_nl_8_bl (IQ4_NL x8)

Tested on: AMD EPYC 9655 "Turin" with 192 threads
This commit is contained in:
pestopoppa 2025-12-21 01:07:40 +01:00
parent 52392291b2
commit b1366757cf
1 changed files with 101 additions and 55 deletions

View File

@ -1415,11 +1415,10 @@ static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block
GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
constexpr int nrows_interleaved = 4;
block_q4_0x4 * dst = (block_q4_0x4 *)t->data;
const block_q4_0 * src = (const block_q4_0 *)data;
block_q4_0 dst_tmp[4];
int nrow = ggml_nrows(t);
int nblocks = t->ne[0] / QK4_0;
block_q4_0x4 * dst_base = (block_q4_0x4 *)t->data;
const block_q4_0 * src_base = (const block_q4_0 *)data;
const int nrow = ggml_nrows(t);
const int nblocks = t->ne[0] / QK4_0;
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
@ -1427,14 +1426,23 @@ static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block
return -1;
}
for (int b = 0; b < nrow; b += nrows_interleaved) {
const int n_row_groups = nrow / nrows_interleaved;
#ifdef GGML_USE_OPENMP
#pragma omp parallel for
#endif
for (int bg = 0; bg < n_row_groups; bg++) {
const int b = bg * nrows_interleaved;
const block_q4_0 * src = src_base + b * nblocks;
block_q4_0x4 * dst = dst_base + bg * nblocks;
block_q4_0 dst_tmp[4];
for (int64_t x = 0; x < nblocks; x++) {
for (int i = 0; i < nrows_interleaved; i++) {
dst_tmp[i] = src[x + i * nblocks];
}
*dst++ = make_block_q4_0x4(dst_tmp, interleave_block);
dst[x] = make_block_q4_0x4(dst_tmp, interleave_block);
}
src += nrows_interleaved * nblocks;
}
return 0;
@ -1446,11 +1454,10 @@ static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block
GGML_ASSERT(interleave_block == 8 || interleave_block == 4);
constexpr int nrows_interleaved = 8;
block_q4_Kx8 * dst = (block_q4_Kx8*)t->data;
const block_q4_K * src = (const block_q4_K*) data;
block_q4_K dst_tmp[8];
int nrow = ggml_nrows(t);
int nblocks = t->ne[0] / QK_K;
block_q4_Kx8 * dst_base = (block_q4_Kx8*)t->data;
const block_q4_K * src_base = (const block_q4_K*) data;
const int nrow = ggml_nrows(t);
const int nblocks = t->ne[0] / QK_K;
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_K));
@ -1458,14 +1465,23 @@ static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block
return -1;
}
for (int b = 0; b < nrow; b += nrows_interleaved) {
const int n_row_groups = nrow / nrows_interleaved;
#ifdef GGML_USE_OPENMP
#pragma omp parallel for
#endif
for (int bg = 0; bg < n_row_groups; bg++) {
const int b = bg * nrows_interleaved;
const block_q4_K * src = src_base + b * nblocks;
block_q4_Kx8 * dst = dst_base + bg * nblocks;
block_q4_K dst_tmp[8];
for (int64_t x = 0; x < nblocks; x++) {
for (int i = 0; i < nrows_interleaved; i++ ) {
for (int i = 0; i < nrows_interleaved; i++) {
dst_tmp[i] = src[x + i * nblocks];
}
*dst++ = make_block_q4_Kx8(dst_tmp, interleave_block);
dst[x] = make_block_q4_Kx8(dst_tmp, interleave_block);
}
src += nrows_interleaved * nblocks;
}
return 0;
@ -1477,11 +1493,10 @@ static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block
GGML_ASSERT(interleave_block == 8);
constexpr int nrows_interleaved = 8;
block_q2_Kx8 * dst = (block_q2_Kx8*)t->data;
const block_q2_K * src = (const block_q2_K*) data;
block_q2_K dst_tmp[8];
int nrow = ggml_nrows(t);
int nblocks = t->ne[0] / QK_K;
block_q2_Kx8 * dst_base = (block_q2_Kx8*)t->data;
const block_q2_K * src_base = (const block_q2_K*) data;
const int nrow = ggml_nrows(t);
const int nblocks = t->ne[0] / QK_K;
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q2_K));
@ -1489,14 +1504,23 @@ static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block
return -1;
}
for (int b = 0; b < nrow; b += nrows_interleaved) {
const int n_row_groups = nrow / nrows_interleaved;
#ifdef GGML_USE_OPENMP
#pragma omp parallel for
#endif
for (int bg = 0; bg < n_row_groups; bg++) {
const int b = bg * nrows_interleaved;
const block_q2_K * src = src_base + b * nblocks;
block_q2_Kx8 * dst = dst_base + bg * nblocks;
block_q2_K dst_tmp[8];
for (int64_t x = 0; x < nblocks; x++) {
for (int i = 0; i < nrows_interleaved; i++ ) {
for (int i = 0; i < nrows_interleaved; i++) {
dst_tmp[i] = src[x + i * nblocks];
}
*dst++ = make_block_q2_Kx8(dst_tmp, interleave_block);
dst[x] = make_block_q2_Kx8(dst_tmp, interleave_block);
}
src += nrows_interleaved * nblocks;
}
return 0;
@ -1508,11 +1532,10 @@ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block
GGML_ASSERT(interleave_block == 8);
constexpr int nrows_interleaved = 8;
block_q4_0x8 * dst = (block_q4_0x8*)t->data;
const block_q4_0 * src = (const block_q4_0*) data;
block_q4_0 dst_tmp[8];
int nrow = ggml_nrows(t);
int nblocks = t->ne[0] / QK4_0;
block_q4_0x8 * dst_base = (block_q4_0x8*)t->data;
const block_q4_0 * src_base = (const block_q4_0*) data;
const int nrow = ggml_nrows(t);
const int nblocks = t->ne[0] / QK4_0;
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
@ -1520,14 +1543,23 @@ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block
return -1;
}
for (int b = 0; b < nrow; b += nrows_interleaved) {
const int n_row_groups = nrow / nrows_interleaved;
#ifdef GGML_USE_OPENMP
#pragma omp parallel for
#endif
for (int bg = 0; bg < n_row_groups; bg++) {
const int b = bg * nrows_interleaved;
const block_q4_0 * src = src_base + b * nblocks;
block_q4_0x8 * dst = dst_base + bg * nblocks;
block_q4_0 dst_tmp[8];
for (int64_t x = 0; x < nblocks; x++) {
for (int i = 0; i < nrows_interleaved; i++ ) {
for (int i = 0; i < nrows_interleaved; i++) {
dst_tmp[i] = src[x + i * nblocks];
}
*dst++ = make_block_q4_0x8(dst_tmp, interleave_block);
dst[x] = make_block_q4_0x8(dst_tmp, interleave_block);
}
src += nrows_interleaved * nblocks;
}
return 0;
@ -1573,14 +1605,12 @@ static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_b
GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
GGML_ASSERT(interleave_block == 4);
const block_iq4_nl * src = (const block_iq4_nl *)data;
block_iq4_nlx4 * dst = ( block_iq4_nlx4 *)t->data;
const block_iq4_nl * src_base = (const block_iq4_nl *)data;
block_iq4_nlx4 * dst_base = (block_iq4_nlx4 *)t->data;
block_iq4_nl dst_tmp[4];
int nrow = ggml_nrows(t);
int nrows_interleaved = 4;
int nblocks = t->ne[0] / QK4_NL;
const int nrow = ggml_nrows(t);
const int nrows_interleaved = 4;
const int nblocks = t->ne[0] / QK4_NL;
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
@ -1588,14 +1618,23 @@ static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_b
return -1;
}
for (int b = 0; b < nrow; b += nrows_interleaved) {
const int n_row_groups = nrow / nrows_interleaved;
#ifdef GGML_USE_OPENMP
#pragma omp parallel for
#endif
for (int bg = 0; bg < n_row_groups; bg++) {
const int b = bg * nrows_interleaved;
const block_iq4_nl * src = src_base + b * nblocks;
block_iq4_nlx4 * dst = dst_base + bg * nblocks;
block_iq4_nl dst_tmp[4];
for (int64_t x = 0; x < nblocks; x++) {
for (int i = 0; i < nrows_interleaved; i++) {
dst_tmp[i] = src[x + i * nblocks];
}
*dst++ = make_block_iq4_nlx4(dst_tmp, interleave_block);
dst[x] = make_block_iq4_nlx4(dst_tmp, interleave_block);
}
src += nrows_interleaved * nblocks;
}
return 0;
@ -1630,14 +1669,12 @@ static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_b
GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
GGML_ASSERT(interleave_block == 8);
const block_iq4_nl * src = (const block_iq4_nl *)data;
block_iq4_nlx8 * dst = ( block_iq4_nlx8 *)t->data;
const block_iq4_nl * src_base = (const block_iq4_nl *)data;
block_iq4_nlx8 * dst_base = (block_iq4_nlx8 *)t->data;
block_iq4_nl dst_tmp[8];
int nrow = ggml_nrows(t);
int nrows_interleaved = 8;
int nblocks = t->ne[0] / QK4_NL;
const int nrow = ggml_nrows(t);
const int nrows_interleaved = 8;
const int nblocks = t->ne[0] / QK4_NL;
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
@ -1645,14 +1682,23 @@ static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_b
return -1;
}
for (int b = 0; b < nrow; b += nrows_interleaved) {
const int n_row_groups = nrow / nrows_interleaved;
#ifdef GGML_USE_OPENMP
#pragma omp parallel for
#endif
for (int bg = 0; bg < n_row_groups; bg++) {
const int b = bg * nrows_interleaved;
const block_iq4_nl * src = src_base + b * nblocks;
block_iq4_nlx8 * dst = dst_base + bg * nblocks;
block_iq4_nl dst_tmp[8];
for (int64_t x = 0; x < nblocks; x++) {
for (int i = 0; i < nrows_interleaved; i++) {
dst_tmp[i] = src[x + i * nblocks];
}
*dst++ = make_block_iq4_nlx8(dst_tmp, interleave_block);
dst[x] = make_block_iq4_nlx8(dst_tmp, interleave_block);
}
src += nrows_interleaved * nblocks;
}
return 0;