From b1366757cf20d6ae35a6513b4c347c90534790b6 Mon Sep 17 00:00:00 2001
From: pestopoppa <dpinna@protonmail.com>
Date: Sun, 21 Dec 2025 01:07:40 +0100
Subject: [PATCH 1/3] ggml-cpu: parallelize tensor repacking with OpenMP

Add OpenMP parallelization to tensor repack functions to significantly
speed up model loading on many-core CPUs.

Measured on AMD EPYC 9655 (96 cores):

| Model Size | Before | After | Speedup |
|------------|--------|-------|---------|
| 6.8GB Q4_K | 5.0s   | 3.3s  | 1.5x    |
| 19GB Q4_K  | 11.9s  | 5.3s  | 2.2x    |
| 271GB Q4_K | ~150s  | ~60s  | ~2.5x   |

The repack functions convert quantized tensors from storage layout
to SIMD-optimized layout for AVX-512. This was previously single-threaded
and is now parallelized across row groups.

Key changes:
- Convert pointer-increment loops to explicit indexing
- Add #pragma omp parallel for to outer loops (guarded by #ifdef _OPENMP)
- Each thread processes independent row groups
- Move thread-local dst_tmp arrays inside parallel region

Functions parallelized:
- repack_q4_0_to_q4_0_4_bl (Q4_0 x4 interleave)
- repack_q4_K_to_q4_K_8_bl (Q4_K_M, Q4_K_S models)
- repack_q2_K_to_q2_K_8_bl (Q2_K models)
- repack_q4_0_to_q4_0_8_bl (Q4_0 x8 interleave)
- repack_iq4_nl_to_iq4_nl_4_bl (IQ4_NL x4)
- repack_iq4_nl_to_iq4_nl_8_bl (IQ4_NL x8)

Tested on: AMD EPYC 9655 "Turin" with 192 threads
---
 ggml/src/ggml-cpu/repack.cpp | 156 +++++++++++++++++++++++------------
 1 file changed, 101 insertions(+), 55 deletions(-)

diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp
index b70ea7d78b..b9727e6c9c 100644
--- a/ggml/src/ggml-cpu/repack.cpp
+++ b/ggml/src/ggml-cpu/repack.cpp
@@ -1415,11 +1415,10 @@ static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block
     GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
     constexpr int nrows_interleaved = 4;
 
-    block_q4_0x4 * dst = (block_q4_0x4 *)t->data;
-    const block_q4_0 * src = (const block_q4_0 *)data;
-    block_q4_0 dst_tmp[4];
-    int nrow = ggml_nrows(t);
-    int nblocks = t->ne[0] / QK4_0;
+    block_q4_0x4 * dst_base = (block_q4_0x4 *)t->data;
+    const block_q4_0 * src_base = (const block_q4_0 *)data;
+    const int nrow = ggml_nrows(t);
+    const int nblocks = t->ne[0] / QK4_0;
 
     GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
 
@@ -1427,14 +1426,23 @@ static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block
         return -1;
     }
 
-    for (int b = 0; b < nrow; b += nrows_interleaved) {
+    const int n_row_groups = nrow / nrows_interleaved;
+
+#ifdef GGML_USE_OPENMP
+    #pragma omp parallel for
+#endif
+    for (int bg = 0; bg < n_row_groups; bg++) {
+        const int b = bg * nrows_interleaved;
+        const block_q4_0 * src = src_base + b * nblocks;
+        block_q4_0x4 * dst = dst_base + bg * nblocks;
+        block_q4_0 dst_tmp[4];
+
         for (int64_t x = 0; x < nblocks; x++) {
             for (int i = 0; i < nrows_interleaved; i++) {
                 dst_tmp[i] = src[x + i * nblocks];
             }
-            *dst++ = make_block_q4_0x4(dst_tmp, interleave_block);
+            dst[x] = make_block_q4_0x4(dst_tmp, interleave_block);
         }
-        src += nrows_interleaved * nblocks;
     }
     return 0;
 
@@ -1446,11 +1454,10 @@ static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block
     GGML_ASSERT(interleave_block == 8 || interleave_block == 4);
     constexpr int nrows_interleaved = 8;
 
-    block_q4_Kx8 * dst = (block_q4_Kx8*)t->data;
-    const block_q4_K * src = (const block_q4_K*) data;
-    block_q4_K dst_tmp[8];
-    int nrow = ggml_nrows(t);
-    int nblocks = t->ne[0] / QK_K;
+    block_q4_Kx8 * dst_base = (block_q4_Kx8*)t->data;
+    const block_q4_K * src_base = (const block_q4_K*) data;
+    const int nrow = ggml_nrows(t);
+    const int nblocks = t->ne[0] / QK_K;
 
     GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_K));
 
@@ -1458,14 +1465,23 @@ static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block
         return -1;
     }
 
-    for (int b = 0; b < nrow; b += nrows_interleaved) {
+    const int n_row_groups = nrow / nrows_interleaved;
+
+#ifdef GGML_USE_OPENMP
+    #pragma omp parallel for
+#endif
+    for (int bg = 0; bg < n_row_groups; bg++) {
+        const int b = bg * nrows_interleaved;
+        const block_q4_K * src = src_base + b * nblocks;
+        block_q4_Kx8 * dst = dst_base + bg * nblocks;
+        block_q4_K dst_tmp[8];
+
         for (int64_t x = 0; x < nblocks; x++) {
-            for (int i  = 0; i < nrows_interleaved; i++ ) {
+            for (int i = 0; i < nrows_interleaved; i++) {
                 dst_tmp[i] = src[x + i * nblocks];
             }
-            *dst++ = make_block_q4_Kx8(dst_tmp, interleave_block);
+            dst[x] = make_block_q4_Kx8(dst_tmp, interleave_block);
         }
-        src += nrows_interleaved * nblocks;
     }
     return 0;
 
@@ -1477,11 +1493,10 @@ static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block
     GGML_ASSERT(interleave_block == 8);
     constexpr int nrows_interleaved = 8;
 
-    block_q2_Kx8 * dst = (block_q2_Kx8*)t->data;
-    const block_q2_K * src = (const block_q2_K*) data;
-    block_q2_K dst_tmp[8];
-    int nrow = ggml_nrows(t);
-    int nblocks = t->ne[0] / QK_K;
+    block_q2_Kx8 * dst_base = (block_q2_Kx8*)t->data;
+    const block_q2_K * src_base = (const block_q2_K*) data;
+    const int nrow = ggml_nrows(t);
+    const int nblocks = t->ne[0] / QK_K;
 
     GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q2_K));
 
@@ -1489,14 +1504,23 @@ static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block
         return -1;
     }
 
-    for (int b = 0; b < nrow; b += nrows_interleaved) {
+    const int n_row_groups = nrow / nrows_interleaved;
+
+#ifdef GGML_USE_OPENMP
+    #pragma omp parallel for
+#endif
+    for (int bg = 0; bg < n_row_groups; bg++) {
+        const int b = bg * nrows_interleaved;
+        const block_q2_K * src = src_base + b * nblocks;
+        block_q2_Kx8 * dst = dst_base + bg * nblocks;
+        block_q2_K dst_tmp[8];
+
         for (int64_t x = 0; x < nblocks; x++) {
-            for (int i  = 0; i < nrows_interleaved; i++ ) {
+            for (int i = 0; i < nrows_interleaved; i++) {
                 dst_tmp[i] = src[x + i * nblocks];
             }
-            *dst++ = make_block_q2_Kx8(dst_tmp, interleave_block);
+            dst[x] = make_block_q2_Kx8(dst_tmp, interleave_block);
         }
-        src += nrows_interleaved * nblocks;
     }
     return 0;
 
@@ -1508,11 +1532,10 @@ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block
     GGML_ASSERT(interleave_block == 8);
     constexpr int nrows_interleaved = 8;
 
-    block_q4_0x8 * dst = (block_q4_0x8*)t->data;
-    const block_q4_0 * src = (const block_q4_0*) data;
-    block_q4_0 dst_tmp[8];
-    int nrow = ggml_nrows(t);
-    int nblocks = t->ne[0] / QK4_0;
+    block_q4_0x8 * dst_base = (block_q4_0x8*)t->data;
+    const block_q4_0 * src_base = (const block_q4_0*) data;
+    const int nrow = ggml_nrows(t);
+    const int nblocks = t->ne[0] / QK4_0;
 
     GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
 
@@ -1520,14 +1543,23 @@ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block
         return -1;
     }
 
-    for (int b = 0; b < nrow; b += nrows_interleaved) {
+    const int n_row_groups = nrow / nrows_interleaved;
+
+#ifdef GGML_USE_OPENMP
+    #pragma omp parallel for
+#endif
+    for (int bg = 0; bg < n_row_groups; bg++) {
+        const int b = bg * nrows_interleaved;
+        const block_q4_0 * src = src_base + b * nblocks;
+        block_q4_0x8 * dst = dst_base + bg * nblocks;
+        block_q4_0 dst_tmp[8];
+
         for (int64_t x = 0; x < nblocks; x++) {
-            for (int i  = 0; i < nrows_interleaved; i++ ) {
+            for (int i = 0; i < nrows_interleaved; i++) {
                 dst_tmp[i] = src[x + i * nblocks];
             }
-            *dst++ = make_block_q4_0x8(dst_tmp, interleave_block);
+            dst[x] = make_block_q4_0x8(dst_tmp, interleave_block);
         }
-        src += nrows_interleaved * nblocks;
     }
     return 0;
 
@@ -1573,14 +1605,12 @@ static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_b
     GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
     GGML_ASSERT(interleave_block == 4);
 
-    const block_iq4_nl   * src = (const block_iq4_nl   *)data;
-          block_iq4_nlx4 * dst = (      block_iq4_nlx4 *)t->data;
+    const block_iq4_nl * src_base = (const block_iq4_nl *)data;
+    block_iq4_nlx4 * dst_base = (block_iq4_nlx4 *)t->data;
 
-    block_iq4_nl dst_tmp[4];
-
-    int nrow = ggml_nrows(t);
-    int nrows_interleaved = 4;
-    int nblocks = t->ne[0] / QK4_NL;
+    const int nrow = ggml_nrows(t);
+    const int nrows_interleaved = 4;
+    const int nblocks = t->ne[0] / QK4_NL;
 
     GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
 
@@ -1588,14 +1618,23 @@ static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_b
         return -1;
     }
 
-    for (int b = 0; b < nrow; b += nrows_interleaved) {
+    const int n_row_groups = nrow / nrows_interleaved;
+
+#ifdef GGML_USE_OPENMP
+    #pragma omp parallel for
+#endif
+    for (int bg = 0; bg < n_row_groups; bg++) {
+        const int b = bg * nrows_interleaved;
+        const block_iq4_nl * src = src_base + b * nblocks;
+        block_iq4_nlx4 * dst = dst_base + bg * nblocks;
+        block_iq4_nl dst_tmp[4];
+
         for (int64_t x = 0; x < nblocks; x++) {
             for (int i = 0; i < nrows_interleaved; i++) {
                 dst_tmp[i] = src[x + i * nblocks];
             }
-            *dst++ = make_block_iq4_nlx4(dst_tmp, interleave_block);
+            dst[x] = make_block_iq4_nlx4(dst_tmp, interleave_block);
         }
-        src += nrows_interleaved * nblocks;
     }
     return 0;
 
@@ -1630,14 +1669,12 @@ static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_b
     GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
     GGML_ASSERT(interleave_block == 8);
 
-    const block_iq4_nl   * src = (const block_iq4_nl   *)data;
-          block_iq4_nlx8 * dst = (      block_iq4_nlx8 *)t->data;
+    const block_iq4_nl * src_base = (const block_iq4_nl *)data;
+    block_iq4_nlx8 * dst_base = (block_iq4_nlx8 *)t->data;
 
-    block_iq4_nl dst_tmp[8];
-
-    int nrow = ggml_nrows(t);
-    int nrows_interleaved = 8;
-    int nblocks = t->ne[0] / QK4_NL;
+    const int nrow = ggml_nrows(t);
+    const int nrows_interleaved = 8;
+    const int nblocks = t->ne[0] / QK4_NL;
 
     GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
 
@@ -1645,14 +1682,23 @@ static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_b
         return -1;
     }
 
-    for (int b = 0; b < nrow; b += nrows_interleaved) {
+    const int n_row_groups = nrow / nrows_interleaved;
+
+#ifdef GGML_USE_OPENMP
+    #pragma omp parallel for
+#endif
+    for (int bg = 0; bg < n_row_groups; bg++) {
+        const int b = bg * nrows_interleaved;
+        const block_iq4_nl * src = src_base + b * nblocks;
+        block_iq4_nlx8 * dst = dst_base + bg * nblocks;
+        block_iq4_nl dst_tmp[8];
+
         for (int64_t x = 0; x < nblocks; x++) {
             for (int i = 0; i < nrows_interleaved; i++) {
                 dst_tmp[i] = src[x + i * nblocks];
             }
-            *dst++ = make_block_iq4_nlx8(dst_tmp, interleave_block);
+            dst[x] = make_block_iq4_nlx8(dst_tmp, interleave_block);
         }
-        src += nrows_interleaved * nblocks;
     }
     return 0;
 

From 11b753e7863125169d58c822efe1d3fa1e3cb1d9 Mon Sep 17 00:00:00 2001
From: Jianhui Zhou <jonaszhou@zhaoxin.com>
Date: Thu, 8 Jan 2026 14:12:59 +0000
Subject: [PATCH 2/3] ggml: optimize repack on NUMA by binding threads

When using repack buffer type, the physical memory allocation is dictated
by the first-touch policy. Since the main thread performs the write
operations, memory is often allocated on a single NUMA node, leading to
uneven weight distribution.

Multi-threaded repack can alleviate this problem, but the threads are
not bound to NUMA nodes.

This patch applies the same thread affinity strategy (--numa distribute)
to the repacking phase. By binding the repack threads to the same nodes
as the compute threads, we ensure that weights are written (and thus
allocated) on the local NUMA node, minimizing cross-node memory access
during inference.

Performance on Intel Xeon Silver 4514Y (32 core):
qwen3 8B  Q4_K: 19.39 -> 26.92 t/s (+39%)
qwen3 32B Q4_K:  4.99 ->  7.38 t/s (+48%)

Signed-off-by: Jianhui Zhou <jonaszhou@zhaoxin.com>
---
 ggml/src/ggml-cpu/ggml-cpu-impl.h |  1 +
 ggml/src/ggml-cpu/ggml-cpu.c      |  6 +++---
 ggml/src/ggml-cpu/repack.cpp      | 30 +++++++++++++++++++++++-------
 3 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h
index 7597377cc2..548dcf979d 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-impl.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h
@@ -517,6 +517,7 @@ static __m256 __lasx_xvreplfr2vr_s(const float val) {
 
 // TODO: move to ggml-threading
 void ggml_barrier(struct ggml_threadpool * tp);
+void ggml_cpu_set_numa_thread_affinity(int thread_n);
 
 void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value);
 int  ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value);
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index a59b518938..a1ebf4955d 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -2087,7 +2087,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
 
 // Android's libc implementation "bionic" does not support setting affinity
 #if defined(__gnu_linux__)
-static void set_numa_thread_affinity(int thread_n) {
+void ggml_cpu_set_numa_thread_affinity(int thread_n) {
     if (!ggml_is_numa()) {
         return;
     }
@@ -2155,7 +2155,7 @@ static void clear_numa_thread_affinity(void) {
 #else
 // TODO: Windows etc.
 // (the linux implementation may also work on BSD, someone should test)
-static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n);  }
+void ggml_cpu_set_numa_thread_affinity(int thread_n) { UNUSED(thread_n);  }
 static void clear_numa_thread_affinity(void) {}
 #endif
 
@@ -2923,7 +2923,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
     const struct ggml_cgraph * cgraph = tp->cgraph;
     const struct ggml_cplan  * cplan  = tp->cplan;
 
-    set_numa_thread_affinity(state->ith);
+    ggml_cpu_set_numa_thread_affinity(state->ith);
 
     struct ggml_compute_params params = {
         /*.ith       =*/ state->ith,
diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp
index b9727e6c9c..c2de7b9adf 100644
--- a/ggml/src/ggml-cpu/repack.cpp
+++ b/ggml/src/ggml-cpu/repack.cpp
@@ -16,6 +16,10 @@
 #include <cassert>
 #include <cstdio>  // for GGML_ASSERT
 
+#if defined(GGML_USE_OPENMP)
+#include <omp.h>
+#endif
+
 #include "repack.h"
 
 #if defined(__GNUC__)
@@ -1429,7 +1433,7 @@ static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block
     const int n_row_groups = nrow / nrows_interleaved;
 
 #ifdef GGML_USE_OPENMP
-    #pragma omp parallel for
+#pragma omp for schedule(static)
 #endif
     for (int bg = 0; bg < n_row_groups; bg++) {
         const int b = bg * nrows_interleaved;
@@ -1468,7 +1472,7 @@ static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block
     const int n_row_groups = nrow / nrows_interleaved;
 
 #ifdef GGML_USE_OPENMP
-    #pragma omp parallel for
+#pragma omp for schedule(static)
 #endif
     for (int bg = 0; bg < n_row_groups; bg++) {
         const int b = bg * nrows_interleaved;
@@ -1507,7 +1511,7 @@ static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block
     const int n_row_groups = nrow / nrows_interleaved;
 
 #ifdef GGML_USE_OPENMP
-    #pragma omp parallel for
+#pragma omp for schedule(static)
 #endif
     for (int bg = 0; bg < n_row_groups; bg++) {
         const int b = bg * nrows_interleaved;
@@ -1546,7 +1550,7 @@ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block
     const int n_row_groups = nrow / nrows_interleaved;
 
 #ifdef GGML_USE_OPENMP
-    #pragma omp parallel for
+#pragma omp for schedule(static)
 #endif
     for (int bg = 0; bg < n_row_groups; bg++) {
         const int b = bg * nrows_interleaved;
@@ -1621,7 +1625,7 @@ static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_b
     const int n_row_groups = nrow / nrows_interleaved;
 
 #ifdef GGML_USE_OPENMP
-    #pragma omp parallel for
+#pragma omp for schedule(static)
 #endif
     for (int bg = 0; bg < n_row_groups; bg++) {
         const int b = bg * nrows_interleaved;
@@ -1685,7 +1689,7 @@ static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_b
     const int n_row_groups = nrow / nrows_interleaved;
 
 #ifdef GGML_USE_OPENMP
-    #pragma omp parallel for
+#pragma omp for schedule(static)
 #endif
     for (int bg = 0; bg < n_row_groups; bg++) {
         const int b = bg * nrows_interleaved;
@@ -2188,9 +2192,21 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
     }
 
     int repack(struct ggml_tensor * t, const void * data, size_t data_size) override {
+        int ret = 0;
         GGML_LOG_DEBUG("%s: repack tensor %s with %s_%dx%d\n", __func__, t->name, ggml_type_name(t->type),
                        (int) NB_COLS, (int) INTER_SIZE);
-        return ggml::cpu::repack::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
+#ifdef GGML_USE_OPENMP
+        #pragma omp parallel
+        {
+            ggml_cpu_set_numa_thread_affinity(omp_get_thread_num());
+            int r = ggml::cpu::repack::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
+            #pragma omp master
+            ret = r;
+        }
+#else
+        ret = ggml::cpu::repack::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
+#endif
+        return ret;
     }
 };
 

From 5714d4b86e2385b4e0c1f581d73af9f4b524ce72 Mon Sep 17 00:00:00 2001
From: Jianhui Zhou <jonaszhou@zhaoxin.com>
Date: Tue, 13 Jan 2026 07:36:31 +0000
Subject: [PATCH 3/3] ggml: Add thread count control during repacking

This change enables the repack stage to utilize the user-specified
thread count, ensuring that both the logical thread IDs and the total
number of threads remain consistent between the repack and inference
stages.

In a NUMA architecture where the `--numa distribute` parameter is used,
logical threads are pinned to specific physical NUMA nodes. By aligning
the thread configuration across these two stages, we can fully leverage
the operating system's "first-touch" memory allocation policy:

1. Repack Stage: Logical thread i (bound to NUMA node j) is responsible
   for repacking and writing the weight data. Since the "first touch"
   occurs within this thread, the corresponding physical memory is
   allocated on node j.

2. Inference Stage: The same logical thread i (still bound to node j)
   reads these weights. Since the data already resides on the local
   node, low-latency local memory access is achieved.

Without ensuring consistency in the number of threads, data may be
randomly allocated to mismatched nodes, resulting in significant
cross-node access overhead during inference.

Signed-off-by: Jianhui Zhou <jonaszhou@zhaoxin.com>
---
 common/common.cpp                 |  1 +
 ggml/include/ggml-cpu.h           |  4 +++
 ggml/src/ggml-cpu/repack.cpp      | 41 ++++++++++++++++++++++++-------
 include/llama.h                   |  1 +
 src/llama-model.cpp               |  1 +
 src/llama.cpp                     |  1 +
 tools/llama-bench/llama-bench.cpp |  1 +
 7 files changed, 41 insertions(+), 9 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index b76dfa10ea..ff2f7c6e1f 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1348,6 +1348,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
     mparams.check_tensors   = params.check_tensors;
     mparams.use_extra_bufts = !params.no_extra_bufts;
     mparams.no_host         = params.no_host;
+    mparams.repack_n_threads = params.cpuparams.n_threads;
 
     if (params.kv_overrides.empty()) {
         mparams.kv_overrides = NULL;
diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h
index 4f3b99c8d0..5d9f7b4d82 100644
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@@ -52,6 +52,10 @@ extern "C" {
     GGML_BACKEND_API float   ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
     GGML_BACKEND_API void    ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
 
+    // parallel repack threads
+    GGML_BACKEND_API void    ggml_cpu_set_repack_n_threads(int n_threads);
+    GGML_BACKEND_API int     ggml_cpu_get_repack_n_threads(void);
+
     GGML_BACKEND_API struct ggml_threadpool *      ggml_threadpool_new           (struct ggml_threadpool_params  * params);
     GGML_BACKEND_API void                          ggml_threadpool_free          (struct ggml_threadpool * threadpool);
     GGML_BACKEND_API int                           ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool);
diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp
index c2de7b9adf..9040e85311 100644
--- a/ggml/src/ggml-cpu/repack.cpp
+++ b/ggml/src/ggml-cpu/repack.cpp
@@ -16,6 +16,8 @@
 #include <cassert>
 #include <cstdio>  // for GGML_ASSERT
 
+static int g_repack_n_threads = 1;
+
 #if defined(GGML_USE_OPENMP)
 #include <omp.h>
 #endif
@@ -52,6 +54,19 @@ static inline int nearest_int(float fval) {
 
 extern "C" {
 
+#if defined(GGML_USE_OPENMP)
+void ggml_cpu_set_repack_n_threads(int n_threads) {
+    g_repack_n_threads = n_threads;
+}
+
+int ggml_cpu_get_repack_n_threads(void) {
+    return g_repack_n_threads;
+}
+#else
+void ggml_cpu_set_repack_n_threads(int n_threads) {}
+int ggml_cpu_get_repack_n_threads(void) { return 0; }
+#endif
+
 void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
     assert(QK8_0 == 32);
     assert(k % QK8_0 == 0);
@@ -2192,20 +2207,28 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
     }
 
     int repack(struct ggml_tensor * t, const void * data, size_t data_size) override {
-        int ret = 0;
+        int ret = -1;
         GGML_LOG_DEBUG("%s: repack tensor %s with %s_%dx%d\n", __func__, t->name, ggml_type_name(t->type),
                        (int) NB_COLS, (int) INTER_SIZE);
 #ifdef GGML_USE_OPENMP
-        #pragma omp parallel
-        {
-            ggml_cpu_set_numa_thread_affinity(omp_get_thread_num());
-            int r = ggml::cpu::repack::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
-            #pragma omp master
-            ret = r;
+        int n_threads = ggml_cpu_get_repack_n_threads();
+        GGML_ASSERT(n_threads >= 0);
+        if (n_threads == 0) {
+            n_threads = omp_get_max_threads();
+        }
+        if (n_threads > 1) {
+            #pragma omp parallel num_threads(n_threads)
+            {
+                ggml_cpu_set_numa_thread_affinity(omp_get_thread_num());
+                int r = ggml::cpu::repack::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
+                #pragma omp master
+                ret = r;
+            }
         }
-#else
-        ret = ggml::cpu::repack::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
 #endif
+        if (ret == -1) {
+            ret = ggml::cpu::repack::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
+        }
         return ret;
     }
 };
diff --git a/include/llama.h b/include/llama.h
index b52eaacfa7..a88cf0166a 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -313,6 +313,7 @@ extern "C" {
         bool check_tensors;   // validate model tensor data
         bool use_extra_bufts; // use extra buffer types (used for weight repacking)
         bool no_host;         // bypass host buffer allowing extra buffers to be used
+        int32_t repack_n_threads; // number of threads to use for repacking
     };
 
     // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 28f06b4e61..9613f1bc76 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -7618,6 +7618,7 @@ llama_model_params llama_model_default_params() {
         /*.check_tensors               =*/ false,
         /*.use_extra_bufts             =*/ true,
         /*.no_host                     =*/ false,
+        /*.repack_n_threads            =*/ 0,
     };
 
     return result;
diff --git a/src/llama.cpp b/src/llama.cpp
index ab2e9868af..86c1e44ec7 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -100,6 +100,7 @@ int64_t llama_time_us(void) {
 
 // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
 static int llama_model_load(const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
+    ggml_cpu_set_repack_n_threads(params.repack_n_threads);
     // loading time will be recalculated after the first eval, so
     // we take page faults deferred by mmap() into consideration
     model.t_load_us = 0;
diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp
index 0be6ed6948..dfbd3116f7 100644
--- a/tools/llama-bench/llama-bench.cpp
+++ b/tools/llama-bench/llama-bench.cpp
@@ -1072,6 +1072,7 @@ struct cmd_params_instance {
         mparams.tensor_split = tensor_split.data();
         mparams.use_mmap     = use_mmap;
         mparams.no_host      = no_host;
+        mparams.repack_n_threads = n_threads;
 
         if (n_cpu_moe <= 0) {
             if (tensor_buft_overrides.empty()) {