ggml: optimize repack on NUMA by binding threads

When using repack buffer type, the physical memory allocation is dictated by the first-touch policy. Since the main thread performs the write operations, memory is often allocated on a single NUMA node, leading to uneven weight distribution. Multi-threaded repack can alleviate this problem, but the threads are not bound to NUMA nodes. This patch applies the same thread affinity strategy (--numa distribute) to the repacking phase. By binding the repack threads to the same nodes as the compute threads, we ensure that weights are written (and thus allocated) on the local NUMA node, minimizing cross-node memory access during inference. Performance on Intel Xeon Silver 4514Y (32 core): qwen3 8B Q4_K: 19.39 -> 26.92 t/s (+39%) qwen3 32B Q4_K: 4.99 -> 7.38 t/s (+48%) Signed-off-by: Jianhui Zhou <jonaszhou@zhaoxin.com>
2026-01-08 14:12:59 +00:00 · 2026-01-08 14:12:59 +00:00 · 11b753e786
parent b1366757cf
commit 11b753e786
3 changed files with 27 additions and 10 deletions
--- a/ggml/src/ggml-cpu/ggml-cpu-impl.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h
@ -517,6 +517,7 @@ static __m256 __lasx_xvreplfr2vr_s(const float val) {

 // TODO: move to ggml-threading
 void ggml_barrier(struct ggml_threadpool * tp);
+void ggml_cpu_set_numa_thread_affinity(int thread_n);

 void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value);
 int  ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value);
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@ -2087,7 +2087,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm

 // Android's libc implementation "bionic" does not support setting affinity
 #if defined(__gnu_linux__)
-static void set_numa_thread_affinity(int thread_n) {
+void ggml_cpu_set_numa_thread_affinity(int thread_n) {
    if (!ggml_is_numa()) {
        return;
    }
@ -2155,7 +2155,7 @@ static void clear_numa_thread_affinity(void) {
 #else
 // TODO: Windows etc.
 // (the linux implementation may also work on BSD, someone should test)
-static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n);  }
+void ggml_cpu_set_numa_thread_affinity(int thread_n) { UNUSED(thread_n);  }
 static void clear_numa_thread_affinity(void) {}
 #endif

@ -2923,7 +2923,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
    const struct ggml_cgraph * cgraph = tp->cgraph;
    const struct ggml_cplan  * cplan  = tp->cplan;

-    set_numa_thread_affinity(state->ith);
+    ggml_cpu_set_numa_thread_affinity(state->ith);

    struct ggml_compute_params params = {
        /*.ith       =*/ state->ith,
--- a/ggml/src/ggml-cpu/repack.cpp
+++ b/ggml/src/ggml-cpu/repack.cpp
@ -16,6 +16,10 @@
 #include <cassert>
 #include <cstdio>  // for GGML_ASSERT

+#if defined(GGML_USE_OPENMP)
+#include <omp.h>
+#endif
+
 #include "repack.h"

 #if defined(__GNUC__)
@ -1429,7 +1433,7 @@ static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block
    const int n_row_groups = nrow / nrows_interleaved;

 #ifdef GGML_USE_OPENMP
-    #pragma omp parallel for
+#pragma omp for schedule(static)
 #endif
    for (int bg = 0; bg < n_row_groups; bg++) {
        const int b = bg * nrows_interleaved;
@ -1468,7 +1472,7 @@ static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block
    const int n_row_groups = nrow / nrows_interleaved;

 #ifdef GGML_USE_OPENMP
-    #pragma omp parallel for
+#pragma omp for schedule(static)
 #endif
    for (int bg = 0; bg < n_row_groups; bg++) {
        const int b = bg * nrows_interleaved;
@ -1507,7 +1511,7 @@ static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block
    const int n_row_groups = nrow / nrows_interleaved;

 #ifdef GGML_USE_OPENMP
-    #pragma omp parallel for
+#pragma omp for schedule(static)
 #endif
    for (int bg = 0; bg < n_row_groups; bg++) {
        const int b = bg * nrows_interleaved;
@ -1546,7 +1550,7 @@ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block
    const int n_row_groups = nrow / nrows_interleaved;

 #ifdef GGML_USE_OPENMP
-    #pragma omp parallel for
+#pragma omp for schedule(static)
 #endif
    for (int bg = 0; bg < n_row_groups; bg++) {
        const int b = bg * nrows_interleaved;
@ -1621,7 +1625,7 @@ static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_b
    const int n_row_groups = nrow / nrows_interleaved;

 #ifdef GGML_USE_OPENMP
-    #pragma omp parallel for
+#pragma omp for schedule(static)
 #endif
    for (int bg = 0; bg < n_row_groups; bg++) {
        const int b = bg * nrows_interleaved;
@ -1685,7 +1689,7 @@ static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_b
    const int n_row_groups = nrow / nrows_interleaved;

 #ifdef GGML_USE_OPENMP
-    #pragma omp parallel for
+#pragma omp for schedule(static)
 #endif
    for (int bg = 0; bg < n_row_groups; bg++) {
        const int b = bg * nrows_interleaved;
@ -2188,9 +2192,21 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
    }

    int repack(struct ggml_tensor * t, const void * data, size_t data_size) override {
+        int ret = 0;
        GGML_LOG_DEBUG("%s: repack tensor %s with %s_%dx%d\n", __func__, t->name, ggml_type_name(t->type),
                       (int) NB_COLS, (int) INTER_SIZE);
-        return ggml::cpu::repack::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
+#ifdef GGML_USE_OPENMP
+        #pragma omp parallel
+        {
+            ggml_cpu_set_numa_thread_affinity(omp_get_thread_num());
+            int r = ggml::cpu::repack::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
+            #pragma omp master
+            ret = r;
+        }
+#else
+        ret = ggml::cpu::repack::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
+#endif
+        return ret;
    }
 };