From 11b753e7863125169d58c822efe1d3fa1e3cb1d9 Mon Sep 17 00:00:00 2001
From: Jianhui Zhou <jonaszhou@zhaoxin.com>
Date: Thu, 8 Jan 2026 14:12:59 +0000
Subject: [PATCH] ggml: optimize repack on NUMA by binding threads

When using repack buffer type, the physical memory allocation is dictated
by the first-touch policy. Since the main thread performs the write
operations, memory is often allocated on a single NUMA node, leading to
uneven weight distribution.

Multi-threaded repack can alleviate this problem, but the threads are
not bound to NUMA nodes.

This patch applies the same thread affinity strategy (--numa distribute)
to the repacking phase. By binding the repack threads to the same nodes
as the compute threads, we ensure that weights are written (and thus
allocated) on the local NUMA node, minimizing cross-node memory access
during inference.

Performance on Intel Xeon Silver 4514Y (32 core):
qwen3 8B  Q4_K: 19.39 -> 26.92 t/s (+39%)
qwen3 32B Q4_K:  4.99 ->  7.38 t/s (+48%)

Signed-off-by: Jianhui Zhou <jonaszhou@zhaoxin.com>
---
 ggml/src/ggml-cpu/ggml-cpu-impl.h |  1 +
 ggml/src/ggml-cpu/ggml-cpu.c      |  6 +++---
 ggml/src/ggml-cpu/repack.cpp      | 30 +++++++++++++++++++++++-------
 3 files changed, 27 insertions(+), 10 deletions(-)
diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h
index 7597377cc2..548dcf979d 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-impl.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h
@@ -517,6 +517,7 @@ static __m256 __lasx_xvreplfr2vr_s(const float val) {
 
 // TODO: move to ggml-threading
 void ggml_barrier(struct ggml_threadpool * tp);
+void ggml_cpu_set_numa_thread_affinity(int thread_n);
 
 void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value);
 int  ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value);
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index a59b518938..a1ebf4955d 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -2087,7 +2087,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
 
 // Android's libc implementation "bionic" does not support setting affinity
 #if defined(__gnu_linux__)
-static void set_numa_thread_affinity(int thread_n) {
+void ggml_cpu_set_numa_thread_affinity(int thread_n) {
     if (!ggml_is_numa()) {
         return;
     }
@@ -2155,7 +2155,7 @@ static void clear_numa_thread_affinity(void) {
 #else
 // TODO: Windows etc.
 // (the linux implementation may also work on BSD, someone should test)
-static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n);  }
+void ggml_cpu_set_numa_thread_affinity(int thread_n) { UNUSED(thread_n);  }
 static void clear_numa_thread_affinity(void) {}
 #endif
 
@@ -2923,7 +2923,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
     const struct ggml_cgraph * cgraph = tp->cgraph;
     const struct ggml_cplan  * cplan  = tp->cplan;
 
-    set_numa_thread_affinity(state->ith);
+    ggml_cpu_set_numa_thread_affinity(state->ith);
 
     struct ggml_compute_params params = {
         /*.ith       =*/ state->ith,
diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp
index b9727e6c9c..c2de7b9adf 100644
--- a/ggml/src/ggml-cpu/repack.cpp
+++ b/ggml/src/ggml-cpu/repack.cpp
@@ -16,6 +16,10 @@
 #include <cassert>
 #include <cstdio>  // for GGML_ASSERT
 
+#if defined(GGML_USE_OPENMP)
+#include <omp.h>
+#endif
+
 #include "repack.h"
 
 #if defined(__GNUC__)
@@ -1429,7 +1433,7 @@ static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block
     const int n_row_groups = nrow / nrows_interleaved;
 
 #ifdef GGML_USE_OPENMP
-    #pragma omp parallel for
+#pragma omp for schedule(static)
 #endif
     for (int bg = 0; bg < n_row_groups; bg++) {
         const int b = bg * nrows_interleaved;
@@ -1468,7 +1472,7 @@ static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block
     const int n_row_groups = nrow / nrows_interleaved;
 
 #ifdef GGML_USE_OPENMP
-    #pragma omp parallel for
+#pragma omp for schedule(static)
 #endif
     for (int bg = 0; bg < n_row_groups; bg++) {
         const int b = bg * nrows_interleaved;
@@ -1507,7 +1511,7 @@ static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block
     const int n_row_groups = nrow / nrows_interleaved;
 
 #ifdef GGML_USE_OPENMP
-    #pragma omp parallel for
+#pragma omp for schedule(static)
 #endif
     for (int bg = 0; bg < n_row_groups; bg++) {
         const int b = bg * nrows_interleaved;
@@ -1546,7 +1550,7 @@ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block
     const int n_row_groups = nrow / nrows_interleaved;
 
 #ifdef GGML_USE_OPENMP
-    #pragma omp parallel for
+#pragma omp for schedule(static)
 #endif
     for (int bg = 0; bg < n_row_groups; bg++) {
         const int b = bg * nrows_interleaved;
@@ -1621,7 +1625,7 @@ static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_b
     const int n_row_groups = nrow / nrows_interleaved;
 
 #ifdef GGML_USE_OPENMP
-    #pragma omp parallel for
+#pragma omp for schedule(static)
 #endif
     for (int bg = 0; bg < n_row_groups; bg++) {
         const int b = bg * nrows_interleaved;
@@ -1685,7 +1689,7 @@ static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_b
     const int n_row_groups = nrow / nrows_interleaved;
 
 #ifdef GGML_USE_OPENMP
-    #pragma omp parallel for
+#pragma omp for schedule(static)
 #endif
     for (int bg = 0; bg < n_row_groups; bg++) {
         const int b = bg * nrows_interleaved;
@@ -2188,9 +2192,21 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
     }
 
     int repack(struct ggml_tensor * t, const void * data, size_t data_size) override {
+        int ret = 0;
         GGML_LOG_DEBUG("%s: repack tensor %s with %s_%dx%d\n", __func__, t->name, ggml_type_name(t->type),
                        (int) NB_COLS, (int) INTER_SIZE);
-        return ggml::cpu::repack::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
+#ifdef GGML_USE_OPENMP
+        #pragma omp parallel
+        {
+            ggml_cpu_set_numa_thread_affinity(omp_get_thread_num());
+            int r = ggml::cpu::repack::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
+            #pragma omp master
+            ret = r;
+        }
+#else
+        ret = ggml::cpu::repack::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
+#endif
+        return ret;
     }
 };