From 11b753e7863125169d58c822efe1d3fa1e3cb1d9 Mon Sep 17 00:00:00 2001 From: Jianhui Zhou Date: Thu, 8 Jan 2026 14:12:59 +0000 Subject: [PATCH] ggml: optimize repack on NUMA by binding threads When using repack buffer type, the physical memory allocation is dictated by the first-touch policy. Since the main thread performs the write operations, memory is often allocated on a single NUMA node, leading to uneven weight distribution. Multi-threaded repack can alleviate this problem, but the threads are not bound to NUMA nodes. This patch applies the same thread affinity strategy (--numa distribute) to the repacking phase. By binding the repack threads to the same nodes as the compute threads, we ensure that weights are written (and thus allocated) on the local NUMA node, minimizing cross-node memory access during inference. Performance on Intel Xeon Silver 4514Y (32 core): qwen3 8B Q4_K: 19.39 -> 26.92 t/s (+39%) qwen3 32B Q4_K: 4.99 -> 7.38 t/s (+48%) Signed-off-by: Jianhui Zhou --- ggml/src/ggml-cpu/ggml-cpu-impl.h | 1 + ggml/src/ggml-cpu/ggml-cpu.c | 6 +++--- ggml/src/ggml-cpu/repack.cpp | 30 +++++++++++++++++++++++------- 3 files changed, 27 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h index 7597377cc2..548dcf979d 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-impl.h +++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h @@ -517,6 +517,7 @@ static __m256 __lasx_xvreplfr2vr_s(const float val) { // TODO: move to ggml-threading void ggml_barrier(struct ggml_threadpool * tp); +void ggml_cpu_set_numa_thread_affinity(int thread_n); void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value); int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value); diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index a59b518938..a1ebf4955d 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -2087,7 +2087,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm // Android's libc implementation "bionic" does not support setting affinity #if defined(__gnu_linux__) -static void set_numa_thread_affinity(int thread_n) { +void ggml_cpu_set_numa_thread_affinity(int thread_n) { if (!ggml_is_numa()) { return; } @@ -2155,7 +2155,7 @@ static void clear_numa_thread_affinity(void) { #else // TODO: Windows etc. // (the linux implementation may also work on BSD, someone should test) -static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); } +void ggml_cpu_set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); } static void clear_numa_thread_affinity(void) {} #endif @@ -2923,7 +2923,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { const struct ggml_cgraph * cgraph = tp->cgraph; const struct ggml_cplan * cplan = tp->cplan; - set_numa_thread_affinity(state->ith); + ggml_cpu_set_numa_thread_affinity(state->ith); struct ggml_compute_params params = { /*.ith =*/ state->ith, diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index b9727e6c9c..c2de7b9adf 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -16,6 +16,10 @@ #include #include // for GGML_ASSERT +#if defined(GGML_USE_OPENMP) +#include +#endif + #include "repack.h" #if defined(__GNUC__) @@ -1429,7 +1433,7 @@ static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block const int n_row_groups = nrow / nrows_interleaved; #ifdef GGML_USE_OPENMP - #pragma omp parallel for +#pragma omp for schedule(static) #endif for (int bg = 0; bg < n_row_groups; bg++) { const int b = bg * nrows_interleaved; @@ -1468,7 +1472,7 @@ static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block const int n_row_groups = nrow / nrows_interleaved; #ifdef GGML_USE_OPENMP - #pragma omp parallel for +#pragma omp for schedule(static) #endif for (int bg = 0; bg < n_row_groups; bg++) { const int b = bg * nrows_interleaved; @@ -1507,7 +1511,7 @@ static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block const int n_row_groups = nrow / nrows_interleaved; #ifdef GGML_USE_OPENMP - #pragma omp parallel for +#pragma omp for schedule(static) #endif for (int bg = 0; bg < n_row_groups; bg++) { const int b = bg * nrows_interleaved; @@ -1546,7 +1550,7 @@ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block const int n_row_groups = nrow / nrows_interleaved; #ifdef GGML_USE_OPENMP - #pragma omp parallel for +#pragma omp for schedule(static) #endif for (int bg = 0; bg < n_row_groups; bg++) { const int b = bg * nrows_interleaved; @@ -1621,7 +1625,7 @@ static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_b const int n_row_groups = nrow / nrows_interleaved; #ifdef GGML_USE_OPENMP - #pragma omp parallel for +#pragma omp for schedule(static) #endif for (int bg = 0; bg < n_row_groups; bg++) { const int b = bg * nrows_interleaved; @@ -1685,7 +1689,7 @@ static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_b const int n_row_groups = nrow / nrows_interleaved; #ifdef GGML_USE_OPENMP - #pragma omp parallel for +#pragma omp for schedule(static) #endif for (int bg = 0; bg < n_row_groups; bg++) { const int b = bg * nrows_interleaved; @@ -2188,9 +2192,21 @@ template name, ggml_type_name(t->type), (int) NB_COLS, (int) INTER_SIZE); - return ggml::cpu::repack::repack(t, data, data_size); +#ifdef GGML_USE_OPENMP + #pragma omp parallel + { + ggml_cpu_set_numa_thread_affinity(omp_get_thread_num()); + int r = ggml::cpu::repack::repack(t, data, data_size); + #pragma omp master + ret = r; + } +#else + ret = ggml::cpu::repack::repack(t, data, data_size); +#endif + return ret; } };