ggml: optimize repack on NUMA by binding threads

When using repack buffer type, the physical memory allocation is dictated
by the first-touch policy. Since the main thread performs the write
operations, memory is often allocated on a single NUMA node, leading to
uneven weight distribution.

Multi-threaded repack can alleviate this problem, but the threads are
not bound to NUMA nodes.

This patch applies the same thread affinity strategy (--numa distribute)
to the repacking phase. By binding the repack threads to the same nodes
as the compute threads, we ensure that weights are written (and thus
allocated) on the local NUMA node, minimizing cross-node memory access
during inference.

Performance on Intel Xeon Silver 4514Y (32 core):
qwen3 8B  Q4_K: 19.39 -> 26.92 t/s (+39%)
qwen3 32B Q4_K:  4.99 ->  7.38 t/s (+48%)

Signed-off-by: Jianhui Zhou <jonaszhou@zhaoxin.com>
This commit is contained in:
Jianhui Zhou 2026-01-08 14:12:59 +00:00
parent b1366757cf
commit 11b753e786
3 changed files with 27 additions and 10 deletions

View File

@ -517,6 +517,7 @@ static __m256 __lasx_xvreplfr2vr_s(const float val) {
// TODO: move to ggml-threading
void ggml_barrier(struct ggml_threadpool * tp);
void ggml_cpu_set_numa_thread_affinity(int thread_n);
void ggml_threadpool_chunk_set(struct ggml_threadpool * tp, int value);
int ggml_threadpool_chunk_add(struct ggml_threadpool * tp, int value);

View File

@ -2087,7 +2087,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
// Android's libc implementation "bionic" does not support setting affinity
#if defined(__gnu_linux__)
static void set_numa_thread_affinity(int thread_n) {
void ggml_cpu_set_numa_thread_affinity(int thread_n) {
if (!ggml_is_numa()) {
return;
}
@ -2155,7 +2155,7 @@ static void clear_numa_thread_affinity(void) {
#else
// TODO: Windows etc.
// (the linux implementation may also work on BSD, someone should test)
static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); }
void ggml_cpu_set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); }
static void clear_numa_thread_affinity(void) {}
#endif
@ -2923,7 +2923,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
const struct ggml_cgraph * cgraph = tp->cgraph;
const struct ggml_cplan * cplan = tp->cplan;
set_numa_thread_affinity(state->ith);
ggml_cpu_set_numa_thread_affinity(state->ith);
struct ggml_compute_params params = {
/*.ith =*/ state->ith,

View File

@ -16,6 +16,10 @@
#include <cassert>
#include <cstdio> // for GGML_ASSERT
#if defined(GGML_USE_OPENMP)
#include <omp.h>
#endif
#include "repack.h"
#if defined(__GNUC__)
@ -1429,7 +1433,7 @@ static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block
const int n_row_groups = nrow / nrows_interleaved;
#ifdef GGML_USE_OPENMP
#pragma omp parallel for
#pragma omp for schedule(static)
#endif
for (int bg = 0; bg < n_row_groups; bg++) {
const int b = bg * nrows_interleaved;
@ -1468,7 +1472,7 @@ static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block
const int n_row_groups = nrow / nrows_interleaved;
#ifdef GGML_USE_OPENMP
#pragma omp parallel for
#pragma omp for schedule(static)
#endif
for (int bg = 0; bg < n_row_groups; bg++) {
const int b = bg * nrows_interleaved;
@ -1507,7 +1511,7 @@ static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block
const int n_row_groups = nrow / nrows_interleaved;
#ifdef GGML_USE_OPENMP
#pragma omp parallel for
#pragma omp for schedule(static)
#endif
for (int bg = 0; bg < n_row_groups; bg++) {
const int b = bg * nrows_interleaved;
@ -1546,7 +1550,7 @@ static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block
const int n_row_groups = nrow / nrows_interleaved;
#ifdef GGML_USE_OPENMP
#pragma omp parallel for
#pragma omp for schedule(static)
#endif
for (int bg = 0; bg < n_row_groups; bg++) {
const int b = bg * nrows_interleaved;
@ -1621,7 +1625,7 @@ static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_b
const int n_row_groups = nrow / nrows_interleaved;
#ifdef GGML_USE_OPENMP
#pragma omp parallel for
#pragma omp for schedule(static)
#endif
for (int bg = 0; bg < n_row_groups; bg++) {
const int b = bg * nrows_interleaved;
@ -1685,7 +1689,7 @@ static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_b
const int n_row_groups = nrow / nrows_interleaved;
#ifdef GGML_USE_OPENMP
#pragma omp parallel for
#pragma omp for schedule(static)
#endif
for (int bg = 0; bg < n_row_groups; bg++) {
const int b = bg * nrows_interleaved;
@ -2188,9 +2192,21 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
}
int repack(struct ggml_tensor * t, const void * data, size_t data_size) override {
int ret = 0;
GGML_LOG_DEBUG("%s: repack tensor %s with %s_%dx%d\n", __func__, t->name, ggml_type_name(t->type),
(int) NB_COLS, (int) INTER_SIZE);
return ggml::cpu::repack::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
#ifdef GGML_USE_OPENMP
#pragma omp parallel
{
ggml_cpu_set_numa_thread_affinity(omp_get_thread_num());
int r = ggml::cpu::repack::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
#pragma omp master
ret = r;
}
#else
ret = ggml::cpu::repack::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
#endif
return ret;
}
};