ggml: Add thread count control during repacking

This change enables the repack stage to utilize the user-specified thread count, ensuring that both the logical thread IDs and the total number of threads remain consistent between the repack and inference stages. In a NUMA architecture where the `--numa distribute` parameter is used, logical threads are pinned to specific physical NUMA nodes. By aligning the thread configuration across these two stages, we can fully leverage the operating system's "first-touch" memory allocation policy: 1. Repack Stage: Logical thread i (bound to NUMA node j) is responsible for repacking and writing the weight data. Since the "first touch" occurs within this thread, the corresponding physical memory is allocated on node j. 2. Inference Stage: The same logical thread i (still bound to node j) reads these weights. Since the data already resides on the local node, low-latency local memory access is achieved. Without ensuring consistency in the number of threads, data may be randomly allocated to mismatched nodes, resulting in significant cross-node access overhead during inference. Signed-off-by: Jianhui Zhou <jonaszhou@zhaoxin.com>
2026-01-13 07:36:31 +00:00 · 2026-01-13 07:36:31 +00:00 · 5714d4b86e
parent 11b753e786
commit 5714d4b86e
7 changed files with 41 additions and 9 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1348,6 +1348,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
    mparams.check_tensors   = params.check_tensors;
    mparams.use_extra_bufts = !params.no_extra_bufts;
    mparams.no_host         = params.no_host;
+    mparams.repack_n_threads = params.cpuparams.n_threads;

    if (params.kv_overrides.empty()) {
        mparams.kv_overrides = NULL;
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@ -52,6 +52,10 @@ extern "C" {
    GGML_BACKEND_API float   ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
    GGML_BACKEND_API void    ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);

+    // parallel repack threads
+    GGML_BACKEND_API void    ggml_cpu_set_repack_n_threads(int n_threads);
+    GGML_BACKEND_API int     ggml_cpu_get_repack_n_threads(void);
+
    GGML_BACKEND_API struct ggml_threadpool *      ggml_threadpool_new           (struct ggml_threadpool_params  * params);
    GGML_BACKEND_API void                          ggml_threadpool_free          (struct ggml_threadpool * threadpool);
    GGML_BACKEND_API int                           ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool);
--- a/ggml/src/ggml-cpu/repack.cpp
+++ b/ggml/src/ggml-cpu/repack.cpp
@ -16,6 +16,8 @@
 #include <cassert>
 #include <cstdio>  // for GGML_ASSERT

+static int g_repack_n_threads = 1;
+
 #if defined(GGML_USE_OPENMP)
 #include <omp.h>
 #endif
@ -52,6 +54,19 @@ static inline int nearest_int(float fval) {

 extern "C" {

+#if defined(GGML_USE_OPENMP)
+void ggml_cpu_set_repack_n_threads(int n_threads) {
+    g_repack_n_threads = n_threads;
+}
+
+int ggml_cpu_get_repack_n_threads(void) {
+    return g_repack_n_threads;
+}
+#else
+void ggml_cpu_set_repack_n_threads(int n_threads) {}
+int ggml_cpu_get_repack_n_threads(void) { return 0; }
+#endif
+
 void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
    assert(QK8_0 == 32);
    assert(k % QK8_0 == 0);
@ -2192,20 +2207,28 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
    }

    int repack(struct ggml_tensor * t, const void * data, size_t data_size) override {
-        int ret = 0;
+        int ret = -1;
        GGML_LOG_DEBUG("%s: repack tensor %s with %s_%dx%d\n", __func__, t->name, ggml_type_name(t->type),
                       (int) NB_COLS, (int) INTER_SIZE);
 #ifdef GGML_USE_OPENMP
-        #pragma omp parallel
-        {
-            ggml_cpu_set_numa_thread_affinity(omp_get_thread_num());
-            int r = ggml::cpu::repack::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
-            #pragma omp master
-            ret = r;
+        int n_threads = ggml_cpu_get_repack_n_threads();
+        GGML_ASSERT(n_threads >= 0);
+        if (n_threads == 0) {
+            n_threads = omp_get_max_threads();
+        }
+        if (n_threads > 1) {
+            #pragma omp parallel num_threads(n_threads)
+            {
+                ggml_cpu_set_numa_thread_affinity(omp_get_thread_num());
+                int r = ggml::cpu::repack::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
+                #pragma omp master
+                ret = r;
+            }
        }
-#else
-        ret = ggml::cpu::repack::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
 #endif
+        if (ret == -1) {
+            ret = ggml::cpu::repack::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
+        }
        return ret;
    }
 };
--- a/include/llama.h
+++ b/include/llama.h
@ -313,6 +313,7 @@ extern "C" {
        bool check_tensors;   // validate model tensor data
        bool use_extra_bufts; // use extra buffer types (used for weight repacking)
        bool no_host;         // bypass host buffer allowing extra buffers to be used
+        int32_t repack_n_threads; // number of threads to use for repacking
    };

    // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -7618,6 +7618,7 @@ llama_model_params llama_model_default_params() {
        /*.check_tensors               =*/ false,
        /*.use_extra_bufts             =*/ true,
        /*.no_host                     =*/ false,
+        /*.repack_n_threads            =*/ 0,
    };

    return result;
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -100,6 +100,7 @@ int64_t llama_time_us(void) {

 // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
 static int llama_model_load(const std::string & fname, std::vector<std::string> & splits, llama_model & model, llama_model_params & params) {
+    ggml_cpu_set_repack_n_threads(params.repack_n_threads);
    // loading time will be recalculated after the first eval, so
    // we take page faults deferred by mmap() into consideration
    model.t_load_us = 0;
--- a/tools/llama-bench/llama-bench.cpp
+++ b/tools/llama-bench/llama-bench.cpp
@ -1072,6 +1072,7 @@ struct cmd_params_instance {
        mparams.tensor_split = tensor_split.data();
        mparams.use_mmap     = use_mmap;
        mparams.no_host      = no_host;
+        mparams.repack_n_threads = n_threads;

        if (n_cpu_moe <= 0) {
            if (tensor_buft_overrides.empty()) {