diff --git a/common/common.cpp b/common/common.cpp index b76dfa10ea..ff2f7c6e1f 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1348,6 +1348,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { mparams.check_tensors = params.check_tensors; mparams.use_extra_bufts = !params.no_extra_bufts; mparams.no_host = params.no_host; + mparams.repack_n_threads = params.cpuparams.n_threads; if (params.kv_overrides.empty()) { mparams.kv_overrides = NULL; diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index 4f3b99c8d0..5d9f7b4d82 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -52,6 +52,10 @@ extern "C" { GGML_BACKEND_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3); GGML_BACKEND_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value); + // parallel repack threads + GGML_BACKEND_API void ggml_cpu_set_repack_n_threads(int n_threads); + GGML_BACKEND_API int ggml_cpu_get_repack_n_threads(void); + GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params); GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool); GGML_BACKEND_API int ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool); diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index c2de7b9adf..9040e85311 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -16,6 +16,8 @@ #include #include // for GGML_ASSERT +static int g_repack_n_threads = 1; + #if defined(GGML_USE_OPENMP) #include #endif @@ -52,6 +54,19 @@ static inline int nearest_int(float fval) { extern "C" { +#if defined(GGML_USE_OPENMP) +void ggml_cpu_set_repack_n_threads(int n_threads) { + g_repack_n_threads = n_threads; +} + +int ggml_cpu_get_repack_n_threads(void) { + return g_repack_n_threads; +} +#else +void ggml_cpu_set_repack_n_threads(int n_threads) {} +int ggml_cpu_get_repack_n_threads(void) { return 0; } +#endif + void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { assert(QK8_0 == 32); assert(k % QK8_0 == 0); @@ -2192,20 +2207,28 @@ template name, ggml_type_name(t->type), (int) NB_COLS, (int) INTER_SIZE); #ifdef GGML_USE_OPENMP - #pragma omp parallel - { - ggml_cpu_set_numa_thread_affinity(omp_get_thread_num()); - int r = ggml::cpu::repack::repack(t, data, data_size); - #pragma omp master - ret = r; + int n_threads = ggml_cpu_get_repack_n_threads(); + GGML_ASSERT(n_threads >= 0); + if (n_threads == 0) { + n_threads = omp_get_max_threads(); + } + if (n_threads > 1) { + #pragma omp parallel num_threads(n_threads) + { + ggml_cpu_set_numa_thread_affinity(omp_get_thread_num()); + int r = ggml::cpu::repack::repack(t, data, data_size); + #pragma omp master + ret = r; + } } -#else - ret = ggml::cpu::repack::repack(t, data, data_size); #endif + if (ret == -1) { + ret = ggml::cpu::repack::repack(t, data, data_size); + } return ret; } }; diff --git a/include/llama.h b/include/llama.h index b52eaacfa7..a88cf0166a 100644 --- a/include/llama.h +++ b/include/llama.h @@ -313,6 +313,7 @@ extern "C" { bool check_tensors; // validate model tensor data bool use_extra_bufts; // use extra buffer types (used for weight repacking) bool no_host; // bypass host buffer allowing extra buffers to be used + int32_t repack_n_threads; // number of threads to use for repacking }; // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 28f06b4e61..9613f1bc76 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -7618,6 +7618,7 @@ llama_model_params llama_model_default_params() { /*.check_tensors =*/ false, /*.use_extra_bufts =*/ true, /*.no_host =*/ false, + /*.repack_n_threads =*/ 0, }; return result; diff --git a/src/llama.cpp b/src/llama.cpp index ab2e9868af..86c1e44ec7 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -100,6 +100,7 @@ int64_t llama_time_us(void) { // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback static int llama_model_load(const std::string & fname, std::vector & splits, llama_model & model, llama_model_params & params) { + ggml_cpu_set_repack_n_threads(params.repack_n_threads); // loading time will be recalculated after the first eval, so // we take page faults deferred by mmap() into consideration model.t_load_us = 0; diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index 0be6ed6948..dfbd3116f7 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -1072,6 +1072,7 @@ struct cmd_params_instance { mparams.tensor_split = tensor_split.data(); mparams.use_mmap = use_mmap; mparams.no_host = no_host; + mparams.repack_n_threads = n_threads; if (n_cpu_moe <= 0) { if (tensor_buft_overrides.empty()) {