fix: use physical cores for --threads auto-detect (#19110)

Replace std:🧵:hardware_concurrency() with cpu_get_num_math()
when --threads is set to -1 or 0 (auto-detect mode).

hardware_concurrency() returns logical cores (includes hyperthreads),
causing thread oversubscription and performance degradation:
- 100% CPU usage instead of optimal ~50%
- 3.6x slower (2.5 tok/s vs 9 tok/s reported)

cpu_get_num_math() returns physical cores and also handles Intel
hybrid CPUs by skipping efficiency cores for math workloads.

Fixes #19110
This commit is contained in:
ingyukoh 2026-02-02 12:46:23 +09:00
parent 2dc3ce2166
commit 08a1f98eef
1 changed files with 4 additions and 4 deletions

View File

@ -1111,7 +1111,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, int value) {
params.cpuparams.n_threads = value;
if (params.cpuparams.n_threads <= 0) {
params.cpuparams.n_threads = std::thread::hardware_concurrency();
params.cpuparams.n_threads = cpu_get_num_math();
}
}
).set_env("LLAMA_ARG_THREADS"));
@ -1121,7 +1121,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, int value) {
params.cpuparams_batch.n_threads = value;
if (params.cpuparams_batch.n_threads <= 0) {
params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
params.cpuparams_batch.n_threads = cpu_get_num_math();
}
}
));
@ -3216,7 +3216,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, int value) {
params.speculative.cpuparams.n_threads = value;
if (params.speculative.cpuparams.n_threads <= 0) {
params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency();
params.speculative.cpuparams.n_threads = cpu_get_num_math();
}
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
@ -3226,7 +3226,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, int value) {
params.speculative.cpuparams_batch.n_threads = value;
if (params.speculative.cpuparams_batch.n_threads <= 0) {
params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
params.speculative.cpuparams_batch.n_threads = cpu_get_num_math();
}
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));