fix: use physical cores for --threads auto-detect (#19110)
Replace std:🧵:hardware_concurrency() with cpu_get_num_math() when --threads is set to -1 or 0 (auto-detect mode). hardware_concurrency() returns logical cores (includes hyperthreads), causing thread oversubscription and performance degradation: - 100% CPU usage instead of optimal ~50% - 3.6x slower (2.5 tok/s vs 9 tok/s reported) cpu_get_num_math() returns physical cores and also handles Intel hybrid CPUs by skipping efficiency cores for math workloads. Fixes #19110
This commit is contained in:
parent
2dc3ce2166
commit
08a1f98eef
|
|
@ -1111,7 +1111,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
[](common_params & params, int value) {
|
||||
params.cpuparams.n_threads = value;
|
||||
if (params.cpuparams.n_threads <= 0) {
|
||||
params.cpuparams.n_threads = std::thread::hardware_concurrency();
|
||||
params.cpuparams.n_threads = cpu_get_num_math();
|
||||
}
|
||||
}
|
||||
).set_env("LLAMA_ARG_THREADS"));
|
||||
|
|
@ -1121,7 +1121,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
[](common_params & params, int value) {
|
||||
params.cpuparams_batch.n_threads = value;
|
||||
if (params.cpuparams_batch.n_threads <= 0) {
|
||||
params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
|
||||
params.cpuparams_batch.n_threads = cpu_get_num_math();
|
||||
}
|
||||
}
|
||||
));
|
||||
|
|
@ -3216,7 +3216,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
[](common_params & params, int value) {
|
||||
params.speculative.cpuparams.n_threads = value;
|
||||
if (params.speculative.cpuparams.n_threads <= 0) {
|
||||
params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency();
|
||||
params.speculative.cpuparams.n_threads = cpu_get_num_math();
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
||||
|
|
@ -3226,7 +3226,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
[](common_params & params, int value) {
|
||||
params.speculative.cpuparams_batch.n_threads = value;
|
||||
if (params.speculative.cpuparams_batch.n_threads <= 0) {
|
||||
params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
|
||||
params.speculative.cpuparams_batch.n_threads = cpu_get_num_math();
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
||||
|
|
|
|||
Loading…
Reference in New Issue