parallel load: default to all GPU contexts

This commit is contained in:
Marko Tombak 2026-03-23 18:13:36 +02:00
parent 74faaaf7d2
commit 6403785b94
3 changed files with 3 additions and 3 deletions

View File

@ -2366,7 +2366,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_env("LLAMA_ARG_SPLIT_MODE"));
add_opt(common_arg(
{"-pl", "--parallel-load"}, "N",
string_format("max parallel jobs for model loading (default: %d, -1 = no cap (up to #contexts), 1 = sequential)", params.n_parallel_load),
"max parallel jobs for model loading (default: all GPUs, 1 = sequential)",
[](common_params & params, int value) {
params.n_parallel_load = value;
if (params.n_parallel_load <= 0) {

View File

@ -444,7 +444,7 @@ struct common_params {
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
int32_t n_parallel_load = 4; // number of threads for parallel model loading (-1 = unlimited, 1 = sequential)
int32_t n_parallel_load = -1; // max parallel jobs for model loading (-1 = all GPUs, 1 = sequential)
struct cpu_params cpuparams;
struct cpu_params cpuparams_batch;

View File

@ -7658,7 +7658,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
// load tensor data
const char * limit_env = getenv("LLAMA_ARG_PARALLEL_LOAD");
const size_t default_limit = 4;
const size_t default_limit = 0;
const int limit_val = limit_env ? atoi(limit_env) : (int) default_limit;
const size_t n_contexts = ctx_buf_maps.size();
const size_t parallel_limit = limit_val <= 0 ? n_contexts : (size_t) limit_val;