From 6403785b94ac2a202c6d75d1fa18c74b01a155e9 Mon Sep 17 00:00:00 2001 From: Marko Tombak Date: Mon, 23 Mar 2026 18:13:36 +0200 Subject: [PATCH] parallel load: default to all GPU contexts --- common/arg.cpp | 2 +- common/common.h | 2 +- src/llama-model.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 4a7d36e537..cbae37b746 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2366,7 +2366,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_env("LLAMA_ARG_SPLIT_MODE")); add_opt(common_arg( {"-pl", "--parallel-load"}, "N", - string_format("max parallel jobs for model loading (default: %d, -1 = no cap (up to #contexts), 1 = sequential)", params.n_parallel_load), + "max parallel jobs for model loading (default: all GPUs, 1 = sequential)", [](common_params & params, int value) { params.n_parallel_load = value; if (params.n_parallel_load <= 0) { diff --git a/common/common.h b/common/common.h index 471f60789e..91128003cc 100644 --- a/common/common.h +++ b/common/common.h @@ -444,7 +444,7 @@ struct common_params { enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs - int32_t n_parallel_load = 4; // number of threads for parallel model loading (-1 = unlimited, 1 = sequential) + int32_t n_parallel_load = -1; // max parallel jobs for model loading (-1 = all GPUs, 1 = sequential) struct cpu_params cpuparams; struct cpu_params cpuparams_batch; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 2297fd75f2..c86d4ea932 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -7658,7 +7658,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // load tensor data const char * limit_env = getenv("LLAMA_ARG_PARALLEL_LOAD"); - const size_t default_limit = 4; + const size_t default_limit = 0; const int limit_val = limit_env ? atoi(limit_env) : (int) default_limit; const size_t n_contexts = ctx_buf_maps.size(); const size_t parallel_limit = limit_val <= 0 ? n_contexts : (size_t) limit_val;