parallel load: default to all GPU contexts

2026-03-23 18:13:36 +02:00 · 2026-03-23 18:13:36 +02:00 · 6403785b94
parent 74faaaf7d2
commit 6403785b94
3 changed files with 3 additions and 3 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -2366,7 +2366,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_env("LLAMA_ARG_SPLIT_MODE"));
    add_opt(common_arg(
        {"-pl", "--parallel-load"}, "N",
-        string_format("max parallel jobs for model loading (default: %d, -1 = no cap (up to #contexts), 1 = sequential)", params.n_parallel_load),
+        "max parallel jobs for model loading (default: all GPUs, 1 = sequential)",
        [](common_params & params, int value) {
            params.n_parallel_load = value;
            if (params.n_parallel_load <= 0) {
--- a/common/common.h
+++ b/common/common.h
@ -444,7 +444,7 @@ struct common_params {

    enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs

-    int32_t n_parallel_load = 4; // number of threads for parallel model loading (-1 = unlimited, 1 = sequential)
+    int32_t n_parallel_load = -1; // max parallel jobs for model loading (-1 = all GPUs, 1 = sequential)

    struct cpu_params cpuparams;
    struct cpu_params cpuparams_batch;
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -7658,7 +7658,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {

    // load tensor data
    const char * limit_env = getenv("LLAMA_ARG_PARALLEL_LOAD");
-    const size_t default_limit = 4;
+    const size_t default_limit = 0;
    const int limit_val = limit_env ? atoi(limit_env) : (int) default_limit;
    const size_t n_contexts = ctx_buf_maps.size();
    const size_t parallel_limit = limit_val <= 0 ? n_contexts : (size_t) limit_val;