From 6403785b94ac2a202c6d75d1fa18c74b01a155e9 Mon Sep 17 00:00:00 2001
From: Marko Tombak <admin@hper.tech>
Date: Mon, 23 Mar 2026 18:13:36 +0200
Subject: [PATCH] parallel load: default to all GPU contexts

---
 common/arg.cpp      | 2 +-
 common/common.h     | 2 +-
 src/llama-model.cpp | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 4a7d36e537..cbae37b746 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2366,7 +2366,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_env("LLAMA_ARG_SPLIT_MODE"));
     add_opt(common_arg(
         {"-pl", "--parallel-load"}, "N",
-        string_format("max parallel jobs for model loading (default: %d, -1 = no cap (up to #contexts), 1 = sequential)", params.n_parallel_load),
+        "max parallel jobs for model loading (default: all GPUs, 1 = sequential)",
         [](common_params & params, int value) {
             params.n_parallel_load = value;
             if (params.n_parallel_load <= 0) {
diff --git a/common/common.h b/common/common.h
index 471f60789e..91128003cc 100644
--- a/common/common.h
+++ b/common/common.h
@@ -444,7 +444,7 @@ struct common_params {
 
     enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
 
-    int32_t n_parallel_load = 4; // number of threads for parallel model loading (-1 = unlimited, 1 = sequential)
+    int32_t n_parallel_load = -1; // max parallel jobs for model loading (-1 = all GPUs, 1 = sequential)
 
     struct cpu_params cpuparams;
     struct cpu_params cpuparams_batch;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 2297fd75f2..c86d4ea932 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -7658,7 +7658,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
     // load tensor data
     const char * limit_env = getenv("LLAMA_ARG_PARALLEL_LOAD");
-    const size_t default_limit = 4;
+    const size_t default_limit = 0;
     const int limit_val = limit_env ? atoi(limit_env) : (int) default_limit;
     const size_t n_contexts = ctx_buf_maps.size();
     const size_t parallel_limit = limit_val <= 0 ? n_contexts : (size_t) limit_val;