parallel load: default to all GPU contexts
This commit is contained in:
parent
74faaaf7d2
commit
6403785b94
|
|
@ -2366,7 +2366,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
).set_env("LLAMA_ARG_SPLIT_MODE"));
|
||||
add_opt(common_arg(
|
||||
{"-pl", "--parallel-load"}, "N",
|
||||
string_format("max parallel jobs for model loading (default: %d, -1 = no cap (up to #contexts), 1 = sequential)", params.n_parallel_load),
|
||||
"max parallel jobs for model loading (default: all GPUs, 1 = sequential)",
|
||||
[](common_params & params, int value) {
|
||||
params.n_parallel_load = value;
|
||||
if (params.n_parallel_load <= 0) {
|
||||
|
|
|
|||
|
|
@ -444,7 +444,7 @@ struct common_params {
|
|||
|
||||
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
||||
|
||||
int32_t n_parallel_load = 4; // number of threads for parallel model loading (-1 = unlimited, 1 = sequential)
|
||||
int32_t n_parallel_load = -1; // max parallel jobs for model loading (-1 = all GPUs, 1 = sequential)
|
||||
|
||||
struct cpu_params cpuparams;
|
||||
struct cpu_params cpuparams_batch;
|
||||
|
|
|
|||
|
|
@ -7658,7 +7658,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
|
||||
// load tensor data
|
||||
const char * limit_env = getenv("LLAMA_ARG_PARALLEL_LOAD");
|
||||
const size_t default_limit = 4;
|
||||
const size_t default_limit = 0;
|
||||
const int limit_val = limit_env ? atoi(limit_env) : (int) default_limit;
|
||||
const size_t n_contexts = ctx_buf_maps.size();
|
||||
const size_t parallel_limit = limit_val <= 0 ? n_contexts : (size_t) limit_val;
|
||||
|
|
|
|||
Loading…
Reference in New Issue