llama : add group split-mode to minimize GPU usage

Adding split-mode group parameter support to llama-bench
This commit is contained in:
Daniel Andersen 2026-02-13 16:09:59 +00:00
parent b4078f77ab
commit 89c6cd3248
2 changed files with 7 additions and 3 deletions

View File

@ -51,7 +51,7 @@ test parameters:
--poll <0...100> (default: 50)
-ngl, --n-gpu-layers <n> (default: 99)
-ncmoe, --n-cpu-moe <n> (default: 0)
-sm, --split-mode <none|layer|row> (default: layer)
-sm, --split-mode <none|layer|row|group> (default: layer)
-mg, --main-gpu <i> (default: 0)
-nkvo, --no-kv-offload <0|1> (default: 0)
-fa, --flash-attn <0|1> (default: 0)

View File

@ -259,6 +259,8 @@ static const char * split_mode_str(llama_split_mode mode) {
return "layer";
case LLAMA_SPLIT_MODE_ROW:
return "row";
case LLAMA_SPLIT_MODE_GROUP:
return "group";
default:
GGML_ABORT("invalid split mode");
}
@ -440,8 +442,8 @@ static void print_usage(int /* argc */, char ** argv) {
join(cmd_params_defaults.n_gpu_layers, ",").c_str());
printf(" -ncmoe, --n-cpu-moe <n> (default: %s)\n",
join(cmd_params_defaults.n_cpu_moe, ",").c_str());
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n",
join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
printf(" -sm, --split-mode <none|layer|row|group> (default: %s)\n",
join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
printf(" -mg, --main-gpu <i> (default: %s)\n",
join(cmd_params_defaults.main_gpu, ",").c_str());
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n",
@ -723,6 +725,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
mode = LLAMA_SPLIT_MODE_LAYER;
} else if (m == "row") {
mode = LLAMA_SPLIT_MODE_ROW;
} else if (m == "group") {
mode = LLAMA_SPLIT_MODE_GROUP;
} else {
invalid_param = true;
break;