llama-bench: add `-fitc` and `-fitt` to arguments (#21304)

* llama-bench: add `-fitc` and `-fitt` to arguments

* update README.md

* address review comments

* update compare-llama-bench.py
This commit is contained in:
Aman Gupta 2026-04-06 22:26:02 +08:00 committed by GitHub
parent 4aa962e2b0
commit 94ca829b60
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 107 additions and 6 deletions

View File

@ -29,7 +29,8 @@ LLAMA_BENCH_DB_FIELDS = [
"cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",
"split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides", "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides",
"use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth", "use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth",
"test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts", "n_cpu_moe" "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts", "n_cpu_moe",
"fit_target", "fit_min_ctx"
] ]
LLAMA_BENCH_DB_TYPES = [ LLAMA_BENCH_DB_TYPES = [
@ -39,6 +40,7 @@ LLAMA_BENCH_DB_TYPES = [
"TEXT", "INTEGER", "INTEGER", "INTEGER", "TEXT", "TEXT", "TEXT", "INTEGER", "INTEGER", "INTEGER", "TEXT", "TEXT",
"INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER",
"TEXT", "INTEGER", "INTEGER", "REAL", "REAL", "INTEGER", "TEXT", "INTEGER", "INTEGER", "REAL", "REAL", "INTEGER",
"INTEGER", "INTEGER"
] ]
# All test-backend-ops SQL fields # All test-backend-ops SQL fields
@ -61,7 +63,8 @@ assert len(TEST_BACKEND_OPS_DB_FIELDS) == len(TEST_BACKEND_OPS_DB_TYPES)
LLAMA_BENCH_KEY_PROPERTIES = [ LLAMA_BENCH_KEY_PROPERTIES = [
"cpu_info", "gpu_info", "backends", "n_gpu_layers", "n_cpu_moe", "tensor_buft_overrides", "model_filename", "model_type", "cpu_info", "gpu_info", "backends", "n_gpu_layers", "n_cpu_moe", "tensor_buft_overrides", "model_filename", "model_type",
"n_batch", "n_ubatch", "embeddings", "cpu_mask", "cpu_strict", "poll", "n_threads", "type_k", "type_v", "n_batch", "n_ubatch", "embeddings", "cpu_mask", "cpu_strict", "poll", "n_threads", "type_k", "type_v",
"use_mmap", "no_kv_offload", "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen", "n_depth" "use_mmap", "no_kv_offload", "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen", "n_depth",
"fit_target", "fit_min_ctx"
] ]
# Properties by which to differentiate results per commit for test-backend-ops: # Properties by which to differentiate results per commit for test-backend-ops:

View File

@ -62,6 +62,8 @@ test parameters:
-ot --override-tensors <tensor name pattern>=<buffer type>;... -ot --override-tensors <tensor name pattern>=<buffer type>;...
(default: disabled) (default: disabled)
-nopo, --no-op-offload <0|1> (default: 0) -nopo, --no-op-offload <0|1> (default: 0)
-fitt, --fit-target <MiB> fit model to device memory with this margin per device in MiB (default: off)
-fitc, --fit-ctx <n> minimum ctx size for --fit-target (default: 4096)
Multiple values can be given for each parameter by separating them with ',' Multiple values can be given for each parameter by separating them with ','
or by specifying the parameter multiple times. Ranges can be given as or by specifying the parameter multiple times. Ranges can be given as

View File

@ -342,6 +342,8 @@ struct cmd_params {
std::vector<bool> embeddings; std::vector<bool> embeddings;
std::vector<bool> no_op_offload; std::vector<bool> no_op_offload;
std::vector<bool> no_host; std::vector<bool> no_host;
std::vector<size_t> fit_params_target;
std::vector<uint32_t> fit_params_min_ctx;
ggml_numa_strategy numa; ggml_numa_strategy numa;
int reps; int reps;
ggml_sched_priority prio; ggml_sched_priority prio;
@ -384,6 +386,8 @@ static const cmd_params cmd_params_defaults = {
/* embeddings */ { false }, /* embeddings */ { false },
/* no_op_offload */ { false }, /* no_op_offload */ { false },
/* no_host */ { false }, /* no_host */ { false },
/* fit_params_target */ { 0 },
/* fit_params_min_ctx */ { 0 },
/* numa */ GGML_NUMA_STRATEGY_DISABLED, /* numa */ GGML_NUMA_STRATEGY_DISABLED,
/* reps */ 5, /* reps */ 5,
/* prio */ GGML_SCHED_PRIO_NORMAL, /* prio */ GGML_SCHED_PRIO_NORMAL,
@ -410,6 +414,8 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -v, --verbose verbose output\n"); printf(" -v, --verbose verbose output\n");
printf(" --progress print test progress indicators\n"); printf(" --progress print test progress indicators\n");
printf(" --no-warmup skip warmup runs before benchmarking\n"); printf(" --no-warmup skip warmup runs before benchmarking\n");
printf(" -fitt, --fit-target <MiB> fit model to device memory with this margin per device in MiB (default: off)\n");
printf(" -fitc, --fit-ctx <n> minimum ctx size for --fit-target (default: 4096)\n");
if (llama_supports_rpc()) { if (llama_supports_rpc()) {
printf(" -rpc, --rpc <rpc_servers> register RPC devices (comma separated)\n"); printf(" -rpc, --rpc <rpc_servers> register RPC devices (comma separated)\n");
} }
@ -958,6 +964,24 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
params.progress = true; params.progress = true;
} else if (arg == "--no-warmup") { } else if (arg == "--no-warmup") {
params.no_warmup = true; params.no_warmup = true;
} else if (arg == "-fitt" || arg == "--fit-target") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<std::string>(argv[i], split_delim);
for (const auto & v : p) {
params.fit_params_target.push_back(std::stoull(v));
}
} else if (arg == "-fitc" || arg == "--fit-ctx") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<std::string>(argv[i], split_delim);
for (const auto & v : p) {
params.fit_params_min_ctx.push_back(std::stoul(v));
}
} else { } else {
invalid_param = true; invalid_param = true;
break; break;
@ -1078,6 +1102,12 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.poll.empty()) { if (params.poll.empty()) {
params.poll = cmd_params_defaults.poll; params.poll = cmd_params_defaults.poll;
} }
if (params.fit_params_target.empty()) {
params.fit_params_target = cmd_params_defaults.fit_params_target;
}
if (params.fit_params_min_ctx.empty()) {
params.fit_params_min_ctx = cmd_params_defaults.fit_params_min_ctx;
}
return params; return params;
} }
@ -1109,6 +1139,8 @@ struct cmd_params_instance {
bool embeddings; bool embeddings;
bool no_op_offload; bool no_op_offload;
bool no_host; bool no_host;
size_t fit_target;
uint32_t fit_min_ctx;
llama_model_params to_llama_mparams() const { llama_model_params to_llama_mparams() const {
llama_model_params mparams = llama_model_default_params(); llama_model_params mparams = llama_model_default_params();
@ -1197,6 +1229,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
// this ordering minimizes the number of times that each model needs to be reloaded // this ordering minimizes the number of times that each model needs to be reloaded
// clang-format off // clang-format off
for (const auto & m : params.model) for (const auto & m : params.model)
for (const auto & fpt : params.fit_params_target)
for (const auto & fpc : params.fit_params_min_ctx)
for (const auto & nl : params.n_gpu_layers) for (const auto & nl : params.n_gpu_layers)
for (const auto & ncmoe : params.n_cpu_moe) for (const auto & ncmoe : params.n_cpu_moe)
for (const auto & sm : params.split_mode) for (const auto & sm : params.split_mode)
@ -1251,6 +1285,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .embeddings = */ embd, /* .embeddings = */ embd,
/* .no_op_offload= */ nopo, /* .no_op_offload= */ nopo,
/* .no_host = */ noh, /* .no_host = */ noh,
/* .fit_target = */ fpt,
/* .fit_min_ctx = */ fpc,
}; };
instances.push_back(instance); instances.push_back(instance);
} }
@ -1286,6 +1322,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .embeddings = */ embd, /* .embeddings = */ embd,
/* .no_op_offload= */ nopo, /* .no_op_offload= */ nopo,
/* .no_host = */ noh, /* .no_host = */ noh,
/* .fit_target = */ fpt,
/* .fit_min_ctx = */ fpc,
}; };
instances.push_back(instance); instances.push_back(instance);
} }
@ -1321,6 +1359,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .embeddings = */ embd, /* .embeddings = */ embd,
/* .no_op_offload= */ nopo, /* .no_op_offload= */ nopo,
/* .no_host = */ noh, /* .no_host = */ noh,
/* .fit_target = */ fpt,
/* .fit_min_ctx = */ fpc,
}; };
instances.push_back(instance); instances.push_back(instance);
} }
@ -1361,6 +1401,8 @@ struct test {
bool embeddings; bool embeddings;
bool no_op_offload; bool no_op_offload;
bool no_host; bool no_host;
size_t fit_target;
uint32_t fit_min_ctx;
int n_prompt; int n_prompt;
int n_gen; int n_gen;
int n_depth; int n_depth;
@ -1399,6 +1441,8 @@ struct test {
embeddings = inst.embeddings; embeddings = inst.embeddings;
no_op_offload = inst.no_op_offload; no_op_offload = inst.no_op_offload;
no_host = inst.no_host; no_host = inst.no_host;
fit_target = inst.fit_target;
fit_min_ctx = inst.fit_min_ctx;
n_prompt = inst.n_prompt; n_prompt = inst.n_prompt;
n_gen = inst.n_gen; n_gen = inst.n_gen;
n_depth = inst.n_depth; n_depth = inst.n_depth;
@ -1456,7 +1500,8 @@ struct test {
"type_k", "type_v", "n_gpu_layers", "n_cpu_moe", "split_mode", "type_k", "type_v", "n_gpu_layers", "n_cpu_moe", "split_mode",
"main_gpu", "no_kv_offload", "flash_attn", "devices", "tensor_split", "main_gpu", "no_kv_offload", "flash_attn", "devices", "tensor_split",
"tensor_buft_overrides", "use_mmap", "use_direct_io", "embeddings", "tensor_buft_overrides", "use_mmap", "use_direct_io", "embeddings",
"no_op_offload", "no_host", "n_prompt", "n_gen", "n_depth", "no_op_offload", "no_host", "fit_target", "fit_min_ctx",
"n_prompt", "n_gen", "n_depth",
"test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts" "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts"
}; };
return fields; return fields;
@ -1468,7 +1513,8 @@ struct test {
if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" || if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" ||
field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" || field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" ||
field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" || field == "avg_ns" || field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" || field == "avg_ns" ||
field == "stddev_ns" || field == "no_op_offload" || field == "n_cpu_moe") { field == "stddev_ns" || field == "no_op_offload" || field == "n_cpu_moe" ||
field == "fit_target" || field == "fit_min_ctx") {
return INT; return INT;
} }
if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" || if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
@ -1549,6 +1595,8 @@ struct test {
std::to_string(embeddings), std::to_string(embeddings),
std::to_string(no_op_offload), std::to_string(no_op_offload),
std::to_string(no_host), std::to_string(no_host),
std::to_string(fit_target),
std::to_string(fit_min_ctx),
std::to_string(n_prompt), std::to_string(n_prompt),
std::to_string(n_gen), std::to_string(n_gen),
std::to_string(n_depth), std::to_string(n_depth),
@ -1792,6 +1840,12 @@ struct markdown_printer : public printer {
if (field == "tensor_buft_overrides") { if (field == "tensor_buft_overrides") {
return "ot"; return "ot";
} }
if (field == "fit_target") {
return "fitt";
}
if (field == "fit_min_ctx") {
return "fitc";
}
return field; return field;
} }
@ -1870,6 +1924,12 @@ struct markdown_printer : public printer {
if (params.no_host.size() > 1 || params.no_host != cmd_params_defaults.no_host) { if (params.no_host.size() > 1 || params.no_host != cmd_params_defaults.no_host) {
fields.emplace_back("no_host"); fields.emplace_back("no_host");
} }
if (params.fit_params_target.size() > 1 || params.fit_params_target != cmd_params_defaults.fit_params_target) {
fields.emplace_back("fit_target");
}
if (params.fit_params_min_ctx.size() > 1 || params.fit_params_min_ctx != cmd_params_defaults.fit_params_min_ctx) {
fields.emplace_back("fit_min_ctx");
}
fields.emplace_back("test"); fields.emplace_back("test");
fields.emplace_back("t/s"); fields.emplace_back("t/s");
@ -2141,13 +2201,49 @@ int main(int argc, char ** argv) {
if (params.progress) { if (params.progress) {
fprintf(stderr, "llama-bench: benchmark %d/%zu: starting\n", params_idx, params_count); fprintf(stderr, "llama-bench: benchmark %d/%zu: starting\n", params_idx, params_count);
} }
auto mparams = inst.to_llama_mparams();
auto cparams = inst.to_llama_cparams();
bool do_fit = inst.fit_target != cmd_params_defaults.fit_params_target[0] ||
inst.fit_min_ctx != cmd_params_defaults.fit_params_min_ctx[0];
std::vector<float> fit_tensor_split(llama_max_devices(), 0.0f);
std::vector<llama_model_tensor_buft_override> fit_overrides(llama_max_tensor_buft_overrides(), {nullptr, nullptr});
if (do_fit) {
// free the previous model so fit sees full free VRAM
if (lmodel) {
llama_model_free(lmodel);
lmodel = nullptr;
prev_inst = nullptr;
}
// use default n_gpu_layers and n_ctx so llama_params_fit can adjust them
mparams.n_gpu_layers = llama_model_default_params().n_gpu_layers;
mparams.tensor_split = fit_tensor_split.data();
mparams.tensor_buft_overrides = fit_overrides.data();
cparams.n_ctx = 0;
std::vector<size_t> margins(llama_max_devices(), inst.fit_target * 1024 * 1024);
uint32_t n_ctx_needed = inst.n_prompt + inst.n_gen + inst.n_depth;
cparams.n_ctx = std::max(cparams.n_ctx, n_ctx_needed);
llama_params_fit(inst.model.c_str(), &mparams, &cparams,
fit_tensor_split.data(),
fit_overrides.data(),
margins.data(),
inst.fit_min_ctx,
params.verbose ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
}
// keep the same model between tests when possible // keep the same model between tests when possible
if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) { if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
if (lmodel) { if (lmodel) {
llama_model_free(lmodel); llama_model_free(lmodel);
} }
lmodel = llama_model_load_from_file(inst.model.c_str(), inst.to_llama_mparams()); lmodel = llama_model_load_from_file(inst.model.c_str(), mparams);
if (lmodel == NULL) { if (lmodel == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str()); fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
return 1; return 1;
@ -2155,7 +2251,7 @@ int main(int argc, char ** argv) {
prev_inst = &inst; prev_inst = &inst;
} }
llama_context * ctx = llama_init_from_model(lmodel, inst.to_llama_cparams()); llama_context * ctx = llama_init_from_model(lmodel, cparams);
if (ctx == NULL) { if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str()); fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str());
llama_model_free(lmodel); llama_model_free(lmodel);