From 94ca829b6001019622c0f67fcd48e9ec6bd7dce8 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Mon, 6 Apr 2026 22:26:02 +0800 Subject: [PATCH] llama-bench: add `-fitc` and `-fitt` to arguments (#21304) * llama-bench: add `-fitc` and `-fitt` to arguments * update README.md * address review comments * update compare-llama-bench.py --- scripts/compare-llama-bench.py | 7 +- tools/llama-bench/README.md | 2 + tools/llama-bench/llama-bench.cpp | 104 ++++++++++++++++++++++++++++-- 3 files changed, 107 insertions(+), 6 deletions(-) diff --git a/scripts/compare-llama-bench.py b/scripts/compare-llama-bench.py index f43d24ebf1..5a6cc7dbb1 100755 --- a/scripts/compare-llama-bench.py +++ b/scripts/compare-llama-bench.py @@ -29,7 +29,8 @@ LLAMA_BENCH_DB_FIELDS = [ "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides", "use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth", - "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts", "n_cpu_moe" + "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts", "n_cpu_moe", + "fit_target", "fit_min_ctx" ] LLAMA_BENCH_DB_TYPES = [ @@ -39,6 +40,7 @@ LLAMA_BENCH_DB_TYPES = [ "TEXT", "INTEGER", "INTEGER", "INTEGER", "TEXT", "TEXT", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "TEXT", "INTEGER", "INTEGER", "REAL", "REAL", "INTEGER", + "INTEGER", "INTEGER" ] # All test-backend-ops SQL fields @@ -61,7 +63,8 @@ assert len(TEST_BACKEND_OPS_DB_FIELDS) == len(TEST_BACKEND_OPS_DB_TYPES) LLAMA_BENCH_KEY_PROPERTIES = [ "cpu_info", "gpu_info", "backends", "n_gpu_layers", "n_cpu_moe", "tensor_buft_overrides", "model_filename", "model_type", "n_batch", "n_ubatch", "embeddings", "cpu_mask", "cpu_strict", "poll", "n_threads", "type_k", "type_v", - "use_mmap", "no_kv_offload", "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen", "n_depth" + "use_mmap", "no_kv_offload", "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen", "n_depth", + "fit_target", "fit_min_ctx" ] # Properties by which to differentiate results per commit for test-backend-ops: diff --git a/tools/llama-bench/README.md b/tools/llama-bench/README.md index c837bb6d26..70355920b8 100644 --- a/tools/llama-bench/README.md +++ b/tools/llama-bench/README.md @@ -62,6 +62,8 @@ test parameters: -ot --override-tensors =;... (default: disabled) -nopo, --no-op-offload <0|1> (default: 0) + -fitt, --fit-target fit model to device memory with this margin per device in MiB (default: off) + -fitc, --fit-ctx minimum ctx size for --fit-target (default: 4096) Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times. Ranges can be given as diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index 0a23f69853..0b395b460e 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -342,6 +342,8 @@ struct cmd_params { std::vector embeddings; std::vector no_op_offload; std::vector no_host; + std::vector fit_params_target; + std::vector fit_params_min_ctx; ggml_numa_strategy numa; int reps; ggml_sched_priority prio; @@ -384,6 +386,8 @@ static const cmd_params cmd_params_defaults = { /* embeddings */ { false }, /* no_op_offload */ { false }, /* no_host */ { false }, + /* fit_params_target */ { 0 }, + /* fit_params_min_ctx */ { 0 }, /* numa */ GGML_NUMA_STRATEGY_DISABLED, /* reps */ 5, /* prio */ GGML_SCHED_PRIO_NORMAL, @@ -410,6 +414,8 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -v, --verbose verbose output\n"); printf(" --progress print test progress indicators\n"); printf(" --no-warmup skip warmup runs before benchmarking\n"); + printf(" -fitt, --fit-target fit model to device memory with this margin per device in MiB (default: off)\n"); + printf(" -fitc, --fit-ctx minimum ctx size for --fit-target (default: 4096)\n"); if (llama_supports_rpc()) { printf(" -rpc, --rpc register RPC devices (comma separated)\n"); } @@ -958,6 +964,24 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { params.progress = true; } else if (arg == "--no-warmup") { params.no_warmup = true; + } else if (arg == "-fitt" || arg == "--fit-target") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + for (const auto & v : p) { + params.fit_params_target.push_back(std::stoull(v)); + } + } else if (arg == "-fitc" || arg == "--fit-ctx") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + for (const auto & v : p) { + params.fit_params_min_ctx.push_back(std::stoul(v)); + } } else { invalid_param = true; break; @@ -1078,6 +1102,12 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.poll.empty()) { params.poll = cmd_params_defaults.poll; } + if (params.fit_params_target.empty()) { + params.fit_params_target = cmd_params_defaults.fit_params_target; + } + if (params.fit_params_min_ctx.empty()) { + params.fit_params_min_ctx = cmd_params_defaults.fit_params_min_ctx; + } return params; } @@ -1109,6 +1139,8 @@ struct cmd_params_instance { bool embeddings; bool no_op_offload; bool no_host; + size_t fit_target; + uint32_t fit_min_ctx; llama_model_params to_llama_mparams() const { llama_model_params mparams = llama_model_default_params(); @@ -1197,6 +1229,8 @@ static std::vector get_cmd_params_instances(const cmd_param // this ordering minimizes the number of times that each model needs to be reloaded // clang-format off for (const auto & m : params.model) + for (const auto & fpt : params.fit_params_target) + for (const auto & fpc : params.fit_params_min_ctx) for (const auto & nl : params.n_gpu_layers) for (const auto & ncmoe : params.n_cpu_moe) for (const auto & sm : params.split_mode) @@ -1251,6 +1285,8 @@ static std::vector get_cmd_params_instances(const cmd_param /* .embeddings = */ embd, /* .no_op_offload= */ nopo, /* .no_host = */ noh, + /* .fit_target = */ fpt, + /* .fit_min_ctx = */ fpc, }; instances.push_back(instance); } @@ -1286,6 +1322,8 @@ static std::vector get_cmd_params_instances(const cmd_param /* .embeddings = */ embd, /* .no_op_offload= */ nopo, /* .no_host = */ noh, + /* .fit_target = */ fpt, + /* .fit_min_ctx = */ fpc, }; instances.push_back(instance); } @@ -1321,6 +1359,8 @@ static std::vector get_cmd_params_instances(const cmd_param /* .embeddings = */ embd, /* .no_op_offload= */ nopo, /* .no_host = */ noh, + /* .fit_target = */ fpt, + /* .fit_min_ctx = */ fpc, }; instances.push_back(instance); } @@ -1361,6 +1401,8 @@ struct test { bool embeddings; bool no_op_offload; bool no_host; + size_t fit_target; + uint32_t fit_min_ctx; int n_prompt; int n_gen; int n_depth; @@ -1399,6 +1441,8 @@ struct test { embeddings = inst.embeddings; no_op_offload = inst.no_op_offload; no_host = inst.no_host; + fit_target = inst.fit_target; + fit_min_ctx = inst.fit_min_ctx; n_prompt = inst.n_prompt; n_gen = inst.n_gen; n_depth = inst.n_depth; @@ -1456,7 +1500,8 @@ struct test { "type_k", "type_v", "n_gpu_layers", "n_cpu_moe", "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "devices", "tensor_split", "tensor_buft_overrides", "use_mmap", "use_direct_io", "embeddings", - "no_op_offload", "no_host", "n_prompt", "n_gen", "n_depth", + "no_op_offload", "no_host", "fit_target", "fit_min_ctx", + "n_prompt", "n_gen", "n_depth", "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts" }; return fields; @@ -1468,7 +1513,8 @@ struct test { if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" || field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" || field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" || field == "avg_ns" || - field == "stddev_ns" || field == "no_op_offload" || field == "n_cpu_moe") { + field == "stddev_ns" || field == "no_op_offload" || field == "n_cpu_moe" || + field == "fit_target" || field == "fit_min_ctx") { return INT; } if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" || @@ -1549,6 +1595,8 @@ struct test { std::to_string(embeddings), std::to_string(no_op_offload), std::to_string(no_host), + std::to_string(fit_target), + std::to_string(fit_min_ctx), std::to_string(n_prompt), std::to_string(n_gen), std::to_string(n_depth), @@ -1792,6 +1840,12 @@ struct markdown_printer : public printer { if (field == "tensor_buft_overrides") { return "ot"; } + if (field == "fit_target") { + return "fitt"; + } + if (field == "fit_min_ctx") { + return "fitc"; + } return field; } @@ -1870,6 +1924,12 @@ struct markdown_printer : public printer { if (params.no_host.size() > 1 || params.no_host != cmd_params_defaults.no_host) { fields.emplace_back("no_host"); } + if (params.fit_params_target.size() > 1 || params.fit_params_target != cmd_params_defaults.fit_params_target) { + fields.emplace_back("fit_target"); + } + if (params.fit_params_min_ctx.size() > 1 || params.fit_params_min_ctx != cmd_params_defaults.fit_params_min_ctx) { + fields.emplace_back("fit_min_ctx"); + } fields.emplace_back("test"); fields.emplace_back("t/s"); @@ -2141,13 +2201,49 @@ int main(int argc, char ** argv) { if (params.progress) { fprintf(stderr, "llama-bench: benchmark %d/%zu: starting\n", params_idx, params_count); } + auto mparams = inst.to_llama_mparams(); + auto cparams = inst.to_llama_cparams(); + + bool do_fit = inst.fit_target != cmd_params_defaults.fit_params_target[0] || + inst.fit_min_ctx != cmd_params_defaults.fit_params_min_ctx[0]; + + std::vector fit_tensor_split(llama_max_devices(), 0.0f); + std::vector fit_overrides(llama_max_tensor_buft_overrides(), {nullptr, nullptr}); + + if (do_fit) { + // free the previous model so fit sees full free VRAM + if (lmodel) { + llama_model_free(lmodel); + lmodel = nullptr; + prev_inst = nullptr; + } + + // use default n_gpu_layers and n_ctx so llama_params_fit can adjust them + mparams.n_gpu_layers = llama_model_default_params().n_gpu_layers; + mparams.tensor_split = fit_tensor_split.data(); + mparams.tensor_buft_overrides = fit_overrides.data(); + cparams.n_ctx = 0; + + std::vector margins(llama_max_devices(), inst.fit_target * 1024 * 1024); + + uint32_t n_ctx_needed = inst.n_prompt + inst.n_gen + inst.n_depth; + cparams.n_ctx = std::max(cparams.n_ctx, n_ctx_needed); + + llama_params_fit(inst.model.c_str(), &mparams, &cparams, + fit_tensor_split.data(), + fit_overrides.data(), + margins.data(), + inst.fit_min_ctx, + params.verbose ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR); + } + // keep the same model between tests when possible if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) { if (lmodel) { llama_model_free(lmodel); } - lmodel = llama_model_load_from_file(inst.model.c_str(), inst.to_llama_mparams()); + lmodel = llama_model_load_from_file(inst.model.c_str(), mparams); if (lmodel == NULL) { fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str()); return 1; @@ -2155,7 +2251,7 @@ int main(int argc, char ** argv) { prev_inst = &inst; } - llama_context * ctx = llama_init_from_model(lmodel, inst.to_llama_cparams()); + llama_context * ctx = llama_init_from_model(lmodel, cparams); if (ctx == NULL) { fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str()); llama_model_free(lmodel);