From ae87863dc10f659a0094450655cccae96609dd33 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Mon, 9 Mar 2026 09:05:44 +0800 Subject: [PATCH] llama-bench: introduce `-hf` and `-hff` flags & use `--mmap 1` by default (#20211) --- tools/llama-bench/llama-bench.cpp | 180 +++++++++++++++++++----------- 1 file changed, 116 insertions(+), 64 deletions(-) diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index 7a75026550..b0f1d6b936 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -20,6 +20,7 @@ #include #include "common.h" +#include "download.h" #include "ggml.h" #include "llama.h" @@ -312,6 +313,9 @@ static std::vector parse_int_range(const std::string & s) { struct cmd_params { std::vector model; + std::vector hf_repo; + std::vector hf_file; + std::string hf_token; std::vector n_prompt; std::vector n_gen; std::vector> n_pg; @@ -351,6 +355,9 @@ struct cmd_params { static const cmd_params cmd_params_defaults = { /* model */ { "models/7B/ggml-model-q4_0.gguf" }, + /* hf_repo */ {}, + /* hf_file */ {}, + /* hf_token */ "", /* n_prompt */ { 512 }, /* n_gen */ { 128 }, /* n_pg */ {}, @@ -372,7 +379,7 @@ static const cmd_params cmd_params_defaults = { /* devices */ { {} }, /* tensor_split */ { std::vector(llama_max_devices(), 0.0f) }, /* tensor_buft_overrides*/ { std::vector{ { nullptr, nullptr } } }, - /* use_mmap */ { false }, + /* use_mmap */ { true }, /* use_direct_io */ { false }, /* embeddings */ { false }, /* no_op_offload */ { false }, @@ -393,74 +400,57 @@ static void print_usage(int /* argc */, char ** argv) { printf("\n"); printf("options:\n"); printf(" -h, --help\n"); - printf(" --numa numa mode (default: disabled)\n"); - printf(" -r, --repetitions number of times to repeat each test (default: %d)\n", - cmd_params_defaults.reps); - printf(" --prio <-1|0|1|2|3> process/thread priority (default: %d)\n", - cmd_params_defaults.prio); - printf(" --delay <0...N> (seconds) delay between each test (default: %d)\n", - cmd_params_defaults.delay); - printf(" -o, --output output format printed to stdout (default: %s)\n", - output_format_str(cmd_params_defaults.output_format)); - printf(" -oe, --output-err output format printed to stderr (default: %s)\n", - output_format_str(cmd_params_defaults.output_format_stderr)); - printf(" --list-devices list available devices and exit\n"); - printf(" -v, --verbose verbose output\n"); - printf(" --progress print test progress indicators\n"); - printf(" --no-warmup skip warmup runs before benchmarking\n"); + printf(" --numa numa mode (default: disabled)\n"); + printf(" -r, --repetitions number of times to repeat each test (default: %d)\n", cmd_params_defaults.reps); + printf(" --prio <-1|0|1|2|3> process/thread priority (default: %d)\n", cmd_params_defaults.prio); + printf(" --delay <0...N> (seconds) delay between each test (default: %d)\n", cmd_params_defaults.delay); + printf(" -o, --output output format printed to stdout (default: %s)\n", output_format_str(cmd_params_defaults.output_format)); + printf(" -oe, --output-err output format printed to stderr (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr)); + printf(" --list-devices list available devices and exit\n"); + printf(" -v, --verbose verbose output\n"); + printf(" --progress print test progress indicators\n"); + printf(" --no-warmup skip warmup runs before benchmarking\n"); if (llama_supports_rpc()) { - printf(" -rpc, --rpc register RPC devices (comma separated)\n"); + printf(" -rpc, --rpc register RPC devices (comma separated)\n"); } printf("\n"); printf("test parameters:\n"); - printf(" -m, --model (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); - printf(" -p, --n-prompt (default: %s)\n", - join(cmd_params_defaults.n_prompt, ",").c_str()); - printf(" -n, --n-gen (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); - printf(" -pg (default: %s)\n", - join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str()); - printf(" -d, --n-depth (default: %s)\n", - join(cmd_params_defaults.n_depth, ",").c_str()); - printf(" -b, --batch-size (default: %s)\n", - join(cmd_params_defaults.n_batch, ",").c_str()); - printf(" -ub, --ubatch-size (default: %s)\n", - join(cmd_params_defaults.n_ubatch, ",").c_str()); - printf(" -ctk, --cache-type-k (default: %s)\n", - join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); - printf(" -ctv, --cache-type-v (default: %s)\n", - join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); - printf(" -t, --threads (default: %s)\n", - join(cmd_params_defaults.n_threads, ",").c_str()); - printf(" -C, --cpu-mask (default: %s)\n", - join(cmd_params_defaults.cpu_mask, ",").c_str()); - printf(" --cpu-strict <0|1> (default: %s)\n", - join(cmd_params_defaults.cpu_strict, ",").c_str()); - printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str()); - printf(" -ngl, --n-gpu-layers (default: %s)\n", - join(cmd_params_defaults.n_gpu_layers, ",").c_str()); - printf(" -ncmoe, --n-cpu-moe (default: %s)\n", - join(cmd_params_defaults.n_cpu_moe, ",").c_str()); - printf(" -sm, --split-mode (default: %s)\n", - join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); - printf(" -mg, --main-gpu (default: %s)\n", - join(cmd_params_defaults.main_gpu, ",").c_str()); - printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", - join(cmd_params_defaults.no_kv_offload, ",").c_str()); - printf(" -fa, --flash-attn <0|1> (default: %s)\n", - join(cmd_params_defaults.flash_attn, ",").c_str()); - printf(" -dev, --device (default: auto)\n"); - printf(" -mmp, --mmap <0|1> (default: %s)\n", - join(cmd_params_defaults.use_mmap, ",").c_str()); - printf(" -dio, --direct-io <0|1> (default: %s)\n", - join(cmd_params_defaults.use_direct_io, ",").c_str()); - printf(" -embd, --embeddings <0|1> (default: %s)\n", - join(cmd_params_defaults.embeddings, ",").c_str()); - printf(" -ts, --tensor-split (default: 0)\n"); + printf(" -m, --model (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); + printf(" -hf, -hfr, --hf-repo /[:quant] Hugging Face model repository; quant is optional, case-insensitive\n"); + printf(" default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"); + printf(" example: unsloth/phi-4-GGUF:Q4_K_M\n"); + printf(" (default: unused)\n"); + printf(" -hff, --hf-file Hugging Face model file. If specified, it will override the quant in --hf-repo\n"); + printf(" (default: unused)\n"); + printf(" -hft, --hf-token Hugging Face access token\n"); + printf(" (default: value from HF_TOKEN environment variable)\n"); + printf(" -p, --n-prompt (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str()); + printf(" -n, --n-gen (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); + printf(" -pg (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str()); + printf(" -d, --n-depth (default: %s)\n", join(cmd_params_defaults.n_depth, ",").c_str()); + printf(" -b, --batch-size (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str()); + printf(" -ub, --ubatch-size (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str()); + printf(" -ctk, --cache-type-k (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); + printf(" -ctv, --cache-type-v (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); + printf(" -t, --threads (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); + printf(" -C, --cpu-mask (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str()); + printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str()); + printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str()); + printf(" -ngl, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); + printf(" -ncmoe, --n-cpu-moe (default: %s)\n", join(cmd_params_defaults.n_cpu_moe, ",").c_str()); + printf(" -sm, --split-mode (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); + printf(" -mg, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); + printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); + printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str()); + printf(" -dev, --device (default: auto)\n"); + printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str()); + printf(" -dio, --direct-io <0|1> (default: %s)\n", join(cmd_params_defaults.use_direct_io, ",").c_str()); + printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str()); + printf(" -ts, --tensor-split (default: 0)\n"); printf(" -ot --override-tensor =;...\n"); - printf(" (default: disabled)\n"); - printf(" -nopo, --no-op-offload <0|1> (default: 0)\n"); - printf(" --no-host <0|1> (default: %s)\n", - join(cmd_params_defaults.no_host, ",").c_str()); + printf(" (default: disabled)\n"); + printf(" -nopo, --no-op-offload <0|1> (default: 0)\n"); + printf(" --no-host <0|1> (default: %s)\n", join(cmd_params_defaults.no_host, ",").c_str()); printf("\n"); printf( "Multiple values can be given for each parameter by separating them with ','\n" @@ -514,6 +504,10 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { params.progress = cmd_params_defaults.progress; params.no_warmup = cmd_params_defaults.no_warmup; + if (const char * env = getenv("HF_TOKEN")) { + params.hf_token = env; + } + for (int i = 1; i < argc; i++) { arg = argv[i]; if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { @@ -531,6 +525,26 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = string_split(argv[i], split_delim); params.model.insert(params.model.end(), p.begin(), p.end()); + } else if (arg == "-hf" || arg == "-hfr" || arg == "--hf-repo") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + params.hf_repo.insert(params.hf_repo.end(), p.begin(), p.end()); + } else if (arg == "-hff" || arg == "--hf-file") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + params.hf_file.insert(params.hf_file.end(), p.begin(), p.end()); + } else if (arg == "-hft" || arg == "--hf-token") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.hf_token = argv[i]; } else if (arg == "-p" || arg == "--n-prompt") { if (++i >= argc) { invalid_param = true; @@ -961,6 +975,44 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { exit(1); } + if (!params.hf_repo.empty()) { + for (size_t i = 0; i < params.hf_repo.size(); i++) { + common_params_model model; + + // step 1: no `-hff` provided, we auto-detect based on the `-hf` flag + if (params.hf_file.empty() || params.hf_file[i].empty()) { + auto auto_detected = common_get_hf_file(params.hf_repo[i], params.hf_token, false); + if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) { + exit(1); + } + + model.name = params.hf_repo[i]; + model.hf_repo = auto_detected.repo; + model.hf_file = auto_detected.ggufFile; + } else { + model.hf_file = params.hf_file[i]; + } + + // step 2: construct the model cache path + std::string clean_fname = model.hf_repo + "_" + model.hf_file; + string_replace_all(clean_fname, "\\", "_"); + string_replace_all(clean_fname, "/", "_"); + model.path = fs_get_cache_file(clean_fname); + + // step 3: download the model if not exists + std::string model_endpoint = get_model_endpoint(); + model.url = model_endpoint + model.hf_repo + "/resolve/main/" + model.hf_file; + + bool ok = common_download_model(model, params.hf_token, false); + if (!ok) { + fprintf(stderr, "error: failed to download model from %s\n", model.url.c_str()); + exit(1); + } + + params.model.push_back(model.path); + } + } + // set defaults if (params.model.empty()) { params.model = cmd_params_defaults.model;