diff --git a/tools/llama-bench/README.md b/tools/llama-bench/README.md index c837bb6d26..bd6d11a19e 100644 --- a/tools/llama-bench/README.md +++ b/tools/llama-bench/README.md @@ -20,48 +20,59 @@ Performance testing tool for llama.cpp. ## Syntax ``` -usage: llama-bench [options] +usage: build/bin/llama-bench [options] options: -h, --help - --numa numa mode (default: disabled) - -r, --repetitions number of times to repeat each test (default: 5) - --prio <0|1|2|3> process/thread priority (default: 0) - --delay <0...N> (seconds) delay between each test (default: 0) - -o, --output output format printed to stdout (default: md) - -oe, --output-err output format printed to stderr (default: none) - --list-devices list available devices and exit - -v, --verbose verbose output - --progress print test progress indicators - -rpc, --rpc register RPC devices (comma separated) + --numa numa mode (default: disabled) + -r, --repetitions number of times to repeat each test (default: 5) + --prio <-1|0|1|2|3> process/thread priority (default: 0) + --delay <0...N> (seconds) delay between each test (default: 0) + -o, --output output format printed to stdout (default: md) + -oe, --output-err output format printed to stderr (default: none) + --list-devices list available devices and exit + -v, --verbose verbose output + --progress print test progress indicators + --no-warmup skip warmup runs before benchmarking test parameters: - -m, --model (default: models/7B/ggml-model-q4_0.gguf) - -p, --n-prompt (default: 512) - -n, --n-gen (default: 128) - -pg (default: ) - -d, --n-depth (default: 0) - -b, --batch-size (default: 2048) - -ub, --ubatch-size (default: 512) - -ctk, --cache-type-k (default: f16) - -ctv, --cache-type-v (default: f16) - -t, --threads (default: system dependent) - -C, --cpu-mask (default: 0x0) - --cpu-strict <0|1> (default: 0) - --poll <0...100> (default: 50) - -ngl, --n-gpu-layers (default: 99) - -ncmoe, --n-cpu-moe (default: 0) - -sm, --split-mode (default: layer) - -mg, --main-gpu (default: 0) - -nkvo, --no-kv-offload <0|1> (default: 0) - -fa, --flash-attn <0|1> (default: 0) - -dev, --device (default: auto) - -mmp, --mmap <0|1> (default: 1) - -embd, --embeddings <0|1> (default: 0) - -ts, --tensor-split (default: 0) - -ot --override-tensors =;... - (default: disabled) - -nopo, --no-op-offload <0|1> (default: 0) + -m, --model (default: models/7B/ggml-model-q4_0.gguf) + -hf, -hfr, --hf-repo /[:quant] Hugging Face model repository; quant is optional, case-insensitive + default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist. + example: unsloth/phi-4-GGUF:Q4_K_M + (default: unused) + -hff, --hf-file Hugging Face model file. If specified, it will override the quant in --hf-repo + (default: unused) + -hft, --hf-token Hugging Face access token + (default: value from HF_TOKEN environment variable) + -p, --n-prompt (default: 512) + -n, --n-gen (default: 128) + -pg (default: ) + -d, --n-depth (default: 0) + -b, --batch-size (default: 2048) + -ub, --ubatch-size (default: 512) + -ctk, --cache-type-k (default: f16) + -ctv, --cache-type-v (default: f16) + -t, --threads (default: 8) + -C, --cpu-mask (default: 0x0) + --cpu-strict <0|1> (default: 0) + --poll <0...100> (default: 50) + -ngl, --n-gpu-layers (default: 99) + -ncmoe, --n-cpu-moe (default: 0) + -sm, --split-mode (default: layer) + -mg, --main-gpu (default: 0) + -nkvo, --no-kv-offload <0|1> (default: 0) + -fa, --flash-attn <0|1> (default: 0) + -dev, --device (default: auto) + -mmp, --mmap <0|1> (DEPRECATED) + -dio, --direct-io <0|1> (DEPRECATED) + -lm, --load-mode (default: mmap) + -embd, --embeddings <0|1> (default: 0) + -ts, --tensor-split (default: 0) + -ot --override-tensor =;... + (default: disabled) + -nopo, --no-op-offload <0|1> (default: 0) + --no-host <0|1> (default: 0) Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times. Ranges can be given as