docs: update llama-bench docs
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
This commit is contained in:
parent
0935e842b0
commit
d7fcab8cde
|
|
@ -20,48 +20,59 @@ Performance testing tool for llama.cpp.
|
||||||
## Syntax
|
## Syntax
|
||||||
|
|
||||||
```
|
```
|
||||||
usage: llama-bench [options]
|
usage: build/bin/llama-bench [options]
|
||||||
|
|
||||||
options:
|
options:
|
||||||
-h, --help
|
-h, --help
|
||||||
--numa <distribute|isolate|numactl> numa mode (default: disabled)
|
--numa <distribute|isolate|numactl> numa mode (default: disabled)
|
||||||
-r, --repetitions <n> number of times to repeat each test (default: 5)
|
-r, --repetitions <n> number of times to repeat each test (default: 5)
|
||||||
--prio <0|1|2|3> process/thread priority (default: 0)
|
--prio <-1|0|1|2|3> process/thread priority (default: 0)
|
||||||
--delay <0...N> (seconds) delay between each test (default: 0)
|
--delay <0...N> (seconds) delay between each test (default: 0)
|
||||||
-o, --output <csv|json|jsonl|md|sql> output format printed to stdout (default: md)
|
-o, --output <csv|json|jsonl|md|sql> output format printed to stdout (default: md)
|
||||||
-oe, --output-err <csv|json|jsonl|md|sql> output format printed to stderr (default: none)
|
-oe, --output-err <csv|json|jsonl|md|sql> output format printed to stderr (default: none)
|
||||||
--list-devices list available devices and exit
|
--list-devices list available devices and exit
|
||||||
-v, --verbose verbose output
|
-v, --verbose verbose output
|
||||||
--progress print test progress indicators
|
--progress print test progress indicators
|
||||||
-rpc, --rpc <rpc_servers> register RPC devices (comma separated)
|
--no-warmup skip warmup runs before benchmarking
|
||||||
|
|
||||||
test parameters:
|
test parameters:
|
||||||
-m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf)
|
-m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf)
|
||||||
-p, --n-prompt <n> (default: 512)
|
-hf, -hfr, --hf-repo <user>/<model>[:quant] Hugging Face model repository; quant is optional, case-insensitive
|
||||||
-n, --n-gen <n> (default: 128)
|
default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.
|
||||||
-pg <pp,tg> (default: )
|
example: unsloth/phi-4-GGUF:Q4_K_M
|
||||||
-d, --n-depth <n> (default: 0)
|
(default: unused)
|
||||||
-b, --batch-size <n> (default: 2048)
|
-hff, --hf-file <file> Hugging Face model file. If specified, it will override the quant in --hf-repo
|
||||||
-ub, --ubatch-size <n> (default: 512)
|
(default: unused)
|
||||||
-ctk, --cache-type-k <t> (default: f16)
|
-hft, --hf-token <token> Hugging Face access token
|
||||||
-ctv, --cache-type-v <t> (default: f16)
|
(default: value from HF_TOKEN environment variable)
|
||||||
-t, --threads <n> (default: system dependent)
|
-p, --n-prompt <n> (default: 512)
|
||||||
-C, --cpu-mask <hex,hex> (default: 0x0)
|
-n, --n-gen <n> (default: 128)
|
||||||
--cpu-strict <0|1> (default: 0)
|
-pg <pp,tg> (default: )
|
||||||
--poll <0...100> (default: 50)
|
-d, --n-depth <n> (default: 0)
|
||||||
-ngl, --n-gpu-layers <n> (default: 99)
|
-b, --batch-size <n> (default: 2048)
|
||||||
-ncmoe, --n-cpu-moe <n> (default: 0)
|
-ub, --ubatch-size <n> (default: 512)
|
||||||
-sm, --split-mode <none|layer|row> (default: layer)
|
-ctk, --cache-type-k <t> (default: f16)
|
||||||
-mg, --main-gpu <i> (default: 0)
|
-ctv, --cache-type-v <t> (default: f16)
|
||||||
-nkvo, --no-kv-offload <0|1> (default: 0)
|
-t, --threads <n> (default: 8)
|
||||||
-fa, --flash-attn <0|1> (default: 0)
|
-C, --cpu-mask <hex,hex> (default: 0x0)
|
||||||
-dev, --device <dev0/dev1/...> (default: auto)
|
--cpu-strict <0|1> (default: 0)
|
||||||
-mmp, --mmap <0|1> (default: 1)
|
--poll <0...100> (default: 50)
|
||||||
-embd, --embeddings <0|1> (default: 0)
|
-ngl, --n-gpu-layers <n> (default: 99)
|
||||||
-ts, --tensor-split <ts0/ts1/..> (default: 0)
|
-ncmoe, --n-cpu-moe <n> (default: 0)
|
||||||
-ot --override-tensors <tensor name pattern>=<buffer type>;...
|
-sm, --split-mode <none|layer|row> (default: layer)
|
||||||
(default: disabled)
|
-mg, --main-gpu <i> (default: 0)
|
||||||
-nopo, --no-op-offload <0|1> (default: 0)
|
-nkvo, --no-kv-offload <0|1> (default: 0)
|
||||||
|
-fa, --flash-attn <0|1> (default: 0)
|
||||||
|
-dev, --device <dev0/dev1/...> (default: auto)
|
||||||
|
-mmp, --mmap <0|1> (DEPRECATED)
|
||||||
|
-dio, --direct-io <0|1> (DEPRECATED)
|
||||||
|
-lm, --load-mode <none|mlock|mmap|dio> (default: mmap)
|
||||||
|
-embd, --embeddings <0|1> (default: 0)
|
||||||
|
-ts, --tensor-split <ts0/ts1/..> (default: 0)
|
||||||
|
-ot --override-tensor <tensor name pattern>=<buffer type>;...
|
||||||
|
(default: disabled)
|
||||||
|
-nopo, --no-op-offload <0|1> (default: 0)
|
||||||
|
--no-host <0|1> (default: 0)
|
||||||
|
|
||||||
Multiple values can be given for each parameter by separating them with ','
|
Multiple values can be given for each parameter by separating them with ','
|
||||||
or by specifying the parameter multiple times. Ranges can be given as
|
or by specifying the parameter multiple times. Ranges can be given as
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue