From 16639ba2178fc1ccbae912d678d8b515dc4dc322 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Sun, 25 Jan 2026 07:31:42 +0100 Subject: [PATCH] common : use two decimal places for float arg help messages (#19048) * common : use two decimal places for float arg help messages This commit updates the help messages for various command-line arguments in arg.cpp to display floating-point default values with two decimal places instead of one. The motivation for this changes is that currently only having one decimal place means that values generated using --help or llama-gen-docs will not display the correct values. For example, currently the value of top-p in tools/server/README.md is `0.9`, but the default value is actually '0.95'. And running llama-gen-docs does not update this value as it uses the output from the help message, which shows only one decimal place, so the values look like they are unchanged. * docs : run llama-gen-docs to update docs --- common/arg.cpp | 42 +++++++++++++++---------------- tools/cli/README.md | 48 +++++++++++++++++------------------ tools/completion/README.md | 44 ++++++++++++++++---------------- tools/server/README.md | 51 +++++++++++++++++++------------------- 4 files changed, 94 insertions(+), 91 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 98477e8117..04fd375d56 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1577,7 +1577,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--temp"}, "N", - string_format("temperature (default: %.1f)", (double)params.sampling.temp), + string_format("temperature (default: %.2f)", (double)params.sampling.temp), [](common_params & params, const std::string & value) { params.sampling.temp = std::stof(value); params.sampling.temp = std::max(params.sampling.temp, 0.0f); @@ -1594,7 +1594,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam().set_env("LLAMA_ARG_TOP_K")); add_opt(common_arg( {"--top-p"}, "N", - string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p), + string_format("top-p sampling (default: %.2f, 1.0 = disabled)", (double)params.sampling.top_p), [](common_params & params, const std::string & value) { params.sampling.top_p = std::stof(value); params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_P; @@ -1602,7 +1602,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--min-p"}, "N", - string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sampling.min_p), + string_format("min-p sampling (default: %.2f, 0.0 = disabled)", (double)params.sampling.min_p), [](common_params & params, const std::string & value) { params.sampling.min_p = std::stof(value); params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIN_P; @@ -1610,14 +1610,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--top-nsigma"}, "N", - string_format("top-n-sigma sampling (default: %.1f, -1.0 = disabled)", params.sampling.top_n_sigma), + string_format("top-n-sigma sampling (default: %.2f, -1.0 = disabled)", params.sampling.top_n_sigma), [](common_params & params, const std::string & value) { params.sampling.top_n_sigma = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--xtc-probability"}, "N", - string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability), + string_format("xtc probability (default: %.2f, 0.0 = disabled)", (double)params.sampling.xtc_probability), [](common_params & params, const std::string & value) { params.sampling.xtc_probability = std::stof(value); params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_PROBABILITY; @@ -1625,7 +1625,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--xtc-threshold"}, "N", - string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sampling.xtc_threshold), + string_format("xtc threshold (default: %.2f, 1.0 = disabled)", (double)params.sampling.xtc_threshold), [](common_params & params, const std::string & value) { params.sampling.xtc_threshold = std::stof(value); params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_XTC_THRESHOLD; @@ -1633,7 +1633,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--typical"}, "N", - string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sampling.typ_p), + string_format("locally typical sampling, parameter p (default: %.2f, 1.0 = disabled)", (double)params.sampling.typ_p), [](common_params & params, const std::string & value) { params.sampling.typ_p = std::stof(value); } @@ -1652,7 +1652,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--repeat-penalty"}, "N", - string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sampling.penalty_repeat), + string_format("penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)", (double)params.sampling.penalty_repeat), [](common_params & params, const std::string & value) { params.sampling.penalty_repeat = std::stof(value); params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_PENALTY_REPEAT; @@ -1660,21 +1660,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--presence-penalty"}, "N", - string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_present), + string_format("repeat alpha presence penalty (default: %.2f, 0.0 = disabled)", (double)params.sampling.penalty_present), [](common_params & params, const std::string & value) { params.sampling.penalty_present = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--frequency-penalty"}, "N", - string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_freq), + string_format("repeat alpha frequency penalty (default: %.2f, 0.0 = disabled)", (double)params.sampling.penalty_freq), [](common_params & params, const std::string & value) { params.sampling.penalty_freq = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--dry-multiplier"}, "N", - string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sampling.dry_multiplier), + string_format("set DRY sampling multiplier (default: %.2f, 0.0 = disabled)", (double)params.sampling.dry_multiplier), [](common_params & params, const std::string & value) { params.sampling.dry_multiplier = std::stof(value); } @@ -1755,14 +1755,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--dynatemp-range"}, "N", - string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sampling.dynatemp_range), + string_format("dynamic temperature range (default: %.2f, 0.0 = disabled)", (double)params.sampling.dynatemp_range), [](common_params & params, const std::string & value) { params.sampling.dynatemp_range = std::stof(value); } ).set_sparam()); add_opt(common_arg( {"--dynatemp-exp"}, "N", - string_format("dynamic temperature exponent (default: %.1f)", (double)params.sampling.dynatemp_exponent), + string_format("dynamic temperature exponent (default: %.2f)", (double)params.sampling.dynatemp_exponent), [](common_params & params, const std::string & value) { params.sampling.dynatemp_exponent = std::stof(value); } @@ -1778,7 +1778,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--mirostat-lr"}, "N", - string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sampling.mirostat_eta), + string_format("Mirostat learning rate, parameter eta (default: %.2f)", (double)params.sampling.mirostat_eta), [](common_params & params, const std::string & value) { params.sampling.mirostat_eta = std::stof(value); params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA; @@ -1786,7 +1786,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_sparam()); add_opt(common_arg( {"--mirostat-ent"}, "N", - string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sampling.mirostat_tau), + string_format("Mirostat target entropy, parameter tau (default: %.2f)", (double)params.sampling.mirostat_tau), [](common_params & params, const std::string & value) { params.sampling.mirostat_tau = std::stof(value); params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_TAU; @@ -1920,28 +1920,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_env("LLAMA_ARG_YARN_ORIG_CTX")); add_opt(common_arg( {"--yarn-ext-factor"}, "N", - string_format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor), + string_format("YaRN: extrapolation mix factor (default: %.2f, 0.0 = full interpolation)", (double)params.yarn_ext_factor), [](common_params & params, const std::string & value) { params.yarn_ext_factor = std::stof(value); } ).set_env("LLAMA_ARG_YARN_EXT_FACTOR")); add_opt(common_arg( {"--yarn-attn-factor"}, "N", - string_format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor), + string_format("YaRN: scale sqrt(t) or attention magnitude (default: %.2f)", (double)params.yarn_attn_factor), [](common_params & params, const std::string & value) { params.yarn_attn_factor = std::stof(value); } ).set_env("LLAMA_ARG_YARN_ATTN_FACTOR")); add_opt(common_arg( {"--yarn-beta-slow"}, "N", - string_format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow), + string_format("YaRN: high correction dim or alpha (default: %.2f)", (double)params.yarn_beta_slow), [](common_params & params, const std::string & value) { params.yarn_beta_slow = std::stof(value); } ).set_env("LLAMA_ARG_YARN_BETA_SLOW")); add_opt(common_arg( {"--yarn-beta-fast"}, "N", - string_format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast), + string_format("YaRN: low correction dim or beta (default: %.2f)", (double)params.yarn_beta_fast), [](common_params & params, const std::string & value) { params.yarn_beta_fast = std::stof(value); } @@ -3335,14 +3335,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MIN")); add_opt(common_arg( {"--draft-p-split"}, "P", - string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split), + string_format("speculative decoding split probability (default: %.2f)", (double)params.speculative.p_split), [](common_params & params, const std::string & value) { params.speculative.p_split = std::stof(value); } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_DRAFT_P_SPLIT")); add_opt(common_arg( {"--draft-p-min"}, "P", - string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min), + string_format("minimum speculative decoding probability (greedy) (default: %.2f)", (double)params.speculative.p_min), [](common_params & params, const std::string & value) { params.speculative.p_min = std::stof(value); } diff --git a/tools/cli/README.md b/tools/cli/README.md index 3b6f0708ed..4a15cbad9d 100644 --- a/tools/cli/README.md +++ b/tools/cli/README.md @@ -45,10 +45,10 @@ | `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model)
(env: LLAMA_ARG_ROPE_FREQ_BASE) | | `--rope-freq-scale N` | RoPE frequency scaling factor, expands context by a factor of 1/N
(env: LLAMA_ARG_ROPE_FREQ_SCALE) | | `--yarn-orig-ctx N` | YaRN: original context size of model (default: 0 = model training context size)
(env: LLAMA_ARG_YARN_ORIG_CTX) | -| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation)
(env: LLAMA_ARG_YARN_EXT_FACTOR) | -| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.0)
(env: LLAMA_ARG_YARN_ATTN_FACTOR) | -| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.0)
(env: LLAMA_ARG_YARN_BETA_SLOW) | -| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.0)
(env: LLAMA_ARG_YARN_BETA_FAST) | +| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.00, 0.0 = full interpolation)
(env: LLAMA_ARG_YARN_EXT_FACTOR) | +| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.00)
(env: LLAMA_ARG_YARN_ATTN_FACTOR) | +| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.00)
(env: LLAMA_ARG_YARN_BETA_SLOW) | +| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.00)
(env: LLAMA_ARG_YARN_BETA_FAST) | | `-kvo, --kv-offload, -nkvo, --no-kv-offload` | whether to enable KV cache offloading (default: enabled)
(env: LLAMA_ARG_KV_OFFLOAD) | | `--repack, -nr, --no-repack` | whether to enable weight repacking (default: enabled)
(env: LLAMA_ARG_REPACK) | | `--no-host` | bypass host buffer allowing extra buffers to be used
(env: LLAMA_ARG_NO_HOST) | @@ -109,30 +109,30 @@ | `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) | | `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) | | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) | -| `--temp N` | temperature (default: 0.8) | +| `--temp N` | temperature (default: 0.80) | | `--top-k N` | top-k sampling (default: 40, 0 = disabled)
(env: LLAMA_ARG_TOP_K) | -| `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) | -| `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) | -| `--adaptive-target N` | adaptive-p: select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) | -| `--adaptive-decay N` | adaptive-p: EMA decay for adaptation; effective history length ≈ 1/(1-decay) tokens (valid range 0.0 - 0.99) | -| `--top-nsigma N` | top-n-sigma sampling (default: -1.0, -1.0 = disabled) | -| `--xtc-probability N` | xtc probability (default: 0.0, 0.0 = disabled) | -| `--xtc-threshold N` | xtc threshold (default: 0.1, 1.0 = disabled) | -| `--typical N` | locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) | +| `--top-p N` | top-p sampling (default: 0.95, 1.0 = disabled) | +| `--min-p N` | min-p sampling (default: 0.05, 0.0 = disabled) | +| `--top-nsigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) | +| `--xtc-probability N` | xtc probability (default: 0.00, 0.0 = disabled) | +| `--xtc-threshold N` | xtc threshold (default: 0.10, 1.0 = disabled) | +| `--typical N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) | | `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) | -| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) | -| `--presence-penalty N` | repeat alpha presence penalty (default: 0.0, 0.0 = disabled) | -| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.0, 0.0 = disabled) | -| `--dry-multiplier N` | set DRY sampling multiplier (default: 0.0, 0.0 = disabled) | +| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled) | +| `--presence-penalty N` | repeat alpha presence penalty (default: 0.00, 0.0 = disabled) | +| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.00, 0.0 = disabled) | +| `--dry-multiplier N` | set DRY sampling multiplier (default: 0.00, 0.0 = disabled) | | `--dry-base N` | set DRY sampling base value (default: 1.75) | | `--dry-allowed-length N` | set allowed length for DRY sampling (default: 2) | | `--dry-penalty-last-n N` | set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) | | `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers ('\n', ':', '"', '*') in the process; use "none" to not use any sequence breakers | -| `--dynatemp-range N` | dynamic temperature range (default: 0.0, 0.0 = disabled) | -| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.0) | +| `--adaptive-target N` | adaptive-p: select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) (default: -1.00)
[(more info)](https://github.com/ggml-org/llama.cpp/pull/17927) | +| `--adaptive-decay N` | adaptive-p: decay rate for target adaptation over time. lower values are more reactive, higher values are more stable.
(valid range 0.0 to 0.99) (default: 0.90) | +| `--dynatemp-range N` | dynamic temperature range (default: 0.00, 0.0 = disabled) | +| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.00) | | `--mirostat N` | use Mirostat sampling.
Top K, Nucleus and Locally Typical samplers are ignored if used.
(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) | -| `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.1) | -| `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.0) | +| `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.10) | +| `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.00) | | `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,
i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' | | `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') | | `--grammar-file FNAME` | file to read grammar from | @@ -173,12 +173,12 @@ | `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)
(env: LLAMA_ARG_JINJA) | | `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content`
- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`
(default: auto)
(env: LLAMA_ARG_THINK) | | `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | -| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | -| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | +| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | +| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | | `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles | | `--draft, --draft-n, --draft-max N` | number of tokens to draft for speculative decoding (default: 16)
(env: LLAMA_ARG_DRAFT_MAX) | | `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)
(env: LLAMA_ARG_DRAFT_MIN) | -| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.8)
(env: LLAMA_ARG_DRAFT_P_MIN) | +| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.75)
(env: LLAMA_ARG_DRAFT_P_MIN) | | `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)
(env: LLAMA_ARG_CTX_SIZE_DRAFT) | | `-devd, --device-draft ` | comma-separated list of devices to use for offloading the draft model (none = don't offload)
use --list-devices to see a list of available devices | | `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) | diff --git a/tools/completion/README.md b/tools/completion/README.md index a16be3f684..3ca3e68454 100644 --- a/tools/completion/README.md +++ b/tools/completion/README.md @@ -128,10 +128,10 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model)
(env: LLAMA_ARG_ROPE_FREQ_BASE) | | `--rope-freq-scale N` | RoPE frequency scaling factor, expands context by a factor of 1/N
(env: LLAMA_ARG_ROPE_FREQ_SCALE) | | `--yarn-orig-ctx N` | YaRN: original context size of model (default: 0 = model training context size)
(env: LLAMA_ARG_YARN_ORIG_CTX) | -| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation)
(env: LLAMA_ARG_YARN_EXT_FACTOR) | -| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.0)
(env: LLAMA_ARG_YARN_ATTN_FACTOR) | -| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.0)
(env: LLAMA_ARG_YARN_BETA_SLOW) | -| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.0)
(env: LLAMA_ARG_YARN_BETA_FAST) | +| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.00, 0.0 = full interpolation)
(env: LLAMA_ARG_YARN_EXT_FACTOR) | +| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.00)
(env: LLAMA_ARG_YARN_ATTN_FACTOR) | +| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.00)
(env: LLAMA_ARG_YARN_BETA_SLOW) | +| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.00)
(env: LLAMA_ARG_YARN_BETA_FAST) | | `-kvo, --kv-offload, -nkvo, --no-kv-offload` | whether to enable KV cache offloading (default: enabled)
(env: LLAMA_ARG_KV_OFFLOAD) | | `--repack, -nr, --no-repack` | whether to enable weight repacking (default: enabled)
(env: LLAMA_ARG_REPACK) | | `--no-host` | bypass host buffer allowing extra buffers to be used
(env: LLAMA_ARG_NO_HOST) | @@ -192,28 +192,30 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) | | `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) | | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) | -| `--temp N` | temperature (default: 0.8) | +| `--temp N` | temperature (default: 0.80) | | `--top-k N` | top-k sampling (default: 40, 0 = disabled)
(env: LLAMA_ARG_TOP_K) | -| `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) | -| `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) | -| `--top-nsigma N` | top-n-sigma sampling (default: -1.0, -1.0 = disabled) | -| `--xtc-probability N` | xtc probability (default: 0.0, 0.0 = disabled) | -| `--xtc-threshold N` | xtc threshold (default: 0.1, 1.0 = disabled) | -| `--typical N` | locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) | +| `--top-p N` | top-p sampling (default: 0.95, 1.0 = disabled) | +| `--min-p N` | min-p sampling (default: 0.05, 0.0 = disabled) | +| `--top-nsigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) | +| `--xtc-probability N` | xtc probability (default: 0.00, 0.0 = disabled) | +| `--xtc-threshold N` | xtc threshold (default: 0.10, 1.0 = disabled) | +| `--typical N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) | | `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) | -| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) | -| `--presence-penalty N` | repeat alpha presence penalty (default: 0.0, 0.0 = disabled) | -| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.0, 0.0 = disabled) | -| `--dry-multiplier N` | set DRY sampling multiplier (default: 0.0, 0.0 = disabled) | +| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled) | +| `--presence-penalty N` | repeat alpha presence penalty (default: 0.00, 0.0 = disabled) | +| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.00, 0.0 = disabled) | +| `--dry-multiplier N` | set DRY sampling multiplier (default: 0.00, 0.0 = disabled) | | `--dry-base N` | set DRY sampling base value (default: 1.75) | | `--dry-allowed-length N` | set allowed length for DRY sampling (default: 2) | | `--dry-penalty-last-n N` | set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) | | `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers ('\n', ':', '"', '*') in the process; use "none" to not use any sequence breakers | -| `--dynatemp-range N` | dynamic temperature range (default: 0.0, 0.0 = disabled) | -| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.0) | +| `--adaptive-target N` | adaptive-p: select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) (default: -1.00)
[(more info)](https://github.com/ggml-org/llama.cpp/pull/17927) | +| `--adaptive-decay N` | adaptive-p: decay rate for target adaptation over time. lower values are more reactive, higher values are more stable.
(valid range 0.0 to 0.99) (default: 0.90) | +| `--dynatemp-range N` | dynamic temperature range (default: 0.00, 0.0 = disabled) | +| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.00) | | `--mirostat N` | use Mirostat sampling.
Top K, Nucleus and Locally Typical samplers are ignored if used.
(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) | -| `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.1) | -| `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.0) | +| `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.10) | +| `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.00) | | `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,
i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' | | `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') | | `--grammar-file FNAME` | file to read grammar from | @@ -251,8 +253,8 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: disabled)
(env: LLAMA_ARG_JINJA) | | `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content`
- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`
(default: auto)
(env: LLAMA_ARG_THINK) | | `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | -| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | -| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | +| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | +| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | | `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles | diff --git a/tools/server/README.md b/tools/server/README.md index f113f9cb75..d132830171 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -63,10 +63,10 @@ For the ful list of features, please refer to [server's changelog](https://githu | `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model)
(env: LLAMA_ARG_ROPE_FREQ_BASE) | | `--rope-freq-scale N` | RoPE frequency scaling factor, expands context by a factor of 1/N
(env: LLAMA_ARG_ROPE_FREQ_SCALE) | | `--yarn-orig-ctx N` | YaRN: original context size of model (default: 0 = model training context size)
(env: LLAMA_ARG_YARN_ORIG_CTX) | -| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation)
(env: LLAMA_ARG_YARN_EXT_FACTOR) | -| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.0)
(env: LLAMA_ARG_YARN_ATTN_FACTOR) | -| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.0)
(env: LLAMA_ARG_YARN_BETA_SLOW) | -| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.0)
(env: LLAMA_ARG_YARN_BETA_FAST) | +| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.00, 0.0 = full interpolation)
(env: LLAMA_ARG_YARN_EXT_FACTOR) | +| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.00)
(env: LLAMA_ARG_YARN_ATTN_FACTOR) | +| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.00)
(env: LLAMA_ARG_YARN_BETA_SLOW) | +| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.00)
(env: LLAMA_ARG_YARN_BETA_FAST) | | `-kvo, --kv-offload, -nkvo, --no-kv-offload` | whether to enable KV cache offloading (default: enabled)
(env: LLAMA_ARG_KV_OFFLOAD) | | `--repack, -nr, --no-repack` | whether to enable weight repacking (default: enabled)
(env: LLAMA_ARG_REPACK) | | `--no-host` | bypass host buffer allowing extra buffers to be used
(env: LLAMA_ARG_NO_HOST) | @@ -126,30 +126,30 @@ For the ful list of features, please refer to [server's changelog](https://githu | `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) | | `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) | | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) | -| `--temp N` | temperature (default: 0.8) | +| `--temp N` | temperature (default: 0.80) | | `--top-k N` | top-k sampling (default: 40, 0 = disabled)
(env: LLAMA_ARG_TOP_K) | -| `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) | -| `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) | -| `--adaptive-target N` | adaptive-p: select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) | -| `--adaptive-decay N` | adaptive-p: EMA decay for adaptation; effective history length ≈ 1/(1-decay) tokens (valid range 0.0 - 0.99) | -| `--top-nsigma N` | top-n-sigma sampling (default: -1.0, -1.0 = disabled) | -| `--xtc-probability N` | xtc probability (default: 0.0, 0.0 = disabled) | -| `--xtc-threshold N` | xtc threshold (default: 0.1, 1.0 = disabled) | -| `--typical N` | locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) | +| `--top-p N` | top-p sampling (default: 0.95, 1.0 = disabled) | +| `--min-p N` | min-p sampling (default: 0.05, 0.0 = disabled) | +| `--top-nsigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) | +| `--xtc-probability N` | xtc probability (default: 0.00, 0.0 = disabled) | +| `--xtc-threshold N` | xtc threshold (default: 0.10, 1.0 = disabled) | +| `--typical N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) | | `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) | -| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) | -| `--presence-penalty N` | repeat alpha presence penalty (default: 0.0, 0.0 = disabled) | -| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.0, 0.0 = disabled) | -| `--dry-multiplier N` | set DRY sampling multiplier (default: 0.0, 0.0 = disabled) | +| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled) | +| `--presence-penalty N` | repeat alpha presence penalty (default: 0.00, 0.0 = disabled) | +| `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.00, 0.0 = disabled) | +| `--dry-multiplier N` | set DRY sampling multiplier (default: 0.00, 0.0 = disabled) | | `--dry-base N` | set DRY sampling base value (default: 1.75) | | `--dry-allowed-length N` | set allowed length for DRY sampling (default: 2) | | `--dry-penalty-last-n N` | set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) | | `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers ('\n', ':', '"', '*') in the process; use "none" to not use any sequence breakers | -| `--dynatemp-range N` | dynamic temperature range (default: 0.0, 0.0 = disabled) | -| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.0) | +| `--adaptive-target N` | adaptive-p: select tokens near this probability (valid range 0.0 to 1.0; negative = disabled) (default: -1.00)
[(more info)](https://github.com/ggml-org/llama.cpp/pull/17927) | +| `--adaptive-decay N` | adaptive-p: decay rate for target adaptation over time. lower values are more reactive, higher values are more stable.
(valid range 0.0 to 0.99) (default: 0.90) | +| `--dynatemp-range N` | dynamic temperature range (default: 0.00, 0.0 = disabled) | +| `--dynatemp-exp N` | dynamic temperature exponent (default: 1.00) | | `--mirostat N` | use Mirostat sampling.
Top K, Nucleus and Locally Typical samplers are ignored if used.
(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) | -| `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.1) | -| `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.0) | +| `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.10) | +| `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.00) | | `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,
i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' | | `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') | | `--grammar-file FNAME` | file to read grammar from | @@ -199,7 +199,8 @@ For the ful list of features, please refer to [server's changelog](https://githu | `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'
(env: LLAMA_CHAT_TEMPLATE_KWARGS) | | `-to, --timeout N` | server read/write timeout in seconds (default: 600)
(env: LLAMA_ARG_TIMEOUT) | | `--threads-http N` | number of threads used to process HTTP requests (default: -1)
(env: LLAMA_ARG_THREADS_HTTP) | -| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)
[(card)](https://ggml.ai/f0.png)
(env: LLAMA_ARG_CACHE_REUSE) | +| `--cache-prompt, --no-cache-prompt` | whether to enable prompt caching (default: enabled)
(env: LLAMA_ARG_CACHE_PROMPT) | +| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting, requires prompt caching to be enabled (default: 0)
[(card)](https://ggml.ai/f0.png)
(env: LLAMA_ARG_CACHE_REUSE) | | `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)
(env: LLAMA_ARG_ENDPOINT_METRICS) | | `--props` | enable changing global properties via POST /props (default: disabled)
(env: LLAMA_ARG_ENDPOINT_PROPS) | | `--slots, --no-slots` | expose slots monitoring endpoint (default: enabled)
(env: LLAMA_ARG_ENDPOINT_SLOTS) | @@ -212,8 +213,8 @@ For the ful list of features, please refer to [server's changelog](https://githu | `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)
(env: LLAMA_ARG_JINJA) | | `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content`
- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`
(default: auto)
(env: LLAMA_ARG_THINK) | | `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | -| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | -| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | +| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | +| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | | `--prefill-assistant, --no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)
when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled

(env: LLAMA_ARG_PREFILL_ASSISTANT) | | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled) | | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) | @@ -222,7 +223,7 @@ For the ful list of features, please refer to [server's changelog](https://githu | `-tbd, --threads-batch-draft N` | number of threads to use during batch and prompt processing (default: same as --threads-draft) | | `--draft, --draft-n, --draft-max N` | number of tokens to draft for speculative decoding (default: 16)
(env: LLAMA_ARG_DRAFT_MAX) | | `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)
(env: LLAMA_ARG_DRAFT_MIN) | -| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.8)
(env: LLAMA_ARG_DRAFT_P_MIN) | +| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.75)
(env: LLAMA_ARG_DRAFT_P_MIN) | | `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)
(env: LLAMA_ARG_CTX_SIZE_DRAFT) | | `-devd, --device-draft ` | comma-separated list of devices to use for offloading the draft model (none = don't offload)
use --list-devices to see a list of available devices | | `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |