common: support negated args (#17919)

* args: support negated args

* update docs

* fix typo

* add more neg options

* Apply suggestions from code review

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* rm duplicated arg

* fix LLAMA_ARG_NO_HOST

* add test

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
Xuan-Son Nguyen 2025-12-12 23:58:53 +01:00 committed by GitHub
parent e39a2ce66d
commit 380b4c984e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 235 additions and 161 deletions

View File

@ -105,6 +105,16 @@ bool common_arg::is_exclude(enum llama_example ex) {
bool common_arg::get_value_from_env(std::string & output) const { bool common_arg::get_value_from_env(std::string & output) const {
if (env == nullptr) return false; if (env == nullptr) return false;
if (!args_neg.empty()) {
// for compatibility, we need to check LLAMA_ARG_NO_ env as well
std::string neg_env = env;
string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
char * neg_value = std::getenv(neg_env.c_str());
if (neg_value) {
output = "0"; // falsey
return true;
}
}
char * value = std::getenv(env); char * value = std::getenv(env);
if (value) { if (value) {
output = value; output = value;
@ -114,6 +124,14 @@ bool common_arg::get_value_from_env(std::string & output) const {
} }
bool common_arg::has_value_from_env() const { bool common_arg::has_value_from_env() const {
if (env != nullptr && !args_neg.empty()) {
// for compatibility, we need to check LLAMA_ARG_NO_ env as well
std::string neg_env = env;
string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
if (std::getenv(neg_env.c_str())) {
return true;
}
}
return env != nullptr && std::getenv(env); return env != nullptr && std::getenv(env);
} }
@ -151,9 +169,10 @@ std::string common_arg::to_string() const {
std::string leading_spaces(n_leading_spaces, ' '); std::string leading_spaces(n_leading_spaces, ' ');
std::ostringstream ss; std::ostringstream ss;
for (const auto arg : args) { auto all_args = get_args(); // also contains args_neg
if (arg == args.front()) { for (const auto & arg : all_args) {
if (args.size() == 1) { if (arg == all_args.front()) {
if (all_args.size() == 1) {
ss << arg; ss << arg;
} else { } else {
// first arg is usually abbreviation, we need padding to make it more beautiful // first arg is usually abbreviation, we need padding to make it more beautiful
@ -162,7 +181,7 @@ std::string common_arg::to_string() const {
ss << tmp << spaces; ss << tmp << spaces;
} }
} else { } else {
ss << arg << (arg != args.back() ? ", " : ""); ss << arg << (arg != all_args.back() ? ", " : "");
} }
} }
if (value_hint) ss << " " << value_hint; if (value_hint) ss << " " << value_hint;
@ -181,6 +200,31 @@ std::string common_arg::to_string() const {
return ss.str(); return ss.str();
} }
std::vector<std::string> common_arg::get_args() const {
std::vector<std::string> result;
for (const auto & arg : args) {
result.push_back(std::string(arg));
}
for (const auto & arg : args_neg) {
result.push_back(std::string(arg));
}
return result;
}
std::vector<std::string> common_arg::get_env() const {
std::vector<std::string> result;
if (env) {
result.push_back(std::string(env));
}
if (!args_neg.empty() && env) {
// for compatibility, we need to add LLAMA_ARG_NO_ variant
std::string neg_env = env;
string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
result.push_back(neg_env);
}
return result;
}
// //
// utils // utils
// //
@ -316,6 +360,16 @@ static std::string get_all_kv_cache_types() {
return msg.str(); return msg.str();
} }
static bool parse_bool_value(const std::string & value) {
if (is_truthy(value)) {
return true;
} else if (is_falsey(value)) {
return false;
} else {
throw std::invalid_argument("invalid boolean value");
}
}
// //
// CLI argument parsing functions // CLI argument parsing functions
// //
@ -323,10 +377,13 @@ static std::string get_all_kv_cache_types() {
static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) { static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
common_params & params = ctx_arg.params; common_params & params = ctx_arg.params;
std::unordered_map<std::string, common_arg *> arg_to_options; std::unordered_map<std::string, std::pair<common_arg *, bool>> arg_to_options;
for (auto & opt : ctx_arg.options) { for (auto & opt : ctx_arg.options) {
for (const auto & arg : opt.args) { for (const auto & arg : opt.args) {
arg_to_options[arg] = &opt; arg_to_options[arg] = {&opt, /* is_positive */ true};
}
for (const auto & arg : opt.args_neg) {
arg_to_options[arg] = {&opt, /* is_positive */ false};
} }
} }
@ -335,12 +392,15 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
std::string value; std::string value;
if (opt.get_value_from_env(value)) { if (opt.get_value_from_env(value)) {
try { try {
if (opt.handler_void && (value == "1" || value == "true")) { if (opt.handler_void && is_truthy(value)) {
opt.handler_void(params); opt.handler_void(params);
} }
if (opt.handler_int) { if (opt.handler_int) {
opt.handler_int(params, std::stoi(value)); opt.handler_int(params, std::stoi(value));
} }
if (opt.handler_bool) {
opt.handler_bool(params, parse_bool_value(value));
}
if (opt.handler_string) { if (opt.handler_string) {
opt.handler_string(params, value); opt.handler_string(params, value);
continue; continue;
@ -369,7 +429,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
if (arg_to_options.find(arg) == arg_to_options.end()) { if (arg_to_options.find(arg) == arg_to_options.end()) {
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str())); throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
} }
auto opt = *arg_to_options[arg]; auto & tmp = arg_to_options[arg];
auto opt = *tmp.first;
bool is_positive = tmp.second;
if (opt.has_value_from_env()) { if (opt.has_value_from_env()) {
fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str()); fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
} }
@ -378,6 +440,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
opt.handler_void(params); opt.handler_void(params);
continue; continue;
} }
if (opt.handler_bool) {
opt.handler_bool(params, is_positive);
continue;
}
// arg with single value // arg with single value
check_arg(i); check_arg(i);
@ -402,7 +468,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
throw std::invalid_argument(string_format( throw std::invalid_argument(string_format(
"error while handling argument \"%s\": %s\n\n" "error while handling argument \"%s\": %s\n\n"
"usage:\n%s\n\nto show complete usage, run with -h", "usage:\n%s\n\nto show complete usage, run with -h",
arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str())); arg.c_str(), e.what(), opt.to_string().c_str()));
} }
} }
@ -750,11 +816,11 @@ static std::string list_builtin_chat_templates() {
} }
bool common_arg_utils::is_truthy(const std::string & value) { bool common_arg_utils::is_truthy(const std::string & value) {
return value == "on" || value == "enabled" || value == "1"; return value == "on" || value == "enabled" || value == "true" || value == "1";
} }
bool common_arg_utils::is_falsey(const std::string & value) { bool common_arg_utils::is_falsey(const std::string & value) {
return value == "off" || value == "disabled" || value == "0"; return value == "off" || value == "disabled" || value == "false" || value == "0";
} }
bool common_arg_utils::is_autoy(const std::string & value) { bool common_arg_utils::is_autoy(const std::string & value) {
@ -839,10 +905,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
} }
)); ));
add_opt(common_arg( add_opt(common_arg(
{"--display-prompt"},
{"--no-display-prompt"}, {"--no-display-prompt"},
string_format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"), string_format("whether to print prompt at generation (default: %s)", params.display_prompt ? "true" : "false"),
[](common_params & params) { [](common_params & params, bool value) {
params.display_prompt = false; params.display_prompt = value;
} }
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI})); ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg( add_opt(common_arg(
@ -1055,18 +1122,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.kv_unified = true; params.kv_unified = true;
} }
).set_env("LLAMA_ARG_KV_UNIFIED")); ).set_env("LLAMA_ARG_KV_UNIFIED"));
add_opt(common_arg(
{"--no-context-shift"},
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
[](common_params & params) {
params.ctx_shift = false;
}
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
add_opt(common_arg( add_opt(common_arg(
{"--context-shift"}, {"--context-shift"},
string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"), {"--no-context-shift"},
[](common_params & params) { string_format("whether to use context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
params.ctx_shift = true; [](common_params & params, bool value) {
params.ctx_shift = value;
} }
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT")); ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
add_opt(common_arg( add_opt(common_arg(
@ -1106,20 +1167,22 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
} }
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION})); ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
add_opt(common_arg( add_opt(common_arg(
{"--perf"},
{"--no-perf"}, {"--no-perf"},
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), string_format("whether to enable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
[](common_params & params) { [](common_params & params, bool value) {
params.no_perf = true; params.no_perf = !value;
params.sampling.no_perf = true; params.sampling.no_perf = !value;
} }
).set_env("LLAMA_ARG_NO_PERF")); ).set_env("LLAMA_ARG_PERF"));
add_opt(common_arg( add_opt(common_arg(
{"--show-timings"},
{"--no-show-timings"}, {"--no-show-timings"},
string_format("disable timing information after each response (default: %s)", params.show_timings ? "true" : "false"), string_format("whether to show timing information after each response (default: %s)", params.show_timings ? "true" : "false"),
[](common_params & params) { [](common_params & params, bool value) {
params.show_timings = false; params.show_timings = value;
} }
).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_NO_SHOW_TIMINGS")); ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SHOW_TIMINGS"));
add_opt(common_arg( add_opt(common_arg(
{"-f", "--file"}, "FNAME", {"-f", "--file"}, "FNAME",
"a file containing the prompt (default: none)", "a file containing the prompt (default: none)",
@ -1171,16 +1234,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_excludes({LLAMA_EXAMPLE_SERVER})); ).set_excludes({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg( add_opt(common_arg(
{"-e", "--escape"}, {"-e", "--escape"},
string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
[](common_params & params) {
params.escape = true;
}
));
add_opt(common_arg(
{"--no-escape"}, {"--no-escape"},
"do not process escape sequences", string_format("whether to process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
[](common_params & params) { [](common_params & params, bool value) {
params.escape = false; params.escape = value;
} }
)); ));
add_opt(common_arg( add_opt(common_arg(
@ -1227,19 +1284,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER})); ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg( add_opt(common_arg(
{"-cnv", "--conversation"}, {"-cnv", "--conversation"},
"run in conversation mode:\n" {"-no-cnv", "--no-conversation"},
"whether to run in conversation mode:\n"
"- does not print special tokens and suffix/prefix\n" "- does not print special tokens and suffix/prefix\n"
"- interactive mode is also enabled\n" "- interactive mode is also enabled\n"
"(default: auto enabled if chat template is available)", "(default: auto enabled if chat template is available)",
[](common_params & params) { [](common_params & params, bool value) {
params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED; params.conversation_mode = value ? COMMON_CONVERSATION_MODE_ENABLED : COMMON_CONVERSATION_MODE_DISABLED;
}
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"-no-cnv", "--no-conversation"},
"force disable conversation mode (default: false)",
[](common_params & params) {
params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
} }
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI})); ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg( add_opt(common_arg(
@ -1297,10 +1348,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
} }
).set_examples({LLAMA_EXAMPLE_COMPLETION})); ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg( add_opt(common_arg(
{"--warmup"},
{"--no-warmup"}, {"--no-warmup"},
"skip warming up the model with an empty run", string_format("whether to perform warmup with an empty run (default: %s)", params.warmup ? "enabled" : "disabled"),
[](common_params & params) { [](common_params & params, bool value) {
params.warmup = false; params.warmup = value;
} }
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY})); ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
add_opt(common_arg( add_opt(common_arg(
@ -1702,19 +1754,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
} }
).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_COMPLETION})); ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg( add_opt(common_arg(
{"-kvo", "--kv-offload"},
{"-nkvo", "--no-kv-offload"}, {"-nkvo", "--no-kv-offload"},
"disable KV offload", string_format("whether to enable KV cache offloading (default: %s)", params.no_kv_offload ? "disabled" : "enabled"),
[](common_params & params) { [](common_params & params, bool value) {
params.no_kv_offload = true; params.no_kv_offload = !value;
} }
).set_env("LLAMA_ARG_NO_KV_OFFLOAD")); ).set_env("LLAMA_ARG_KV_OFFLOAD"));
add_opt(common_arg( add_opt(common_arg(
{"--repack"},
{"-nr", "--no-repack"}, {"-nr", "--no-repack"},
"disable weight repacking", string_format("whether to enable weight repacking (default: %s)", params.no_extra_bufts ? "disabled" : "enabled"),
[](common_params & params) { [](common_params & params, bool value) {
params.no_extra_bufts = true; params.no_extra_bufts = !value;
} }
).set_env("LLAMA_ARG_NO_REPACK")); ).set_env("LLAMA_ARG_REPACK"));
add_opt(common_arg( add_opt(common_arg(
{"--no-host"}, {"--no-host"},
"bypass host buffer allowing extra buffers to be used", "bypass host buffer allowing extra buffers to be used",
@ -1843,18 +1897,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_PARALLEL})); ).set_examples({LLAMA_EXAMPLE_PARALLEL}));
add_opt(common_arg( add_opt(common_arg(
{"-cb", "--cont-batching"}, {"-cb", "--cont-batching"},
string_format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"), {"-nocb", "--no-cont-batching"},
[](common_params & params) { string_format("whether to enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
params.cont_batching = true; [](common_params & params, bool value) {
params.cont_batching = value;
} }
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING")); ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
add_opt(common_arg(
{"-nocb", "--no-cont-batching"},
"disable continuous batching",
[](common_params & params) {
params.cont_batching = false;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
add_opt(common_arg( add_opt(common_arg(
{"-mm", "--mmproj"}, "FILE", {"-mm", "--mmproj"}, "FILE",
"path to a multimodal projector file. see tools/mtmd/README.md\n" "path to a multimodal projector file. see tools/mtmd/README.md\n"
@ -1871,19 +1919,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
} }
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL")); ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL"));
add_opt(common_arg( add_opt(common_arg(
{"--no-mmproj"}, {"--mmproj-auto"},
"explicitly disable multimodal projector, useful when using -hf", {"--no-mmproj", "--no-mmproj-auto"},
[](common_params & params) { string_format("whether to use multimodal projector file (if available), useful when using -hf (default: %s)", params.no_mmproj ? "disabled" : "enabled"),
params.no_mmproj = true; [](common_params & params, bool value) {
params.no_mmproj = !value;
} }
).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ")); ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_AUTO"));
add_opt(common_arg( add_opt(common_arg(
{"--mmproj-offload"},
{"--no-mmproj-offload"}, {"--no-mmproj-offload"},
"do not offload multimodal projector to GPU", string_format("whether to enable GPU offloading for multimodal projector (default: %s)", params.mmproj_use_gpu ? "enabled" : "disabled"),
[](common_params & params) { [](common_params & params, bool value) {
params.mmproj_use_gpu = false; params.mmproj_use_gpu = value;
} }
).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD")); ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD"));
add_opt(common_arg( add_opt(common_arg(
{"--image", "--audio"}, "FILE", {"--image", "--audio"}, "FILE",
"path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n", "path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
@ -1923,12 +1973,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
} }
).set_env("LLAMA_ARG_MLOCK")); ).set_env("LLAMA_ARG_MLOCK"));
add_opt(common_arg( add_opt(common_arg(
{"--mmap"},
{"--no-mmap"}, {"--no-mmap"},
"do not memory-map model (slower load but may reduce pageouts if not using mlock)", string_format("whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
[](common_params & params) { [](common_params & params, bool value) {
params.use_mmap = false; params.use_mmap = value;
} }
).set_env("LLAMA_ARG_NO_MMAP")); ).set_env("LLAMA_ARG_MMAP"));
add_opt(common_arg( add_opt(common_arg(
{"--numa"}, "TYPE", {"--numa"}, "TYPE",
"attempt optimizations that help on some NUMA systems\n" "attempt optimizations that help on some NUMA systems\n"
@ -2116,10 +2167,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
} }
)); ));
add_opt(common_arg( add_opt(common_arg(
{"--op-offload"},
{"--no-op-offload"}, {"--no-op-offload"},
string_format("disable offloading host tensor operations to device (default: %s)", params.no_op_offload ? "true" : "false"), string_format("whether to offload host tensor operations to device (default: %s)", params.no_op_offload ? "false" : "true"),
[](common_params & params) { [](common_params & params, bool value) {
params.no_op_offload = true; params.no_op_offload = !value;
} }
)); ));
add_opt(common_arg( add_opt(common_arg(
@ -2315,10 +2367,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
} }
).set_examples({LLAMA_EXAMPLE_IMATRIX})); ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
add_opt(common_arg( add_opt(common_arg(
{"--ppl"},
{"--no-ppl"}, {"--no-ppl"},
string_format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"), string_format("whether to compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
[](common_params & params) { [](common_params & params, bool value) {
params.compute_ppl = false; params.compute_ppl = value;
} }
).set_examples({LLAMA_EXAMPLE_IMATRIX})); ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
add_opt(common_arg( add_opt(common_arg(
@ -2437,12 +2490,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
} }
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX")); ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
add_opt(common_arg( add_opt(common_arg(
{"--webui"},
{"--no-webui"}, {"--no-webui"},
string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"), string_format("whether to enable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
[](common_params & params) { [](common_params & params, bool value) {
params.webui = false; params.webui = value;
} }
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI")); ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI"));
add_opt(common_arg( add_opt(common_arg(
{"--embedding", "--embeddings"}, {"--embedding", "--embeddings"},
string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"), string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
@ -2547,18 +2601,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS")); ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
add_opt(common_arg( add_opt(common_arg(
{"--slots"}, {"--slots"},
string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"), {"--no-slots"},
[](common_params & params) { string_format("expose slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
params.endpoint_slots = true; [](common_params & params, bool value) {
params.endpoint_slots = value;
} }
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS")); ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
add_opt(common_arg(
{"--no-slots"},
"disables slots monitoring endpoint",
[](common_params & params) {
params.endpoint_slots = false;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS"));
add_opt(common_arg( add_opt(common_arg(
{"--slot-save-path"}, "PATH", {"--slot-save-path"}, "PATH",
"path to save slot kv cache (default: disabled)", "path to save slot kv cache (default: disabled)",
@ -2609,26 +2657,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
} }
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX")); ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
add_opt(common_arg( add_opt(common_arg(
{"--models-autoload"},
{"--no-models-autoload"}, {"--no-models-autoload"},
"disables automatic loading of models (default: enabled)", string_format("for router server, whether to automatically load models (default: %s)", params.models_autoload ? "enabled" : "disabled"),
[](common_params & params) { [](common_params & params, bool value) {
params.models_autoload = false; params.models_autoload = value;
} }
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_MODELS_AUTOLOAD")); ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_AUTOLOAD"));
add_opt(common_arg( add_opt(common_arg(
{"--jinja"}, {"--jinja"},
string_format("use jinja template for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"), {"--no-jinja"},
[](common_params & params) { string_format("whether to use jinja template engine for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"),
params.use_jinja = true; [](common_params & params, bool value) {
params.use_jinja = value;
} }
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA")); ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
add_opt(common_arg(
{"--no-jinja"},
string_format("disable jinja template for chat (default: %s)", params.use_jinja ? "disabled" : "enabled"),
[](common_params & params) {
params.use_jinja = false;
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_NO_JINJA"));
add_opt(common_arg( add_opt(common_arg(
{"--reasoning-format"}, "FORMAT", {"--reasoning-format"}, "FORMAT",
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n" "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
@ -2673,15 +2716,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
} }
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE")); ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
add_opt(common_arg( add_opt(common_arg(
{"--prefill-assistant"},
{"--no-prefill-assistant"}, {"--no-prefill-assistant"},
string_format( string_format(
"whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n" "whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
"when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n" "when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
), ),
[](common_params & params) { [](common_params & params, bool value) {
params.prefill_assistant = false; params.prefill_assistant = value;
} }
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT")); ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PREFILL_ASSISTANT"));
add_opt(common_arg( add_opt(common_arg(
{"-sps", "--slot-prompt-similarity"}, "SIMILARITY", {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity), string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),

View File

@ -16,6 +16,7 @@ struct common_arg {
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON}; std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
std::set<enum llama_example> excludes = {}; std::set<enum llama_example> excludes = {};
std::vector<const char *> args; std::vector<const char *> args;
std::vector<const char *> args_neg; // for negated args like --no-xxx
const char * value_hint = nullptr; // help text or example for arg value const char * value_hint = nullptr; // help text or example for arg value
const char * value_hint_2 = nullptr; // for second arg value const char * value_hint_2 = nullptr; // for second arg value
const char * env = nullptr; const char * env = nullptr;
@ -25,6 +26,7 @@ struct common_arg {
void (*handler_string) (common_params & params, const std::string &) = nullptr; void (*handler_string) (common_params & params, const std::string &) = nullptr;
void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr; void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
void (*handler_int) (common_params & params, int) = nullptr; void (*handler_int) (common_params & params, int) = nullptr;
void (*handler_bool) (common_params & params, bool) = nullptr;
common_arg() = default; common_arg() = default;
@ -48,6 +50,13 @@ struct common_arg {
void (*handler)(common_params & params) void (*handler)(common_params & params)
) : args(args), help(help), handler_void(handler) {} ) : args(args), help(help), handler_void(handler) {}
common_arg(
const std::initializer_list<const char *> & args,
const std::initializer_list<const char *> & args_neg,
const std::string & help,
void (*handler)(common_params & params, bool)
) : args(args), args_neg(args_neg), help(help), handler_bool(handler) {}
// support 2 values for arg // support 2 values for arg
common_arg( common_arg(
const std::initializer_list<const char *> & args, const std::initializer_list<const char *> & args,
@ -80,6 +89,10 @@ struct common_arg {
} }
return strcmp(args[0], other.args[0]) == 0; return strcmp(args[0], other.args[0]) == 0;
} }
// get all args and env vars (including negated args/env)
std::vector<std::string> get_args() const;
std::vector<std::string> get_env() const;
}; };
namespace common_arg_utils { namespace common_arg_utils {

View File

@ -23,8 +23,14 @@ std::vector<std::string> common_preset::to_args() const {
if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) { if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
// flag option, no value // flag option, no value
if (common_arg_utils::is_falsey(value)) { if (common_arg_utils::is_falsey(value)) {
// skip the flag // use negative arg if available
args.pop_back(); if (!opt.args_neg.empty()) {
args.back() = opt.args_neg.back();
} else {
// otherwise, skip the flag
// TODO: maybe throw an error instead?
args.pop_back();
}
} }
} }
if (opt.value_hint != nullptr) { if (opt.value_hint != nullptr) {
@ -141,10 +147,10 @@ static std::map<std::string, std::map<std::string, std::string>> parse_ini_from_
static std::map<std::string, common_arg> get_map_key_opt(common_params_context & ctx_params) { static std::map<std::string, common_arg> get_map_key_opt(common_params_context & ctx_params) {
std::map<std::string, common_arg> mapping; std::map<std::string, common_arg> mapping;
for (const auto & opt : ctx_params.options) { for (const auto & opt : ctx_params.options) {
if (opt.env != nullptr) { for (const auto & env : opt.get_env()) {
mapping[opt.env] = opt; mapping[env] = opt;
} }
for (const auto & arg : opt.args) { for (const auto & arg : opt.get_args()) {
mapping[rm_leading_dashes(arg)] = opt; mapping[rm_leading_dashes(arg)] = opt;
} }
} }

View File

@ -14,12 +14,13 @@ static void write_table_header(std::ofstream & file) {
static void write_table_entry(std::ofstream & file, const common_arg & opt) { static void write_table_entry(std::ofstream & file, const common_arg & opt) {
file << "| `"; file << "| `";
// args // args
for (const auto & arg : opt.args) { auto all_args = opt.get_args();
if (arg == opt.args.front()) { for (const auto & arg : all_args) {
if (arg == all_args.front()) {
file << arg; file << arg;
if (opt.args.size() > 1) file << ", "; if (all_args.size() > 1) file << ", ";
} else { } else {
file << arg << (arg != opt.args.back() ? ", " : ""); file << arg << (arg != all_args.back() ? ", " : "");
} }
} }
// value hint // value hint

View File

@ -20,20 +20,20 @@ int main(void) {
std::unordered_set<std::string> seen_env_vars; std::unordered_set<std::string> seen_env_vars;
for (const auto & opt : ctx_arg.options) { for (const auto & opt : ctx_arg.options) {
// check for args duplications // check for args duplications
for (const auto & arg : opt.args) { for (const auto & arg : opt.get_args()) {
if (seen_args.find(arg) == seen_args.end()) { if (seen_args.find(arg) == seen_args.end()) {
seen_args.insert(arg); seen_args.insert(arg);
} else { } else {
fprintf(stderr, "test-arg-parser: found different handlers for the same argument: %s", arg); fprintf(stderr, "test-arg-parser: found different handlers for the same argument: %s", arg.c_str());
exit(1); exit(1);
} }
} }
// check for env var duplications // check for env var duplications
if (opt.env) { for (const auto & env : opt.get_env()) {
if (seen_env_vars.find(opt.env) == seen_env_vars.end()) { if (seen_env_vars.find(env) == seen_env_vars.end()) {
seen_env_vars.insert(opt.env); seen_env_vars.insert(env);
} else { } else {
fprintf(stderr, "test-arg-parser: found different handlers for the same env var: %s", opt.env); fprintf(stderr, "test-arg-parser: found different handlers for the same env var: %s", env.c_str());
exit(1); exit(1);
} }
} }
@ -115,6 +115,14 @@ int main(void) {
assert(params.model.path == "blah.gguf"); assert(params.model.path == "blah.gguf");
assert(params.cpuparams.n_threads == 1010); assert(params.cpuparams.n_threads == 1010);
printf("test-arg-parser: test negated environment variables\n\n");
setenv("LLAMA_ARG_MMAP", "0", true);
setenv("LLAMA_ARG_NO_PERF", "1", true); // legacy format
argv = {"binary_name"};
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
assert(params.use_mmap == false);
assert(params.no_perf == true);
printf("test-arg-parser: test environment variables being overwritten\n\n"); printf("test-arg-parser: test environment variables being overwritten\n\n");

View File

@ -54,9 +54,8 @@ For the ful list of features, please refer to [server's changelog](https://githu
| `--swa-full` | use full-size SWA cache (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)<br/>(env: LLAMA_ARG_SWA_FULL) | | `--swa-full` | use full-size SWA cache (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)<br/>(env: LLAMA_ARG_SWA_FULL) |
| `--kv-unified, -kvu` | use single unified KV buffer for the KV cache of all sequences (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)<br/>(env: LLAMA_ARG_KV_UNIFIED) | | `--kv-unified, -kvu` | use single unified KV buffer for the KV cache of all sequences (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
| `-fa, --flash-attn [on\|off\|auto]` | set Flash Attention use ('on', 'off', or 'auto', default: 'auto')<br/>(env: LLAMA_ARG_FLASH_ATTN) | | `-fa, --flash-attn [on\|off\|auto]` | set Flash Attention use ('on', 'off', or 'auto', default: 'auto')<br/>(env: LLAMA_ARG_FLASH_ATTN) |
| `--no-perf` | disable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_NO_PERF) | | `--perf, --no-perf` | whether to enable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_PERF) |
| `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) | | `-e, --escape, --no-escape` | whether to process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
| `--no-escape` | do not process escape sequences |
| `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model<br/>(env: LLAMA_ARG_ROPE_SCALING_TYPE) | | `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model<br/>(env: LLAMA_ARG_ROPE_SCALING_TYPE) |
| `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N<br/>(env: LLAMA_ARG_ROPE_SCALE) | | `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N<br/>(env: LLAMA_ARG_ROPE_SCALE) |
| `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model)<br/>(env: LLAMA_ARG_ROPE_FREQ_BASE) | | `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model)<br/>(env: LLAMA_ARG_ROPE_FREQ_BASE) |
@ -66,15 +65,15 @@ For the ful list of features, please refer to [server's changelog](https://githu
| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.0)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) | | `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.0)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) |
| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) | | `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) | | `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
| `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) | | `-kvo, --kv-offload, -nkvo, --no-kv-offload` | whether to enable KV cache offloading (default: enabled)<br/>(env: LLAMA_ARG_KV_OFFLOAD) |
| `-nr, --no-repack` | disable weight repacking<br/>(env: LLAMA_ARG_NO_REPACK) | | `--repack, -nr, --no-repack` | whether to enable weight repacking (default: enabled)<br/>(env: LLAMA_ARG_REPACK) |
| `--no-host` | bypass host buffer allowing extra buffers to be used<br/>(env: LLAMA_ARG_NO_HOST) | | `--no-host` | bypass host buffer allowing extra buffers to be used<br/>(env: LLAMA_ARG_HOST) |
| `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) | | `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
| `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) | | `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) | | `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock)<br/>(env: LLAMA_ARG_NO_MMAP) | | `--mmap, --no-mmap` | whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) | | `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
| `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) | | `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
| `--list-devices` | print list of available devices and exit | | `--list-devices` | print list of available devices and exit |
@ -87,7 +86,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)<br/>(env: LLAMA_ARG_MAIN_GPU) | | `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)<br/>(env: LLAMA_ARG_MAIN_GPU) |
| `--check-tensors` | check model tensor data for invalid values (default: false) | | `--check-tensors` | check model tensor data for invalid values (default: false) |
| `--override-kv KEY=TYPE:VALUE` | advanced option to override model metadata by key. may be specified multiple times.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false | | `--override-kv KEY=TYPE:VALUE` | advanced option to override model metadata by key. may be specified multiple times.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false |
| `--no-op-offload` | disable offloading host tensor operations to device (default: false) | | `--op-offload, --no-op-offload` | whether to offload host tensor operations to device (default: true) |
| `--lora FNAME` | path to LoRA adapter (can be repeated to use multiple adapters) | | `--lora FNAME` | path to LoRA adapter (can be repeated to use multiple adapters) |
| `--lora-scaled FNAME SCALE` | path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters) | | `--lora-scaled FNAME SCALE` | path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters) |
| `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors | | `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
@ -157,19 +156,18 @@ For the ful list of features, please refer to [server's changelog](https://githu
| -------- | ----------- | | -------- | ----------- |
| `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)<br/>(env: LLAMA_ARG_CTX_CHECKPOINTS) | | `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)<br/>(env: LLAMA_ARG_CTX_CHECKPOINTS) |
| `--cache-ram, -cram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) | | `--cache-ram, -cram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
| `--no-context-shift` | disables context shift on infinite text generation (default: enabled)<br/>(env: LLAMA_ARG_NO_CONTEXT_SHIFT) | | `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
| `--context-shift` | enables context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
| `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode<br/> | | `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode<br/> |
| `-sp, --special` | special tokens output enabled (default: false) | | `-sp, --special` | special tokens output enabled (default: false) |
| `--no-warmup` | skip warming up the model with an empty run | | `--warmup, --no-warmup` | whether to perform warmup with an empty run (default: enabled) |
| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) | | `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
| `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) | | `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) | | `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
| `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) | | `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
| `-mm, --mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md<br/>note: if -hf is used, this argument can be omitted<br/>(env: LLAMA_ARG_MMPROJ) | | `-mm, --mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md<br/>note: if -hf is used, this argument can be omitted<br/>(env: LLAMA_ARG_MMPROJ) |
| `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) | | `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
| `--no-mmproj` | explicitly disable multimodal projector, useful when using -hf<br/>(env: LLAMA_ARG_NO_MMPROJ) | | `--mmproj-auto, --no-mmproj, --no-mmproj-auto` | whether to use multimodal projector file (if available), useful when using -hf (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_AUTO) |
| `--no-mmproj-offload` | do not offload multimodal projector to GPU<br/>(env: LLAMA_ARG_NO_MMPROJ_OFFLOAD) | | `--mmproj-offload, --no-mmproj-offload` | whether to enable GPU offloading for multimodal projector (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_OFFLOAD) |
| `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MIN_TOKENS) | | `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MIN_TOKENS) |
| `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MAX_TOKENS) | | `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MAX_TOKENS) |
| `--override-tensor-draft, -otd <tensor name pattern>=<buffer type>,...` | override tensor buffer type for draft model | | `--override-tensor-draft, -otd <tensor name pattern>=<buffer type>,...` | override tensor buffer type for draft model |
@ -180,7 +178,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
| `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) | | `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
| `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) | | `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
| `--api-prefix PREFIX` | prefix path the server serves from, without the trailing slash (default: )<br/>(env: LLAMA_ARG_API_PREFIX) | | `--api-prefix PREFIX` | prefix path the server serves from, without the trailing slash (default: )<br/>(env: LLAMA_ARG_API_PREFIX) |
| `--no-webui` | Disable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_NO_WEBUI) | | `--webui, --no-webui` | whether to enable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_WEBUI) |
| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) | | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
| `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) | | `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
| `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) | | `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) |
@ -193,20 +191,19 @@ For the ful list of features, please refer to [server's changelog](https://githu
| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)<br/>[(card)](https://ggml.ai/f0.png)<br/>(env: LLAMA_ARG_CACHE_REUSE) | | `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)<br/>[(card)](https://ggml.ai/f0.png)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) | | `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
| `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) | | `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |
| `--slots` | enable slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) | | `--slots, --no-slots` | expose slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
| `--no-slots` | disables slots monitoring endpoint<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) | | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
| `--media-path PATH` | directory for loading local media files; files can be accessed via file:// URLs using relative paths (default: disabled) |
| `--models-dir PATH` | directory containing models for the router server (default: disabled)<br/>(env: LLAMA_ARG_MODELS_DIR) | | `--models-dir PATH` | directory containing models for the router server (default: disabled)<br/>(env: LLAMA_ARG_MODELS_DIR) |
| `--models-preset PATH` | path to INI file containing model presets for the router server (default: disabled)<br/>(env: LLAMA_ARG_MODELS_PRESET) |
| `--models-max N` | for router server, maximum number of models to load simultaneously (default: 4, 0 = unlimited)<br/>(env: LLAMA_ARG_MODELS_MAX) | | `--models-max N` | for router server, maximum number of models to load simultaneously (default: 4, 0 = unlimited)<br/>(env: LLAMA_ARG_MODELS_MAX) |
| `--models-allow-extra-args` | for router server, allow extra arguments for models; important: some arguments can allow users to access local file system, use with caution (default: disabled)<br/>(env: LLAMA_ARG_MODELS_ALLOW_EXTRA_ARGS) | | `--models-autoload, --no-models-autoload` | for router server, whether to automatically load models (default: enabled)<br/>(env: LLAMA_ARG_MODELS_AUTOLOAD) |
| `--no-models-autoload` | disables automatic loading of models (default: enabled)<br/>(env: LLAMA_ARG_NO_MODELS_AUTOLOAD) | | `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)<br/>(env: LLAMA_ARG_JINJA) |
| `--jinja` | use jinja template for chat (default: enabled)<br/><br/>(env: LLAMA_ARG_JINJA) |
| `--no-jinja` | disable jinja template for chat (default: enabled)<br/><br/>(env: LLAMA_ARG_NO_JINJA) |
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) | | `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) | | `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) | | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
| `--no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/><br/>(env: LLAMA_ARG_NO_PREFILL_ASSISTANT) | | `--prefill-assistant, --no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/><br/>(env: LLAMA_ARG_PREFILL_ASSISTANT) |
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled)<br/> | | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled)<br/> |
| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) | | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
| `-td, --threads-draft N` | number of threads to use during generation (default: same as --threads) | | `-td, --threads-draft N` | number of threads to use during generation (default: same as --threads) |
@ -236,6 +233,11 @@ For the ful list of features, please refer to [server's changelog](https://githu
Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var. Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
For boolean options like `--mmap` or `--kv-offload`, the environment variable is handled as shown in this example:
- `LLAMA_ARG_MMAP=true` means enabled, other accepted values are: `1`, `on`, `enabled`
- `LLAMA_ARG_MMAP=false` means disabled, other accepted values are: `0`, `off`, `disabled`
- If `LLAMA_ARG_NO_MMAP` is present (no matter the value), it means disabling mmap
Example usage of docker compose with environment variables: Example usage of docker compose with environment variables:
```yml ```yml