common: support negated args (#17919)
* args: support negated args * update docs * fix typo * add more neg options * Apply suggestions from code review Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * rm duplicated arg * fix LLAMA_ARG_NO_HOST * add test --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
parent
e39a2ce66d
commit
380b4c984e
290
common/arg.cpp
290
common/arg.cpp
|
|
@ -105,6 +105,16 @@ bool common_arg::is_exclude(enum llama_example ex) {
|
||||||
|
|
||||||
bool common_arg::get_value_from_env(std::string & output) const {
|
bool common_arg::get_value_from_env(std::string & output) const {
|
||||||
if (env == nullptr) return false;
|
if (env == nullptr) return false;
|
||||||
|
if (!args_neg.empty()) {
|
||||||
|
// for compatibility, we need to check LLAMA_ARG_NO_ env as well
|
||||||
|
std::string neg_env = env;
|
||||||
|
string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
|
||||||
|
char * neg_value = std::getenv(neg_env.c_str());
|
||||||
|
if (neg_value) {
|
||||||
|
output = "0"; // falsey
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
char * value = std::getenv(env);
|
char * value = std::getenv(env);
|
||||||
if (value) {
|
if (value) {
|
||||||
output = value;
|
output = value;
|
||||||
|
|
@ -114,6 +124,14 @@ bool common_arg::get_value_from_env(std::string & output) const {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool common_arg::has_value_from_env() const {
|
bool common_arg::has_value_from_env() const {
|
||||||
|
if (env != nullptr && !args_neg.empty()) {
|
||||||
|
// for compatibility, we need to check LLAMA_ARG_NO_ env as well
|
||||||
|
std::string neg_env = env;
|
||||||
|
string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
|
||||||
|
if (std::getenv(neg_env.c_str())) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
return env != nullptr && std::getenv(env);
|
return env != nullptr && std::getenv(env);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -151,9 +169,10 @@ std::string common_arg::to_string() const {
|
||||||
std::string leading_spaces(n_leading_spaces, ' ');
|
std::string leading_spaces(n_leading_spaces, ' ');
|
||||||
|
|
||||||
std::ostringstream ss;
|
std::ostringstream ss;
|
||||||
for (const auto arg : args) {
|
auto all_args = get_args(); // also contains args_neg
|
||||||
if (arg == args.front()) {
|
for (const auto & arg : all_args) {
|
||||||
if (args.size() == 1) {
|
if (arg == all_args.front()) {
|
||||||
|
if (all_args.size() == 1) {
|
||||||
ss << arg;
|
ss << arg;
|
||||||
} else {
|
} else {
|
||||||
// first arg is usually abbreviation, we need padding to make it more beautiful
|
// first arg is usually abbreviation, we need padding to make it more beautiful
|
||||||
|
|
@ -162,7 +181,7 @@ std::string common_arg::to_string() const {
|
||||||
ss << tmp << spaces;
|
ss << tmp << spaces;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
ss << arg << (arg != args.back() ? ", " : "");
|
ss << arg << (arg != all_args.back() ? ", " : "");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (value_hint) ss << " " << value_hint;
|
if (value_hint) ss << " " << value_hint;
|
||||||
|
|
@ -181,6 +200,31 @@ std::string common_arg::to_string() const {
|
||||||
return ss.str();
|
return ss.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<std::string> common_arg::get_args() const {
|
||||||
|
std::vector<std::string> result;
|
||||||
|
for (const auto & arg : args) {
|
||||||
|
result.push_back(std::string(arg));
|
||||||
|
}
|
||||||
|
for (const auto & arg : args_neg) {
|
||||||
|
result.push_back(std::string(arg));
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::string> common_arg::get_env() const {
|
||||||
|
std::vector<std::string> result;
|
||||||
|
if (env) {
|
||||||
|
result.push_back(std::string(env));
|
||||||
|
}
|
||||||
|
if (!args_neg.empty() && env) {
|
||||||
|
// for compatibility, we need to add LLAMA_ARG_NO_ variant
|
||||||
|
std::string neg_env = env;
|
||||||
|
string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
|
||||||
|
result.push_back(neg_env);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// utils
|
// utils
|
||||||
//
|
//
|
||||||
|
|
@ -316,6 +360,16 @@ static std::string get_all_kv_cache_types() {
|
||||||
return msg.str();
|
return msg.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool parse_bool_value(const std::string & value) {
|
||||||
|
if (is_truthy(value)) {
|
||||||
|
return true;
|
||||||
|
} else if (is_falsey(value)) {
|
||||||
|
return false;
|
||||||
|
} else {
|
||||||
|
throw std::invalid_argument("invalid boolean value");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// CLI argument parsing functions
|
// CLI argument parsing functions
|
||||||
//
|
//
|
||||||
|
|
@ -323,10 +377,13 @@ static std::string get_all_kv_cache_types() {
|
||||||
static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
|
static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
|
||||||
common_params & params = ctx_arg.params;
|
common_params & params = ctx_arg.params;
|
||||||
|
|
||||||
std::unordered_map<std::string, common_arg *> arg_to_options;
|
std::unordered_map<std::string, std::pair<common_arg *, bool>> arg_to_options;
|
||||||
for (auto & opt : ctx_arg.options) {
|
for (auto & opt : ctx_arg.options) {
|
||||||
for (const auto & arg : opt.args) {
|
for (const auto & arg : opt.args) {
|
||||||
arg_to_options[arg] = &opt;
|
arg_to_options[arg] = {&opt, /* is_positive */ true};
|
||||||
|
}
|
||||||
|
for (const auto & arg : opt.args_neg) {
|
||||||
|
arg_to_options[arg] = {&opt, /* is_positive */ false};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -335,12 +392,15 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||||
std::string value;
|
std::string value;
|
||||||
if (opt.get_value_from_env(value)) {
|
if (opt.get_value_from_env(value)) {
|
||||||
try {
|
try {
|
||||||
if (opt.handler_void && (value == "1" || value == "true")) {
|
if (opt.handler_void && is_truthy(value)) {
|
||||||
opt.handler_void(params);
|
opt.handler_void(params);
|
||||||
}
|
}
|
||||||
if (opt.handler_int) {
|
if (opt.handler_int) {
|
||||||
opt.handler_int(params, std::stoi(value));
|
opt.handler_int(params, std::stoi(value));
|
||||||
}
|
}
|
||||||
|
if (opt.handler_bool) {
|
||||||
|
opt.handler_bool(params, parse_bool_value(value));
|
||||||
|
}
|
||||||
if (opt.handler_string) {
|
if (opt.handler_string) {
|
||||||
opt.handler_string(params, value);
|
opt.handler_string(params, value);
|
||||||
continue;
|
continue;
|
||||||
|
|
@ -369,7 +429,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||||
if (arg_to_options.find(arg) == arg_to_options.end()) {
|
if (arg_to_options.find(arg) == arg_to_options.end()) {
|
||||||
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
|
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
|
||||||
}
|
}
|
||||||
auto opt = *arg_to_options[arg];
|
auto & tmp = arg_to_options[arg];
|
||||||
|
auto opt = *tmp.first;
|
||||||
|
bool is_positive = tmp.second;
|
||||||
if (opt.has_value_from_env()) {
|
if (opt.has_value_from_env()) {
|
||||||
fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
|
fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
|
||||||
}
|
}
|
||||||
|
|
@ -378,6 +440,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||||
opt.handler_void(params);
|
opt.handler_void(params);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
if (opt.handler_bool) {
|
||||||
|
opt.handler_bool(params, is_positive);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
// arg with single value
|
// arg with single value
|
||||||
check_arg(i);
|
check_arg(i);
|
||||||
|
|
@ -402,7 +468,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
||||||
throw std::invalid_argument(string_format(
|
throw std::invalid_argument(string_format(
|
||||||
"error while handling argument \"%s\": %s\n\n"
|
"error while handling argument \"%s\": %s\n\n"
|
||||||
"usage:\n%s\n\nto show complete usage, run with -h",
|
"usage:\n%s\n\nto show complete usage, run with -h",
|
||||||
arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str()));
|
arg.c_str(), e.what(), opt.to_string().c_str()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -750,11 +816,11 @@ static std::string list_builtin_chat_templates() {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool common_arg_utils::is_truthy(const std::string & value) {
|
bool common_arg_utils::is_truthy(const std::string & value) {
|
||||||
return value == "on" || value == "enabled" || value == "1";
|
return value == "on" || value == "enabled" || value == "true" || value == "1";
|
||||||
}
|
}
|
||||||
|
|
||||||
bool common_arg_utils::is_falsey(const std::string & value) {
|
bool common_arg_utils::is_falsey(const std::string & value) {
|
||||||
return value == "off" || value == "disabled" || value == "0";
|
return value == "off" || value == "disabled" || value == "false" || value == "0";
|
||||||
}
|
}
|
||||||
|
|
||||||
bool common_arg_utils::is_autoy(const std::string & value) {
|
bool common_arg_utils::is_autoy(const std::string & value) {
|
||||||
|
|
@ -839,10 +905,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
));
|
));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
{"--display-prompt"},
|
||||||
{"--no-display-prompt"},
|
{"--no-display-prompt"},
|
||||||
string_format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
|
string_format("whether to print prompt at generation (default: %s)", params.display_prompt ? "true" : "false"),
|
||||||
[](common_params & params) {
|
[](common_params & params, bool value) {
|
||||||
params.display_prompt = false;
|
params.display_prompt = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
|
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
|
@ -1055,18 +1122,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
params.kv_unified = true;
|
params.kv_unified = true;
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_KV_UNIFIED"));
|
).set_env("LLAMA_ARG_KV_UNIFIED"));
|
||||||
add_opt(common_arg(
|
|
||||||
{"--no-context-shift"},
|
|
||||||
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
|
|
||||||
[](common_params & params) {
|
|
||||||
params.ctx_shift = false;
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
|
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--context-shift"},
|
{"--context-shift"},
|
||||||
string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
|
{"--no-context-shift"},
|
||||||
[](common_params & params) {
|
string_format("whether to use context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
|
||||||
params.ctx_shift = true;
|
[](common_params & params, bool value) {
|
||||||
|
params.ctx_shift = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
|
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
|
@ -1106,20 +1167,22 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
|
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
{"--perf"},
|
||||||
{"--no-perf"},
|
{"--no-perf"},
|
||||||
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
|
string_format("whether to enable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
|
||||||
[](common_params & params) {
|
[](common_params & params, bool value) {
|
||||||
params.no_perf = true;
|
params.no_perf = !value;
|
||||||
params.sampling.no_perf = true;
|
params.sampling.no_perf = !value;
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_NO_PERF"));
|
).set_env("LLAMA_ARG_PERF"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
{"--show-timings"},
|
||||||
{"--no-show-timings"},
|
{"--no-show-timings"},
|
||||||
string_format("disable timing information after each response (default: %s)", params.show_timings ? "true" : "false"),
|
string_format("whether to show timing information after each response (default: %s)", params.show_timings ? "true" : "false"),
|
||||||
[](common_params & params) {
|
[](common_params & params, bool value) {
|
||||||
params.show_timings = false;
|
params.show_timings = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_NO_SHOW_TIMINGS"));
|
).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SHOW_TIMINGS"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-f", "--file"}, "FNAME",
|
{"-f", "--file"}, "FNAME",
|
||||||
"a file containing the prompt (default: none)",
|
"a file containing the prompt (default: none)",
|
||||||
|
|
@ -1171,16 +1234,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_excludes({LLAMA_EXAMPLE_SERVER}));
|
).set_excludes({LLAMA_EXAMPLE_SERVER}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-e", "--escape"},
|
{"-e", "--escape"},
|
||||||
string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
|
|
||||||
[](common_params & params) {
|
|
||||||
params.escape = true;
|
|
||||||
}
|
|
||||||
));
|
|
||||||
add_opt(common_arg(
|
|
||||||
{"--no-escape"},
|
{"--no-escape"},
|
||||||
"do not process escape sequences",
|
string_format("whether to process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
|
||||||
[](common_params & params) {
|
[](common_params & params, bool value) {
|
||||||
params.escape = false;
|
params.escape = value;
|
||||||
}
|
}
|
||||||
));
|
));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
|
@ -1227,19 +1284,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
|
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-cnv", "--conversation"},
|
{"-cnv", "--conversation"},
|
||||||
"run in conversation mode:\n"
|
{"-no-cnv", "--no-conversation"},
|
||||||
|
"whether to run in conversation mode:\n"
|
||||||
"- does not print special tokens and suffix/prefix\n"
|
"- does not print special tokens and suffix/prefix\n"
|
||||||
"- interactive mode is also enabled\n"
|
"- interactive mode is also enabled\n"
|
||||||
"(default: auto enabled if chat template is available)",
|
"(default: auto enabled if chat template is available)",
|
||||||
[](common_params & params) {
|
[](common_params & params, bool value) {
|
||||||
params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
|
params.conversation_mode = value ? COMMON_CONVERSATION_MODE_ENABLED : COMMON_CONVERSATION_MODE_DISABLED;
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
|
||||||
add_opt(common_arg(
|
|
||||||
{"-no-cnv", "--no-conversation"},
|
|
||||||
"force disable conversation mode (default: false)",
|
|
||||||
[](common_params & params) {
|
|
||||||
params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
|
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
|
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
|
@ -1297,10 +1348,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
{"--warmup"},
|
||||||
{"--no-warmup"},
|
{"--no-warmup"},
|
||||||
"skip warming up the model with an empty run",
|
string_format("whether to perform warmup with an empty run (default: %s)", params.warmup ? "enabled" : "disabled"),
|
||||||
[](common_params & params) {
|
[](common_params & params, bool value) {
|
||||||
params.warmup = false;
|
params.warmup = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
|
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
|
@ -1702,19 +1754,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
{"-kvo", "--kv-offload"},
|
||||||
{"-nkvo", "--no-kv-offload"},
|
{"-nkvo", "--no-kv-offload"},
|
||||||
"disable KV offload",
|
string_format("whether to enable KV cache offloading (default: %s)", params.no_kv_offload ? "disabled" : "enabled"),
|
||||||
[](common_params & params) {
|
[](common_params & params, bool value) {
|
||||||
params.no_kv_offload = true;
|
params.no_kv_offload = !value;
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
|
).set_env("LLAMA_ARG_KV_OFFLOAD"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
{"--repack"},
|
||||||
{"-nr", "--no-repack"},
|
{"-nr", "--no-repack"},
|
||||||
"disable weight repacking",
|
string_format("whether to enable weight repacking (default: %s)", params.no_extra_bufts ? "disabled" : "enabled"),
|
||||||
[](common_params & params) {
|
[](common_params & params, bool value) {
|
||||||
params.no_extra_bufts = true;
|
params.no_extra_bufts = !value;
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_NO_REPACK"));
|
).set_env("LLAMA_ARG_REPACK"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--no-host"},
|
{"--no-host"},
|
||||||
"bypass host buffer allowing extra buffers to be used",
|
"bypass host buffer allowing extra buffers to be used",
|
||||||
|
|
@ -1843,18 +1897,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_examples({LLAMA_EXAMPLE_PARALLEL}));
|
).set_examples({LLAMA_EXAMPLE_PARALLEL}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-cb", "--cont-batching"},
|
{"-cb", "--cont-batching"},
|
||||||
string_format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
|
{"-nocb", "--no-cont-batching"},
|
||||||
[](common_params & params) {
|
string_format("whether to enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
|
||||||
params.cont_batching = true;
|
[](common_params & params, bool value) {
|
||||||
|
params.cont_batching = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
|
||||||
add_opt(common_arg(
|
|
||||||
{"-nocb", "--no-cont-batching"},
|
|
||||||
"disable continuous batching",
|
|
||||||
[](common_params & params) {
|
|
||||||
params.cont_batching = false;
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
|
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-mm", "--mmproj"}, "FILE",
|
{"-mm", "--mmproj"}, "FILE",
|
||||||
"path to a multimodal projector file. see tools/mtmd/README.md\n"
|
"path to a multimodal projector file. see tools/mtmd/README.md\n"
|
||||||
|
|
@ -1871,19 +1919,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL"));
|
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--no-mmproj"},
|
{"--mmproj-auto"},
|
||||||
"explicitly disable multimodal projector, useful when using -hf",
|
{"--no-mmproj", "--no-mmproj-auto"},
|
||||||
[](common_params & params) {
|
string_format("whether to use multimodal projector file (if available), useful when using -hf (default: %s)", params.no_mmproj ? "disabled" : "enabled"),
|
||||||
params.no_mmproj = true;
|
[](common_params & params, bool value) {
|
||||||
|
params.no_mmproj = !value;
|
||||||
}
|
}
|
||||||
).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ"));
|
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_AUTO"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
{"--mmproj-offload"},
|
||||||
{"--no-mmproj-offload"},
|
{"--no-mmproj-offload"},
|
||||||
"do not offload multimodal projector to GPU",
|
string_format("whether to enable GPU offloading for multimodal projector (default: %s)", params.mmproj_use_gpu ? "enabled" : "disabled"),
|
||||||
[](common_params & params) {
|
[](common_params & params, bool value) {
|
||||||
params.mmproj_use_gpu = false;
|
params.mmproj_use_gpu = value;
|
||||||
}
|
}
|
||||||
).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
|
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--image", "--audio"}, "FILE",
|
{"--image", "--audio"}, "FILE",
|
||||||
"path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
|
"path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
|
||||||
|
|
@ -1923,12 +1973,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_MLOCK"));
|
).set_env("LLAMA_ARG_MLOCK"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
{"--mmap"},
|
||||||
{"--no-mmap"},
|
{"--no-mmap"},
|
||||||
"do not memory-map model (slower load but may reduce pageouts if not using mlock)",
|
string_format("whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
|
||||||
[](common_params & params) {
|
[](common_params & params, bool value) {
|
||||||
params.use_mmap = false;
|
params.use_mmap = value;
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_NO_MMAP"));
|
).set_env("LLAMA_ARG_MMAP"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--numa"}, "TYPE",
|
{"--numa"}, "TYPE",
|
||||||
"attempt optimizations that help on some NUMA systems\n"
|
"attempt optimizations that help on some NUMA systems\n"
|
||||||
|
|
@ -2116,10 +2167,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
));
|
));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
{"--op-offload"},
|
||||||
{"--no-op-offload"},
|
{"--no-op-offload"},
|
||||||
string_format("disable offloading host tensor operations to device (default: %s)", params.no_op_offload ? "true" : "false"),
|
string_format("whether to offload host tensor operations to device (default: %s)", params.no_op_offload ? "false" : "true"),
|
||||||
[](common_params & params) {
|
[](common_params & params, bool value) {
|
||||||
params.no_op_offload = true;
|
params.no_op_offload = !value;
|
||||||
}
|
}
|
||||||
));
|
));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
|
@ -2315,10 +2367,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
{"--ppl"},
|
||||||
{"--no-ppl"},
|
{"--no-ppl"},
|
||||||
string_format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
|
string_format("whether to compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
|
||||||
[](common_params & params) {
|
[](common_params & params, bool value) {
|
||||||
params.compute_ppl = false;
|
params.compute_ppl = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
|
@ -2437,12 +2490,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
{"--webui"},
|
||||||
{"--no-webui"},
|
{"--no-webui"},
|
||||||
string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
|
string_format("whether to enable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
|
||||||
[](common_params & params) {
|
[](common_params & params, bool value) {
|
||||||
params.webui = false;
|
params.webui = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--embedding", "--embeddings"},
|
{"--embedding", "--embeddings"},
|
||||||
string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
|
string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
|
||||||
|
|
@ -2547,18 +2601,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--slots"},
|
{"--slots"},
|
||||||
string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
|
{"--no-slots"},
|
||||||
[](common_params & params) {
|
string_format("expose slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
|
||||||
params.endpoint_slots = true;
|
[](common_params & params, bool value) {
|
||||||
|
params.endpoint_slots = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
|
||||||
add_opt(common_arg(
|
|
||||||
{"--no-slots"},
|
|
||||||
"disables slots monitoring endpoint",
|
|
||||||
[](common_params & params) {
|
|
||||||
params.endpoint_slots = false;
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS"));
|
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--slot-save-path"}, "PATH",
|
{"--slot-save-path"}, "PATH",
|
||||||
"path to save slot kv cache (default: disabled)",
|
"path to save slot kv cache (default: disabled)",
|
||||||
|
|
@ -2609,26 +2657,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
{"--models-autoload"},
|
||||||
{"--no-models-autoload"},
|
{"--no-models-autoload"},
|
||||||
"disables automatic loading of models (default: enabled)",
|
string_format("for router server, whether to automatically load models (default: %s)", params.models_autoload ? "enabled" : "disabled"),
|
||||||
[](common_params & params) {
|
[](common_params & params, bool value) {
|
||||||
params.models_autoload = false;
|
params.models_autoload = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_MODELS_AUTOLOAD"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_AUTOLOAD"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--jinja"},
|
{"--jinja"},
|
||||||
string_format("use jinja template for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"),
|
{"--no-jinja"},
|
||||||
[](common_params & params) {
|
string_format("whether to use jinja template engine for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"),
|
||||||
params.use_jinja = true;
|
[](common_params & params, bool value) {
|
||||||
|
params.use_jinja = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
|
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
|
||||||
add_opt(common_arg(
|
|
||||||
{"--no-jinja"},
|
|
||||||
string_format("disable jinja template for chat (default: %s)", params.use_jinja ? "disabled" : "enabled"),
|
|
||||||
[](common_params & params) {
|
|
||||||
params.use_jinja = false;
|
|
||||||
}
|
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_NO_JINJA"));
|
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--reasoning-format"}, "FORMAT",
|
{"--reasoning-format"}, "FORMAT",
|
||||||
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
|
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
|
||||||
|
|
@ -2673,15 +2716,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
|
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
{"--prefill-assistant"},
|
||||||
{"--no-prefill-assistant"},
|
{"--no-prefill-assistant"},
|
||||||
string_format(
|
string_format(
|
||||||
"whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
|
"whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
|
||||||
"when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
|
"when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
|
||||||
),
|
),
|
||||||
[](common_params & params) {
|
[](common_params & params, bool value) {
|
||||||
params.prefill_assistant = false;
|
params.prefill_assistant = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PREFILL_ASSISTANT"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
|
{"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
|
||||||
string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
|
string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
|
||||||
|
|
|
||||||
13
common/arg.h
13
common/arg.h
|
|
@ -16,6 +16,7 @@ struct common_arg {
|
||||||
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
|
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
|
||||||
std::set<enum llama_example> excludes = {};
|
std::set<enum llama_example> excludes = {};
|
||||||
std::vector<const char *> args;
|
std::vector<const char *> args;
|
||||||
|
std::vector<const char *> args_neg; // for negated args like --no-xxx
|
||||||
const char * value_hint = nullptr; // help text or example for arg value
|
const char * value_hint = nullptr; // help text or example for arg value
|
||||||
const char * value_hint_2 = nullptr; // for second arg value
|
const char * value_hint_2 = nullptr; // for second arg value
|
||||||
const char * env = nullptr;
|
const char * env = nullptr;
|
||||||
|
|
@ -25,6 +26,7 @@ struct common_arg {
|
||||||
void (*handler_string) (common_params & params, const std::string &) = nullptr;
|
void (*handler_string) (common_params & params, const std::string &) = nullptr;
|
||||||
void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
|
void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
|
||||||
void (*handler_int) (common_params & params, int) = nullptr;
|
void (*handler_int) (common_params & params, int) = nullptr;
|
||||||
|
void (*handler_bool) (common_params & params, bool) = nullptr;
|
||||||
|
|
||||||
common_arg() = default;
|
common_arg() = default;
|
||||||
|
|
||||||
|
|
@ -48,6 +50,13 @@ struct common_arg {
|
||||||
void (*handler)(common_params & params)
|
void (*handler)(common_params & params)
|
||||||
) : args(args), help(help), handler_void(handler) {}
|
) : args(args), help(help), handler_void(handler) {}
|
||||||
|
|
||||||
|
common_arg(
|
||||||
|
const std::initializer_list<const char *> & args,
|
||||||
|
const std::initializer_list<const char *> & args_neg,
|
||||||
|
const std::string & help,
|
||||||
|
void (*handler)(common_params & params, bool)
|
||||||
|
) : args(args), args_neg(args_neg), help(help), handler_bool(handler) {}
|
||||||
|
|
||||||
// support 2 values for arg
|
// support 2 values for arg
|
||||||
common_arg(
|
common_arg(
|
||||||
const std::initializer_list<const char *> & args,
|
const std::initializer_list<const char *> & args,
|
||||||
|
|
@ -80,6 +89,10 @@ struct common_arg {
|
||||||
}
|
}
|
||||||
return strcmp(args[0], other.args[0]) == 0;
|
return strcmp(args[0], other.args[0]) == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// get all args and env vars (including negated args/env)
|
||||||
|
std::vector<std::string> get_args() const;
|
||||||
|
std::vector<std::string> get_env() const;
|
||||||
};
|
};
|
||||||
|
|
||||||
namespace common_arg_utils {
|
namespace common_arg_utils {
|
||||||
|
|
|
||||||
|
|
@ -23,8 +23,14 @@ std::vector<std::string> common_preset::to_args() const {
|
||||||
if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
|
if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
|
||||||
// flag option, no value
|
// flag option, no value
|
||||||
if (common_arg_utils::is_falsey(value)) {
|
if (common_arg_utils::is_falsey(value)) {
|
||||||
// skip the flag
|
// use negative arg if available
|
||||||
args.pop_back();
|
if (!opt.args_neg.empty()) {
|
||||||
|
args.back() = opt.args_neg.back();
|
||||||
|
} else {
|
||||||
|
// otherwise, skip the flag
|
||||||
|
// TODO: maybe throw an error instead?
|
||||||
|
args.pop_back();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (opt.value_hint != nullptr) {
|
if (opt.value_hint != nullptr) {
|
||||||
|
|
@ -141,10 +147,10 @@ static std::map<std::string, std::map<std::string, std::string>> parse_ini_from_
|
||||||
static std::map<std::string, common_arg> get_map_key_opt(common_params_context & ctx_params) {
|
static std::map<std::string, common_arg> get_map_key_opt(common_params_context & ctx_params) {
|
||||||
std::map<std::string, common_arg> mapping;
|
std::map<std::string, common_arg> mapping;
|
||||||
for (const auto & opt : ctx_params.options) {
|
for (const auto & opt : ctx_params.options) {
|
||||||
if (opt.env != nullptr) {
|
for (const auto & env : opt.get_env()) {
|
||||||
mapping[opt.env] = opt;
|
mapping[env] = opt;
|
||||||
}
|
}
|
||||||
for (const auto & arg : opt.args) {
|
for (const auto & arg : opt.get_args()) {
|
||||||
mapping[rm_leading_dashes(arg)] = opt;
|
mapping[rm_leading_dashes(arg)] = opt;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -14,12 +14,13 @@ static void write_table_header(std::ofstream & file) {
|
||||||
static void write_table_entry(std::ofstream & file, const common_arg & opt) {
|
static void write_table_entry(std::ofstream & file, const common_arg & opt) {
|
||||||
file << "| `";
|
file << "| `";
|
||||||
// args
|
// args
|
||||||
for (const auto & arg : opt.args) {
|
auto all_args = opt.get_args();
|
||||||
if (arg == opt.args.front()) {
|
for (const auto & arg : all_args) {
|
||||||
|
if (arg == all_args.front()) {
|
||||||
file << arg;
|
file << arg;
|
||||||
if (opt.args.size() > 1) file << ", ";
|
if (all_args.size() > 1) file << ", ";
|
||||||
} else {
|
} else {
|
||||||
file << arg << (arg != opt.args.back() ? ", " : "");
|
file << arg << (arg != all_args.back() ? ", " : "");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// value hint
|
// value hint
|
||||||
|
|
|
||||||
|
|
@ -20,20 +20,20 @@ int main(void) {
|
||||||
std::unordered_set<std::string> seen_env_vars;
|
std::unordered_set<std::string> seen_env_vars;
|
||||||
for (const auto & opt : ctx_arg.options) {
|
for (const auto & opt : ctx_arg.options) {
|
||||||
// check for args duplications
|
// check for args duplications
|
||||||
for (const auto & arg : opt.args) {
|
for (const auto & arg : opt.get_args()) {
|
||||||
if (seen_args.find(arg) == seen_args.end()) {
|
if (seen_args.find(arg) == seen_args.end()) {
|
||||||
seen_args.insert(arg);
|
seen_args.insert(arg);
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "test-arg-parser: found different handlers for the same argument: %s", arg);
|
fprintf(stderr, "test-arg-parser: found different handlers for the same argument: %s", arg.c_str());
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// check for env var duplications
|
// check for env var duplications
|
||||||
if (opt.env) {
|
for (const auto & env : opt.get_env()) {
|
||||||
if (seen_env_vars.find(opt.env) == seen_env_vars.end()) {
|
if (seen_env_vars.find(env) == seen_env_vars.end()) {
|
||||||
seen_env_vars.insert(opt.env);
|
seen_env_vars.insert(env);
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "test-arg-parser: found different handlers for the same env var: %s", opt.env);
|
fprintf(stderr, "test-arg-parser: found different handlers for the same env var: %s", env.c_str());
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -115,6 +115,14 @@ int main(void) {
|
||||||
assert(params.model.path == "blah.gguf");
|
assert(params.model.path == "blah.gguf");
|
||||||
assert(params.cpuparams.n_threads == 1010);
|
assert(params.cpuparams.n_threads == 1010);
|
||||||
|
|
||||||
|
printf("test-arg-parser: test negated environment variables\n\n");
|
||||||
|
|
||||||
|
setenv("LLAMA_ARG_MMAP", "0", true);
|
||||||
|
setenv("LLAMA_ARG_NO_PERF", "1", true); // legacy format
|
||||||
|
argv = {"binary_name"};
|
||||||
|
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||||
|
assert(params.use_mmap == false);
|
||||||
|
assert(params.no_perf == true);
|
||||||
|
|
||||||
printf("test-arg-parser: test environment variables being overwritten\n\n");
|
printf("test-arg-parser: test environment variables being overwritten\n\n");
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -54,9 +54,8 @@ For the ful list of features, please refer to [server's changelog](https://githu
|
||||||
| `--swa-full` | use full-size SWA cache (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)<br/>(env: LLAMA_ARG_SWA_FULL) |
|
| `--swa-full` | use full-size SWA cache (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)<br/>(env: LLAMA_ARG_SWA_FULL) |
|
||||||
| `--kv-unified, -kvu` | use single unified KV buffer for the KV cache of all sequences (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
|
| `--kv-unified, -kvu` | use single unified KV buffer for the KV cache of all sequences (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
|
||||||
| `-fa, --flash-attn [on\|off\|auto]` | set Flash Attention use ('on', 'off', or 'auto', default: 'auto')<br/>(env: LLAMA_ARG_FLASH_ATTN) |
|
| `-fa, --flash-attn [on\|off\|auto]` | set Flash Attention use ('on', 'off', or 'auto', default: 'auto')<br/>(env: LLAMA_ARG_FLASH_ATTN) |
|
||||||
| `--no-perf` | disable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_NO_PERF) |
|
| `--perf, --no-perf` | whether to enable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_PERF) |
|
||||||
| `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
|
| `-e, --escape, --no-escape` | whether to process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
|
||||||
| `--no-escape` | do not process escape sequences |
|
|
||||||
| `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model<br/>(env: LLAMA_ARG_ROPE_SCALING_TYPE) |
|
| `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model<br/>(env: LLAMA_ARG_ROPE_SCALING_TYPE) |
|
||||||
| `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N<br/>(env: LLAMA_ARG_ROPE_SCALE) |
|
| `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N<br/>(env: LLAMA_ARG_ROPE_SCALE) |
|
||||||
| `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model)<br/>(env: LLAMA_ARG_ROPE_FREQ_BASE) |
|
| `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model)<br/>(env: LLAMA_ARG_ROPE_FREQ_BASE) |
|
||||||
|
|
@ -66,15 +65,15 @@ For the ful list of features, please refer to [server's changelog](https://githu
|
||||||
| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.0)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) |
|
| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.0)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) |
|
||||||
| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
|
| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
|
||||||
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
|
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
|
||||||
| `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) |
|
| `-kvo, --kv-offload, -nkvo, --no-kv-offload` | whether to enable KV cache offloading (default: enabled)<br/>(env: LLAMA_ARG_KV_OFFLOAD) |
|
||||||
| `-nr, --no-repack` | disable weight repacking<br/>(env: LLAMA_ARG_NO_REPACK) |
|
| `--repack, -nr, --no-repack` | whether to enable weight repacking (default: enabled)<br/>(env: LLAMA_ARG_REPACK) |
|
||||||
| `--no-host` | bypass host buffer allowing extra buffers to be used<br/>(env: LLAMA_ARG_NO_HOST) |
|
| `--no-host` | bypass host buffer allowing extra buffers to be used<br/>(env: LLAMA_ARG_HOST) |
|
||||||
| `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
|
| `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
|
||||||
| `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
|
| `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
|
||||||
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
|
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
|
||||||
| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
|
| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
|
||||||
| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
|
| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
|
||||||
| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock)<br/>(env: LLAMA_ARG_NO_MMAP) |
|
| `--mmap, --no-mmap` | whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
|
||||||
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
|
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
|
||||||
| `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
|
| `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
|
||||||
| `--list-devices` | print list of available devices and exit |
|
| `--list-devices` | print list of available devices and exit |
|
||||||
|
|
@ -87,7 +86,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
|
||||||
| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)<br/>(env: LLAMA_ARG_MAIN_GPU) |
|
| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)<br/>(env: LLAMA_ARG_MAIN_GPU) |
|
||||||
| `--check-tensors` | check model tensor data for invalid values (default: false) |
|
| `--check-tensors` | check model tensor data for invalid values (default: false) |
|
||||||
| `--override-kv KEY=TYPE:VALUE` | advanced option to override model metadata by key. may be specified multiple times.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false |
|
| `--override-kv KEY=TYPE:VALUE` | advanced option to override model metadata by key. may be specified multiple times.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false |
|
||||||
| `--no-op-offload` | disable offloading host tensor operations to device (default: false) |
|
| `--op-offload, --no-op-offload` | whether to offload host tensor operations to device (default: true) |
|
||||||
| `--lora FNAME` | path to LoRA adapter (can be repeated to use multiple adapters) |
|
| `--lora FNAME` | path to LoRA adapter (can be repeated to use multiple adapters) |
|
||||||
| `--lora-scaled FNAME SCALE` | path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters) |
|
| `--lora-scaled FNAME SCALE` | path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters) |
|
||||||
| `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
|
| `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
|
||||||
|
|
@ -157,19 +156,18 @@ For the ful list of features, please refer to [server's changelog](https://githu
|
||||||
| -------- | ----------- |
|
| -------- | ----------- |
|
||||||
| `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)<br/>(env: LLAMA_ARG_CTX_CHECKPOINTS) |
|
| `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)<br/>(env: LLAMA_ARG_CTX_CHECKPOINTS) |
|
||||||
| `--cache-ram, -cram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
|
| `--cache-ram, -cram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
|
||||||
| `--no-context-shift` | disables context shift on infinite text generation (default: enabled)<br/>(env: LLAMA_ARG_NO_CONTEXT_SHIFT) |
|
| `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
|
||||||
| `--context-shift` | enables context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
|
|
||||||
| `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode<br/> |
|
| `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode<br/> |
|
||||||
| `-sp, --special` | special tokens output enabled (default: false) |
|
| `-sp, --special` | special tokens output enabled (default: false) |
|
||||||
| `--no-warmup` | skip warming up the model with an empty run |
|
| `--warmup, --no-warmup` | whether to perform warmup with an empty run (default: enabled) |
|
||||||
| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
|
| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
|
||||||
| `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
|
| `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
|
||||||
| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
|
| `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
|
||||||
| `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
|
| `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
|
||||||
| `-mm, --mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md<br/>note: if -hf is used, this argument can be omitted<br/>(env: LLAMA_ARG_MMPROJ) |
|
| `-mm, --mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md<br/>note: if -hf is used, this argument can be omitted<br/>(env: LLAMA_ARG_MMPROJ) |
|
||||||
| `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
|
| `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
|
||||||
| `--no-mmproj` | explicitly disable multimodal projector, useful when using -hf<br/>(env: LLAMA_ARG_NO_MMPROJ) |
|
| `--mmproj-auto, --no-mmproj, --no-mmproj-auto` | whether to use multimodal projector file (if available), useful when using -hf (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_AUTO) |
|
||||||
| `--no-mmproj-offload` | do not offload multimodal projector to GPU<br/>(env: LLAMA_ARG_NO_MMPROJ_OFFLOAD) |
|
| `--mmproj-offload, --no-mmproj-offload` | whether to enable GPU offloading for multimodal projector (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_OFFLOAD) |
|
||||||
| `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MIN_TOKENS) |
|
| `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MIN_TOKENS) |
|
||||||
| `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MAX_TOKENS) |
|
| `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MAX_TOKENS) |
|
||||||
| `--override-tensor-draft, -otd <tensor name pattern>=<buffer type>,...` | override tensor buffer type for draft model |
|
| `--override-tensor-draft, -otd <tensor name pattern>=<buffer type>,...` | override tensor buffer type for draft model |
|
||||||
|
|
@ -180,7 +178,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
|
||||||
| `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
|
| `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
|
||||||
| `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
|
| `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
|
||||||
| `--api-prefix PREFIX` | prefix path the server serves from, without the trailing slash (default: )<br/>(env: LLAMA_ARG_API_PREFIX) |
|
| `--api-prefix PREFIX` | prefix path the server serves from, without the trailing slash (default: )<br/>(env: LLAMA_ARG_API_PREFIX) |
|
||||||
| `--no-webui` | Disable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_NO_WEBUI) |
|
| `--webui, --no-webui` | whether to enable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_WEBUI) |
|
||||||
| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
|
| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
|
||||||
| `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
|
| `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
|
||||||
| `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) |
|
| `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) |
|
||||||
|
|
@ -193,20 +191,19 @@ For the ful list of features, please refer to [server's changelog](https://githu
|
||||||
| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)<br/>[(card)](https://ggml.ai/f0.png)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
|
| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)<br/>[(card)](https://ggml.ai/f0.png)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
|
||||||
| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
|
| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
|
||||||
| `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |
|
| `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |
|
||||||
| `--slots` | enable slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
|
| `--slots, --no-slots` | expose slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
|
||||||
| `--no-slots` | disables slots monitoring endpoint<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
|
|
||||||
| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
|
| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
|
||||||
|
| `--media-path PATH` | directory for loading local media files; files can be accessed via file:// URLs using relative paths (default: disabled) |
|
||||||
| `--models-dir PATH` | directory containing models for the router server (default: disabled)<br/>(env: LLAMA_ARG_MODELS_DIR) |
|
| `--models-dir PATH` | directory containing models for the router server (default: disabled)<br/>(env: LLAMA_ARG_MODELS_DIR) |
|
||||||
|
| `--models-preset PATH` | path to INI file containing model presets for the router server (default: disabled)<br/>(env: LLAMA_ARG_MODELS_PRESET) |
|
||||||
| `--models-max N` | for router server, maximum number of models to load simultaneously (default: 4, 0 = unlimited)<br/>(env: LLAMA_ARG_MODELS_MAX) |
|
| `--models-max N` | for router server, maximum number of models to load simultaneously (default: 4, 0 = unlimited)<br/>(env: LLAMA_ARG_MODELS_MAX) |
|
||||||
| `--models-allow-extra-args` | for router server, allow extra arguments for models; important: some arguments can allow users to access local file system, use with caution (default: disabled)<br/>(env: LLAMA_ARG_MODELS_ALLOW_EXTRA_ARGS) |
|
| `--models-autoload, --no-models-autoload` | for router server, whether to automatically load models (default: enabled)<br/>(env: LLAMA_ARG_MODELS_AUTOLOAD) |
|
||||||
| `--no-models-autoload` | disables automatic loading of models (default: enabled)<br/>(env: LLAMA_ARG_NO_MODELS_AUTOLOAD) |
|
| `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)<br/>(env: LLAMA_ARG_JINJA) |
|
||||||
| `--jinja` | use jinja template for chat (default: enabled)<br/><br/>(env: LLAMA_ARG_JINJA) |
|
|
||||||
| `--no-jinja` | disable jinja template for chat (default: enabled)<br/><br/>(env: LLAMA_ARG_NO_JINJA) |
|
|
||||||
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
|
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
|
||||||
| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
|
| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
|
||||||
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
||||||
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
|
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
|
||||||
| `--no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/><br/>(env: LLAMA_ARG_NO_PREFILL_ASSISTANT) |
|
| `--prefill-assistant, --no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/><br/>(env: LLAMA_ARG_PREFILL_ASSISTANT) |
|
||||||
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled)<br/> |
|
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled)<br/> |
|
||||||
| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
|
| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
|
||||||
| `-td, --threads-draft N` | number of threads to use during generation (default: same as --threads) |
|
| `-td, --threads-draft N` | number of threads to use during generation (default: same as --threads) |
|
||||||
|
|
@ -236,6 +233,11 @@ For the ful list of features, please refer to [server's changelog](https://githu
|
||||||
|
|
||||||
Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
|
Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
|
||||||
|
|
||||||
|
For boolean options like `--mmap` or `--kv-offload`, the environment variable is handled as shown in this example:
|
||||||
|
- `LLAMA_ARG_MMAP=true` means enabled, other accepted values are: `1`, `on`, `enabled`
|
||||||
|
- `LLAMA_ARG_MMAP=false` means disabled, other accepted values are: `0`, `off`, `disabled`
|
||||||
|
- If `LLAMA_ARG_NO_MMAP` is present (no matter the value), it means disabling mmap
|
||||||
|
|
||||||
Example usage of docker compose with environment variables:
|
Example usage of docker compose with environment variables:
|
||||||
|
|
||||||
```yml
|
```yml
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue