arg: fix order to use short form before long form (#18196)
* arg: fix order to use short form before long form * arg: update doc * arg: update test-arg-parser * arg: address review feedback from ngxson simplified to check first.length() <= last.length() only fixed: --sampler-seq, --rerank, --draft ordering note: middle positions in 3+ arg sets are not verified * arg: update doc
This commit is contained in:
parent
f99ef53d2a
commit
14931a826e
|
|
@ -1144,7 +1144,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--cache-ram", "-cram"}, "N",
|
{"-cram", "--cache-ram"}, "N",
|
||||||
string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)"
|
string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)"
|
||||||
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib),
|
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
|
|
@ -1152,7 +1152,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--kv-unified", "-kvu"},
|
{"-kvu", "--kv-unified"},
|
||||||
"use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)",
|
"use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)",
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.kv_unified = true;
|
params.kv_unified = true;
|
||||||
|
|
@ -1420,7 +1420,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--sampling-seq", "--sampler-seq"}, "SEQUENCE",
|
{"--sampler-seq", "--sampling-seq"}, "SEQUENCE",
|
||||||
string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
|
string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
params.sampling.samplers = common_sampler_types_from_chars(value);
|
params.sampling.samplers = common_sampler_types_from_chars(value);
|
||||||
|
|
@ -2078,26 +2078,26 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
));
|
));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--override-tensor", "-ot"}, "<tensor name pattern>=<buffer type>,...",
|
{"-ot", "--override-tensor"}, "<tensor name pattern>=<buffer type>,...",
|
||||||
"override tensor buffer type", [](common_params & params, const std::string & value) {
|
"override tensor buffer type", [](common_params & params, const std::string & value) {
|
||||||
parse_tensor_buffer_overrides(value, params.tensor_buft_overrides);
|
parse_tensor_buffer_overrides(value, params.tensor_buft_overrides);
|
||||||
}
|
}
|
||||||
));
|
));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--override-tensor-draft", "-otd"}, "<tensor name pattern>=<buffer type>,...",
|
{"-otd", "--override-tensor-draft"}, "<tensor name pattern>=<buffer type>,...",
|
||||||
"override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
|
"override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
|
||||||
parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides);
|
parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides);
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--cpu-moe", "-cmoe"},
|
{"-cmoe", "--cpu-moe"},
|
||||||
"keep all Mixture of Experts (MoE) weights in the CPU",
|
"keep all Mixture of Experts (MoE) weights in the CPU",
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
|
params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_CPU_MOE"));
|
).set_env("LLAMA_ARG_CPU_MOE"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--n-cpu-moe", "-ncmoe"}, "N",
|
{"-ncmoe", "--n-cpu-moe"}, "N",
|
||||||
"keep the Mixture of Experts (MoE) weights of the first N layers in the CPU",
|
"keep the Mixture of Experts (MoE) weights of the first N layers in the CPU",
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
if (value < 0) {
|
if (value < 0) {
|
||||||
|
|
@ -2112,14 +2112,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_N_CPU_MOE"));
|
).set_env("LLAMA_ARG_N_CPU_MOE"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--cpu-moe-draft", "-cmoed"},
|
{"-cmoed", "--cpu-moe-draft"},
|
||||||
"keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
|
"keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
|
params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--n-cpu-moe-draft", "-ncmoed"}, "N",
|
{"-ncmoed", "--n-cpu-moe-draft"}, "N",
|
||||||
"keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model",
|
"keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model",
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
if (value < 0) {
|
if (value < 0) {
|
||||||
|
|
@ -2647,7 +2647,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--reranking", "--rerank"},
|
{"--rerank", "--reranking"},
|
||||||
string_format("enable reranking endpoint on server (default: %s)", "disabled"),
|
string_format("enable reranking endpoint on server (default: %s)", "disabled"),
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.embedding = true;
|
params.embedding = true;
|
||||||
|
|
@ -3118,7 +3118,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--draft-max", "--draft", "--draft-n"}, "N",
|
{"--draft", "--draft-n", "--draft-max"}, "N",
|
||||||
string_format("number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max),
|
string_format("number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.speculative.n_max = value;
|
params.speculative.n_max = value;
|
||||||
|
|
|
||||||
|
|
@ -37,6 +37,30 @@ int main(void) {
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ensure shorter argument precedes longer argument
|
||||||
|
if (opt.args.size() > 1) {
|
||||||
|
const std::string first(opt.args.front());
|
||||||
|
const std::string last(opt.args.back());
|
||||||
|
|
||||||
|
if (first.length() > last.length()) {
|
||||||
|
fprintf(stderr, "test-arg-parser: shorter argument should come before longer one: %s, %s\n",
|
||||||
|
first.c_str(), last.c_str());
|
||||||
|
assert(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// same check for negated arguments
|
||||||
|
if (opt.args_neg.size() > 1) {
|
||||||
|
const std::string first(opt.args_neg.front());
|
||||||
|
const std::string last(opt.args_neg.back());
|
||||||
|
|
||||||
|
if (first.length() > last.length()) {
|
||||||
|
fprintf(stderr, "test-arg-parser: shorter negated argument should come before longer one: %s, %s\n",
|
||||||
|
first.c_str(), last.c_str());
|
||||||
|
assert(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} catch (std::exception & e) {
|
} catch (std::exception & e) {
|
||||||
printf("%s\n", e.what());
|
printf("%s\n", e.what());
|
||||||
|
|
|
||||||
|
|
@ -75,9 +75,9 @@ For the ful list of features, please refer to [server's changelog](https://githu
|
||||||
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
|
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
|
||||||
| `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
|
| `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
|
||||||
| `--list-devices` | print list of available devices and exit |
|
| `--list-devices` | print list of available devices and exit |
|
||||||
| `--override-tensor, -ot <tensor name pattern>=<buffer type>,...` | override tensor buffer type |
|
| `-ot, --override-tensor <tensor name pattern>=<buffer type>,...` | override tensor buffer type |
|
||||||
| `--cpu-moe, -cmoe` | keep all Mixture of Experts (MoE) weights in the CPU<br/>(env: LLAMA_ARG_CPU_MOE) |
|
| `-cmoe, --cpu-moe` | keep all Mixture of Experts (MoE) weights in the CPU<br/>(env: LLAMA_ARG_CPU_MOE) |
|
||||||
| `--n-cpu-moe, -ncmoe N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU<br/>(env: LLAMA_ARG_N_CPU_MOE) |
|
| `-ncmoe, --n-cpu-moe N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU<br/>(env: LLAMA_ARG_N_CPU_MOE) |
|
||||||
| `-ngl, --gpu-layers, --n-gpu-layers N` | max. number of layers to store in VRAM (default: -1)<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
|
| `-ngl, --gpu-layers, --n-gpu-layers N` | max. number of layers to store in VRAM (default: -1)<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
|
||||||
| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs<br/>(env: LLAMA_ARG_SPLIT_MODE) |
|
| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs<br/>(env: LLAMA_ARG_SPLIT_MODE) |
|
||||||
| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1<br/>(env: LLAMA_ARG_TENSOR_SPLIT) |
|
| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1<br/>(env: LLAMA_ARG_TENSOR_SPLIT) |
|
||||||
|
|
@ -120,7 +120,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
|
||||||
| -------- | ----------- |
|
| -------- | ----------- |
|
||||||
| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: penalties;dry;top_n_sigma;top_k;typ_p;top_p;min_p;xtc;temperature) |
|
| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: penalties;dry;top_n_sigma;top_k;typ_p;top_p;min_p;xtc;temperature) |
|
||||||
| `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
|
| `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
|
||||||
| `--sampling-seq, --sampler-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) |
|
| `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) |
|
||||||
| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
|
| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
|
||||||
| `--temp N` | temperature (default: 0.8) |
|
| `--temp N` | temperature (default: 0.8) |
|
||||||
| `--top-k N` | top-k sampling (default: 40, 0 = disabled)<br/>(env: LLAMA_ARG_TOP_K) |
|
| `--top-k N` | top-k sampling (default: 40, 0 = disabled)<br/>(env: LLAMA_ARG_TOP_K) |
|
||||||
|
|
@ -156,8 +156,8 @@ For the ful list of features, please refer to [server's changelog](https://githu
|
||||||
| Argument | Explanation |
|
| Argument | Explanation |
|
||||||
| -------- | ----------- |
|
| -------- | ----------- |
|
||||||
| `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)<br/>(env: LLAMA_ARG_CTX_CHECKPOINTS) |
|
| `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)<br/>(env: LLAMA_ARG_CTX_CHECKPOINTS) |
|
||||||
| `--cache-ram, -cram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
|
| `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
|
||||||
| `--kv-unified, -kvu` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
|
| `-kvu, --kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
|
||||||
| `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
|
| `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
|
||||||
| `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode<br/> |
|
| `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode<br/> |
|
||||||
| `-sp, --special` | special tokens output enabled (default: false) |
|
| `-sp, --special` | special tokens output enabled (default: false) |
|
||||||
|
|
@ -172,9 +172,9 @@ For the ful list of features, please refer to [server's changelog](https://githu
|
||||||
| `--mmproj-offload, --no-mmproj-offload` | whether to enable GPU offloading for multimodal projector (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_OFFLOAD) |
|
| `--mmproj-offload, --no-mmproj-offload` | whether to enable GPU offloading for multimodal projector (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_OFFLOAD) |
|
||||||
| `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MIN_TOKENS) |
|
| `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MIN_TOKENS) |
|
||||||
| `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MAX_TOKENS) |
|
| `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MAX_TOKENS) |
|
||||||
| `--override-tensor-draft, -otd <tensor name pattern>=<buffer type>,...` | override tensor buffer type for draft model |
|
| `-otd, --override-tensor-draft <tensor name pattern>=<buffer type>,...` | override tensor buffer type for draft model |
|
||||||
| `--cpu-moe-draft, -cmoed` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model<br/>(env: LLAMA_ARG_CPU_MOE_DRAFT) |
|
| `-cmoed, --cpu-moe-draft` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model<br/>(env: LLAMA_ARG_CPU_MOE_DRAFT) |
|
||||||
| `--n-cpu-moe-draft, -ncmoed N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model<br/>(env: LLAMA_ARG_N_CPU_MOE_DRAFT) |
|
| `-ncmoed, --n-cpu-moe-draft N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model<br/>(env: LLAMA_ARG_N_CPU_MOE_DRAFT) |
|
||||||
| `-a, --alias STRING` | set alias for model name (to be used by REST API)<br/>(env: LLAMA_ARG_ALIAS) |
|
| `-a, --alias STRING` | set alias for model name (to be used by REST API)<br/>(env: LLAMA_ARG_ALIAS) |
|
||||||
| `--host HOST` | ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) |
|
| `--host HOST` | ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) |
|
||||||
| `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
|
| `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
|
||||||
|
|
@ -184,7 +184,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
|
||||||
| `--webui-config-file PATH` | JSON file that provides default WebUI settings (overrides WebUI defaults)<br/>(env: LLAMA_ARG_WEBUI_CONFIG_FILE) |
|
| `--webui-config-file PATH` | JSON file that provides default WebUI settings (overrides WebUI defaults)<br/>(env: LLAMA_ARG_WEBUI_CONFIG_FILE) |
|
||||||
| `--webui, --no-webui` | whether to enable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_WEBUI) |
|
| `--webui, --no-webui` | whether to enable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_WEBUI) |
|
||||||
| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
|
| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
|
||||||
| `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
|
| `--rerank, --reranking` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
|
||||||
| `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) |
|
| `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) |
|
||||||
| `--api-key-file FNAME` | path to file containing API keys (default: none) |
|
| `--api-key-file FNAME` | path to file containing API keys (default: none) |
|
||||||
| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key<br/>(env: LLAMA_ARG_SSL_KEY_FILE) |
|
| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key<br/>(env: LLAMA_ARG_SSL_KEY_FILE) |
|
||||||
|
|
@ -212,7 +212,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
|
||||||
| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
|
| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
|
||||||
| `-td, --threads-draft N` | number of threads to use during generation (default: same as --threads) |
|
| `-td, --threads-draft N` | number of threads to use during generation (default: same as --threads) |
|
||||||
| `-tbd, --threads-batch-draft N` | number of threads to use during batch and prompt processing (default: same as --threads-draft) |
|
| `-tbd, --threads-batch-draft N` | number of threads to use during batch and prompt processing (default: same as --threads-draft) |
|
||||||
| `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_DRAFT_MAX) |
|
| `--draft, --draft-n, --draft-max N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_DRAFT_MAX) |
|
||||||
| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)<br/>(env: LLAMA_ARG_DRAFT_MIN) |
|
| `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)<br/>(env: LLAMA_ARG_DRAFT_MIN) |
|
||||||
| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.8)<br/>(env: LLAMA_ARG_DRAFT_P_MIN) |
|
| `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.8)<br/>(env: LLAMA_ARG_DRAFT_P_MIN) |
|
||||||
| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE_DRAFT) |
|
| `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE_DRAFT) |
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue