cli: new CLI experience (#17824)

* wip

* wip

* fix logging, add display info

* handle commands

* add args

* wip

* move old cli to llama-completion

* rm deprecation notice

* move server to a shared library

* move ci to llama-completion

* add loading animation

* add --show-timings arg

* add /read command, improve LOG_ERR

* add args for speculative decoding, enable show timings by default

* add arg --image and --audio

* fix windows build

* support reasoning_content

* fix llama2c workflow

* color default is auto

* fix merge conflicts

* properly fix color problem

Co-authored-by: bandoti <bandoti@users.noreply.github.com>

* better loading spinner

* make sure to clean color on force-exit

* also clear input files on "/clear"

* simplify common_log_flush

* add warning in mtmd-cli

* implement console writter

* fix data race

* add attribute

* fix llama-completion and mtmd-cli

* add some notes about console::log

* fix compilation

---------

Co-authored-by: bandoti <bandoti@users.noreply.github.com>
This commit is contained in:
Xuan-Son Nguyen 2025-12-10 15:28:59 +01:00 committed by GitHub
parent b677721819
commit 6c2131773c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
26 changed files with 742 additions and 148 deletions

View File

@ -243,7 +243,7 @@ jobs:
echo "Fetch llama2c model"
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
- name: Test llama2c (s390x)
id: llama2c_test_s390x
@ -252,7 +252,7 @@ jobs:
cd build
echo "Fetch llama2c big-endian model"
wget https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K-be.gguf
./bin/llama-cli -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
./bin/llama-completion -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
ubuntu-latest-cmake-sanitizer:
runs-on: ubuntu-latest

View File

@ -347,19 +347,6 @@ To learn more about model quantization, [read this documentation](tools/quantize
</details>
- <details>
<summary>Run simple text completion</summary>
To disable conversation mode explicitly, use `-no-cnv`
```bash
llama-cli -m model.gguf -p "I believe the meaning of life is" -n 128 -no-cnv
# I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
```
</details>
- <details>
<summary>Constrain the output with a custom grammar</summary>

View File

@ -398,18 +398,18 @@ function gg_run_qwen3_0_6b {
./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k $(nproc)
./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k $(nproc)
(time ./bin/llama-cli -no-cnv --model ${model_f16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-cli -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
(time ./bin/llama-cli -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-cli -no-cnv --model ${model_q4_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-cli -no-cnv --model ${model_q4_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-cli -no-cnv --model ${model_q5_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-cli -no-cnv --model ${model_q5_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-cli -no-cnv --model ${model_q2_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q3_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q4_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q5_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q6_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-completion -no-cnv --model ${model_f16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-completion -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
(time ./bin/llama-completion -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-completion -no-cnv --model ${model_q4_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-completion -no-cnv --model ${model_q4_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-completion -no-cnv --model ${model_q5_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-completion -no-cnv --model ${model_q5_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-completion -no-cnv --model ${model_q2_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-completion -no-cnv --model ${model_q3_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-completion -no-cnv --model ${model_q4_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-completion -no-cnv --model ${model_q5_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-completion -no-cnv --model ${model_q6_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
if [ -z ${GG_BUILD_NO_BF16} ]; then

View File

@ -51,6 +51,7 @@ using json = nlohmann::ordered_json;
static std::initializer_list<enum llama_example> mmproj_examples = {
LLAMA_EXAMPLE_MTMD,
LLAMA_EXAMPLE_SERVER,
LLAMA_EXAMPLE_CLI,
};
static std::string read_file(const std::string & fname) {
@ -468,6 +469,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
));
}
common_log_set_verbosity_thold(params.verbosity);
return true;
}
@ -790,7 +793,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) {
params.display_prompt = false;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"-co", "--color"}, "[on|off|auto]",
"Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')\n"
@ -807,7 +810,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
string_format("error: unknown value for --color: '%s'\n", value.c_str()));
}
}
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
add_opt(common_arg(
{"-t", "--threads"}, "N",
string_format("number of CPU threads to use during generation (default: %d)", params.cpuparams.n_threads),
@ -940,7 +943,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
add_opt(common_arg(
{"-n", "--predict", "--n-predict"}, "N",
string_format(
ex == LLAMA_EXAMPLE_MAIN
ex == LLAMA_EXAMPLE_COMPLETION
? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)"
: "number of tokens to predict (default: %d, -1 = infinity)",
params.n_predict),
@ -984,7 +987,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, int value) {
params.n_ctx_checkpoints = value;
}
).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--cache-ram", "-cram"}, "N",
string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)\n"
@ -992,7 +995,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, int value) {
params.cache_ram_mib = value;
}
).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER}));
).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--kv-unified", "-kvu"},
string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
@ -1007,14 +1010,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) {
params.ctx_shift = false;
}
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
add_opt(common_arg(
{"--context-shift"},
string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
[](common_params & params) {
params.ctx_shift = true;
}
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
add_opt(common_arg(
{"--chunks"}, "N",
string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
@ -1050,7 +1053,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.system_prompt = value;
}
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
add_opt(common_arg(
{"--no-perf"},
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
@ -1059,6 +1062,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.sampling.no_perf = true;
}
).set_env("LLAMA_ARG_NO_PERF"));
add_opt(common_arg(
{"--no-show-timings"},
string_format("disable timing information after each response (default: %s)", params.show_timings ? "true" : "false"),
[](common_params & params) {
params.show_timings = false;
}
).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_NO_SHOW_TIMINGS"));
add_opt(common_arg(
{"-f", "--file"}, "FNAME",
"a file containing the prompt (default: none)",
@ -1080,7 +1090,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.system_prompt.pop_back();
}
}
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
add_opt(common_arg(
{"--in-file"}, "FNAME",
"an input file (repeat to specify multiple files)",
@ -1128,42 +1138,42 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, int value) {
params.n_print = value;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"--prompt-cache"}, "FNAME",
"file to cache prompt state for faster startup (default: none)",
[](common_params & params, const std::string & value) {
params.path_prompt_cache = value;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"--prompt-cache-all"},
"if specified, saves user input and generations to cache as well\n",
[](common_params & params) {
params.prompt_cache_all = true;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"--prompt-cache-ro"},
"if specified, uses the prompt cache but does not update it",
[](common_params & params) {
params.prompt_cache_ro = true;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"-r", "--reverse-prompt"}, "PROMPT",
"halt generation at PROMPT, return control in interactive mode\n",
[](common_params & params, const std::string & value) {
params.antiprompt.emplace_back(value);
}
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"-sp", "--special"},
string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
[](common_params & params) {
params.special = true;
}
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"-cnv", "--conversation"},
"run in conversation mode:\n"
@ -1173,14 +1183,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) {
params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"-no-cnv", "--no-conversation"},
"force disable conversation mode (default: false)",
[](common_params & params) {
params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"-st", "--single-turn"},
"run conversation for a single turn only, then exit when done\n"
@ -1189,28 +1199,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) {
params.single_turn = true;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"-i", "--interactive"},
string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
[](common_params & params) {
params.interactive = true;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"-if", "--interactive-first"},
string_format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
[](common_params & params) {
params.interactive_first = true;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"-mli", "--multiline-input"},
"allows you to write or paste multiple lines without ending each in '\\'",
[](common_params & params) {
params.multiline_input = true;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--in-prefix-bos"},
"prefix BOS to user inputs, preceding the `--in-prefix` string",
@ -1218,7 +1228,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.input_prefix_bos = true;
params.enable_chat_template = false;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"--in-prefix"}, "STRING",
"string to prefix user inputs with (default: empty)",
@ -1226,7 +1236,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.input_prefix = value;
params.enable_chat_template = false;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"--in-suffix"}, "STRING",
"string to suffix after user inputs with (default: empty)",
@ -1234,14 +1244,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.input_suffix = value;
params.enable_chat_template = false;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"--no-warmup"},
"skip warming up the model with an empty run",
[](common_params & params) {
params.warmup = false;
}
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
add_opt(common_arg(
{"--spm-infill"},
string_format(
@ -1632,14 +1642,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, int value) {
params.grp_attn_n = value;
}
).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_PASSKEY}));
).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_PASSKEY}));
add_opt(common_arg(
{"-gaw", "--grp-attn-w"}, "N",
string_format("group-attention width (default: %d)", params.grp_attn_w),
[](common_params & params, int value) {
params.grp_attn_w = value;
}
).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"-nkvo", "--no-kv-offload"},
"disable KV offload",
@ -1829,7 +1839,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.image.emplace_back(value);
}
).set_examples({LLAMA_EXAMPLE_MTMD}));
).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--image-min-tokens"}, "N",
"minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
@ -1922,7 +1932,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
"override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides);
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--cpu-moe", "-cmoe"},
"keep all Mixture of Experts (MoE) weights in the CPU",
@ -1951,7 +1961,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) {
params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
add_opt(common_arg(
{"--n-cpu-moe-draft", "-ncmoed"}, "N",
"keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model",
@ -1965,7 +1975,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
}
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
add_opt(common_arg(
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
string_format("max. number of layers to store in VRAM (default: %d)", params.n_gpu_layers),
@ -2444,7 +2454,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.default_template_kwargs[item.key()] = item.value().dump();
}
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
add_opt(common_arg(
{"-to", "--timeout"}, "N",
string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
@ -2553,14 +2563,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) {
params.use_jinja = true;
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
add_opt(common_arg(
{"--no-jinja"},
string_format("disable jinja template for chat (default: %s)\n", params.use_jinja ? "enabled" : "disabled"),
[](common_params & params) {
params.use_jinja = false;
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_NO_JINJA"));
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_NO_JINJA"));
add_opt(common_arg(
{"--reasoning-format"}, "FORMAT",
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
@ -2571,7 +2581,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.reasoning_format = common_reasoning_format_from_name(value);
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK"));
add_opt(common_arg(
{"--reasoning-budget"}, "N",
"controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
@ -2579,7 +2589,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
params.reasoning_budget = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_BUDGET"));
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET"));
add_opt(common_arg(
{"--chat-template"}, "JINJA_TEMPLATE",
string_format(
@ -2591,7 +2601,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.chat_template = value;
}
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
add_opt(common_arg(
{"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
string_format(
@ -2603,7 +2613,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.chat_template = read_file(value);
}
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
add_opt(common_arg(
{"--no-prefill-assistant"},
string_format(
@ -2634,7 +2644,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) {
params.simple_io = true;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--positive-file"}, "FNAME",
string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
@ -2717,7 +2727,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
"Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
[](common_params & params) {
params.verbosity = INT_MAX;
common_log_set_verbosity_thold(INT_MAX);
}
));
add_opt(common_arg(
@ -2738,7 +2747,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
"(default: %d)\n", params.verbosity),
[](common_params & params, int value) {
params.verbosity = value;
common_log_set_verbosity_thold(value);
}
).set_env("LLAMA_LOG_VERBOSITY"));
add_opt(common_arg(
@ -2871,14 +2879,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, int value) {
params.speculative.n_max = value;
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX"));
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MAX"));
add_opt(common_arg(
{"--draft-min", "--draft-n-min"}, "N",
string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
[](common_params & params, int value) {
params.speculative.n_min = value;
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN"));
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MIN"));
add_opt(common_arg(
{"--draft-p-split"}, "P",
string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
@ -2892,14 +2900,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.speculative.p_min = std::stof(value);
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
add_opt(common_arg(
{"-cd", "--ctx-size-draft"}, "N",
string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
[](common_params & params, int value) {
params.speculative.n_ctx = value;
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
add_opt(common_arg(
{"-devd", "--device-draft"}, "<dev1,dev2,..>",
"comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
@ -2907,7 +2915,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.speculative.devices = parse_device_list(value);
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
"number of layers to store in VRAM for the draft model",
@ -2919,21 +2927,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
}
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
add_opt(common_arg(
{"-md", "--model-draft"}, "FNAME",
"draft model for speculative decoding (default: unused)",
[](common_params & params, const std::string & value) {
params.speculative.model.path = value;
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_MODEL_DRAFT"));
add_opt(common_arg(
{"--spec-replace"}, "TARGET", "DRAFT",
"translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
[](common_params & params, const std::string & tgt, const std::string & dft) {
params.speculative.replacements.push_back({ tgt, dft });
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"-ctkd", "--cache-type-k-draft"}, "TYPE",
string_format(
@ -3197,7 +3205,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.use_jinja = true;
//params.default_template_kwargs["reasoning_effort"] = "\"high\"";
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--gpt-oss-120b-default"},
@ -3216,7 +3224,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.use_jinja = true;
//params.default_template_kwargs["reasoning_effort"] = "\"high\"";
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--vision-gemma-4b-default"},
@ -3227,7 +3235,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.n_ctx = 0;
params.use_jinja = true;
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--vision-gemma-12b-default"},
@ -3238,7 +3246,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.n_ctx = 0;
params.use_jinja = true;
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
return ctx_arg;
}

View File

@ -82,7 +82,8 @@ int32_t cpu_get_num_math();
enum llama_example {
LLAMA_EXAMPLE_COMMON,
LLAMA_EXAMPLE_SPECULATIVE,
LLAMA_EXAMPLE_MAIN,
LLAMA_EXAMPLE_COMPLETION,
LLAMA_EXAMPLE_CLI,
LLAMA_EXAMPLE_EMBEDDING,
LLAMA_EXAMPLE_PERPLEXITY,
LLAMA_EXAMPLE_RETRIEVAL,
@ -406,6 +407,7 @@ struct common_params {
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
bool cont_batching = true; // insert new sequences for decoding on-the-fly
bool no_perf = false; // disable performance metrics
bool show_timings = true; // show timing information on CLI
bool ctx_shift = false; // context shift on infinite text generation
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
bool kv_unified = false; // enable unified KV cache

View File

@ -1,4 +1,5 @@
#include "console.h"
#include "log.h"
#include <vector>
#include <iostream>
#include <cassert>
@ -6,6 +7,10 @@
#include <cctype>
#include <cwctype>
#include <cstdint>
#include <condition_variable>
#include <mutex>
#include <thread>
#include <stdarg.h>
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
@ -35,6 +40,7 @@
#define ANSI_COLOR_BLUE "\x1b[34m"
#define ANSI_COLOR_MAGENTA "\x1b[35m"
#define ANSI_COLOR_CYAN "\x1b[36m"
#define ANSI_COLOR_GRAY "\x1b[90m"
#define ANSI_COLOR_RESET "\x1b[0m"
#define ANSI_BOLD "\x1b[1m"
@ -63,7 +69,7 @@ namespace console {
static bool advanced_display = false;
static bool simple_io = true;
static display_t current_display = reset;
static display_type current_display = DISPLAY_TYPE_RESET;
static FILE* out = stdout;
@ -142,7 +148,7 @@ namespace console {
void cleanup() {
// Reset console display
set_display(reset);
set_display(DISPLAY_TYPE_RESET);
#if !defined(_WIN32)
// Restore settings on POSIX systems
@ -162,20 +168,26 @@ namespace console {
//
// Keep track of current display and only emit ANSI code if it changes
void set_display(display_t display) {
void set_display(display_type display) {
if (advanced_display && current_display != display) {
fflush(stdout);
common_log_flush(common_log_main());
switch(display) {
case reset:
case DISPLAY_TYPE_RESET:
fprintf(out, ANSI_COLOR_RESET);
break;
case prompt:
case DISPLAY_TYPE_INFO:
fprintf(out, ANSI_COLOR_MAGENTA);
break;
case DISPLAY_TYPE_PROMPT:
fprintf(out, ANSI_COLOR_YELLOW);
break;
case user_input:
case DISPLAY_TYPE_REASONING:
fprintf(out, ANSI_COLOR_GRAY);
break;
case DISPLAY_TYPE_USER_INPUT:
fprintf(out, ANSI_BOLD ANSI_COLOR_GREEN);
break;
case error:
case DISPLAY_TYPE_ERROR:
fprintf(out, ANSI_BOLD ANSI_COLOR_RED);
}
current_display = display;
@ -778,7 +790,6 @@ namespace console {
}
if (is_special_char) {
set_display(user_input);
replace_last(line.back());
is_special_char = false;
}
@ -961,7 +972,6 @@ namespace console {
}
if (!line.empty() && (line.back() == '\\' || line.back() == '/')) {
set_display(prompt);
replace_last(line.back());
is_special_char = true;
}
@ -1046,12 +1056,82 @@ namespace console {
}
bool readline(std::string & line, bool multiline_input) {
set_display(user_input);
if (simple_io) {
return readline_simple(line, multiline_input);
}
return readline_advanced(line, multiline_input);
}
namespace spinner {
static const char LOADING_CHARS[] = {'|', '/', '-', '\\'};
static std::condition_variable cv_stop;
static std::thread th;
static size_t frame = 0; // only modified by one thread
static bool running = false;
static std::mutex mtx;
static auto wait_time = std::chrono::milliseconds(100);
static void draw_next_frame() {
// don't need lock because only one thread modifies running
frame = (frame + 1) % sizeof(LOADING_CHARS);
replace_last(LOADING_CHARS[frame]);
fflush(out);
}
void start() {
std::unique_lock<std::mutex> lock(mtx);
if (simple_io || running) {
return;
}
common_log_flush(common_log_main());
fprintf(out, "%c", LOADING_CHARS[0]);
fflush(out);
frame = 1;
running = true;
th = std::thread([]() {
std::unique_lock<std::mutex> lock(mtx);
while (true) {
if (cv_stop.wait_for(lock, wait_time, []{ return !running; })) {
break;
}
draw_next_frame();
}
});
}
void stop() {
{
std::unique_lock<std::mutex> lock(mtx);
if (simple_io || !running) {
return;
}
running = false;
cv_stop.notify_all();
}
if (th.joinable()) {
th.join();
}
replace_last(' ');
pop_cursor();
fflush(out);
}
}
void log(const char * fmt, ...) {
va_list args;
va_start(args, fmt);
vfprintf(out, fmt, args);
va_end(args);
}
void error(const char * fmt, ...) {
va_list args;
va_start(args, fmt);
display_type cur = current_display;
set_display(DISPLAY_TYPE_ERROR);
vfprintf(out, fmt, args);
set_display(cur); // restore previous color
va_end(args);
}
void flush() {
fflush(out);
}
}

View File

@ -2,18 +2,40 @@
#pragma once
#include "common.h"
#include <string>
namespace console {
enum display_t {
reset = 0,
prompt,
user_input,
error
};
enum display_type {
DISPLAY_TYPE_RESET = 0,
DISPLAY_TYPE_INFO,
DISPLAY_TYPE_PROMPT,
DISPLAY_TYPE_REASONING,
DISPLAY_TYPE_USER_INPUT,
DISPLAY_TYPE_ERROR
};
namespace console {
void init(bool use_simple_io, bool use_advanced_display);
void cleanup();
void set_display(display_t display);
void set_display(display_type display);
bool readline(std::string & line, bool multiline_input);
namespace spinner {
void start();
void stop();
}
// note: the logging API below output directly to stdout
// it can negatively impact performance if used on inference thread
// only use in in a dedicated CLI thread
// for logging in inference thread, use log.h instead
LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
void log(const char * fmt, ...);
LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
void error(const char * fmt, ...);
void flush();
}

View File

@ -420,6 +420,11 @@ void common_log_set_timestamps(struct common_log * log, bool timestamps) {
log->set_timestamps(timestamps);
}
void common_log_flush(struct common_log * log) {
log->pause();
log->resume();
}
static int common_get_verbosity(enum ggml_log_level level) {
switch (level) {
case GGML_LOG_LEVEL_DEBUG: return LOG_LEVEL_DEBUG;

View File

@ -84,6 +84,7 @@ void common_log_set_file (struct common_log * log, const char * file); // n
void common_log_set_colors (struct common_log * log, log_colors colors); // not thread-safe
void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
void common_log_flush (struct common_log * log); // flush all pending log messages
// helper macros for logging
// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold

View File

@ -76,7 +76,7 @@ static void export_md(std::string fname, llama_example ex) {
}
int main(int, char **) {
export_md("autogen-main.md", LLAMA_EXAMPLE_MAIN);
export_md("autogen-main.md", LLAMA_EXAMPLE_COMPLETION);
export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER);
return 0;

View File

@ -46,7 +46,7 @@ adb $adbserial shell " \
LD_LIBRARY_PATH=$basedir/$branch/lib \
ADSP_LIBRARY_PATH=$basedir/$branch/lib \
$verbose $experimental $sched $opmask $profile $nhvx $ndev \
./$branch/bin/llama-cli --no-mmap -m $basedir/../gguf/$model \
./$branch/bin/llama-completion --no-mmap -m $basedir/../gguf/$model \
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \
--ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on \
-ngl 99 --device $device $cli_opts $@ \

View File

@ -79,19 +79,19 @@ run_conversion_and_inference_lora() {
# Run inference
echo -e "\n\n---------------------------\n\n"
echo "Running llama-cli without lora for $model_name with hidden_size $hidden_size..."
OUTPUT_BASE=$(./llama-cli -no-cnv -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \
echo "Running llama-completion without lora for $model_name with hidden_size $hidden_size..."
OUTPUT_BASE=$(./llama-completion -no-cnv -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \
-p "$EXPECTED_BASE_FIRST_WORD" -n 50 --seed 42 --temp 0)
echo -e "\n\n---------------------------\n\n"
echo "Running llama-cli with hot lora for $model_name with hidden_size $hidden_size..."
OUTPUT_LORA_HOT=$(./llama-cli -no-cnv -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \
echo "Running llama-completion with hot lora for $model_name with hidden_size $hidden_size..."
OUTPUT_LORA_HOT=$(./llama-completion -no-cnv -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \
--lora $MODELS_REPO/$model_name/hidden_size=$hidden_size/lora/Lora-F32-LoRA.gguf \
-p "$EXPECTED_LORA_FIRST_WORD" -n 50 --seed 42 --temp 0)
echo -e "\n\n---------------------------\n\n"
echo "Running llama-cli with merged lora for $model_name with hidden_size $hidden_size..."
OUTPUT_LORA_MERGED=$(./llama-cli -no-cnv -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32-lora-merged.gguf \
echo "Running llama-completion with merged lora for $model_name with hidden_size $hidden_size..."
OUTPUT_LORA_MERGED=$(./llama-completion -no-cnv -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32-lora-merged.gguf \
-p "$EXPECTED_LORA_FIRST_WORD" -n 50 --seed 42 --temp 0)
# Remove any initial white space

View File

@ -18,7 +18,8 @@ else()
add_subdirectory(gguf-split)
add_subdirectory(imatrix)
add_subdirectory(llama-bench)
add_subdirectory(main)
add_subdirectory(cli)
add_subdirectory(completion)
add_subdirectory(perplexity)
add_subdirectory(quantize)
if (LLAMA_BUILD_SERVER)

10
tools/cli/CMakeLists.txt Normal file
View File

@ -0,0 +1,10 @@
set(TARGET llama-cli)
add_executable(${TARGET} cli.cpp)
target_link_libraries(${TARGET} PRIVATE server-context PUBLIC common ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
include_directories(../server)
if(LLAMA_TOOLS_INSTALL)
install(TARGETS ${TARGET} RUNTIME)
endif()

395
tools/cli/cli.cpp Normal file
View File

@ -0,0 +1,395 @@
#include "common.h"
#include "arg.h"
#include "console.h"
// #include "log.h"
#include "server-context.h"
#include "server-task.h"
#include <atomic>
#include <fstream>
#include <thread>
#include <signal.h>
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
# define NOMINMAX
#endif
#include <windows.h>
#endif
const char * LLAMA_ASCII_LOGO = R"(
)";
static std::atomic<bool> g_is_interrupted = false;
static bool should_stop() {
return g_is_interrupted.load();
}
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
static void signal_handler(int) {
if (g_is_interrupted.load()) {
// second Ctrl+C - exit immediately
// make sure to clear colors before exiting (not using LOG or console.cpp here to avoid deadlock)
fprintf(stdout, "\033[0m\n");
fflush(stdout);
std::exit(130);
}
g_is_interrupted.store(true);
}
#endif
struct cli_context {
server_context ctx_server;
json messages = json::array();
std::vector<raw_buffer> input_files;
task_params defaults;
// thread for showing "loading" animation
std::atomic<bool> loading_show;
cli_context(const common_params & params) {
defaults.sampling = params.sampling;
defaults.speculative = params.speculative;
defaults.n_keep = params.n_keep;
defaults.n_predict = params.n_predict;
defaults.antiprompt = params.antiprompt;
defaults.stream = true; // make sure we always use streaming mode
defaults.timings_per_token = true; // in order to get timings even when we cancel mid-way
// defaults.return_progress = true; // TODO: show progress
defaults.oaicompat_chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
}
std::string generate_completion(result_timings & out_timings) {
server_response_reader rd = ctx_server.get_response_reader();
{
// TODO: reduce some copies here in the future
server_task task = server_task(SERVER_TASK_TYPE_COMPLETION);
task.id = rd.get_new_id();
task.index = 0;
task.params = defaults; // copy
task.cli_input = messages; // copy
task.cli_files = input_files; // copy
rd.post_task({std::move(task)});
}
// wait for first result
console::spinner::start();
server_task_result_ptr result = rd.next(should_stop);
console::spinner::stop();
std::string curr_content;
bool is_thinking = false;
while (result) {
if (should_stop()) {
break;
}
if (result->is_error()) {
json err_data = result->to_json();
if (err_data.contains("message")) {
console::error("Error: %s\n", err_data["message"].get<std::string>().c_str());
} else {
console::error("Error: %s\n", err_data.dump().c_str());
}
return curr_content;
}
auto res_partial = dynamic_cast<server_task_result_cmpl_partial *>(result.get());
if (res_partial) {
out_timings = std::move(res_partial->timings);
for (const auto & diff : res_partial->oaicompat_msg_diffs) {
if (!diff.content_delta.empty()) {
if (is_thinking) {
console::log("\n[End thinking]\n\n");
console::set_display(DISPLAY_TYPE_RESET);
is_thinking = false;
}
curr_content += diff.content_delta;
console::log("%s", diff.content_delta.c_str());
console::flush();
}
if (!diff.reasoning_content_delta.empty()) {
console::set_display(DISPLAY_TYPE_REASONING);
if (!is_thinking) {
console::log("[Start thinking]\n");
}
is_thinking = true;
console::log("%s", diff.reasoning_content_delta.c_str());
console::flush();
}
}
}
auto res_final = dynamic_cast<server_task_result_cmpl_final *>(result.get());
if (res_final) {
out_timings = std::move(res_final->timings);
break;
}
result = rd.next(should_stop);
}
g_is_interrupted.store(false);
// server_response_reader automatically cancels pending tasks upon destruction
return curr_content;
}
// TODO: support remote files in the future (http, https, etc)
std::string load_input_file(const std::string & fname, bool is_media) {
std::ifstream file(fname, std::ios::binary);
if (!file) {
return "";
}
if (is_media) {
raw_buffer buf;
buf.assign((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
input_files.push_back(std::move(buf));
return mtmd_default_marker();
} else {
std::string content((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
return content;
}
}
};
int main(int argc, char ** argv) {
common_params params;
params.verbosity = LOG_LEVEL_ERROR; // by default, less verbose logs
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CLI)) {
return 1;
}
// TODO: maybe support it later?
if (params.conversation_mode == COMMON_CONVERSATION_MODE_DISABLED) {
console::error("--no-conversation is not supported by llama-cli\n");
console::error("please use llama-completion instead\n");
}
common_init();
// struct that contains llama context and inference
cli_context ctx_cli(params);
llama_backend_init();
llama_numa_init(params.numa);
// TODO: avoid using atexit() here by making `console` a singleton
console::init(params.simple_io, params.use_color);
atexit([]() { console::cleanup(); });
console::set_display(DISPLAY_TYPE_RESET);
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
struct sigaction sigint_action;
sigint_action.sa_handler = signal_handler;
sigemptyset (&sigint_action.sa_mask);
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
sigaction(SIGTERM, &sigint_action, NULL);
#elif defined (_WIN32)
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
};
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
#endif
console::log("\nLoading model... "); // followed by loading animation
console::spinner::start();
if (!ctx_cli.ctx_server.load_model(params)) {
console::spinner::stop();
console::error("\nFailed to load the model\n");
return 1;
}
ctx_cli.ctx_server.init();
console::spinner::stop();
console::log("\n");
std::thread inference_thread([&ctx_cli]() {
ctx_cli.ctx_server.start_loop();
});
auto inf = ctx_cli.ctx_server.get_info();
std::string modalities = "text";
if (inf.has_inp_image) {
modalities += ", vision";
}
if (inf.has_inp_audio) {
modalities += ", audio";
}
if (!params.system_prompt.empty()) {
ctx_cli.messages.push_back({
{"role", "system"},
{"content", params.system_prompt}
});
}
console::log("\n");
console::log("%s\n", LLAMA_ASCII_LOGO);
console::log("build : %s\n", inf.build_info.c_str());
console::log("model : %s\n", inf.model_name.c_str());
console::log("modalities : %s\n", modalities.c_str());
if (!params.system_prompt.empty()) {
console::log("using custom system prompt\n");
}
console::log("\n");
console::log("available commands:\n");
console::log(" /exit or Ctrl+C stop or exit\n");
console::log(" /regen regenerate the last response\n");
console::log(" /clear clear the chat history\n");
console::log(" /read add a text file\n");
if (inf.has_inp_image) {
console::log(" /image <file> add an image file\n");
}
if (inf.has_inp_audio) {
console::log(" /audio <file> add an audio file\n");
}
console::log("\n");
// interactive loop
std::string cur_msg;
while (true) {
std::string buffer;
console::set_display(DISPLAY_TYPE_USER_INPUT);
if (params.prompt.empty()) {
console::log("\n> ");
std::string line;
bool another_line = true;
do {
another_line = console::readline(line, params.multiline_input);
buffer += line;
} while (another_line);
} else {
// process input prompt from args
for (auto & fname : params.image) {
std::string marker = ctx_cli.load_input_file(fname, true);
if (marker.empty()) {
console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str());
break;
}
console::log("Loaded media from '%s'\n", fname.c_str());
cur_msg += marker;
}
buffer = params.prompt;
if (buffer.size() > 500) {
console::log("\n> %s ... (truncated)\n", buffer.substr(0, 500).c_str());
} else {
console::log("\n> %s\n", buffer.c_str());
}
params.prompt.clear(); // only use it once
}
console::set_display(DISPLAY_TYPE_RESET);
console::log("\n");
if (should_stop()) {
g_is_interrupted.store(false);
break;
}
// remove trailing newline
if (!buffer.empty() &&buffer.back() == '\n') {
buffer.pop_back();
}
// skip empty messages
if (buffer.empty()) {
continue;
}
bool add_user_msg = true;
// process commands
if (string_starts_with(buffer, "/exit")) {
break;
} else if (string_starts_with(buffer, "/regen")) {
if (ctx_cli.messages.size() >= 2) {
size_t last_idx = ctx_cli.messages.size() - 1;
ctx_cli.messages.erase(last_idx);
add_user_msg = false;
} else {
console::error("No message to regenerate.\n");
continue;
}
} else if (string_starts_with(buffer, "/clear")) {
ctx_cli.messages.clear();
ctx_cli.input_files.clear();
console::log("Chat history cleared.\n");
continue;
} else if (
(string_starts_with(buffer, "/image ") && inf.has_inp_image) ||
(string_starts_with(buffer, "/audio ") && inf.has_inp_audio)) {
// just in case (bad copy-paste for example), we strip all trailing/leading spaces
std::string fname = string_strip(buffer.substr(7));
std::string marker = ctx_cli.load_input_file(fname, true);
if (marker.empty()) {
console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str());
continue;
}
cur_msg += marker;
console::log("Loaded media from '%s'\n", fname.c_str());
continue;
} else if (string_starts_with(buffer, "/read ")) {
std::string fname = string_strip(buffer.substr(6));
std::string marker = ctx_cli.load_input_file(fname, false);
if (marker.empty()) {
console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str());
continue;
}
cur_msg += marker;
console::log("Loaded text from '%s'\n", fname.c_str());
continue;
} else {
// not a command
cur_msg += buffer;
}
// generate response
if (add_user_msg) {
ctx_cli.messages.push_back({
{"role", "user"},
{"content", cur_msg}
});
cur_msg.clear();
}
result_timings timings;
std::string assistant_content = ctx_cli.generate_completion(timings);
ctx_cli.messages.push_back({
{"role", "assistant"},
{"content", assistant_content}
});
console::log("\n");
if (params.show_timings) {
console::set_display(DISPLAY_TYPE_INFO);
console::log("\n");
console::log("[ Prompt: %.1f t/s | Generation: %.1f t/s ]\n", timings.prompt_per_second, timings.predicted_per_second);
console::set_display(DISPLAY_TYPE_RESET);
}
if (params.single_turn) {
break;
}
}
console::set_display(DISPLAY_TYPE_RESET);
console::log("\nExiting...\n");
ctx_cli.ctx_server.terminate();
inference_thread.join();
// bump the log level to display timings
common_log_set_verbosity_thold(LOG_LEVEL_INFO);
llama_memory_breakdown_print(ctx_cli.ctx_server.get_llama_context());
return 0;
}

View File

@ -1,5 +1,5 @@
set(TARGET llama-cli)
add_executable(${TARGET} main.cpp)
set(TARGET llama-completion)
add_executable(${TARGET} completion.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -86,7 +86,7 @@ static void sigint_handler(int signo) {
int main(int argc, char ** argv) {
common_params params;
g_params = &params;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMPLETION, print_usage)) {
return 1;
}
@ -521,12 +521,6 @@ int main(int argc, char ** argv) {
is_interacting = params.interactive_first;
}
LOG_WRN("*****************************\n");
LOG_WRN("IMPORTANT: The current llama-cli will be moved to llama-completion in the near future\n");
LOG_WRN(" New llama-cli will have enhanced features and improved user experience\n");
LOG_WRN(" More info: https://github.com/ggml-org/llama.cpp/discussions/17618\n");
LOG_WRN("*****************************\n");
bool is_antiprompt = false;
bool input_echo = true;
bool display = true;
@ -543,7 +537,7 @@ int main(int argc, char ** argv) {
std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode
// the first thing we will do is to output the prompt, so set color accordingly
console::set_display(console::prompt);
console::set_display(DISPLAY_TYPE_PROMPT);
display = params.display_prompt;
std::vector<llama_token> embd;
@ -588,9 +582,9 @@ int main(int argc, char ** argv) {
const int skipped_tokens = (int) embd.size() - max_embd_size;
embd.resize(max_embd_size);
console::set_display(console::error);
console::set_display(DISPLAY_TYPE_ERROR);
LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
console::set_display(console::reset);
console::set_display(DISPLAY_TYPE_RESET);
}
if (ga_n == 1) {
@ -772,7 +766,7 @@ int main(int argc, char ** argv) {
// reset color to default if there is no pending user input
if (input_echo && (int) embd_inp.size() == n_consumed) {
console::set_display(console::reset);
console::set_display(DISPLAY_TYPE_RESET);
display = true;
}
@ -868,7 +862,7 @@ int main(int argc, char ** argv) {
}
// color user input only
console::set_display(console::user_input);
console::set_display(DISPLAY_TYPE_USER_INPUT);
display = params.display_prompt;
std::string line;
@ -879,7 +873,7 @@ int main(int argc, char ** argv) {
} while (another_line);
// done taking input, reset color
console::set_display(console::reset);
console::set_display(DISPLAY_TYPE_RESET);
display = true;
if (buffer.empty()) { // Ctrl+D on empty line exits

View File

@ -19,7 +19,7 @@ fi
set -x
SPLIT=$1/llama-gguf-split
MAIN=$1/llama-cli
MAIN=$1/llama-completion
WORK_PATH=$TMP_DIR/gguf-split
ROOT_DIR=$(realpath $(dirname $0)/../../)

View File

@ -310,6 +310,9 @@ int main(int argc, char ** argv) {
if (g_is_interrupted) return 130;
LOG_WRN("WARN: This is an experimental CLI for testing multimodal capability.\n");
LOG_WRN(" For normal use cases, please use the standard llama-cli\n");
if (is_single_turn) {
g_is_generating = true;
if (params.prompt.find(mtmd_default_marker()) == std::string::npos) {
@ -349,11 +352,11 @@ int main(int argc, char ** argv) {
while (!g_is_interrupted) {
g_is_generating = false;
LOG("\n> ");
console::set_display(console::user_input);
console::set_display(DISPLAY_TYPE_USER_INPUT);
std::string line;
console::readline(line, false);
if (g_is_interrupted) break;
console::set_display(console::reset);
console::set_display(DISPLAY_TYPE_RESET);
line = string_strip(line);
if (line.empty()) {
continue;

View File

@ -20,7 +20,7 @@ set -x
SPLIT=$1/llama-gguf-split
QUANTIZE=$1/llama-quantize
MAIN=$1/llama-cli
MAIN=$1/llama-completion
WORK_PATH=$TMP_DIR/quantize
ROOT_DIR=$(realpath $(dirname $0)/../../)

View File

@ -1,7 +1,33 @@
set(TARGET llama-server)
include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
# server-context containing the core server logic, used by llama-server and CLI
set(TARGET server-context)
add_library(${TARGET} STATIC
server-task.cpp
server-task.h
server-queue.cpp
server-queue.h
server-common.cpp
server-common.h
server-context.cpp
server-context.h
)
if (BUILD_SHARED_LIBS)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
target_include_directories(${TARGET} PRIVATE ../mtmd)
target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
target_link_libraries(${TARGET} PUBLIC common mtmd ${CMAKE_THREAD_LIBS_INIT})
# llama-server executable
set(TARGET llama-server)
if (NOT LLAMA_HTTPLIB)
message(FATAL_ERROR "LLAMA_HTTPLIB is OFF, cannot build llama-server. Hint: to skip building server, set -DLLAMA_BUILD_SERVER=OFF")
endif()
@ -12,14 +38,6 @@ set(TARGET_SRCS
server-http.h
server-models.cpp
server-models.h
server-task.cpp
server-task.h
server-queue.cpp
server-queue.h
server-common.cpp
server-common.h
server-context.cpp
server-context.h
)
set(PUBLIC_ASSETS
index.html.gz
@ -43,7 +61,7 @@ install(TARGETS ${TARGET} RUNTIME)
target_include_directories(${TARGET} PRIVATE ../mtmd)
target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
target_link_libraries(${TARGET} PRIVATE common mtmd cpp-httplib ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE server-context PUBLIC common cpp-httplib ${CMAKE_THREAD_LIBS_INIT})
if (WIN32)
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)

View File

@ -1474,6 +1474,44 @@ struct server_context_impl {
// Functions to process the task
//
// tokenize the input if it's set by CLI, return false on error
bool tokenize_cli_input(server_task & task) {
if (task.cli_input == nullptr) {
return true; // nothing to do
}
try {
auto & opt = oai_parser_opt;
common_chat_templates_inputs inputs;
inputs.messages = common_chat_msgs_parse_oaicompat(task.cli_input);
inputs.tools = {}; // TODO
inputs.tool_choice = COMMON_CHAT_TOOL_CHOICE_NONE;
inputs.json_schema = ""; // TODO
inputs.grammar = ""; // TODO
inputs.use_jinja = opt.use_jinja;
inputs.parallel_tool_calls = false;
inputs.add_generation_prompt = true;
inputs.reasoning_format = opt.reasoning_format;
inputs.enable_thinking = opt.enable_thinking;
// Apply chat template to the list of messages
auto chat_params = common_chat_templates_apply(opt.tmpls, inputs);
// tokenize the resulting prompt
auto & prompt = chat_params.prompt;
if (mctx != nullptr) {
task.tokens = process_mtmd_prompt(mctx, prompt, task.cli_files);
} else {
task.tokens = std::move(tokenize_input_prompts(vocab, mctx, prompt, true, true)[0]);
}
task.cli_input.clear();
task.cli_files.clear();
} catch (const std::exception & e) {
send_error(task, std::string("Failed to format input: ") + e.what(), ERROR_TYPE_INVALID_REQUEST);
return false;
}
return true;
}
void process_single_task(server_task && task) {
switch (task.type) {
case SERVER_TASK_TYPE_COMPLETION:
@ -1481,6 +1519,10 @@ struct server_context_impl {
case SERVER_TASK_TYPE_EMBEDDING:
case SERVER_TASK_TYPE_RERANK:
{
if (!tokenize_cli_input(task)) {
break;
}
const int id_slot = task.id_slot;
server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task);
@ -1690,7 +1732,6 @@ struct server_context_impl {
res->id = task.id;
queue_results.send(std::move(res));
} break;
}
}
@ -2626,6 +2667,15 @@ server_response_reader server_context::get_response_reader() {
return impl->get_response_reader();
}
server_context_info server_context::get_info() const {
return server_context_info {
/* build_info */ build_info,
/* model_name */ impl->model_name,
/* has_inp_image */ impl->oai_parser_opt.allow_image,
/* has_inp_audio */ impl->oai_parser_opt.allow_audio,
};
}
// generator-like API for HTTP response generation

View File

@ -9,6 +9,13 @@
struct server_context_impl; // private implementation
struct server_context_info {
std::string build_info;
std::string model_name;
bool has_inp_image;
bool has_inp_audio;
};
struct server_context {
std::unique_ptr<server_context_impl> impl;
@ -33,6 +40,10 @@ struct server_context {
// get a new response reader, used by CLI application
server_response_reader get_response_reader();
// get server info
// used by CLI application
server_context_info get_info() const;
};

View File

@ -135,7 +135,10 @@ struct server_response_reader {
stop();
}
void post_task(server_task && tasks);
int get_new_id() {
return queue_tasks.get_new_id();
}
void post_task(server_task && task);
void post_tasks(std::vector<server_task> && tasks);
bool has_next() const;

View File

@ -120,6 +120,10 @@ struct server_task {
task_params params;
server_tokens tokens;
// only used by CLI, this delegates the tokenization to the server
json cli_input = nullptr;
std::vector<raw_buffer> cli_files;
server_task_type type;
// used by SERVER_TASK_TYPE_SLOT_SAVE, SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE