Merge branch 'master' into dev-hexagon-refactoring-part2

This commit is contained in:
chraac 2025-12-11 13:35:28 +08:00
commit 07ea177987
48 changed files with 16844 additions and 4985 deletions

View File

@ -243,7 +243,7 @@ jobs:
echo "Fetch llama2c model"
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
- name: Test llama2c (s390x)
id: llama2c_test_s390x
@ -252,7 +252,7 @@ jobs:
cd build
echo "Fetch llama2c big-endian model"
wget https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K-be.gguf
./bin/llama-cli -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
./bin/llama-completion -m stories260K-be.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
ubuntu-latest-cmake-sanitizer:
runs-on: ubuntu-latest
@ -1770,7 +1770,7 @@ jobs:
echo "Fetch llama2c model"
wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin
./bin/llama-convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
./bin/llama-cli -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
./bin/llama-completion -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
ubuntu-cmake-sanitizer-riscv64-native:
runs-on: RISCV64

View File

@ -347,19 +347,6 @@ To learn more about model quantization, [read this documentation](tools/quantize
</details>
- <details>
<summary>Run simple text completion</summary>
To disable conversation mode explicitly, use `-no-cnv`
```bash
llama-cli -m model.gguf -p "I believe the meaning of life is" -n 128 -no-cnv
# I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
```
</details>
- <details>
<summary>Constrain the output with a custom grammar</summary>

View File

@ -398,18 +398,18 @@ function gg_run_qwen3_0_6b {
./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k $(nproc)
./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k $(nproc)
(time ./bin/llama-cli -no-cnv --model ${model_f16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-cli -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
(time ./bin/llama-cli -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-cli -no-cnv --model ${model_q4_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-cli -no-cnv --model ${model_q4_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-cli -no-cnv --model ${model_q5_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-cli -no-cnv --model ${model_q5_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-cli -no-cnv --model ${model_q2_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q3_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q4_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q5_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-cli -no-cnv --model ${model_q6_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-completion -no-cnv --model ${model_f16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/llama-completion -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
(time ./bin/llama-completion -no-cnv --model ${model_q8_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/llama-completion -no-cnv --model ${model_q4_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/llama-completion -no-cnv --model ${model_q4_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/llama-completion -no-cnv --model ${model_q5_0} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/llama-completion -no-cnv --model ${model_q5_1} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/llama-completion -no-cnv --model ${model_q2_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/llama-completion -no-cnv --model ${model_q3_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/llama-completion -no-cnv --model ${model_q4_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/llama-completion -no-cnv --model ${model_q5_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/llama-completion -no-cnv --model ${model_q6_k} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/llama-perplexity --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
if [ -z ${GG_BUILD_NO_BF16} ]; then

View File

@ -73,6 +73,8 @@ add_library(${TARGET} STATIC
ngram-cache.h
peg-parser.cpp
peg-parser.h
preset.cpp
preset.h
regex-partial.cpp
regex-partial.h
sampling.cpp

View File

@ -47,10 +47,12 @@
#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
using json = nlohmann::ordered_json;
using namespace common_arg_utils;
static std::initializer_list<enum llama_example> mmproj_examples = {
LLAMA_EXAMPLE_MTMD,
LLAMA_EXAMPLE_SERVER,
LLAMA_EXAMPLE_CLI,
};
static std::string read_file(const std::string & fname) {
@ -63,6 +65,15 @@ static std::string read_file(const std::string & fname) {
return content;
}
static const std::vector<common_arg> & get_common_arg_defs() {
static const std::vector<common_arg> options = [] {
common_params params;
auto ctx = common_params_parser_init(params, LLAMA_EXAMPLE_SERVER, nullptr);
return ctx.options;
}();
return options;
}
common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
this->examples = examples;
return *this;
@ -133,7 +144,7 @@ static std::vector<std::string> break_str_into_lines(std::string input, size_t m
return result;
}
std::string common_arg::to_string() {
std::string common_arg::to_string() const {
// params for printing to console
const static int n_leading_spaces = 40;
const static int n_char_per_line_help = 70; // TODO: detect this based on current console
@ -468,6 +479,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
));
}
common_log_set_verbosity_thold(params.verbosity);
return true;
}
@ -644,6 +657,53 @@ static void add_rpc_devices(const std::string & servers) {
}
}
bool common_params_parse(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map) {
common_params dummy_params;
common_params_context ctx_arg = common_params_parser_init(dummy_params, ex, nullptr);
std::unordered_map<std::string, common_arg *> arg_to_options;
for (auto & opt : ctx_arg.options) {
for (const auto & arg : opt.args) {
arg_to_options[arg] = &opt;
}
}
// TODO @ngxson : find a way to deduplicate this code
// handle command line arguments
auto check_arg = [&](int i) {
if (i+1 >= argc) {
throw std::invalid_argument("expected value for argument");
}
};
for (int i = 1; i < argc; i++) {
const std::string arg_prefix = "--";
std::string arg = argv[i];
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
std::replace(arg.begin(), arg.end(), '_', '-');
}
if (arg_to_options.find(arg) == arg_to_options.end()) {
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
}
auto opt = *arg_to_options[arg];
std::string val;
if (opt.value_hint != nullptr) {
// arg with single value
check_arg(i);
val = argv[++i];
}
if (opt.value_hint_2 != nullptr) {
// TODO: support arg with 2 values
throw std::invalid_argument("error: argument with 2 values is not yet supported\n");
}
out_map[opt] = val;
}
return true;
}
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
auto ctx_arg = common_params_parser_init(params, ex, print_usage);
const common_params params_org = ctx_arg.params; // the example can modify the default params
@ -689,25 +749,19 @@ static std::string list_builtin_chat_templates() {
return msg.str();
}
static bool is_truthy(const std::string & value) {
bool common_arg_utils::is_truthy(const std::string & value) {
return value == "on" || value == "enabled" || value == "1";
}
static bool is_falsey(const std::string & value) {
bool common_arg_utils::is_falsey(const std::string & value) {
return value == "off" || value == "disabled" || value == "0";
}
static bool is_autoy(const std::string & value) {
bool common_arg_utils::is_autoy(const std::string & value) {
return value == "auto" || value == "-1";
}
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
// default values specific to example
// note: we place it here instead of inside server.cpp to allow llama-gen-docs to pick it up
if (ex == LLAMA_EXAMPLE_SERVER) {
params.use_jinja = true;
}
params.use_color = tty_can_use_colors();
// load dynamic backends
@ -790,7 +844,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) {
params.display_prompt = false;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"-co", "--color"}, "[on|off|auto]",
"Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')\n"
@ -807,7 +861,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
string_format("error: unknown value for --color: '%s'\n", value.c_str()));
}
}
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
add_opt(common_arg(
{"-t", "--threads"}, "N",
string_format("number of CPU threads to use during generation (default: %d)", params.cpuparams.n_threads),
@ -940,7 +994,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
add_opt(common_arg(
{"-n", "--predict", "--n-predict"}, "N",
string_format(
ex == LLAMA_EXAMPLE_MAIN
ex == LLAMA_EXAMPLE_COMPLETION
? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)"
: "number of tokens to predict (default: %d, -1 = infinity)",
params.n_predict),
@ -984,7 +1038,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, int value) {
params.n_ctx_checkpoints = value;
}
).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--cache-ram", "-cram"}, "N",
string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)\n"
@ -992,7 +1046,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, int value) {
params.cache_ram_mib = value;
}
).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER}));
).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--kv-unified", "-kvu"},
string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
@ -1007,14 +1061,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) {
params.ctx_shift = false;
}
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
add_opt(common_arg(
{"--context-shift"},
string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
[](common_params & params) {
params.ctx_shift = true;
}
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
add_opt(common_arg(
{"--chunks"}, "N",
string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
@ -1050,7 +1104,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.system_prompt = value;
}
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
add_opt(common_arg(
{"--no-perf"},
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
@ -1059,6 +1113,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.sampling.no_perf = true;
}
).set_env("LLAMA_ARG_NO_PERF"));
add_opt(common_arg(
{"--no-show-timings"},
string_format("disable timing information after each response (default: %s)", params.show_timings ? "true" : "false"),
[](common_params & params) {
params.show_timings = false;
}
).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_NO_SHOW_TIMINGS"));
add_opt(common_arg(
{"-f", "--file"}, "FNAME",
"a file containing the prompt (default: none)",
@ -1080,7 +1141,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.system_prompt.pop_back();
}
}
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
add_opt(common_arg(
{"--in-file"}, "FNAME",
"an input file (repeat to specify multiple files)",
@ -1128,42 +1189,42 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, int value) {
params.n_print = value;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"--prompt-cache"}, "FNAME",
"file to cache prompt state for faster startup (default: none)",
[](common_params & params, const std::string & value) {
params.path_prompt_cache = value;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"--prompt-cache-all"},
"if specified, saves user input and generations to cache as well\n",
[](common_params & params) {
params.prompt_cache_all = true;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"--prompt-cache-ro"},
"if specified, uses the prompt cache but does not update it",
[](common_params & params) {
params.prompt_cache_ro = true;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"-r", "--reverse-prompt"}, "PROMPT",
"halt generation at PROMPT, return control in interactive mode\n",
[](common_params & params, const std::string & value) {
params.antiprompt.emplace_back(value);
}
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"-sp", "--special"},
string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
[](common_params & params) {
params.special = true;
}
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"-cnv", "--conversation"},
"run in conversation mode:\n"
@ -1173,14 +1234,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) {
params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"-no-cnv", "--no-conversation"},
"force disable conversation mode (default: false)",
[](common_params & params) {
params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"-st", "--single-turn"},
"run conversation for a single turn only, then exit when done\n"
@ -1189,28 +1250,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) {
params.single_turn = true;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"-i", "--interactive"},
string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
[](common_params & params) {
params.interactive = true;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"-if", "--interactive-first"},
string_format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
[](common_params & params) {
params.interactive_first = true;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"-mli", "--multiline-input"},
"allows you to write or paste multiple lines without ending each in '\\'",
[](common_params & params) {
params.multiline_input = true;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--in-prefix-bos"},
"prefix BOS to user inputs, preceding the `--in-prefix` string",
@ -1218,7 +1279,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.input_prefix_bos = true;
params.enable_chat_template = false;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"--in-prefix"}, "STRING",
"string to prefix user inputs with (default: empty)",
@ -1226,7 +1287,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.input_prefix = value;
params.enable_chat_template = false;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"--in-suffix"}, "STRING",
"string to suffix after user inputs with (default: empty)",
@ -1234,14 +1295,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.input_suffix = value;
params.enable_chat_template = false;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"--no-warmup"},
"skip warming up the model with an empty run",
[](common_params & params) {
params.warmup = false;
}
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
add_opt(common_arg(
{"--spm-infill"},
string_format(
@ -1632,14 +1693,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, int value) {
params.grp_attn_n = value;
}
).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_PASSKEY}));
).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_PASSKEY}));
add_opt(common_arg(
{"-gaw", "--grp-attn-w"}, "N",
string_format("group-attention width (default: %d)", params.grp_attn_w),
[](common_params & params, int value) {
params.grp_attn_w = value;
}
).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
{"-nkvo", "--no-kv-offload"},
"disable KV offload",
@ -1829,7 +1890,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.image.emplace_back(value);
}
).set_examples({LLAMA_EXAMPLE_MTMD}));
).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--image-min-tokens"}, "N",
"minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
@ -1922,7 +1983,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
"override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides);
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--cpu-moe", "-cmoe"},
"keep all Mixture of Experts (MoE) weights in the CPU",
@ -1951,7 +2012,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) {
params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
add_opt(common_arg(
{"--n-cpu-moe-draft", "-ncmoed"}, "N",
"keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model",
@ -1965,7 +2026,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
}
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
add_opt(common_arg(
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
string_format("max. number of layers to store in VRAM (default: %d)", params.n_gpu_layers),
@ -2444,7 +2505,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.default_template_kwargs[item.key()] = item.value().dump();
}
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
add_opt(common_arg(
{"-to", "--timeout"}, "N",
string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
@ -2533,6 +2594,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.models_dir = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_DIR"));
add_opt(common_arg(
{"--models-preset"}, "PATH",
"path to INI file containing model presets for the router server (default: disabled)",
[](common_params & params, const std::string & value) {
params.models_preset = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_PRESET"));
add_opt(common_arg(
{"--models-max"}, "N",
string_format("for router server, maximum number of models to load simultaneously (default: %d, 0 = unlimited)", params.models_max),
@ -2549,18 +2617,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_MODELS_AUTOLOAD"));
add_opt(common_arg(
{"--jinja"},
string_format("use jinja template for chat (default: %s)\n", params.use_jinja ? "enabled" : "disabled"),
string_format("use jinja template for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"),
[](common_params & params) {
params.use_jinja = true;
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
add_opt(common_arg(
{"--no-jinja"},
string_format("disable jinja template for chat (default: %s)\n", params.use_jinja ? "enabled" : "disabled"),
string_format("disable jinja template for chat (default: %s)", params.use_jinja ? "disabled" : "enabled"),
[](common_params & params) {
params.use_jinja = false;
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_NO_JINJA"));
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_NO_JINJA"));
add_opt(common_arg(
{"--reasoning-format"}, "FORMAT",
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
@ -2571,7 +2639,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.reasoning_format = common_reasoning_format_from_name(value);
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK"));
add_opt(common_arg(
{"--reasoning-budget"}, "N",
"controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
@ -2579,7 +2647,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
params.reasoning_budget = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_BUDGET"));
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET"));
add_opt(common_arg(
{"--chat-template"}, "JINJA_TEMPLATE",
string_format(
@ -2591,7 +2659,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.chat_template = value;
}
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
add_opt(common_arg(
{"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
string_format(
@ -2603,7 +2671,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.chat_template = read_file(value);
}
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
add_opt(common_arg(
{"--no-prefill-assistant"},
string_format(
@ -2634,7 +2702,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) {
params.simple_io = true;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--positive-file"}, "FNAME",
string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
@ -2717,7 +2785,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
"Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
[](common_params & params) {
params.verbosity = INT_MAX;
common_log_set_verbosity_thold(INT_MAX);
}
));
add_opt(common_arg(
@ -2738,7 +2805,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
"(default: %d)\n", params.verbosity),
[](common_params & params, int value) {
params.verbosity = value;
common_log_set_verbosity_thold(value);
}
).set_env("LLAMA_LOG_VERBOSITY"));
add_opt(common_arg(
@ -2871,14 +2937,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, int value) {
params.speculative.n_max = value;
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX"));
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MAX"));
add_opt(common_arg(
{"--draft-min", "--draft-n-min"}, "N",
string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
[](common_params & params, int value) {
params.speculative.n_min = value;
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN"));
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MIN"));
add_opt(common_arg(
{"--draft-p-split"}, "P",
string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
@ -2892,14 +2958,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.speculative.p_min = std::stof(value);
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
add_opt(common_arg(
{"-cd", "--ctx-size-draft"}, "N",
string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
[](common_params & params, int value) {
params.speculative.n_ctx = value;
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
add_opt(common_arg(
{"-devd", "--device-draft"}, "<dev1,dev2,..>",
"comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
@ -2907,7 +2973,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.speculative.devices = parse_device_list(value);
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
"number of layers to store in VRAM for the draft model",
@ -2919,21 +2985,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
}
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
add_opt(common_arg(
{"-md", "--model-draft"}, "FNAME",
"draft model for speculative decoding (default: unused)",
[](common_params & params, const std::string & value) {
params.speculative.model.path = value;
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_MODEL_DRAFT"));
add_opt(common_arg(
{"--spec-replace"}, "TARGET", "DRAFT",
"translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
[](common_params & params, const std::string & tgt, const std::string & dft) {
params.speculative.replacements.push_back({ tgt, dft });
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"-ctkd", "--cache-type-k-draft"}, "TYPE",
string_format(
@ -3197,7 +3263,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.use_jinja = true;
//params.default_template_kwargs["reasoning_effort"] = "\"high\"";
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--gpt-oss-120b-default"},
@ -3216,7 +3282,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.use_jinja = true;
//params.default_template_kwargs["reasoning_effort"] = "\"high\"";
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--vision-gemma-4b-default"},
@ -3227,7 +3293,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.n_ctx = 0;
params.use_jinja = true;
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--vision-gemma-12b-default"},
@ -3238,7 +3304,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.n_ctx = 0;
params.use_jinja = true;
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
return ctx_arg;
}

View File

@ -3,8 +3,10 @@
#include "common.h"
#include <set>
#include <map>
#include <string>
#include <vector>
#include <cstring>
//
// CLI argument parsing
@ -24,6 +26,8 @@ struct common_arg {
void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
void (*handler_int) (common_params & params, int) = nullptr;
common_arg() = default;
common_arg(
const std::initializer_list<const char *> & args,
const char * value_hint,
@ -61,9 +65,29 @@ struct common_arg {
bool is_exclude(enum llama_example ex);
bool get_value_from_env(std::string & output) const;
bool has_value_from_env() const;
std::string to_string();
std::string to_string() const;
// for using as key in std::map
bool operator<(const common_arg& other) const {
if (args.empty() || other.args.empty()) {
return false;
}
return strcmp(args[0], other.args[0]) < 0;
}
bool operator==(const common_arg& other) const {
if (args.empty() || other.args.empty()) {
return false;
}
return strcmp(args[0], other.args[0]) == 0;
}
};
namespace common_arg_utils {
bool is_truthy(const std::string & value);
bool is_falsey(const std::string & value);
bool is_autoy(const std::string & value);
}
struct common_params_context {
enum llama_example ex = LLAMA_EXAMPLE_COMMON;
common_params & params;
@ -76,7 +100,11 @@ struct common_params_context {
// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
// function to be used by test-arg-parser
// parse input arguments from CLI into a map
// TODO: support repeated args in the future
bool common_params_parse(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map);
// initialize argument parser context - used by test-arg-parser and preset
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
struct common_remote_params {

View File

@ -82,7 +82,8 @@ int32_t cpu_get_num_math();
enum llama_example {
LLAMA_EXAMPLE_COMMON,
LLAMA_EXAMPLE_SPECULATIVE,
LLAMA_EXAMPLE_MAIN,
LLAMA_EXAMPLE_COMPLETION,
LLAMA_EXAMPLE_CLI,
LLAMA_EXAMPLE_EMBEDDING,
LLAMA_EXAMPLE_PERPLEXITY,
LLAMA_EXAMPLE_RETRIEVAL,
@ -406,6 +407,7 @@ struct common_params {
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
bool cont_batching = true; // insert new sequences for decoding on-the-fly
bool no_perf = false; // disable performance metrics
bool show_timings = true; // show timing information on CLI
bool ctx_shift = false; // context shift on infinite text generation
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
bool kv_unified = false; // enable unified KV cache
@ -462,7 +464,7 @@ struct common_params {
std::string public_path = ""; // NOLINT
std::string api_prefix = ""; // NOLINT
std::string chat_template = ""; // NOLINT
bool use_jinja = false; // NOLINT
bool use_jinja = true; // NOLINT
bool enable_chat_template = true;
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
int reasoning_budget = -1;
@ -482,9 +484,10 @@ struct common_params {
bool endpoint_metrics = false;
// router server configs
std::string models_dir = ""; // directory containing models for the router server
int models_max = 4; // maximum number of models to load simultaneously
bool models_autoload = true; // automatically load models when requested via the router server
std::string models_dir = ""; // directory containing models for the router server
std::string models_preset = ""; // directory containing model presets for the router server
int models_max = 4; // maximum number of models to load simultaneously
bool models_autoload = true; // automatically load models when requested via the router server
bool log_json = false;

View File

@ -1,4 +1,5 @@
#include "console.h"
#include "log.h"
#include <vector>
#include <iostream>
#include <cassert>
@ -6,6 +7,10 @@
#include <cctype>
#include <cwctype>
#include <cstdint>
#include <condition_variable>
#include <mutex>
#include <thread>
#include <stdarg.h>
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
@ -35,6 +40,7 @@
#define ANSI_COLOR_BLUE "\x1b[34m"
#define ANSI_COLOR_MAGENTA "\x1b[35m"
#define ANSI_COLOR_CYAN "\x1b[36m"
#define ANSI_COLOR_GRAY "\x1b[90m"
#define ANSI_COLOR_RESET "\x1b[0m"
#define ANSI_BOLD "\x1b[1m"
@ -61,17 +67,17 @@ namespace console {
//
#endif
static bool advanced_display = false;
static bool simple_io = true;
static display_t current_display = reset;
static bool advanced_display = false;
static bool simple_io = true;
static display_type current_display = DISPLAY_TYPE_RESET;
static FILE* out = stdout;
static FILE* out = stdout;
#if defined (_WIN32)
static void* hConsole;
static void* hConsole;
#else
static FILE* tty = nullptr;
static termios initial_state;
static FILE* tty = nullptr;
static termios initial_state;
#endif
//
@ -142,7 +148,7 @@ namespace console {
void cleanup() {
// Reset console display
set_display(reset);
set_display(DISPLAY_TYPE_RESET);
#if !defined(_WIN32)
// Restore settings on POSIX systems
@ -162,20 +168,26 @@ namespace console {
//
// Keep track of current display and only emit ANSI code if it changes
void set_display(display_t display) {
void set_display(display_type display) {
if (advanced_display && current_display != display) {
fflush(stdout);
common_log_flush(common_log_main());
switch(display) {
case reset:
case DISPLAY_TYPE_RESET:
fprintf(out, ANSI_COLOR_RESET);
break;
case prompt:
case DISPLAY_TYPE_INFO:
fprintf(out, ANSI_COLOR_MAGENTA);
break;
case DISPLAY_TYPE_PROMPT:
fprintf(out, ANSI_COLOR_YELLOW);
break;
case user_input:
case DISPLAY_TYPE_REASONING:
fprintf(out, ANSI_COLOR_GRAY);
break;
case DISPLAY_TYPE_USER_INPUT:
fprintf(out, ANSI_BOLD ANSI_COLOR_GREEN);
break;
case error:
case DISPLAY_TYPE_ERROR:
fprintf(out, ANSI_BOLD ANSI_COLOR_RED);
}
current_display = display;
@ -778,7 +790,6 @@ namespace console {
}
if (is_special_char) {
set_display(user_input);
replace_last(line.back());
is_special_char = false;
}
@ -961,7 +972,6 @@ namespace console {
}
if (!line.empty() && (line.back() == '\\' || line.back() == '/')) {
set_display(prompt);
replace_last(line.back());
is_special_char = true;
}
@ -1046,12 +1056,82 @@ namespace console {
}
bool readline(std::string & line, bool multiline_input) {
set_display(user_input);
if (simple_io) {
return readline_simple(line, multiline_input);
}
return readline_advanced(line, multiline_input);
}
namespace spinner {
static const char LOADING_CHARS[] = {'|', '/', '-', '\\'};
static std::condition_variable cv_stop;
static std::thread th;
static size_t frame = 0; // only modified by one thread
static bool running = false;
static std::mutex mtx;
static auto wait_time = std::chrono::milliseconds(100);
static void draw_next_frame() {
// don't need lock because only one thread modifies running
frame = (frame + 1) % sizeof(LOADING_CHARS);
replace_last(LOADING_CHARS[frame]);
fflush(out);
}
void start() {
std::unique_lock<std::mutex> lock(mtx);
if (simple_io || running) {
return;
}
common_log_flush(common_log_main());
fprintf(out, "%c", LOADING_CHARS[0]);
fflush(out);
frame = 1;
running = true;
th = std::thread([]() {
std::unique_lock<std::mutex> lock(mtx);
while (true) {
if (cv_stop.wait_for(lock, wait_time, []{ return !running; })) {
break;
}
draw_next_frame();
}
});
}
void stop() {
{
std::unique_lock<std::mutex> lock(mtx);
if (simple_io || !running) {
return;
}
running = false;
cv_stop.notify_all();
}
if (th.joinable()) {
th.join();
}
replace_last(' ');
pop_cursor();
fflush(out);
}
}
void log(const char * fmt, ...) {
va_list args;
va_start(args, fmt);
vfprintf(out, fmt, args);
va_end(args);
}
void error(const char * fmt, ...) {
va_list args;
va_start(args, fmt);
display_type cur = current_display;
set_display(DISPLAY_TYPE_ERROR);
vfprintf(out, fmt, args);
set_display(cur); // restore previous color
va_end(args);
}
void flush() {
fflush(out);
}
}

View File

@ -2,18 +2,40 @@
#pragma once
#include "common.h"
#include <string>
namespace console {
enum display_t {
reset = 0,
prompt,
user_input,
error
};
enum display_type {
DISPLAY_TYPE_RESET = 0,
DISPLAY_TYPE_INFO,
DISPLAY_TYPE_PROMPT,
DISPLAY_TYPE_REASONING,
DISPLAY_TYPE_USER_INPUT,
DISPLAY_TYPE_ERROR
};
namespace console {
void init(bool use_simple_io, bool use_advanced_display);
void cleanup();
void set_display(display_t display);
void set_display(display_type display);
bool readline(std::string & line, bool multiline_input);
namespace spinner {
void start();
void stop();
}
// note: the logging API below output directly to stdout
// it can negatively impact performance if used on inference thread
// only use in in a dedicated CLI thread
// for logging in inference thread, use log.h instead
LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
void log(const char * fmt, ...);
LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
void error(const char * fmt, ...);
void flush();
}

View File

@ -420,6 +420,11 @@ void common_log_set_timestamps(struct common_log * log, bool timestamps) {
log->set_timestamps(timestamps);
}
void common_log_flush(struct common_log * log) {
log->pause();
log->resume();
}
static int common_get_verbosity(enum ggml_log_level level) {
switch (level) {
case GGML_LOG_LEVEL_DEBUG: return LOG_LEVEL_DEBUG;

View File

@ -84,6 +84,7 @@ void common_log_set_file (struct common_log * log, const char * file); // n
void common_log_set_colors (struct common_log * log, log_colors colors); // not thread-safe
void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
void common_log_flush (struct common_log * log); // flush all pending log messages
// helper macros for logging
// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold

180
common/preset.cpp Normal file
View File

@ -0,0 +1,180 @@
#include "arg.h"
#include "preset.h"
#include "peg-parser.h"
#include "log.h"
#include <fstream>
#include <sstream>
#include <filesystem>
static std::string rm_leading_dashes(const std::string & str) {
size_t pos = 0;
while (pos < str.size() && str[pos] == '-') {
++pos;
}
return str.substr(pos);
}
std::vector<std::string> common_preset::to_args() const {
std::vector<std::string> args;
for (const auto & [opt, value] : options) {
args.push_back(opt.args.back()); // use the last arg as the main arg
if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
// flag option, no value
if (common_arg_utils::is_falsey(value)) {
// skip the flag
args.pop_back();
}
}
if (opt.value_hint != nullptr) {
// single value
args.push_back(value);
}
if (opt.value_hint != nullptr && opt.value_hint_2 != nullptr) {
throw std::runtime_error(string_format(
"common_preset::to_args(): option '%s' has two values, which is not supported yet",
opt.args.back()
));
}
}
return args;
}
std::string common_preset::to_ini() const {
std::ostringstream ss;
ss << "[" << name << "]\n";
for (const auto & [opt, value] : options) {
auto espaced_value = value;
string_replace_all(espaced_value, "\n", "\\\n");
ss << rm_leading_dashes(opt.args.back()) << " = ";
ss << espaced_value << "\n";
}
ss << "\n";
return ss.str();
}
static std::map<std::string, std::map<std::string, std::string>> parse_ini_from_file(const std::string & path) {
std::map<std::string, std::map<std::string, std::string>> parsed;
if (!std::filesystem::exists(path)) {
throw std::runtime_error("preset file does not exist: " + path);
}
std::ifstream file(path);
if (!file.good()) {
throw std::runtime_error("failed to open server preset file: " + path);
}
std::string contents((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
static const auto parser = build_peg_parser([](auto & p) {
// newline ::= "\r\n" / "\n" / "\r"
auto newline = p.rule("newline", p.literal("\r\n") | p.literal("\n") | p.literal("\r"));
// ws ::= [ \t]*
auto ws = p.rule("ws", p.chars("[ \t]", 0, -1));
// comment ::= [;#] (!newline .)*
auto comment = p.rule("comment", p.chars("[;#]", 1, 1) + p.zero_or_more(p.negate(newline) + p.any()));
// eol ::= ws comment? (newline / EOF)
auto eol = p.rule("eol", ws + p.optional(comment) + (newline | p.end()));
// ident ::= [a-zA-Z_] [a-zA-Z0-9_.-]*
auto ident = p.rule("ident", p.chars("[a-zA-Z_]", 1, 1) + p.chars("[a-zA-Z0-9_.-]", 0, -1));
// value ::= (!eol-start .)*
auto eol_start = p.rule("eol-start", ws + (p.chars("[;#]", 1, 1) | newline | p.end()));
auto value = p.rule("value", p.zero_or_more(p.negate(eol_start) + p.any()));
// header-line ::= "[" ws ident ws "]" eol
auto header_line = p.rule("header-line", "[" + ws + p.tag("section-name", p.chars("[^]]")) + ws + "]" + eol);
// kv-line ::= ident ws "=" ws value eol
auto kv_line = p.rule("kv-line", p.tag("key", ident) + ws + "=" + ws + p.tag("value", value) + eol);
// comment-line ::= ws comment (newline / EOF)
auto comment_line = p.rule("comment-line", ws + comment + (newline | p.end()));
// blank-line ::= ws (newline / EOF)
auto blank_line = p.rule("blank-line", ws + (newline | p.end()));
// line ::= header-line / kv-line / comment-line / blank-line
auto line = p.rule("line", header_line | kv_line | comment_line | blank_line);
// ini ::= line* EOF
auto ini = p.rule("ini", p.zero_or_more(line) + p.end());
return ini;
});
common_peg_parse_context ctx(contents);
const auto result = parser.parse(ctx);
if (!result.success()) {
throw std::runtime_error("failed to parse server config file: " + path);
}
std::string current_section = COMMON_PRESET_DEFAULT_NAME;
std::string current_key;
ctx.ast.visit(result, [&](const auto & node) {
if (node.tag == "section-name") {
const std::string section = std::string(node.text);
current_section = section;
parsed[current_section] = {};
} else if (node.tag == "key") {
const std::string key = std::string(node.text);
current_key = key;
} else if (node.tag == "value" && !current_key.empty() && !current_section.empty()) {
parsed[current_section][current_key] = std::string(node.text);
current_key.clear();
}
});
return parsed;
}
static std::map<std::string, common_arg> get_map_key_opt(common_params_context & ctx_params) {
std::map<std::string, common_arg> mapping;
for (const auto & opt : ctx_params.options) {
if (opt.env != nullptr) {
mapping[opt.env] = opt;
}
for (const auto & arg : opt.args) {
mapping[rm_leading_dashes(arg)] = opt;
}
}
return mapping;
}
common_presets common_presets_load(const std::string & path, common_params_context & ctx_params) {
common_presets out;
auto key_to_opt = get_map_key_opt(ctx_params);
auto ini_data = parse_ini_from_file(path);
for (auto section : ini_data) {
common_preset preset;
if (section.first.empty()) {
preset.name = COMMON_PRESET_DEFAULT_NAME;
} else {
preset.name = section.first;
}
LOG_DBG("loading preset: %s\n", preset.name.c_str());
for (const auto & [key, value] : section.second) {
LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str());
if (key_to_opt.find(key) != key_to_opt.end()) {
preset.options[key_to_opt[key]] = value;
LOG_DBG("accepted option: %s = %s\n", key.c_str(), value.c_str());
} else {
// TODO: maybe warn about unknown key?
}
}
out[preset.name] = preset;
}
return out;
}

32
common/preset.h Normal file
View File

@ -0,0 +1,32 @@
#pragma once
#include "common.h"
#include "arg.h"
#include <string>
#include <vector>
#include <map>
//
// INI preset parser and writer
//
constexpr const char * COMMON_PRESET_DEFAULT_NAME = "default";
struct common_preset {
std::string name;
// TODO: support repeated args in the future
std::map<common_arg, std::string> options;
// convert preset to CLI argument list
std::vector<std::string> to_args() const;
// convert preset to INI format string
std::string to_ini() const;
// TODO: maybe implement to_env() if needed
};
// interface for multiple presets in one file
using common_presets = std::map<std::string, common_preset>;
common_presets common_presets_load(const std::string & path, common_params_context & ctx_params);

View File

@ -16,12 +16,12 @@ Legend:
|-----------|------|------|------|------|------|------|------|------|------|------|------|
| ABS | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
| ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | | ✅ | ✅ | ✅ | ❌ | ❌ |
| ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| ADD_ID | ❌ | ❌ | ✅ | ✅ | ✅ | | ❌ | ✅ | ❌ | ❌ | ❌ |
| ADD_ID | ❌ | ❌ | ✅ | ✅ | ✅ | | ❌ | ✅ | ❌ | ❌ | ❌ |
| ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | | ✅ | ✅ | ❌ | ❌ | ❌ |
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
| CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
| CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | ❌ |
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
@ -39,13 +39,13 @@ Legend:
| CUMSUM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
| DIAG | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
| DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
| DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | | ✅ | ✅ | ✅ | ❌ | ❌ |
| DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
| ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ |
| EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| EXPM1 | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| FILL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
| FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | | ❌ | 🟡 | ❌ | ❌ | ❌ |
| FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ | ❌ |
| FLOOR | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
| GATED_LINEAR_ATTN | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
| GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
@ -65,8 +65,8 @@ Legend:
| L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
| LOG | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
| MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | | ✅ | ✅ | ❌ | ❌ | ❌ |
| MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
| MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | | ✅ | ✅ | ❌ | ❌ | ❌ |
| MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | | ✅ | ✅ | ✅ | ❌ | ❌ |
| MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
| MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
| NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
@ -75,7 +75,7 @@ Legend:
| OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
| OPT_STEP_SGD | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
| OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | ❌ |
| PAD | ❌ | ✅ | ✅ | 🟡 | 🟡 | | 🟡 | ✅ | ❌ | ❌ | ❌ |
| PAD | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
| PAD_REFLECT_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
| POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
@ -84,7 +84,7 @@ Legend:
| REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| RMS_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
| RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| RMS_NORM_MUL_ADD | ❌ | ✅ | ❌ | ❌ | ❌ | | ❌ | ❌ | ❌ | ❌ | ❌ |
| RMS_NORM_MUL_ADD | ❌ | ✅ | ❌ | ❌ | ❌ | | ❌ | ❌ | ❌ | ❌ | ❌ |
| ROLL | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
| ROPE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
| ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
@ -104,20 +104,20 @@ Legend:
| SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
| SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ |
| SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
| SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | | 🟡 | 🟡 | ❌ | ❌ | ❌ |
| SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | | 🟡 | 🟡 | ❌ | ❌ | ❌ |
| SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | | ✅ | ✅ | ❌ | ❌ | ❌ |
| SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | | 🟡 | 🟡 | ❌ | ❌ | ❌ |
| SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | | 🟡 | 🟡 | ❌ | ❌ | ❌ |
| SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | | ✅ | ✅ | ❌ | ❌ | ❌ |
| SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
| STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
| SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | | ✅ | ✅ | ✅ | ❌ | ❌ |
| SUM | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
| SUM_ROWS | ❌ | ✅ | ✅ | 🟡 | ✅ | | 🟡 | ✅ | ❌ | ❌ | ❌ |
| SUM_ROWS | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ | ❌ |
| SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ✅ | | ❌ | 🟡 | ✅ | ❌ | ❌ |
| SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ✅ | | ❌ | 🟡 | ✅ | ❌ | ❌ |
| TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
| TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
| TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
| TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ |
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | | 🟡 | 🟡 | ❌ | ❌ | ❌ |
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | ❌ |
| XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |

File diff suppressed because it is too large Load Diff

View File

@ -76,7 +76,7 @@ static void export_md(std::string fname, llama_example ex) {
}
int main(int, char **) {
export_md("autogen-main.md", LLAMA_EXAMPLE_MAIN);
export_md("autogen-main.md", LLAMA_EXAMPLE_COMPLETION);
export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER);
return 0;

View File

@ -2305,13 +2305,11 @@ extern "C" {
float stop,
float step);
#define GGML_KQ_MASK_PAD 1
// q: [n_embd_k, n_batch, n_head, ne3 ]
// k: [n_embd_k, n_kv, n_head_kv, ne3 ]
// v: [n_embd_v, n_kv, n_head_kv, ne3 ] !! not transposed !!
// mask: [n_kv, n_batch_pad, ne32, ne33] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
// res: [n_embd_v, n_head, n_batch, ne3 ] !! permuted !!
// q: [n_embd_k, n_batch, n_head, ne3 ]
// k: [n_embd_k, n_kv, n_head_kv, ne3 ]
// v: [n_embd_v, n_kv, n_head_kv, ne3 ] !! not transposed !!
// mask: [n_kv, n_batch, ne32, ne33]
// res: [n_embd_v, n_head, n_batch, ne3 ] !! permuted !!
//
// broadcast:
// n_head % n_head_kv == 0

View File

@ -187,6 +187,9 @@ typedef void * thread_ret_t;
typedef pthread_t ggml_thread_t;
#define GGML_THREADPOOL_N_THREADS_MASK (0xffffU)
#define GGML_THREADPOOL_N_THREADS_BITS (16)
#if defined(__APPLE__)
#include <unistd.h>
#include <mach/mach.h>
@ -449,7 +452,7 @@ struct ggml_threadpool {
struct ggml_cplan * cplan;
// synchronization primitives
atomic_int n_graph; // incremented when there is work to be done (i.e each graph)
atomic_int n_graph; // updated when there is work to be done (i.e each graph) holds graph and active thread counts.
atomic_int GGML_CACHE_ALIGN n_barrier;
atomic_int GGML_CACHE_ALIGN n_barrier_passed;
atomic_int GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
@ -457,12 +460,10 @@ struct ggml_threadpool {
// these are atomic as an annotation for thread-sanitizer
atomic_bool stop; // Used for stopping the threadpool altogether
atomic_bool pause; // Used for pausing the threadpool or individual threads
atomic_int abort; // Used for aborting processing of a graph
atomic_int abort; // Used for aborting processing of a graph
struct ggml_compute_state * workers; // per thread state
int n_threads_max; // number of threads in the pool
atomic_int n_threads_cur; // number of threads used in the current graph
int n_threads; // Number of threads in the pool
int32_t prio; // Scheduling priority
uint32_t poll; // Polling level (0 - no polling)
@ -539,7 +540,7 @@ struct ggml_state {
static struct ggml_state g_state = {0};
void ggml_barrier(struct ggml_threadpool * tp) {
int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
int n_threads = atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK;
if (n_threads == 1) {
return;
}
@ -556,7 +557,7 @@ void ggml_barrier(struct ggml_threadpool * tp) {
// last thread
atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
// exit barrier (fill seq-cst fence)
// exit barrier (full seq-cst fence)
atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst);
return;
}
@ -2628,7 +2629,7 @@ static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask
void ggml_threadpool_free(struct ggml_threadpool* threadpool) {
if (!threadpool) return;
const int n_threads = threadpool->n_threads_max;
const int n_threads = threadpool->n_threads;
#ifndef GGML_USE_OPENMP
struct ggml_compute_state* workers = threadpool->workers;
@ -2704,7 +2705,7 @@ struct ggml_cplan ggml_graph_plan(
//GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
}
if (n_threads <= 0) {
n_threads = threadpool ? threadpool->n_threads_max : GGML_DEFAULT_N_THREADS;
n_threads = threadpool ? threadpool->n_threads : GGML_DEFAULT_N_THREADS;
}
#if defined(__EMSCRIPTEN__) && !defined(__EMSCRIPTEN_PTHREADS__)
@ -2912,12 +2913,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
struct ggml_compute_params params = {
/*.ith =*/ state->ith,
/*.nth =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed),
/*.nth =*/ atomic_load_explicit(&tp->n_graph, memory_order_relaxed) & GGML_THREADPOOL_N_THREADS_MASK,
/*.wsize =*/ cplan->work_size,
/*.wdata =*/ cplan->work_data,
/*.threadpool=*/ tp,
};
GGML_PRINT_DEBUG("thread #%d compute-start cplan %p last-graph %d \n", state->ith, cplan, state->last_graph);
for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
struct ggml_tensor * node = cgraph->nodes[node_n];
@ -2939,6 +2942,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
}
}
GGML_PRINT_DEBUG("thread #%d compute-done cplan %p last-graph %d \n", state->ith, cplan, state->last_graph);
ggml_barrier(state->threadpool);
return 0;
@ -2946,27 +2951,23 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
#ifndef GGML_USE_OPENMP
// check if thread is active
static inline bool ggml_graph_compute_thread_active(struct ggml_compute_state * state) {
struct ggml_threadpool * threadpool = state->threadpool;
int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
return (state->ith < n_threads);
}
// check if thread is ready to proceed (exit from polling or sleeping)
// returns true if loops should exit, sets state->pending to indicate new work
static inline bool ggml_graph_compute_thread_ready(struct ggml_compute_state * state) {
struct ggml_threadpool * threadpool = state->threadpool;
if (state->pending || threadpool->stop || threadpool->pause) { return true; }
// check for new graph/work
int new_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
if (new_graph != state->last_graph) {
state->pending = ggml_graph_compute_thread_active(state);
state->last_graph = new_graph;
int n_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed);
int n_threads = n_graph & GGML_THREADPOOL_N_THREADS_MASK;
if (n_graph != state->last_graph) {
state->pending = (state->ith < n_threads);
state->last_graph = n_graph;
return true;
}
return state->pending;
return false;
}
// sync thread state after polling
@ -2983,11 +2984,6 @@ static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * st
static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
struct ggml_threadpool * threadpool = state->threadpool;
// Skip polling for unused threads
if (!ggml_graph_compute_thread_active(state)) {
return state->pending;
}
// This seems to make 0 ... 100 a decent range for polling level across modern processors.
// Perhaps, we can adjust it dynamically based on load and things.
const uint64_t n_rounds = 1024UL * 128 * threadpool->poll;
@ -3049,7 +3045,6 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
ggml_graph_compute_check_for_work(state);
if (state->pending) {
state->pending = false;
ggml_graph_compute_thread(state);
}
}
@ -3064,14 +3059,15 @@ static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int
ggml_mutex_lock(&threadpool->mutex);
GGML_PRINT_DEBUG("threadpool: n_threads_cur %d n_threads %d\n", threadpool->n_threads_cur, n_threads);
// Update the number of active threads and the graph count
int n_graph = atomic_load_explicit(&threadpool->n_graph, memory_order_relaxed) >> GGML_THREADPOOL_N_THREADS_BITS;
n_graph = ((n_graph + 1) << GGML_THREADPOOL_N_THREADS_BITS) | (n_threads & GGML_THREADPOOL_N_THREADS_MASK);
// Update the number of active threads
atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
GGML_PRINT_DEBUG("compute-kickoff: n_threads %d n_graph %d\n", n_threads, n_graph);
// Indicate the graph is ready to be processed
// We need the full seq-cst fence here because of the polling threads (used in thread_sync)
atomic_fetch_add_explicit(&threadpool->n_graph, 1, memory_order_seq_cst);
atomic_store_explicit(&threadpool->n_graph, n_graph, memory_order_seq_cst);
if (threadpool->pause) {
// Update main thread prio and affinity to match the threadpool settings
@ -3109,8 +3105,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
threadpool->pause = tpp->paused;
threadpool->abort = -1;
threadpool->workers = NULL;
threadpool->n_threads_max = tpp->n_threads;
threadpool->n_threads_cur = tpp->n_threads;
threadpool->n_threads = tpp->n_threads;
threadpool->poll = tpp->poll;
threadpool->prio = tpp->prio;
threadpool->ec = GGML_STATUS_SUCCESS;
@ -3205,7 +3200,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
{
// update the number of threads from the actual number of threads that we got from OpenMP
n_threads = omp_get_num_threads();
atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
atomic_store_explicit(&threadpool->n_graph, n_threads, memory_order_relaxed);
}
// Apply thread CPU mask and priority
@ -3218,13 +3213,13 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
ggml_graph_compute_thread(&threadpool->workers[ith]);
}
} else {
atomic_store_explicit(&threadpool->n_threads_cur, 1, memory_order_relaxed);
atomic_store_explicit(&threadpool->n_graph, 1, memory_order_relaxed);
ggml_graph_compute_thread(&threadpool->workers[0]);
}
#else
if (n_threads > threadpool->n_threads_max) {
GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max);
n_threads = threadpool->n_threads_max;
if (n_threads > threadpool->n_threads) {
GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads);
n_threads = threadpool->n_threads;
}
// Kick all threads to start the new graph

View File

@ -4313,6 +4313,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_UNARY_OP_EXPM1:
case GGML_UNARY_OP_SOFTPLUS:
case GGML_UNARY_OP_ELU:
case GGML_UNARY_OP_XIELU:
case GGML_UNARY_OP_FLOOR:
case GGML_UNARY_OP_CEIL:
case GGML_UNARY_OP_ROUND:

View File

@ -73,15 +73,15 @@ static float rope_yarn_ramp(const float low, const float high, const int i0) {
return (1 - MIN(1, MAX(0, y)));
}
static void rope_cache_init(const float theta_base,
float freq_scale,
const float * freq_factors,
float * corr_dims,
uint32_t ne0,
float ext_factor,
float mscale,
float * cache,
float theta_scale) {
static void rope_cache_init(const float theta_base,
const float freq_scale,
const float * freq_factors,
float * corr_dims,
const uint32_t ne0,
const float ext_factor,
const float mscale,
float * cache,
const float theta_scale) {
// ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
float theta = theta_base;
@ -92,18 +92,19 @@ static void rope_cache_init(const float theta_base,
// Get n-d rotational scaling corrected for extrapolation
float theta_interp = freq_scale * theta_extrap;
float theta2 = theta_interp;
float theta_final = theta_interp;
float mscale_final = mscale;
if (ext_factor != 0.0f) {
float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
theta2 = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
theta_final = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
// Get n-d magnitude scaling corrected for interpolation
mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
mscale_final *= 1.0f + 0.1f * logf(1.0f / freq_scale);
}
cache[i0 + 0] = cosf(theta2) * mscale;
cache[i0 + 1] = sinf(theta2) * mscale;
cache[i0 + 0] = cosf(theta_final) * mscale_final;
cache[i0 + 1] = sinf(theta_final) * mscale_final;
theta *= theta_scale;
}
@ -151,9 +152,9 @@ static void init_rope_ctx(struct rope_th_ctx * rope_ctx, struct htp_ops_context
}
static void hvx_calc_rope_neox_f32(const float * restrict src0,
float * restrict dst,
const int num_elems,
const float * restrict theta_cache) {
float * restrict dst,
const int num_elems,
const float * restrict theta_cache) {
// for (int i = 0; i < num_elems; i += 2) {
//const float cos_theta = theta_cache[i + 0];
//const float sin_theta = theta_cache[i + 1];
@ -192,7 +193,7 @@ static void hvx_calc_rope_neox_f32(const float * restrict src0,
HVX_Vector v4 = Q6_Vqf32_vsub_Vqf32Vqf32(vx0_c, vx1_s);
HVX_Vector v5 = Q6_Vqf32_vadd_Vqf32Vqf32(vx0_s, vx1_c);
*(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v4);
*(HVX_Vector *) dst_curr = Q6_Vsf_equals_Vqf32(v4);
*(HVX_Vector *) (dst_curr + half_size) = Q6_Vsf_equals_Vqf32(v5);
src0_curr += VLEN;
@ -259,7 +260,7 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
const uint32_t ir1,
int nth,
int ith,
int opt_path) {
const int opt_path) {
struct htp_ops_context * octx = rope_ctx->octx;
const struct htp_tensor * src0 = &octx->src0;
@ -267,8 +268,8 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
const struct htp_tensor * src2 = &octx->src2;
struct htp_tensor * dst = &octx->dst;
const int32_t mode = rope_ctx->mode;
const bool is_neox = mode & HTP_ROPE_TYPE_NEOX;
const int32_t mode = rope_ctx->mode;
const bool is_neox = mode & HTP_ROPE_TYPE_NEOX;
htp_rope_preamble;
@ -281,8 +282,9 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
freq_factors = (const float *) src2->data;
}
int ir = 0;
const uint32_t i1_end = MIN(ir1, ne1);
const int32_t half_dims = rope_ctx->n_dims / 2;
const size_t remain_bytes = (ne0 - rope_ctx->n_dims) * sizeof(float);
for (uint32_t i3 = 0; i3 < ne3; i3++) { // batch
for (uint32_t i2 = 0; i2 < ne2; i2++) { // seq-len
const int32_t p = pos[i2];
@ -290,14 +292,7 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
rope_cache_init(p, rope_ctx->freq_scale, freq_factors, rope_ctx->corr_dims, ne0, rope_ctx->ext_factor,
rope_ctx->attn_factor, wp0, rope_ctx->theta_scale);
for (uint32_t i1 = 0; i1 < ne1; i1++) { // attn-heads
if (ir++ < ir0) {
continue;
}
if (ir > ir1) {
break;
}
for (uint32_t i1 = ir0; i1 < i1_end; i1++) { // attn-heads
const float * src = (float *) ((char *) src0->data + i3 * nb03 + i2 * nb02 + i1 * nb01);
float * dst_data = (float *) ((char *) dst->data + i3 * nb3 + i2 * nb2 + i1 * nb1);
@ -310,6 +305,9 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
} else {
hvx_calc_rope_f32(src_loc, dst_data_loc, rope_ctx->n_dims, wp0);
}
src_loc += rope_ctx->n_dims;
dst_data_loc += rope_ctx->n_dims;
} else {
for (uint32_t i0 = 0; i0 < rope_ctx->n_dims; i0 += 2) {
const float cos_theta = wp0[i0 + 0];
@ -317,10 +315,10 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
if (is_neox) {
const float x0 = src_loc[0];
const float x1 = src_loc[rope_ctx->n_dims/2];
const float x1 = src_loc[half_dims];
dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta;
dst_data_loc[rope_ctx->n_dims/2] = x0 * sin_theta + x1 * cos_theta;
dst_data_loc[0] = x0 * cos_theta - x1 * sin_theta;
dst_data_loc[half_dims] = x0 * sin_theta + x1 * cos_theta;
src_loc += 1;
dst_data_loc += 1;
@ -335,15 +333,13 @@ static void rope_hex_f32(struct rope_th_ctx * rope_ctx,
dst_data_loc += 2;
}
}
src_loc += (is_neox ? half_dims : 0);
dst_data_loc += (is_neox ? half_dims : 0);
}
for (uint32_t i0 = rope_ctx->n_dims; i0 < ne0; i0 += 2) {
dst_data_loc[0] = src_loc[0];
dst_data_loc[1] = src_loc[1];
src_loc += 2;
dst_data_loc += 2;
}
// TODO: use simd to speed up the remaining elements copy
memcpy(dst_data_loc, src_loc, remain_bytes);
}
}
}

View File

@ -5260,8 +5260,6 @@ struct ggml_tensor * ggml_flash_attn_ext(
if (mask) {
GGML_ASSERT(ggml_is_contiguous(mask));
GGML_ASSERT(mask->ne[1] >= GGML_PAD(q->ne[1], GGML_KQ_MASK_PAD) &&
"the Flash-Attention kernel requires the mask to be padded to GGML_KQ_MASK_PAD and at least n_queries big");
//GGML_ASSERT(ggml_can_repeat_rows(mask, qk));
GGML_ASSERT(q->ne[2] % mask->ne[2] == 0);

View File

@ -46,7 +46,7 @@ adb $adbserial shell " \
LD_LIBRARY_PATH=$basedir/$branch/lib \
ADSP_LIBRARY_PATH=$basedir/$branch/lib \
$verbose $experimental $sched $opmask $profile $nhvx $ndev \
./$branch/bin/llama-cli --no-mmap -m $basedir/../gguf/$model \
./$branch/bin/llama-completion --no-mmap -m $basedir/../gguf/$model \
--poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \
--ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on \
-ngl 99 --device $device $cli_opts $@ \

View File

@ -93,14 +93,6 @@ llama_context::llama_context(
// with causal attention, the batch size is limited by the context size
cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
// the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
// this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
// ref: https://github.com/ggerganov/llama.cpp/pull/5021
// TODO: this padding is not needed for the cache-less context so we should probably move it to llama_memory
if (cparams.n_batch < GGML_KQ_MASK_PAD) {
LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
cparams.n_batch = GGML_KQ_MASK_PAD;
}
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
cparams.op_offload = params.op_offload;

View File

@ -385,7 +385,7 @@ bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
//res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
res &= self_kq_mask->ne[0] == mctx->get_n_kv();
res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
return res;
}
@ -416,10 +416,10 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
//res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
res &= self_kq_mask->ne[0] == mctx->get_base()->get_n_kv();
res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv();
res &= self_kq_mask_swa->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
res &= self_kq_mask_swa->ne[1] == params.ubatch.n_tokens;
return res;
}
@ -452,7 +452,7 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
}
}
for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
for (int i = n_tokens; i < n_tokens; ++i) {
for (int j = 0; j < n_enc; ++j) {
data[h*(n_enc*n_tokens) + i*n_enc + j] = -INFINITY;
}
@ -1470,13 +1470,13 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con
auto inp = std::make_unique<llm_graph_input_attn_no_cache>(hparams, cparams);
// note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
ggml_set_input(inp->self_kq_mask);
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
ggml_set_input(inp->self_kq_mask_swa);
inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
@ -1558,7 +1558,7 @@ static std::unique_ptr<llm_graph_input_attn_kv> build_attn_inp_kv_impl(
inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
ggml_set_input(inp->self_kq_mask);
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
@ -1701,7 +1701,7 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
const int32_t n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;
inp->cross_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
inp->cross_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_enc, n_tokens, 1, 1);
ggml_set_input(inp->cross_kq_mask);
inp->cross_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->cross_kq_mask, GGML_TYPE_F16) : inp->cross_kq_mask;
@ -1767,7 +1767,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
inp->self_k_idxs = mctx_cur->get_base()->build_input_k_idxs(ctx0, ubatch);
inp->self_v_idxs = mctx_cur->get_base()->build_input_v_idxs(ctx0, ubatch);
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
ggml_set_input(inp->self_kq_mask);
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
@ -1781,7 +1781,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
inp->self_k_idxs_swa = mctx_cur->get_swa()->build_input_k_idxs(ctx0, ubatch);
inp->self_v_idxs_swa = mctx_cur->get_swa()->build_input_v_idxs(ctx0, ubatch);
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
ggml_set_input(inp->self_kq_mask_swa);
inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;

View File

@ -1232,8 +1232,7 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
GGML_ASSERT(n_tokens%n_stream == 0);
// n_tps == n_tokens_per_stream
const int64_t n_tps = n_tokens/n_stream;
const int64_t n_tps_pad = GGML_PAD(n_tps, GGML_KQ_MASK_PAD);
const int64_t n_tps = n_tokens/n_stream;
std::fill(data, data + ggml_nelements(dst), -INFINITY);
@ -1266,7 +1265,7 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens] : 0;
const uint64_t idst = n_kv*(h*n_stream*n_tps_pad + s*n_tps_pad + ii);
const uint64_t idst = n_kv*(h*n_stream*n_tps + s*n_tps + ii);
for (uint32_t j = 0; j < n_kv; ++j) {
if (cells.is_empty(j)) {

View File

@ -120,6 +120,7 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_16B_A1B: return "16B.A1B";
case LLM_TYPE_21B_A3B: return "21B.A3B";
case LLM_TYPE_30B_A3B: return "30B.A3B";
case LLM_TYPE_80B_A3B: return "80B.A3B";
case LLM_TYPE_100B_A6B: return "100B.A6B";
case LLM_TYPE_106B_A12B: return "106B.A12B";
case LLM_TYPE_230B_A10B: return "230B.A10B";
@ -2257,7 +2258,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
}
switch (hparams.n_layer) {
case 80: type = LLM_TYPE_80B_A3B; break;
case 48: type = LLM_TYPE_80B_A3B; break;
default: type = LLM_TYPE_UNKNOWN;
}
} break;

View File

@ -5875,7 +5875,7 @@ struct test_flash_attn_ext : public test_case {
ggml_tensor * m = nullptr;
if (mask) {
m = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, nr23[1]);
m = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, nb, 1, nr23[1]);
ggml_set_name(m, "m");
}

View File

@ -11,19 +11,7 @@
#define MAX_NARGS 2
int main(int argc, char *argv[]) {
int n_threads = std::max(1, std::min(4, (int) std::thread::hardware_concurrency()));
int n_rounds = 100;
if (argc > 1) {
n_threads = std::atoi(argv[1]);
}
if (argc > 2) {
n_rounds = std::atoi(argv[2]);
}
static void test_barrier(int n_threads, int n_rounds) {
struct ggml_init_params params = {
/* .mem_size = */ 1024*1024*1024,
/* .mem_buffer = */ NULL,
@ -56,7 +44,7 @@ int main(int argc, char *argv[]) {
exit(1);
}
// Create compute plan
// The test runs with constant number of threads
struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads, threadpool);
std::vector<uint8_t> work_data(cplan.work_size);
@ -89,6 +77,160 @@ int main(int argc, char *argv[]) {
ggml_threadpool_free(threadpool);
ggml_free(ctx);
}
static void test_active(int n_threads, int n_rounds) {
struct ggml_init_params params = {
/* .mem_size = */ 1024*1024*1024,
/* .mem_buffer = */ NULL,
/* .no_alloc = */ false,
};
struct ggml_context * ctx = ggml_init(params);
// Create graph
struct ggml_cgraph * gf = ggml_new_graph(ctx);
// Small graph with, parallel ops with barriers
struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 64);
for (int i = 0; i < 2; i++) {
struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128);
out = ggml_mul_mat(ctx, a, out);
struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 64);
out = ggml_mul_mat(ctx, d, out);
}
ggml_build_forward_expand(gf, out);
int n_nodes = ggml_graph_n_nodes(gf);
// Create threadpool
struct ggml_threadpool_params tpp = ggml_threadpool_params_default(n_threads);
struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
if (!threadpool) {
fprintf(stderr, "threadpool create failed : n_threads %d\n", n_threads);
exit(1);
}
std::cerr << "graph-compute with"
<< "\n n_threads: " << n_threads
<< "\n n_nodes: " << n_nodes
<< "\n n_rounds: " << n_rounds
<< "\n";
// ggml_graph_print(gf);
// In this test we keep changing the number of threads every 4th iteration
// to test for race conditions in that path
for (int i=0; i < n_rounds; i++) {
struct ggml_cplan cplan = ggml_graph_plan(gf, (i % 4) == 0 ? 1 : n_threads, threadpool);
std::vector<uint8_t> work_data(cplan.work_size);
cplan.work_data = work_data.data();
ggml_graph_compute(gf, &cplan);
}
ggml_threadpool_free(threadpool);
ggml_free(ctx);
}
static void test_multi_graph(int n_threads, int n_rounds) {
struct ggml_init_params params = {
/* .mem_size = */ 1024*1024*1024,
/* .mem_buffer = */ NULL,
/* .no_alloc = */ false,
};
struct ggml_context * ctx = ggml_init(params);
// Create graphs
struct ggml_cgraph * gf0 = ggml_new_graph(ctx);
{
// Small graph with parallel ops with barriers
struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 64);
for (int i = 0; i < 2; i++) {
struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128);
out = ggml_mul_mat(ctx, a, out);
struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 64);
out = ggml_mul_mat(ctx, d, out);
}
ggml_build_forward_expand(gf0, out);
}
struct ggml_cgraph * gf1 = ggml_new_graph(ctx);
{
// Small graph with parallel ops with barriers
// Use larger tensors to make sure work_data size is larger than gf0
struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 256);
for (int i = 0; i < 4; i++) {
struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 256, 128);
out = ggml_mul_mat(ctx, a, out);
struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 256);
out = ggml_mul_mat(ctx, d, out);
}
ggml_build_forward_expand(gf1, out);
}
// Create threadpool
struct ggml_threadpool_params tpp = ggml_threadpool_params_default(n_threads);
struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
if (!threadpool) {
fprintf(stderr, "threadpool create failed : n_threads %d\n", n_threads);
exit(1);
}
std::cerr << "graph-compute with"
<< "\n gf0 n_nodes: " << ggml_graph_n_nodes(gf0)
<< "\n gf1 n_nodes: " << ggml_graph_n_nodes(gf1)
<< "\n n_threads: " << n_threads
<< "\n n_rounds: " << n_rounds
<< "\n";
// In this test we keep changing the number of threads every 4th iteration
// and we compute two graphs back to back to test graph frequent graph switching
for (int i=0; i < n_rounds; i++) {
struct ggml_cplan cplan0 = ggml_graph_plan(gf0, (i % 4) == 0 ? 1 : n_threads, threadpool);
std::vector<uint8_t> work_data0(cplan0.work_size);
cplan0.work_data = work_data0.data();
struct ggml_cplan cplan1 = ggml_graph_plan(gf1, (i % 4) == 0 ? 1 : n_threads, threadpool);
std::vector<uint8_t> work_data1(cplan1.work_size);
cplan1.work_data = work_data1.data();
ggml_graph_compute(gf0, &cplan0);
ggml_graph_compute(gf1, &cplan1);
}
ggml_threadpool_free(threadpool);
ggml_free(ctx);
}
int main(int argc, char *argv[]) {
int n_threads = std::max(1, std::min(4, (int) std::thread::hardware_concurrency()));
int n_rounds = 100;
if (argc > 1) {
n_threads = std::atoi(argv[1]);
}
if (argc > 2) {
n_rounds = std::atoi(argv[2]);
}
test_barrier(n_threads, n_rounds);
test_active(n_threads, n_rounds * 100);
test_multi_graph(n_threads, n_rounds * 10);
return 0;
}

View File

@ -79,19 +79,19 @@ run_conversion_and_inference_lora() {
# Run inference
echo -e "\n\n---------------------------\n\n"
echo "Running llama-cli without lora for $model_name with hidden_size $hidden_size..."
OUTPUT_BASE=$(./llama-cli -no-cnv -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \
echo "Running llama-completion without lora for $model_name with hidden_size $hidden_size..."
OUTPUT_BASE=$(./llama-completion -no-cnv -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \
-p "$EXPECTED_BASE_FIRST_WORD" -n 50 --seed 42 --temp 0)
echo -e "\n\n---------------------------\n\n"
echo "Running llama-cli with hot lora for $model_name with hidden_size $hidden_size..."
OUTPUT_LORA_HOT=$(./llama-cli -no-cnv -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \
echo "Running llama-completion with hot lora for $model_name with hidden_size $hidden_size..."
OUTPUT_LORA_HOT=$(./llama-completion -no-cnv -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \
--lora $MODELS_REPO/$model_name/hidden_size=$hidden_size/lora/Lora-F32-LoRA.gguf \
-p "$EXPECTED_LORA_FIRST_WORD" -n 50 --seed 42 --temp 0)
echo -e "\n\n---------------------------\n\n"
echo "Running llama-cli with merged lora for $model_name with hidden_size $hidden_size..."
OUTPUT_LORA_MERGED=$(./llama-cli -no-cnv -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32-lora-merged.gguf \
echo "Running llama-completion with merged lora for $model_name with hidden_size $hidden_size..."
OUTPUT_LORA_MERGED=$(./llama-completion -no-cnv -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32-lora-merged.gguf \
-p "$EXPECTED_LORA_FIRST_WORD" -n 50 --seed 42 --temp 0)
# Remove any initial white space

View File

@ -18,7 +18,8 @@ else()
add_subdirectory(gguf-split)
add_subdirectory(imatrix)
add_subdirectory(llama-bench)
add_subdirectory(main)
add_subdirectory(cli)
add_subdirectory(completion)
add_subdirectory(perplexity)
add_subdirectory(quantize)
if (LLAMA_BUILD_SERVER)

10
tools/cli/CMakeLists.txt Normal file
View File

@ -0,0 +1,10 @@
set(TARGET llama-cli)
add_executable(${TARGET} cli.cpp)
target_link_libraries(${TARGET} PRIVATE server-context PUBLIC common ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
include_directories(../server)
if(LLAMA_TOOLS_INSTALL)
install(TARGETS ${TARGET} RUNTIME)
endif()

395
tools/cli/cli.cpp Normal file
View File

@ -0,0 +1,395 @@
#include "common.h"
#include "arg.h"
#include "console.h"
// #include "log.h"
#include "server-context.h"
#include "server-task.h"
#include <atomic>
#include <fstream>
#include <thread>
#include <signal.h>
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
# define NOMINMAX
#endif
#include <windows.h>
#endif
const char * LLAMA_ASCII_LOGO = R"(
)";
static std::atomic<bool> g_is_interrupted = false;
static bool should_stop() {
return g_is_interrupted.load();
}
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
static void signal_handler(int) {
if (g_is_interrupted.load()) {
// second Ctrl+C - exit immediately
// make sure to clear colors before exiting (not using LOG or console.cpp here to avoid deadlock)
fprintf(stdout, "\033[0m\n");
fflush(stdout);
std::exit(130);
}
g_is_interrupted.store(true);
}
#endif
struct cli_context {
server_context ctx_server;
json messages = json::array();
std::vector<raw_buffer> input_files;
task_params defaults;
// thread for showing "loading" animation
std::atomic<bool> loading_show;
cli_context(const common_params & params) {
defaults.sampling = params.sampling;
defaults.speculative = params.speculative;
defaults.n_keep = params.n_keep;
defaults.n_predict = params.n_predict;
defaults.antiprompt = params.antiprompt;
defaults.stream = true; // make sure we always use streaming mode
defaults.timings_per_token = true; // in order to get timings even when we cancel mid-way
// defaults.return_progress = true; // TODO: show progress
defaults.oaicompat_chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
}
std::string generate_completion(result_timings & out_timings) {
server_response_reader rd = ctx_server.get_response_reader();
{
// TODO: reduce some copies here in the future
server_task task = server_task(SERVER_TASK_TYPE_COMPLETION);
task.id = rd.get_new_id();
task.index = 0;
task.params = defaults; // copy
task.cli_input = messages; // copy
task.cli_files = input_files; // copy
rd.post_task({std::move(task)});
}
// wait for first result
console::spinner::start();
server_task_result_ptr result = rd.next(should_stop);
console::spinner::stop();
std::string curr_content;
bool is_thinking = false;
while (result) {
if (should_stop()) {
break;
}
if (result->is_error()) {
json err_data = result->to_json();
if (err_data.contains("message")) {
console::error("Error: %s\n", err_data["message"].get<std::string>().c_str());
} else {
console::error("Error: %s\n", err_data.dump().c_str());
}
return curr_content;
}
auto res_partial = dynamic_cast<server_task_result_cmpl_partial *>(result.get());
if (res_partial) {
out_timings = std::move(res_partial->timings);
for (const auto & diff : res_partial->oaicompat_msg_diffs) {
if (!diff.content_delta.empty()) {
if (is_thinking) {
console::log("\n[End thinking]\n\n");
console::set_display(DISPLAY_TYPE_RESET);
is_thinking = false;
}
curr_content += diff.content_delta;
console::log("%s", diff.content_delta.c_str());
console::flush();
}
if (!diff.reasoning_content_delta.empty()) {
console::set_display(DISPLAY_TYPE_REASONING);
if (!is_thinking) {
console::log("[Start thinking]\n");
}
is_thinking = true;
console::log("%s", diff.reasoning_content_delta.c_str());
console::flush();
}
}
}
auto res_final = dynamic_cast<server_task_result_cmpl_final *>(result.get());
if (res_final) {
out_timings = std::move(res_final->timings);
break;
}
result = rd.next(should_stop);
}
g_is_interrupted.store(false);
// server_response_reader automatically cancels pending tasks upon destruction
return curr_content;
}
// TODO: support remote files in the future (http, https, etc)
std::string load_input_file(const std::string & fname, bool is_media) {
std::ifstream file(fname, std::ios::binary);
if (!file) {
return "";
}
if (is_media) {
raw_buffer buf;
buf.assign((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
input_files.push_back(std::move(buf));
return mtmd_default_marker();
} else {
std::string content((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
return content;
}
}
};
int main(int argc, char ** argv) {
common_params params;
params.verbosity = LOG_LEVEL_ERROR; // by default, less verbose logs
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CLI)) {
return 1;
}
// TODO: maybe support it later?
if (params.conversation_mode == COMMON_CONVERSATION_MODE_DISABLED) {
console::error("--no-conversation is not supported by llama-cli\n");
console::error("please use llama-completion instead\n");
}
common_init();
// struct that contains llama context and inference
cli_context ctx_cli(params);
llama_backend_init();
llama_numa_init(params.numa);
// TODO: avoid using atexit() here by making `console` a singleton
console::init(params.simple_io, params.use_color);
atexit([]() { console::cleanup(); });
console::set_display(DISPLAY_TYPE_RESET);
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
struct sigaction sigint_action;
sigint_action.sa_handler = signal_handler;
sigemptyset (&sigint_action.sa_mask);
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
sigaction(SIGTERM, &sigint_action, NULL);
#elif defined (_WIN32)
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
};
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
#endif
console::log("\nLoading model... "); // followed by loading animation
console::spinner::start();
if (!ctx_cli.ctx_server.load_model(params)) {
console::spinner::stop();
console::error("\nFailed to load the model\n");
return 1;
}
ctx_cli.ctx_server.init();
console::spinner::stop();
console::log("\n");
std::thread inference_thread([&ctx_cli]() {
ctx_cli.ctx_server.start_loop();
});
auto inf = ctx_cli.ctx_server.get_info();
std::string modalities = "text";
if (inf.has_inp_image) {
modalities += ", vision";
}
if (inf.has_inp_audio) {
modalities += ", audio";
}
if (!params.system_prompt.empty()) {
ctx_cli.messages.push_back({
{"role", "system"},
{"content", params.system_prompt}
});
}
console::log("\n");
console::log("%s\n", LLAMA_ASCII_LOGO);
console::log("build : %s\n", inf.build_info.c_str());
console::log("model : %s\n", inf.model_name.c_str());
console::log("modalities : %s\n", modalities.c_str());
if (!params.system_prompt.empty()) {
console::log("using custom system prompt\n");
}
console::log("\n");
console::log("available commands:\n");
console::log(" /exit or Ctrl+C stop or exit\n");
console::log(" /regen regenerate the last response\n");
console::log(" /clear clear the chat history\n");
console::log(" /read add a text file\n");
if (inf.has_inp_image) {
console::log(" /image <file> add an image file\n");
}
if (inf.has_inp_audio) {
console::log(" /audio <file> add an audio file\n");
}
console::log("\n");
// interactive loop
std::string cur_msg;
while (true) {
std::string buffer;
console::set_display(DISPLAY_TYPE_USER_INPUT);
if (params.prompt.empty()) {
console::log("\n> ");
std::string line;
bool another_line = true;
do {
another_line = console::readline(line, params.multiline_input);
buffer += line;
} while (another_line);
} else {
// process input prompt from args
for (auto & fname : params.image) {
std::string marker = ctx_cli.load_input_file(fname, true);
if (marker.empty()) {
console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str());
break;
}
console::log("Loaded media from '%s'\n", fname.c_str());
cur_msg += marker;
}
buffer = params.prompt;
if (buffer.size() > 500) {
console::log("\n> %s ... (truncated)\n", buffer.substr(0, 500).c_str());
} else {
console::log("\n> %s\n", buffer.c_str());
}
params.prompt.clear(); // only use it once
}
console::set_display(DISPLAY_TYPE_RESET);
console::log("\n");
if (should_stop()) {
g_is_interrupted.store(false);
break;
}
// remove trailing newline
if (!buffer.empty() &&buffer.back() == '\n') {
buffer.pop_back();
}
// skip empty messages
if (buffer.empty()) {
continue;
}
bool add_user_msg = true;
// process commands
if (string_starts_with(buffer, "/exit")) {
break;
} else if (string_starts_with(buffer, "/regen")) {
if (ctx_cli.messages.size() >= 2) {
size_t last_idx = ctx_cli.messages.size() - 1;
ctx_cli.messages.erase(last_idx);
add_user_msg = false;
} else {
console::error("No message to regenerate.\n");
continue;
}
} else if (string_starts_with(buffer, "/clear")) {
ctx_cli.messages.clear();
ctx_cli.input_files.clear();
console::log("Chat history cleared.\n");
continue;
} else if (
(string_starts_with(buffer, "/image ") && inf.has_inp_image) ||
(string_starts_with(buffer, "/audio ") && inf.has_inp_audio)) {
// just in case (bad copy-paste for example), we strip all trailing/leading spaces
std::string fname = string_strip(buffer.substr(7));
std::string marker = ctx_cli.load_input_file(fname, true);
if (marker.empty()) {
console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str());
continue;
}
cur_msg += marker;
console::log("Loaded media from '%s'\n", fname.c_str());
continue;
} else if (string_starts_with(buffer, "/read ")) {
std::string fname = string_strip(buffer.substr(6));
std::string marker = ctx_cli.load_input_file(fname, false);
if (marker.empty()) {
console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str());
continue;
}
cur_msg += marker;
console::log("Loaded text from '%s'\n", fname.c_str());
continue;
} else {
// not a command
cur_msg += buffer;
}
// generate response
if (add_user_msg) {
ctx_cli.messages.push_back({
{"role", "user"},
{"content", cur_msg}
});
cur_msg.clear();
}
result_timings timings;
std::string assistant_content = ctx_cli.generate_completion(timings);
ctx_cli.messages.push_back({
{"role", "assistant"},
{"content", assistant_content}
});
console::log("\n");
if (params.show_timings) {
console::set_display(DISPLAY_TYPE_INFO);
console::log("\n");
console::log("[ Prompt: %.1f t/s | Generation: %.1f t/s ]\n", timings.prompt_per_second, timings.predicted_per_second);
console::set_display(DISPLAY_TYPE_RESET);
}
if (params.single_turn) {
break;
}
}
console::set_display(DISPLAY_TYPE_RESET);
console::log("\nExiting...\n");
ctx_cli.ctx_server.terminate();
inference_thread.join();
// bump the log level to display timings
common_log_set_verbosity_thold(LOG_LEVEL_INFO);
llama_memory_breakdown_print(ctx_cli.ctx_server.get_llama_context());
return 0;
}

View File

@ -1,5 +1,5 @@
set(TARGET llama-cli)
add_executable(${TARGET} main.cpp)
set(TARGET llama-completion)
add_executable(${TARGET} completion.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -86,7 +86,11 @@ static void sigint_handler(int signo) {
int main(int argc, char ** argv) {
common_params params;
g_params = &params;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
// disable jinja by default
params.use_jinja = false;
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMPLETION, print_usage)) {
return 1;
}
@ -521,12 +525,6 @@ int main(int argc, char ** argv) {
is_interacting = params.interactive_first;
}
LOG_WRN("*****************************\n");
LOG_WRN("IMPORTANT: The current llama-cli will be moved to llama-completion in the near future\n");
LOG_WRN(" New llama-cli will have enhanced features and improved user experience\n");
LOG_WRN(" More info: https://github.com/ggml-org/llama.cpp/discussions/17618\n");
LOG_WRN("*****************************\n");
bool is_antiprompt = false;
bool input_echo = true;
bool display = true;
@ -543,7 +541,7 @@ int main(int argc, char ** argv) {
std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode
// the first thing we will do is to output the prompt, so set color accordingly
console::set_display(console::prompt);
console::set_display(DISPLAY_TYPE_PROMPT);
display = params.display_prompt;
std::vector<llama_token> embd;
@ -588,9 +586,9 @@ int main(int argc, char ** argv) {
const int skipped_tokens = (int) embd.size() - max_embd_size;
embd.resize(max_embd_size);
console::set_display(console::error);
console::set_display(DISPLAY_TYPE_ERROR);
LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
console::set_display(console::reset);
console::set_display(DISPLAY_TYPE_RESET);
}
if (ga_n == 1) {
@ -772,7 +770,7 @@ int main(int argc, char ** argv) {
// reset color to default if there is no pending user input
if (input_echo && (int) embd_inp.size() == n_consumed) {
console::set_display(console::reset);
console::set_display(DISPLAY_TYPE_RESET);
display = true;
}
@ -868,7 +866,7 @@ int main(int argc, char ** argv) {
}
// color user input only
console::set_display(console::user_input);
console::set_display(DISPLAY_TYPE_USER_INPUT);
display = params.display_prompt;
std::string line;
@ -879,7 +877,7 @@ int main(int argc, char ** argv) {
} while (another_line);
// done taking input, reset color
console::set_display(console::reset);
console::set_display(DISPLAY_TYPE_RESET);
display = true;
if (buffer.empty()) { // Ctrl+D on empty line exits

View File

@ -19,7 +19,7 @@ fi
set -x
SPLIT=$1/llama-gguf-split
MAIN=$1/llama-cli
MAIN=$1/llama-completion
WORK_PATH=$TMP_DIR/gguf-split
ROOT_DIR=$(realpath $(dirname $0)/../../)

View File

@ -595,11 +595,12 @@ struct clip_graph {
cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
cur = ggml_add(ctx0, cur, model.mm_1_b);
cur = ggml_gelu(ctx0, cur);
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
cur = ggml_add(ctx0, cur, model.mm_2_b);
cur = build_ffn(cur,
model.mm_1_w, model.mm_1_b,
nullptr, nullptr,
model.mm_2_w, model.mm_2_b,
FFN_GELU,
-1);
} else if (ctx->proj_type() == PROJECTOR_TYPE_JANUS_PRO) {
cur = build_ffn(cur,
@ -667,16 +668,12 @@ struct clip_graph {
// LlavaMultiModalProjector (always using GELU activation)
{
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
if (model.mm_1_b) {
cur = ggml_add(ctx0, cur, model.mm_1_b);
}
cur = ggml_gelu(ctx0, cur);
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
if (model.mm_2_b) {
cur = ggml_add(ctx0, cur, model.mm_2_b);
}
cur = build_ffn(cur,
model.mm_1_w, model.mm_1_b,
nullptr, nullptr,
model.mm_2_w, model.mm_2_b,
FFN_GELU,
-1);
}
// arrangement of the [IMG_BREAK] token
@ -775,10 +772,6 @@ struct clip_graph {
// if flash attn is used, we need to pad the mask and cast to f16
if (ctx->flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
int n_pad = GGML_PAD(window_mask->ne[1], GGML_KQ_MASK_PAD) - window_mask->ne[1];
if (n_pad > 0) {
window_mask = ggml_pad(ctx0, window_mask, 0, n_pad, 0, 0);
}
window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16);
}
@ -791,7 +784,7 @@ struct clip_graph {
// loop over layers
for (int il = 0; il < n_layer; il++) {
auto & layer = model.layers[il];
const auto & layer = model.layers[il];
const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true;
ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
@ -870,16 +863,12 @@ struct clip_graph {
// multimodal projection
ggml_tensor * embeddings = inpL;
embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
// GELU activation
embeddings = ggml_gelu(ctx0, embeddings);
// Second linear layer
embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
embeddings = build_ffn(embeddings,
model.mm_0_w, model.mm_0_b,
nullptr, nullptr,
model.mm_1_w, model.mm_1_b,
FFN_GELU,
-1);
if (use_window_attn) {
window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
@ -1257,11 +1246,12 @@ struct clip_graph {
// projector LayerNorm uses pytorch's default eps = 1e-5
// ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79
cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
cur = ggml_add(ctx0, cur, model.mm_1_b);
cur = ggml_gelu(ctx0, cur);
cur = ggml_mul_mat(ctx0, model.mm_3_w, cur);
cur = ggml_add(ctx0, cur, model.mm_3_b);
cur = build_ffn(cur,
model.mm_1_w, model.mm_1_b,
nullptr, nullptr,
model.mm_3_w, model.mm_3_b,
FFN_GELU,
-1);
}
// build the graph
@ -1412,11 +1402,12 @@ struct clip_graph {
cb(cur, "proj_inp_normed", -1);
// projection mlp
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
cur = ggml_add(ctx0, cur, model.mm_1_b);
cur = ggml_gelu(ctx0, cur);
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
cur = ggml_add(ctx0, cur, model.mm_2_b);
cur = build_ffn(cur,
model.mm_1_w, model.mm_1_b,
nullptr, nullptr,
model.mm_2_w, model.mm_2_b,
FFN_GELU,
-1);
cb(cur, "proj_out", -1);
}
@ -1887,9 +1878,12 @@ struct clip_graph {
} else if (ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL) {
// projector
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
cur = ggml_gelu_erf(ctx0, cur);
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
cur = build_ffn(cur,
model.mm_1_w, model.mm_1_b,
nullptr, nullptr,
model.mm_2_w, model.mm_2_b,
FFN_GELU_ERF,
-1);
} else {
GGML_ABORT("%s: unknown projector type", __func__);
@ -2074,34 +2068,66 @@ private:
// self-attention
{
ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
if (layer.q_b) {
Qcur = ggml_add(ctx0, Qcur, layer.q_b);
}
ggml_tensor * Qcur = nullptr;
ggml_tensor * Kcur = nullptr;
ggml_tensor * Vcur = nullptr;
if (layer.qkv_w != nullptr) {
// fused qkv
cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
if (layer.qkv_b != nullptr) {
cur = ggml_add(ctx0, cur, layer.qkv_b);
}
ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
if (layer.k_b) {
Kcur = ggml_add(ctx0, Kcur, layer.k_b);
}
Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
/* nb1 */ ggml_row_size(cur->type, d_head),
/* nb2 */ cur->nb[1],
/* offset */ 0);
ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
if (layer.v_b) {
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
}
Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
/* nb1 */ ggml_row_size(cur->type, d_head),
/* nb2 */ cur->nb[1],
/* offset */ ggml_row_size(cur->type, n_embd));
if (layer.q_norm) {
Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
cb(Qcur, "Qcur_norm", il);
}
Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
/* nb1 */ ggml_row_size(cur->type, d_head),
/* nb2 */ cur->nb[1],
/* offset */ ggml_row_size(cur->type, 2 * n_embd));
if (layer.k_norm) {
Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
cb(Kcur, "Kcur_norm", il);
}
// TODO: q/k norm requires row size == n_embd, while here it's d_head
// we can add support in the future if needed
GGML_ASSERT(layer.q_norm == nullptr && layer.k_norm == nullptr);
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
} else {
// separate q, k, v
Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
if (layer.q_b) {
Qcur = ggml_add(ctx0, Qcur, layer.q_b);
}
Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
if (layer.k_b) {
Kcur = ggml_add(ctx0, Kcur, layer.k_b);
}
Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
if (layer.v_b) {
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
}
if (layer.q_norm) {
Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
cb(Qcur, "Qcur_norm", il);
}
if (layer.k_norm) {
Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
cb(Kcur, "Kcur_norm", il);
}
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
}
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);

View File

@ -270,6 +270,7 @@ int main(int argc, char ** argv) {
ggml_time_init();
common_params params;
params.use_jinja = false; // disable jinja by default
params.sampling.temp = 0.2; // lower temp by default for better quality
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MTMD, show_additional_info)) {
@ -310,11 +311,16 @@ int main(int argc, char ** argv) {
if (g_is_interrupted) return 130;
LOG_WRN("WARN: This is an experimental CLI for testing multimodal capability.\n");
LOG_WRN(" For normal use cases, please use the standard llama-cli\n");
if (is_single_turn) {
g_is_generating = true;
if (params.prompt.find(mtmd_default_marker()) == std::string::npos) {
for (size_t i = 0; i < params.image.size(); i++) {
params.prompt += mtmd_default_marker();
// most models require the marker before each image
// ref: https://github.com/ggml-org/llama.cpp/pull/17616
params.prompt = mtmd_default_marker() + params.prompt;
}
}
common_chat_msg msg;
@ -349,11 +355,11 @@ int main(int argc, char ** argv) {
while (!g_is_interrupted) {
g_is_generating = false;
LOG("\n> ");
console::set_display(console::user_input);
console::set_display(DISPLAY_TYPE_USER_INPUT);
std::string line;
console::readline(line, false);
if (g_is_interrupted) break;
console::set_display(console::reset);
console::set_display(DISPLAY_TYPE_RESET);
line = string_strip(line);
if (line.empty()) {
continue;

View File

@ -32,23 +32,32 @@ fi
arr_prefix=()
arr_hf=()
arr_tmpl=() # chat template
arr_extra_args=()
arr_file=()
add_test_vision() {
local hf=$1
local tmpl=${2:-""} # default to empty string if not provided
shift
local extra_args=""
if [ $# -gt 0 ]; then
extra_args=$(printf " %q" "$@")
fi
arr_prefix+=("[vision]")
arr_hf+=("$hf")
arr_tmpl+=("$tmpl")
arr_extra_args+=("$extra_args")
arr_file+=("test-1.jpeg")
}
add_test_audio() {
local hf=$1
shift
local extra_args=""
if [ $# -gt 0 ]; then
extra_args=$(printf " %q" "$@")
fi
arr_prefix+=("[audio] ")
arr_hf+=("$hf")
arr_tmpl+=("") # no need for chat tmpl
arr_extra_args+=("$extra_args")
arr_file+=("test-2.mp3")
}
@ -56,9 +65,9 @@ add_test_vision "ggml-org/SmolVLM-500M-Instruct-GGUF:Q8_0"
add_test_vision "ggml-org/SmolVLM2-2.2B-Instruct-GGUF:Q4_K_M"
add_test_vision "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF:Q8_0"
add_test_vision "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
add_test_vision "THUDM/glm-edge-v-5b-gguf:Q4_K_M"
add_test_vision "second-state/Llava-v1.5-7B-GGUF:Q2_K" "vicuna"
add_test_vision "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M" "vicuna"
add_test_vision "THUDM/glm-edge-v-5b-gguf:Q4_K_M" -p "name of the newspaper?<__media__>"
add_test_vision "second-state/Llava-v1.5-7B-GGUF:Q2_K" --chat-template vicuna
add_test_vision "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M" --chat-template vicuna
add_test_vision "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
add_test_vision "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted
add_test_vision "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
@ -79,7 +88,7 @@ add_test_audio "ggml-org/Voxtral-Mini-3B-2507-GGUF:Q4_K_M"
# to test the big models, run: ./tests.sh big
if [ "$RUN_BIG_TESTS" = true ]; then
add_test_vision "ggml-org/pixtral-12b-GGUF:Q4_K_M"
add_test_vision "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" "mistral-v7"
add_test_vision "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" --chat-template mistral-v7
add_test_vision "ggml-org/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
add_test_vision "ggml-org/Qwen2-VL-7B-Instruct-GGUF:Q4_K_M"
add_test_vision "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
@ -89,7 +98,7 @@ if [ "$RUN_BIG_TESTS" = true ]; then
add_test_vision "ggml-org/InternVL3-14B-Instruct-GGUF:Q4_K_M"
add_test_vision "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
# add_test_vision "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" # does not work on my mac M3 Ultra
add_test_vision "ggml-org/Kimi-VL-A3B-Thinking-2506-GGUF:Q4_K_M"
# add_test_vision "ggml-org/Kimi-VL-A3B-Thinking-2506-GGUF:Q4_K_M" # not always working
add_test_audio "ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF:Q4_K_M"
add_test_audio "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
@ -122,21 +131,25 @@ for i in "${!arr_hf[@]}"; do
bin="llama-mtmd-cli"
prefix="${arr_prefix[$i]}"
hf="${arr_hf[$i]}"
tmpl="${arr_tmpl[$i]}"
extra_args="${arr_extra_args[$i]}"
inp_file="${arr_file[$i]}"
echo "Running test with binary: $bin and HF model: $hf"
echo ""
echo ""
output=$(\
"$PROJ_ROOT/build/bin/$bin" \
-hf "$hf" \
--image $SCRIPT_DIR/$inp_file \
-p "what is the publisher name of the newspaper?" \
cmd="$(printf %q "$PROJ_ROOT/build/bin/$bin") \
-hf $(printf %q "$hf") \
--image $(printf %q "$SCRIPT_DIR/$inp_file") \
--temp 0 -n 128 \
${tmpl:+--chat-template "$tmpl"} \
2>&1 | tee /dev/tty)
${extra_args}"
# if extra_args does not contain -p, we add a default prompt
if ! [[ "$extra_args" =~ "-p" ]]; then
cmd+=" -p \"what is the publisher name of the newspaper?\""
fi
output=$(eval "$cmd" 2>&1 | tee /dev/tty)
echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log
@ -144,9 +157,9 @@ for i in "${!arr_hf[@]}"; do
if echo "$output" | grep -iq "new york" \
|| (echo "$output" | grep -iq "men" && echo "$output" | grep -iq "walk")
then
result="$prefix \033[32mOK\033[0m: $bin $hf"
result="$prefix \033[32mOK\033[0m: $hf"
else
result="$prefix \033[31mFAIL\033[0m: $bin $hf"
result="$prefix \033[31mFAIL\033[0m: $hf"
fi
echo -e "$result"
arr_res+=("$result")

View File

@ -20,7 +20,7 @@ set -x
SPLIT=$1/llama-gguf-split
QUANTIZE=$1/llama-quantize
MAIN=$1/llama-cli
MAIN=$1/llama-completion
WORK_PATH=$TMP_DIR/quantize
ROOT_DIR=$(realpath $(dirname $0)/../../)

View File

@ -1,7 +1,33 @@
set(TARGET llama-server)
include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
# server-context containing the core server logic, used by llama-server and CLI
set(TARGET server-context)
add_library(${TARGET} STATIC
server-task.cpp
server-task.h
server-queue.cpp
server-queue.h
server-common.cpp
server-common.h
server-context.cpp
server-context.h
)
if (BUILD_SHARED_LIBS)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
target_include_directories(${TARGET} PRIVATE ../mtmd)
target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
target_link_libraries(${TARGET} PUBLIC common mtmd ${CMAKE_THREAD_LIBS_INIT})
# llama-server executable
set(TARGET llama-server)
if (NOT LLAMA_HTTPLIB)
message(FATAL_ERROR "LLAMA_HTTPLIB is OFF, cannot build llama-server. Hint: to skip building server, set -DLLAMA_BUILD_SERVER=OFF")
endif()
@ -43,7 +69,7 @@ install(TARGETS ${TARGET} RUNTIME)
target_include_directories(${TARGET} PRIVATE ../mtmd)
target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
target_link_libraries(${TARGET} PRIVATE common mtmd cpp-httplib ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE server-context PUBLIC common cpp-httplib ${CMAKE_THREAD_LIBS_INIT})
if (WIN32)
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)

View File

@ -1369,6 +1369,11 @@ llama-server
### Model sources
There are 3 possible sources for model files:
1. Cached models (controlled by the `LLAMA_CACHE` environment variable)
2. Custom model directory (set via the `--models-dir` argument)
3. Custom preset (set via the `--models-preset` argument)
By default, the router looks for models in the cache. You can add Hugging Face models to the cache with:
```sh
@ -1413,6 +1418,51 @@ llama-server -ctx 8192 -n 1024 -np 2
Note: model instances inherit both command line arguments and environment variables from the router server.
Alternatively, you can also add GGUF based preset (see next section)
### Model presets
Model presets allow advanced users to define custom configurations using an `.ini` file:
```sh
llama-server --models-preset ./my-models.ini
```
Each section in the file defines a new preset. Keys within a section correspond to command-line arguments (without leading dashes). For example, the argument `--n-gpu-layer 123` is written as `n-gpu-layer = 123`.
Short argument forms (e.g., `c`, `ngl`) and environment variable names (e.g., `LLAMA_ARG_N_GPU_LAYERS`) are also supported as keys.
Example:
```ini
version = 1
; If the key corresponds to an existing model on the server,
; this will be used as the default config for that model
[ggml-org/MY-MODEL-GGUF:Q8_0]
; string value
chat-template = chatml
; numeric value
n-gpu-layer = 123
; flag value (for certain flags, you need to use the "no-" prefix for negation)
jinja = true
; shorthand argument (for example, context size)
c = 4096
; environment variable name
LLAMA_ARG_CACHE_RAM = 0
; file paths are relative to server's CWD
model-draft = ./my-models/draft.gguf
; but it's RECOMMENDED to use absolute path
model-draft = /Users/abc/my-models/draft.gguf
; If the key does NOT correspond to an existing model,
; you need to specify at least the model path
[custom_model]
model = /Users/abc/my-awesome-model-Q4_K_M.gguf
```
Note: some arguments are controlled by router (e.g., host, port, API key, HF repo, model alias). They will be removed or overwritten upload loading.
### Routing requests
Requests are routed according to the requested model name.

View File

@ -1474,6 +1474,44 @@ struct server_context_impl {
// Functions to process the task
//
// tokenize the input if it's set by CLI, return false on error
bool tokenize_cli_input(server_task & task) {
if (task.cli_input == nullptr) {
return true; // nothing to do
}
try {
auto & opt = oai_parser_opt;
common_chat_templates_inputs inputs;
inputs.messages = common_chat_msgs_parse_oaicompat(task.cli_input);
inputs.tools = {}; // TODO
inputs.tool_choice = COMMON_CHAT_TOOL_CHOICE_NONE;
inputs.json_schema = ""; // TODO
inputs.grammar = ""; // TODO
inputs.use_jinja = opt.use_jinja;
inputs.parallel_tool_calls = false;
inputs.add_generation_prompt = true;
inputs.reasoning_format = opt.reasoning_format;
inputs.enable_thinking = opt.enable_thinking;
// Apply chat template to the list of messages
auto chat_params = common_chat_templates_apply(opt.tmpls, inputs);
// tokenize the resulting prompt
auto & prompt = chat_params.prompt;
if (mctx != nullptr) {
task.tokens = process_mtmd_prompt(mctx, prompt, task.cli_files);
} else {
task.tokens = std::move(tokenize_input_prompts(vocab, mctx, prompt, true, true)[0]);
}
task.cli_input.clear();
task.cli_files.clear();
} catch (const std::exception & e) {
send_error(task, std::string("Failed to format input: ") + e.what(), ERROR_TYPE_INVALID_REQUEST);
return false;
}
return true;
}
void process_single_task(server_task && task) {
switch (task.type) {
case SERVER_TASK_TYPE_COMPLETION:
@ -1481,6 +1519,10 @@ struct server_context_impl {
case SERVER_TASK_TYPE_EMBEDDING:
case SERVER_TASK_TYPE_RERANK:
{
if (!tokenize_cli_input(task)) {
break;
}
const int id_slot = task.id_slot;
server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task);
@ -1690,7 +1732,6 @@ struct server_context_impl {
res->id = task.id;
queue_results.send(std::move(res));
} break;
}
}
@ -2626,6 +2667,15 @@ server_response_reader server_context::get_response_reader() {
return impl->get_response_reader();
}
server_context_info server_context::get_info() const {
return server_context_info {
/* build_info */ build_info,
/* model_name */ impl->model_name,
/* has_inp_image */ impl->oai_parser_opt.allow_image,
/* has_inp_audio */ impl->oai_parser_opt.allow_audio,
};
}
// generator-like API for HTTP response generation

View File

@ -9,6 +9,13 @@
struct server_context_impl; // private implementation
struct server_context_info {
std::string build_info;
std::string model_name;
bool has_inp_image;
bool has_inp_audio;
};
struct server_context {
std::unique_ptr<server_context_impl> impl;
@ -33,6 +40,10 @@ struct server_context {
// get a new response reader, used by CLI application
server_response_reader get_response_reader();
// get server info
// used by CLI application
server_context_info get_info() const;
};

View File

@ -1,6 +1,7 @@
#include "server-common.h"
#include "server-models.h"
#include "preset.h"
#include "download.h"
#include <cpp-httplib/httplib.h> // TODO: remove this once we use HTTP client from download.h
@ -33,6 +34,10 @@
#define CMD_EXIT "exit"
// address for child process, this is needed because router may run on 0.0.0.0
// ref: https://github.com/ggml-org/llama.cpp/issues/17862
#define CHILD_ADDR "127.0.0.1"
static std::filesystem::path get_server_exec_path() {
#if defined(_WIN32)
wchar_t buf[32768] = { 0 }; // Large buffer to handle long paths
@ -132,6 +137,93 @@ static std::vector<local_model> list_local_models(const std::string & dir) {
return models;
}
//
// server_presets
//
server_presets::server_presets(int argc, char ** argv, common_params & base_params, const std::string & presets_path)
: ctx_params(common_params_parser_init(base_params, LLAMA_EXAMPLE_SERVER)) {
if (!presets_path.empty()) {
presets = common_presets_load(presets_path, ctx_params);
SRV_INF("Loaded %zu presets from %s\n", presets.size(), presets_path.c_str());
}
// populate reserved args (will be appended by the router)
for (auto & opt : ctx_params.options) {
if (opt.env == nullptr) {
continue;
}
std::string env = opt.env;
if (env == "LLAMA_ARG_PORT" ||
env == "LLAMA_ARG_HOST" ||
env == "LLAMA_ARG_ALIAS" ||
env == "LLAMA_ARG_API_KEY" ||
env == "LLAMA_ARG_MODELS_DIR" ||
env == "LLAMA_ARG_MODELS_MAX" ||
env == "LLAMA_ARG_MODELS_PRESET" ||
env == "LLAMA_ARG_MODEL" ||
env == "LLAMA_ARG_MMPROJ" ||
env == "LLAMA_ARG_HF_REPO" ||
env == "LLAMA_ARG_NO_MODELS_AUTOLOAD") {
control_args[env] = opt;
}
}
// read base args from router's argv
common_params_parse(argc, argv, LLAMA_EXAMPLE_SERVER, base_args);
// remove any router-controlled args from base_args
for (const auto & cargs : control_args) {
auto it = base_args.find(cargs.second);
if (it != base_args.end()) {
base_args.erase(it);
}
}
}
common_preset server_presets::get_preset(const std::string & name) {
auto it = presets.find(name);
if (it != presets.end()) {
return it->second;
}
return common_preset();
}
void server_presets::render_args(server_model_meta & meta) {
common_preset preset = meta.preset; // copy
// merging 3 kinds of args:
// 1. model-specific args (from preset)
// force removing control args if any
for (auto & cargs : control_args) {
if (preset.options.find(cargs.second) != preset.options.end()) {
SRV_WRN("Preset '%s' contains reserved arg '%s', removing it\n", preset.name.c_str(), cargs.second.args[0]);
preset.options.erase(cargs.second);
}
}
// 2. base args (from router)
// inherit from base args
for (const auto & [arg, value] : base_args) {
preset.options[arg] = value;
}
// 3. control args (from router)
// set control values
preset.options[control_args["LLAMA_ARG_HOST"]] = CHILD_ADDR;
preset.options[control_args["LLAMA_ARG_PORT"]] = std::to_string(meta.port);
preset.options[control_args["LLAMA_ARG_ALIAS"]] = meta.name;
if (meta.in_cache) {
preset.options[control_args["LLAMA_ARG_HF_REPO"]] = meta.name;
} else {
preset.options[control_args["LLAMA_ARG_MODEL"]] = meta.path;
if (!meta.path_mmproj.empty()) {
preset.options[control_args["LLAMA_ARG_MMPROJ"]] = meta.path_mmproj;
}
}
meta.args = preset.to_args();
// add back the binary path at the front
meta.args.insert(meta.args.begin(), get_server_exec_path().string());
}
//
// server_models
//
@ -140,7 +232,7 @@ server_models::server_models(
const common_params & params,
int argc,
char ** argv,
char ** envp) : base_params(params) {
char ** envp) : base_params(params), presets(argc, argv, base_params, params.models_preset) {
for (int i = 0; i < argc; i++) {
base_args.push_back(std::string(argv[i]));
}
@ -155,11 +247,58 @@ server_models::server_models(
LOG_WRN("failed to get server executable path: %s\n", e.what());
LOG_WRN("using original argv[0] as fallback: %s\n", base_args[0].c_str());
}
// TODO: allow refreshing cached model list
// add cached models
load_models();
}
void server_models::add_model(server_model_meta && meta) {
if (mapping.find(meta.name) != mapping.end()) {
throw std::runtime_error(string_format("model '%s' appears multiple times", meta.name.c_str()));
}
presets.render_args(meta); // populate meta.args
std::string name = meta.name;
mapping[name] = instance_t{
/* subproc */ std::make_shared<subprocess_s>(),
/* th */ std::thread(),
/* meta */ std::move(meta)
};
}
static std::vector<local_model> list_custom_path_models(server_presets & presets) {
// detect any custom-path models in presets
std::vector<local_model> custom_models;
for (auto & [model_name, preset] : presets.presets) {
local_model model;
model.name = model_name;
std::vector<common_arg> to_erase;
for (auto & [arg, value] : preset.options) {
std::string env(arg.env ? arg.env : "");
if (env == "LLAMA_ARG_MODEL") {
model.path = value;
to_erase.push_back(arg);
}
if (env == "LLAMA_ARG_MMPROJ") {
model.path_mmproj = value;
to_erase.push_back(arg);
}
}
for (auto & arg : to_erase) {
preset.options.erase(arg);
}
if (!model.name.empty() && !model.path.empty()) {
custom_models.push_back(model);
}
}
return custom_models;
}
// TODO: allow refreshing cached model list
void server_models::load_models() {
// loading models from 3 sources:
// 1. cached models
auto cached_models = common_list_cached_models();
for (const auto & model : cached_models) {
server_model_meta meta{
/* preset */ presets.get_preset(model.to_string()),
/* name */ model.to_string(),
/* path */ model.manifest_path,
/* path_mmproj */ "", // auto-detected when loading
@ -170,21 +309,18 @@ server_models::server_models(
/* args */ std::vector<std::string>(),
/* exit_code */ 0
};
mapping[meta.name] = instance_t{
/* subproc */ std::make_shared<subprocess_s>(),
/* th */ std::thread(),
/* meta */ meta
};
add_model(std::move(meta));
}
// add local models specificed via --models-dir
if (!params.models_dir.empty()) {
auto local_models = list_local_models(params.models_dir);
// 2. local models specificed via --models-dir
if (!base_params.models_dir.empty()) {
auto local_models = list_local_models(base_params.models_dir);
for (const auto & model : local_models) {
if (mapping.find(model.name) != mapping.end()) {
// already exists in cached models, skip
continue;
}
server_model_meta meta{
/* preset */ presets.get_preset(model.name),
/* name */ model.name,
/* path */ model.path,
/* path_mmproj */ model.path_mmproj,
@ -195,13 +331,31 @@ server_models::server_models(
/* args */ std::vector<std::string>(),
/* exit_code */ 0
};
mapping[meta.name] = instance_t{
/* subproc */ std::make_shared<subprocess_s>(),
/* th */ std::thread(),
/* meta */ meta
};
add_model(std::move(meta));
}
}
// 3. custom-path models specified in presets
auto custom_models = list_custom_path_models(presets);
for (const auto & model : custom_models) {
server_model_meta meta{
/* preset */ presets.get_preset(model.name),
/* name */ model.name,
/* path */ model.path,
/* path_mmproj */ model.path_mmproj,
/* in_cache */ false,
/* port */ 0,
/* status */ SERVER_MODEL_STATUS_UNLOADED,
/* last_used */ 0,
/* args */ std::vector<std::string>(),
/* exit_code */ 0
};
add_model(std::move(meta));
}
// log available models
SRV_INF("Available models (%zu) (*: custom preset)\n", mapping.size());
for (const auto & [name, inst] : mapping) {
SRV_INF(" %c %s\n", inst.meta.preset.name.empty() ? ' ' : '*', name.c_str());
}
}
void server_models::update_meta(const std::string & name, const server_model_meta & meta) {
@ -335,19 +489,7 @@ void server_models::unload_lru() {
}
}
static void add_or_replace_arg(std::vector<std::string> & args, const std::string & key, const std::string & value) {
for (size_t i = 0; i < args.size(); i++) {
if (args[i] == key && i + 1 < args.size()) {
args[i + 1] = value;
return;
}
}
// not found, append
args.push_back(key);
args.push_back(value);
}
void server_models::load(const std::string & name, bool auto_load) {
void server_models::load(const std::string & name) {
if (!has_model(name)) {
throw std::runtime_error("model name=" + name + " is not found");
}
@ -376,26 +518,10 @@ void server_models::load(const std::string & name, bool auto_load) {
{
SRV_INF("spawning server instance with name=%s on port %d\n", inst.meta.name.c_str(), inst.meta.port);
std::vector<std::string> child_args;
if (auto_load && !meta.args.empty()) {
child_args = meta.args; // copy previous args
} else {
child_args = base_args; // copy
if (inst.meta.in_cache) {
add_or_replace_arg(child_args, "-hf", inst.meta.name);
} else {
add_or_replace_arg(child_args, "-m", inst.meta.path);
if (!inst.meta.path_mmproj.empty()) {
add_or_replace_arg(child_args, "--mmproj", inst.meta.path_mmproj);
}
}
}
presets.render_args(inst.meta); // update meta.args
// set model args
add_or_replace_arg(child_args, "--port", std::to_string(inst.meta.port));
add_or_replace_arg(child_args, "--alias", inst.meta.name);
std::vector<std::string> child_env = base_env; // copy
std::vector<std::string> child_args = inst.meta.args; // copy
std::vector<std::string> child_env = base_env; // copy
child_env.push_back("LLAMA_SERVER_ROUTER_PORT=" + std::to_string(base_params.port));
SRV_INF("%s", "spawning server instance with args:\n");
@ -541,7 +667,7 @@ bool server_models::ensure_model_loaded(const std::string & name) {
}
if (meta->status == SERVER_MODEL_STATUS_UNLOADED) {
SRV_INF("model name=%s is not loaded, loading...\n", name.c_str());
load(name, true);
load(name);
}
SRV_INF("waiting until model name=%s is fully loaded...\n", name.c_str());
@ -571,7 +697,7 @@ server_http_res_ptr server_models::proxy_request(const server_http_req & req, co
SRV_INF("proxying request to model %s on port %d\n", name.c_str(), meta->port);
auto proxy = std::make_unique<server_http_proxy>(
method,
base_params.hostname,
CHILD_ADDR,
meta->port,
req.path,
req.headers,
@ -724,38 +850,6 @@ void server_models_routes::init_routes() {
return models.proxy_request(req, method, name, true); // update last usage for POST request only
};
this->get_router_models = [this](const server_http_req &) {
auto res = std::make_unique<server_http_res>();
json models_json = json::array();
auto all_models = models.get_all_meta();
std::time_t t = std::time(0);
for (const auto & meta : all_models) {
json status {
{"value", server_model_status_to_string(meta.status)},
{"args", meta.args},
};
if (meta.is_failed()) {
status["exit_code"] = meta.exit_code;
status["failed"] = true;
}
models_json.push_back(json {
{"id", meta.name},
{"object", "model"}, // for OAI-compat
{"owned_by", "llamacpp"}, // for OAI-compat
{"created", t}, // for OAI-compat
{"in_cache", meta.in_cache},
{"path", meta.path},
{"status", status},
// TODO: add other fields, may require reading GGUF metadata
});
}
res_ok(res, {
{"data", models_json},
{"object", "list"},
});
return res;
};
this->post_router_models_load = [this](const server_http_req & req) {
auto res = std::make_unique<server_http_res>();
json body = json::parse(req.body);
@ -769,7 +863,7 @@ void server_models_routes::init_routes() {
res_err(res, format_error_response("model is already loaded", ERROR_TYPE_INVALID_REQUEST));
return res;
}
models.load(name, false);
models.load(name);
res_ok(res, {{"success", true}});
return res;
};
@ -793,9 +887,12 @@ void server_models_routes::init_routes() {
std::time_t t = std::time(0);
for (const auto & meta : all_models) {
json status {
{"value", server_model_status_to_string(meta.status)},
{"args", meta.args},
{"value", server_model_status_to_string(meta.status)},
{"args", meta.args},
};
if (!meta.preset.name.empty()) {
status["preset"] = meta.preset.to_ini();
}
if (meta.is_failed()) {
status["exit_code"] = meta.exit_code;
status["failed"] = true;

View File

@ -1,6 +1,7 @@
#pragma once
#include "common.h"
#include "preset.h"
#include "server-http.h"
#include <mutex>
@ -47,6 +48,7 @@ static std::string server_model_status_to_string(server_model_status status) {
}
struct server_model_meta {
common_preset preset;
std::string name;
std::string path;
std::string path_mmproj; // only available if in_cache=false
@ -54,7 +56,7 @@ struct server_model_meta {
int port = 0;
server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
int64_t last_used = 0; // for LRU unloading
std::vector<std::string> args; // additional args passed to the model instance (used for debugging)
std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
bool is_active() const {
@ -66,6 +68,19 @@ struct server_model_meta {
}
};
// the server_presets struct holds the presets read from presets.ini
// as well as base args from the router server
struct server_presets {
common_presets presets;
common_params_context ctx_params;
std::map<common_arg, std::string> base_args;
std::map<std::string, common_arg> control_args; // args reserved for server control
server_presets(int argc, char ** argv, common_params & base_params, const std::string & models_dir);
common_preset get_preset(const std::string & name);
void render_args(server_model_meta & meta);
};
struct subprocess_s;
struct server_models {
@ -85,14 +100,21 @@ private:
std::vector<std::string> base_args;
std::vector<std::string> base_env;
server_presets presets;
void update_meta(const std::string & name, const server_model_meta & meta);
// unload least recently used models if the limit is reached
void unload_lru();
// not thread-safe, caller must hold mutex
void add_model(server_model_meta && meta);
public:
server_models(const common_params & params, int argc, char ** argv, char ** envp);
void load_models();
// check if a model instance exists
bool has_model(const std::string & name);
@ -102,8 +124,7 @@ public:
// return a copy of all model metadata
std::vector<server_model_meta> get_all_meta();
// if auto_load is true, load the model with previous args if any
void load(const std::string & name, bool auto_load);
void load(const std::string & name);
void unload(const std::string & name);
void unload_all();

View File

@ -135,7 +135,10 @@ struct server_response_reader {
stop();
}
void post_task(server_task && tasks);
int get_new_id() {
return queue_tasks.get_new_id();
}
void post_task(server_task && task);
void post_tasks(std::vector<server_task> && tasks);
bool has_next() const;

View File

@ -120,6 +120,10 @@ struct server_task {
task_params params;
server_tokens tokens;
// only used by CLI, this delegates the tokenization to the server
json cli_input = nullptr;
std::vector<raw_buffer> cli_files;
server_task_type type;
// used by SERVER_TASK_TYPE_SLOT_SAVE, SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE