diff --git a/common/arg.cpp b/common/arg.cpp index c6a2dcbf2d..f61fcaa27b 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3,7 +3,9 @@ #include "chat.h" #include "common.h" #include "download.h" +#include "ggml.h" #include "json-schema-to-grammar.h" +#include "llama.h" #include "log.h" #include "sampling.h" #include "speculative.h" @@ -2204,27 +2206,45 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } add_opt(common_arg( {"--mlock"}, - "force system to keep model in RAM rather than swapping or compressing", + "DEPRECATED: force system to keep model in RAM rather than swapping or compressing", [](common_params & params) { - params.use_mlock = true; + LOG_WRN("DEPRECATED: --mlock is deprecated. use --load-mode mlock instead"); + params.load_mode = LLAMA_LOAD_MODE_MLOCK; } ).set_env("LLAMA_ARG_MLOCK")); add_opt(common_arg( {"--mmap"}, {"--no-mmap"}, - string_format("whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"), + "DEPRECATED: whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock)", [](common_params & params, bool value) { - params.use_mmap = value; + LOG_WRN("DEPRECATED: --mmap and --no-mmap are deprecated. use --load-mode mmap instead"); + params.load_mode = value ? LLAMA_LOAD_MODE_MMAP : LLAMA_LOAD_MODE_NONE; } ).set_env("LLAMA_ARG_MMAP")); add_opt(common_arg( {"-dio", "--direct-io"}, {"-ndio", "--no-direct-io"}, - string_format("use DirectIO if available. (default: %s)", params.use_direct_io ? "enabled" : "disabled"), + "DEPRECATED: use DirectIO if available", [](common_params & params, bool value) { - params.use_direct_io = value; + LOG_WRN("DEPRECATED: -dio/--direct-io and -ndio/--no-direct-io are deprecated. use --load-mode dio instead"); + params.load_mode = value ? LLAMA_LOAD_MODE_DIRECT_IO : LLAMA_LOAD_MODE_NONE; } ).set_env("LLAMA_ARG_DIO")); + add_opt(common_arg( + {"-lm", "--load-mode"}, "MODE", + "model loading mode (default: mmap)\n" + "- none: no special loading mode\n" + "- mmap: memory-map model (if mmap disabled, slower load but may reduce pageouts if not using mlock)\n" + "- mlock: force system to keep model in RAM rather than swapping or compressing\n" + "- dio: use DirectIO if available\n", + [](common_params & params, const std::string & value) { + /**/ if (value == "none") { params.load_mode = LLAMA_LOAD_MODE_NONE; } + else if (value == "mmap") { params.load_mode = LLAMA_LOAD_MODE_MMAP; } + else if (value == "mlock") { params.load_mode = LLAMA_LOAD_MODE_MLOCK; } + else if (value == "dio") { params.load_mode = LLAMA_LOAD_MODE_DIRECT_IO; } + else { throw std::invalid_argument("invalid value"); } + } + ).set_env("LLAMA_ARG_LOAD_MODE")); add_opt(common_arg( {"--numa"}, "TYPE", "attempt optimizations that help on some NUMA systems\n" diff --git a/common/common.cpp b/common/common.cpp index 59d75a3b95..f5c6163235 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1320,9 +1320,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { mparams.main_gpu = params.main_gpu; mparams.split_mode = params.split_mode; mparams.tensor_split = params.tensor_split; - mparams.use_mmap = params.use_mmap; - mparams.use_direct_io = params.use_direct_io; - mparams.use_mlock = params.use_mlock; + mparams.load_mode = params.load_mode; mparams.check_tensors = params.check_tensors; mparams.use_extra_bufts = !params.no_extra_bufts; mparams.no_host = params.no_host; diff --git a/common/common.h b/common/common.h index 62201ea1ad..ddcb510e10 100644 --- a/common/common.h +++ b/common/common.h @@ -443,6 +443,7 @@ struct common_params { std::vector fit_params_target = std::vector(llama_max_devices(), 1024 * 1024*1024); enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs + enum llama_load_mode load_mode = LLAMA_LOAD_MODE_MMAP; // how to load the model struct cpu_params cpuparams; struct cpu_params cpuparams_batch; @@ -532,9 +533,6 @@ struct common_params { bool kv_unified = false; // enable unified KV cache bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix - bool use_mmap = true; // enable mmap to use filesystem cache - bool use_direct_io = false; // read from disk without buffering - bool use_mlock = false; // use mlock to keep model in memory bool verbose_prompt = false; // print prompt tokens before generation bool display_prompt = true; // print prompt before generation bool no_kv_offload = false; // disable KV offloading diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp index d38bfe7f82..061db8899a 100644 --- a/examples/diffusion/diffusion-cli.cpp +++ b/examples/diffusion/diffusion-cli.cpp @@ -555,9 +555,7 @@ int main(int argc, char ** argv) { llama_model_params model_params = llama_model_default_params(); model_params.n_gpu_layers = params.n_gpu_layers; model_params.devices = params.devices.data(); - model_params.use_mmap = params.use_mmap; - model_params.use_direct_io = params.use_direct_io; - model_params.use_mlock = params.use_mlock; + model_params.load_mode = params.load_mode; model_params.check_tensors = params.check_tensors; llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params); diff --git a/examples/training/finetune.cpp b/examples/training/finetune.cpp index e20f89488f..4b71f2006e 100644 --- a/examples/training/finetune.cpp +++ b/examples/training/finetune.cpp @@ -24,10 +24,10 @@ int main(int argc, char ** argv) { return 1; } - if (params.use_mmap) { + if (params.load_mode == LLAMA_LOAD_MODE_MMAP) { LOG_INF("%s: force disabling memory mapping because it would result in-read-only pointers to the weights\n", __func__); - params.use_mmap = false; + params.load_mode = LLAMA_LOAD_MODE_NONE; } if (params.cache_type_k != GGML_TYPE_F32) { LOG_INF("%s: force changing k cache type to f32 due to a lack of f16 support for OUT_PROD\n", __func__); diff --git a/include/llama.h b/include/llama.h index 6e72db7e3c..e22959cd8e 100644 --- a/include/llama.h +++ b/include/llama.h @@ -196,6 +196,13 @@ extern "C" { LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported }; + enum llama_load_mode { + LLAMA_LOAD_MODE_NONE = 0, // no special loading mode + LLAMA_LOAD_MODE_MMAP = 1, // memory map the model + LLAMA_LOAD_MODE_MLOCK = 2, // force system to keep model in RAM rather than swapping or compressing + LLAMA_LOAD_MODE_DIRECT_IO = 3, // use direct I/O if available + }; + // TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979) typedef struct llama_token_data { llama_token id; // token id @@ -290,6 +297,7 @@ extern "C" { int32_t n_gpu_layers; // number of layers to store in VRAM, a negative value means all layers enum llama_split_mode split_mode; // how to split the model across multiple GPUs + enum llama_load_mode load_mode; // how to load the model into memory // the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE int32_t main_gpu; @@ -310,9 +318,9 @@ extern "C" { // Keep the booleans together to avoid misalignment during copy-by-value. bool vocab_only; // only load the vocabulary, no weights - bool use_mmap; // use mmap if possible - bool use_direct_io; // use direct io, takes precedence over use_mmap when supported - bool use_mlock; // force system to keep model in RAM + // bool use_mmap; // DEPRECATED: use mmap if possible + // bool use_direct_io; // DEPRECATED: use direct io, takes precedence over use_mmap when supported + // bool use_mlock; // DEPRECATED: force system to keep model in RAM bool check_tensors; // validate model tensor data bool use_extra_bufts; // use extra buffer types (used for weight repacking) bool no_host; // bypass host buffer allowing extra buffers to be used diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 413f34c226..119f826766 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -4,6 +4,7 @@ #include "ggml.h" #include "gguf.h" #include "llama-hparams.h" +#include "llama.h" #include #include @@ -511,8 +512,7 @@ llama_model_loader::llama_model_loader( void * set_tensor_data_ud, const std::string & fname, std::vector & splits, - bool use_mmap, - bool use_direct_io, + llama_load_mode load_mode, bool check_tensors, bool no_alloc, const llama_model_kv_override * param_overrides_p, @@ -551,20 +551,6 @@ llama_model_loader::llama_model_loader( files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io)); contexts.emplace_back(ctx); - if (use_mmap && use_direct_io) { - if (files.back()->has_direct_io()) { - LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__); - use_mmap = false; - } else { - LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__); - use_direct_io = false; - - // reopen file using std::fopen for mmap - files.pop_back(); - files.emplace_back(new llama_file(fname.c_str(), "rb", false)); - } - } - // Save tensors data offset of the main file. // For subsidiary files, `meta` tensor data offset must not be used, // so we build a unified tensors index for weights. @@ -778,8 +764,8 @@ llama_model_loader::llama_model_loader( use_mmap = false; } - this->use_mmap = use_mmap; - this->use_direct_io = use_direct_io; + this->use_mmap = load_mode == LLAMA_LOAD_MODE_MMAP; + this->use_direct_io = load_mode == LLAMA_LOAD_MODE_DIRECT_IO; this->check_tensors = check_tensors; this->no_alloc = no_alloc; } diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index ed5de729ca..512ea715b1 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -125,8 +125,7 @@ struct llama_model_loader { void * set_tensor_data_ud, const std::string & fname, std::vector & splits, // optional, only need if the split does not follow naming scheme - bool use_mmap, - bool use_direct_io, + enum llama_load_mode load_mode, bool check_tensors, bool no_alloc, const llama_model_kv_override * param_overrides_p, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index f8caad2889..1bbb35b6ac 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -14,6 +14,7 @@ #include "ggml-cpp.h" +#include "llama.h" #include "models/models.h" #include @@ -2581,7 +2582,7 @@ void llama_model::load_vocab(llama_model_loader & ml) { bool llama_model::load_tensors(llama_model_loader & ml) { const auto & split_mode = params.split_mode; - const auto & use_mlock = params.use_mlock; + const auto & use_mlock = params.load_mode == LLAMA_LOAD_MODE_MLOCK; const auto & tensor_split = params.tensor_split; const int n_layer = hparams.n_layer; @@ -8698,15 +8699,13 @@ llama_model_params llama_model_default_params() { /*.tensor_buft_overrides =*/ nullptr, /*.n_gpu_layers =*/ -1, /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER, + /*.load_mode =*/ LLAMA_LOAD_MODE_MMAP, /*.main_gpu =*/ 0, /*.tensor_split =*/ nullptr, /*.progress_callback =*/ nullptr, /*.progress_callback_user_data =*/ nullptr, /*.kv_overrides =*/ nullptr, /*.vocab_only =*/ false, - /*.use_mmap =*/ true, - /*.use_direct_io =*/ false, - /*.use_mlock =*/ false, /*.check_tensors =*/ false, /*.use_extra_bufts =*/ true, /*.no_host =*/ false, diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 8e8ce23124..a122baa4da 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -846,9 +846,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // mmap consistently increases speed on Linux, and also increases speed on Windows with // hot cache. It may cause a slowdown on macOS, possibly related to free memory. #if defined(__linux__) || defined(_WIN32) - constexpr bool use_mmap = true; + constexpr llama_load_mode load_mode = LLAMA_LOAD_MODE_MMAP; #else - constexpr bool use_mmap = false; + constexpr llama_load_mode load_mode = LLAMA_LOAD_MODE_NONE; #endif llama_model_kv_override * kv_overrides = nullptr; @@ -859,7 +859,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: std::vector splits = {}; llama_model_loader ml(/*metadata*/ nullptr, /*set_tensor_data*/ nullptr, /*set_tensor_data_ud*/ nullptr, - fname_inp, splits, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr); + fname_inp, splits, load_mode, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr); ml.init_mappings(false); // no prefetching llama_model model(llama_model_default_params()); diff --git a/src/llama.cpp b/src/llama.cpp index 872e659edc..21615f98d7 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -74,8 +74,7 @@ static std::vector llama_get_device_memory_data( llama_model_params mparams_copy = *mparams; mparams_copy.no_alloc = true; - mparams_copy.use_mmap = false; - mparams_copy.use_mlock = false; + mparams_copy.load_mode = LLAMA_LOAD_MODE_NONE; llama_model * model = llama_model_load_from_file(path_model, mparams_copy); if (model == nullptr) { @@ -837,7 +836,7 @@ static int llama_model_load(struct gguf_context * metadata, llama_model_set_tens model.t_start_us = tm.t_start_us; try { - llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, params.use_mmap, params.use_direct_io, + llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, params.load_mode, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides); ml.print_info(); @@ -1035,7 +1034,7 @@ struct llama_model * llama_model_init_from_user( GGML_ASSERT(metadata != nullptr); std::string path_model; std::vector splits = {}; - params.use_mmap = false; + params.load_mode = LLAMA_LOAD_MODE_NONE; params.use_extra_bufts = false; return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, params); } diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp index 67f8ca632c..8931e3b1de 100644 --- a/tests/test-arg-parser.cpp +++ b/tests/test-arg-parser.cpp @@ -98,11 +98,9 @@ int main(void) { argv = {"binary_name", "--draft", "123"}; assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_EMBEDDING)); - // negated arg - argv = {"binary_name", "--no-mmap"}; + argv = {"binary_name", "-lm", "hello"}; assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); - printf("test-arg-parser: test valid usage\n\n"); argv = {"binary_name", "-m", "model_file.gguf"}; @@ -128,6 +126,22 @@ int main(void) { assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE)); assert(params.speculative.n_max == 123); + argv = {"binary_name", "-lm", "none"}; + assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + assert(params.load_mode == LLAMA_LOAD_MODE_NONE); + + argv = {"binary_name", "-lm", "mmap"}; + assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + assert(params.load_mode == LLAMA_LOAD_MODE_MMAP); + + argv = {"binary_name", "-lm", "mlock"}; + assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + assert(params.load_mode == LLAMA_LOAD_MODE_MLOCK); + + argv = {"binary_name", "-lm", "dio"}; + assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + assert(params.load_mode == LLAMA_LOAD_MODE_DIRECT_IO); + // multi-value args (CSV) argv = {"binary_name", "--lora", "file1.gguf,\"file2,2.gguf\",\"file3\"\"3\"\".gguf\",file4\".gguf"}; assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); @@ -154,13 +168,39 @@ int main(void) { assert(params.model.path == "blah.gguf"); assert(params.cpuparams.n_threads == 1010); - printf("test-arg-parser: test negated environment variables\n\n"); + setenv("LLAMA_ARG_LOAD_MODE", "blah", true); + argv = {"binary_name"}; + assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); - setenv("LLAMA_ARG_MMAP", "0", true); + setenv("LLAMA_ARG_LOAD_MODE", "none", true); + argv = {"binary_name"}; + assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + assert(params.load_mode == LLAMA_LOAD_MODE_NONE); + + setenv("LLAMA_ARG_LOAD_MODE", "mlock", true); + argv = {"binary_name"}; + assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + assert(params.load_mode == LLAMA_LOAD_MODE_MLOCK); + + setenv("LLAMA_ARG_LOAD_MODE", "mmap", true); setenv("LLAMA_ARG_NO_PERF", "1", true); // legacy format argv = {"binary_name"}; assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); - assert(params.use_mmap == false); + assert(params.load_mode == LLAMA_LOAD_MODE_MMAP); + assert(params.no_perf == true); + + setenv("LLAMA_ARG_LOAD_MODE", "dio", true); + argv = {"binary_name"}; + assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + assert(params.load_mode == LLAMA_LOAD_MODE_DIRECT_IO); + + printf("test-arg-parser: test negated environment variables\n\n"); + + setenv("LLAMA_ARG_LOAD_MODE", "none", true); + setenv("LLAMA_ARG_NO_PERF", "1", true); // legacy format + argv = {"binary_name"}; + assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + assert(params.load_mode == LLAMA_LOAD_MODE_NONE); assert(params.no_perf == true); printf("test-arg-parser: test environment variables being overwritten\n\n"); diff --git a/tests/test-model-load-cancel.cpp b/tests/test-model-load-cancel.cpp index 9095826fa9..ecc3022711 100644 --- a/tests/test-model-load-cancel.cpp +++ b/tests/test-model-load-cancel.cpp @@ -16,7 +16,7 @@ int main(int argc, char *argv[] ) { llama_backend_init(); auto params = llama_model_params{}; - params.use_mmap = false; + params.load_mode = LLAMA_LOAD_MODE_NONE; params.progress_callback = [](float progress, void * ctx){ (void) ctx; return progress > 0.50; diff --git a/tests/test-quantize-stats.cpp b/tests/test-quantize-stats.cpp index de587d456d..f2e736131f 100644 --- a/tests/test-quantize-stats.cpp +++ b/tests/test-quantize-stats.cpp @@ -309,7 +309,9 @@ int main(int argc, char ** argv) { { auto mparams = llama_model_default_params(); - mparams.use_mlock = false; + if (mparams.load_mode == LLAMA_LOAD_MODE_MLOCK) { + mparams.load_mode = LLAMA_LOAD_MODE_MMAP; + } model = llama_model_load_from_file(params.model.c_str(), mparams); diff --git a/tools/cli/README.md b/tools/cli/README.md index c344cab2a8..b40028cdf0 100644 --- a/tools/cli/README.md +++ b/tools/cli/README.md @@ -56,9 +56,10 @@ | `-ctv, --cache-type-v TYPE` | KV cache data type for V
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)
(env: LLAMA_ARG_DEFRAG_THOLD) | | `-np, --parallel N` | number of parallel sequences to decode (default: 1)
(env: LLAMA_ARG_N_PARALLEL) | -| `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | -| `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | -| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)
(env: LLAMA_ARG_DIO) | +| `--mlock` | DEPRECATED: force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | +| `--mmap, --no-mmap` | DEPRECATED: whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock)
(env: LLAMA_ARG_MMAP) | +| `-dio, --direct-io, -ndio, --no-direct-io` | DEPRECATED: use DirectIO if available
(env: LLAMA_ARG_DIO) | +| `-lm, --load-mode MODE` | model loading mode (default: mmap)
- none: no special loading mode
- mmap: memory-map model (if mmap disabled, slower load but may reduce pageouts if not using mlock)
- mlock: force system to keep model in RAM rather than swapping or compressing
- dio: use DirectIO if available

(env: LLAMA_ARG_LOAD_MODE) | | `--numa TYPE` | attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggml-org/llama.cpp/issues/1437
(env: LLAMA_ARG_NUMA) | | `-dev, --device ` | comma-separated list of devices to use for offloading (none = don't offload)
use --list-devices to see a list of available devices
(env: LLAMA_ARG_DEVICE) | | `--list-devices` | print list of available devices and exit | @@ -134,7 +135,7 @@ | `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.10) | | `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.00) | | `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,
i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' | -| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') | +| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) | | `--grammar-file FNAME` | file to read grammar from | | `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead | | `-jf, --json-schema-file FILE` | File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead | @@ -147,7 +148,8 @@ | -------- | ----------- | | `--display-prompt, --no-display-prompt` | whether to print prompt at generation (default: true) | | `-co, --color [on\|off\|auto]` | Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')
'auto' enables colors when output is to a terminal | -| `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)
(env: LLAMA_ARG_CTX_CHECKPOINTS) | +| `-ctxcp, --ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 32)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)
(env: LLAMA_ARG_CTX_CHECKPOINTS) | +| `-cpent, --checkpoint-every-n-tokens N` | create a checkpoint every n tokens during prefill (processing), -1 to disable (default: 8192)
(env: LLAMA_ARG_CHECKPOINT_EVERY_NT) | | `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)
(env: LLAMA_ARG_CACHE_RAM) | | `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)
(env: LLAMA_ARG_CONTEXT_SHIFT) | | `-sys, --system-prompt PROMPT` | system prompt to use with model (if applicable, depending on chat template) | @@ -172,9 +174,12 @@ | `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'
(env: LLAMA_CHAT_TEMPLATE_KWARGS) | | `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)
(env: LLAMA_ARG_JINJA) | | `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content`
- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`
(default: auto)
(env: LLAMA_ARG_THINK) | -| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | +| `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))
(env: LLAMA_ARG_REASONING) | +| `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | +| `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)
(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) | | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | +| `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)
(env: LLAMA_ARG_SKIP_CHAT_PARSING) | | `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles | | `--draft, --draft-n, --draft-max N` | number of tokens to draft for speculative decoding (default: 16)
(env: LLAMA_ARG_DRAFT_MAX) | | `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)
(env: LLAMA_ARG_DRAFT_MIN) | diff --git a/tools/completion/README.md b/tools/completion/README.md index b5eeba7334..eea6751d3c 100644 --- a/tools/completion/README.md +++ b/tools/completion/README.md @@ -139,9 +139,10 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | `-ctv, --cache-type-v TYPE` | KV cache data type for V
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)
(env: LLAMA_ARG_DEFRAG_THOLD) | | `-np, --parallel N` | number of parallel sequences to decode (default: 1)
(env: LLAMA_ARG_N_PARALLEL) | -| `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | -| `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | -| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)
(env: LLAMA_ARG_DIO) | +| `--mlock` | DEPRECATED: force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | +| `--mmap, --no-mmap` | DEPRECATED: whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock)
(env: LLAMA_ARG_MMAP) | +| `-dio, --direct-io, -ndio, --no-direct-io` | DEPRECATED: use DirectIO if available
(env: LLAMA_ARG_DIO) | +| `-lm, --load-mode MODE` | model loading mode (default: mmap)
- none: no special loading mode
- mmap: memory-map model (if mmap disabled, slower load but may reduce pageouts if not using mlock)
- mlock: force system to keep model in RAM rather than swapping or compressing
- dio: use DirectIO if available

(env: LLAMA_ARG_LOAD_MODE) | | `--numa TYPE` | attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggml-org/llama.cpp/issues/1437
(env: LLAMA_ARG_NUMA) | | `-dev, --device ` | comma-separated list of devices to use for offloading (none = don't offload)
use --list-devices to see a list of available devices
(env: LLAMA_ARG_DEVICE) | | `--list-devices` | print list of available devices and exit | @@ -217,7 +218,7 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.10) | | `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.00) | | `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,
i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' | -| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') | +| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) | | `--grammar-file FNAME` | file to read grammar from | | `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead | | `-jf, --json-schema-file FILE` | File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead | @@ -252,9 +253,12 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | `-gaw, --grp-attn-w N` | group-attention width (default: 512)
(env: LLAMA_ARG_GRP_ATTN_W) | | `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: disabled)
(env: LLAMA_ARG_JINJA) | | `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content`
- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`
(default: auto)
(env: LLAMA_ARG_THINK) | -| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | +| `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))
(env: LLAMA_ARG_REASONING) | +| `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | +| `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)
(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) | | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | +| `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)
(env: LLAMA_ARG_SKIP_CHAT_PARSING) | | `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles | diff --git a/tools/llama-bench/README.md b/tools/llama-bench/README.md index c837bb6d26..61eba4b3c0 100644 --- a/tools/llama-bench/README.md +++ b/tools/llama-bench/README.md @@ -24,44 +24,55 @@ usage: llama-bench [options] options: -h, --help - --numa numa mode (default: disabled) - -r, --repetitions number of times to repeat each test (default: 5) - --prio <0|1|2|3> process/thread priority (default: 0) - --delay <0...N> (seconds) delay between each test (default: 0) - -o, --output output format printed to stdout (default: md) - -oe, --output-err output format printed to stderr (default: none) - --list-devices list available devices and exit - -v, --verbose verbose output - --progress print test progress indicators - -rpc, --rpc register RPC devices (comma separated) + --numa numa mode (default: disabled) + -r, --repetitions number of times to repeat each test (default: 5) + --prio <-1|0|1|2|3> process/thread priority (default: 0) + --delay <0...N> (seconds) delay between each test (default: 0) + -o, --output output format printed to stdout (default: md) + -oe, --output-err output format printed to stderr (default: none) + --list-devices list available devices and exit + -v, --verbose verbose output + --progress print test progress indicators + --no-warmup skip warmup runs before benchmarking test parameters: - -m, --model (default: models/7B/ggml-model-q4_0.gguf) - -p, --n-prompt (default: 512) - -n, --n-gen (default: 128) - -pg (default: ) - -d, --n-depth (default: 0) - -b, --batch-size (default: 2048) - -ub, --ubatch-size (default: 512) - -ctk, --cache-type-k (default: f16) - -ctv, --cache-type-v (default: f16) - -t, --threads (default: system dependent) - -C, --cpu-mask (default: 0x0) - --cpu-strict <0|1> (default: 0) - --poll <0...100> (default: 50) - -ngl, --n-gpu-layers (default: 99) - -ncmoe, --n-cpu-moe (default: 0) - -sm, --split-mode (default: layer) - -mg, --main-gpu (default: 0) - -nkvo, --no-kv-offload <0|1> (default: 0) - -fa, --flash-attn <0|1> (default: 0) - -dev, --device (default: auto) - -mmp, --mmap <0|1> (default: 1) - -embd, --embeddings <0|1> (default: 0) - -ts, --tensor-split (default: 0) - -ot --override-tensors =;... - (default: disabled) - -nopo, --no-op-offload <0|1> (default: 0) + -m, --model (default: models/7B/ggml-model-q4_0.gguf) + -hf, -hfr, --hf-repo /[:quant] Hugging Face model repository; quant is optional, case-insensitive + default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist. + example: unsloth/phi-4-GGUF:Q4_K_M + (default: unused) + -hff, --hf-file Hugging Face model file. If specified, it will override the quant in --hf-repo + (default: unused) + -hft, --hf-token Hugging Face access token + (default: value from HF_TOKEN environment variable) + -p, --n-prompt (default: 512) + -n, --n-gen (default: 128) + -pg (default: ) + -d, --n-depth (default: 0) + -b, --batch-size (default: 2048) + -ub, --ubatch-size (default: 512) + -ctk, --cache-type-k (default: f16) + -ctv, --cache-type-v (default: f16) + -t, --threads (default: 8) + -C, --cpu-mask (default: 0x0) + --cpu-strict <0|1> (default: 0) + --poll <0...100> (default: 50) + -ngl, --n-gpu-layers (default: 99) + -ncmoe, --n-cpu-moe (default: 0) + -sm, --split-mode (default: layer) + -mg, --main-gpu (default: 0) + -nkvo, --no-kv-offload <0|1> (default: 0) + -fa, --flash-attn <0|1> (default: 0) + -dev, --device (default: auto) + -mmp, --mmap <0|1> (DEPRECATED) + -dio, --direct-io <0|1> (DEPRECATED) + -lm, --load-mode (default: mmap) + -embd, --embeddings <0|1> (default: 0) + -ts, --tensor-split (default: 0) + -ot --override-tensor =;... + (default: disabled) + -nopo, --no-op-offload <0|1> (default: 0) + --no-host <0|1> (default: 0) Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times. Ranges can be given as diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index 21173576cc..2aa33b6240 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -265,6 +266,21 @@ static const char * split_mode_str(llama_split_mode mode) { } } +static const char * load_mode_str(llama_load_mode mode) { + switch (mode) { + case LLAMA_LOAD_MODE_NONE: + return "none"; + case LLAMA_LOAD_MODE_MLOCK: + return "mlock"; + case LLAMA_LOAD_MODE_MMAP: + return "mmap"; + case LLAMA_LOAD_MODE_DIRECT_IO: + return "dio"; + default: + GGML_ABORT("invalid load mode"); + } +} + static std::string pair_str(const std::pair & p) { static char buf[32]; snprintf(buf, sizeof(buf), "%d,%d", p.first, p.second); @@ -331,14 +347,13 @@ struct cmd_params { std::vector n_gpu_layers; std::vector n_cpu_moe; std::vector split_mode; + std::vector load_mode; std::vector main_gpu; std::vector no_kv_offload; std::vector flash_attn; std::vector> devices; std::vector> tensor_split; std::vector> tensor_buft_overrides; - std::vector use_mmap; - std::vector use_direct_io; std::vector embeddings; std::vector no_op_offload; std::vector no_host; @@ -373,14 +388,13 @@ static const cmd_params cmd_params_defaults = { /* n_gpu_layers */ { 99 }, /* n_cpu_moe */ { 0 }, /* split_mode */ { LLAMA_SPLIT_MODE_LAYER }, + /* load_mode */ { LLAMA_LOAD_MODE_MMAP }, /* main_gpu */ { 0 }, /* no_kv_offload */ { false }, /* flash_attn */ { false }, /* devices */ { {} }, /* tensor_split */ { std::vector(llama_max_devices(), 0.0f) }, /* tensor_buft_overrides*/ { std::vector{ { nullptr, nullptr } } }, - /* use_mmap */ { true }, - /* use_direct_io */ { false }, /* embeddings */ { false }, /* no_op_offload */ { false }, /* no_host */ { false }, @@ -443,8 +457,9 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str()); printf(" -dev, --device (default: auto)\n"); - printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str()); - printf(" -dio, --direct-io <0|1> (default: %s)\n", join(cmd_params_defaults.use_direct_io, ",").c_str()); + printf(" -mmp, --mmap <0|1> (DEPRECATED)\n"); + printf(" -dio, --direct-io <0|1> (DEPRECATED)\n"); + printf(" -lm, --load-mode (default: %s)\n", join(transform_to_str(cmd_params_defaults.load_mode, load_mode_str), ",").c_str()); printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str()); printf(" -ts, --tensor-split (default: 0)\n"); printf(" -ot --override-tensor =;...\n"); @@ -747,6 +762,34 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { break; } params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end()); + } else if (arg == "-lm" || arg == "--load-mode") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + + std::vector modes; + for (const auto & m : p) { + llama_load_mode mode; + if (m == "none") { + mode = LLAMA_LOAD_MODE_NONE; + } else if (m == "mlock") { + mode = LLAMA_LOAD_MODE_MLOCK; + } else if (m == "mmap") { + mode = LLAMA_LOAD_MODE_MMAP; + } else if (m == "dio") { + mode = LLAMA_LOAD_MODE_DIRECT_IO; + } else { + invalid_param = true; + break; + } + modes.push_back(mode); + } + if (invalid_param) { + break; + } + params.load_mode.insert(params.load_mode.end(), modes.begin(), modes.end()); } else if (arg == "-mg" || arg == "--main-gpu") { if (++i >= argc) { invalid_param = true; @@ -788,15 +831,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - auto p = string_split(argv[i], split_delim); - params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end()); + throw std::invalid_argument("error: -mmp/--mmap option is deprecated; please use -lm/--load-mode mmap instead"); } else if (arg == "-dio" || arg == "--direct-io") { if (++i >= argc) { invalid_param = true; break; } - auto p = string_split(argv[i], split_delim); - params.use_direct_io.insert(params.use_direct_io.end(), p.begin(), p.end()); + throw std::invalid_argument("error: -dio/--direct-io option is deprecated; please use -lm/--load-mode dio instead"); } else if (arg == "-embd" || arg == "--embeddings") { if (++i >= argc) { invalid_param = true; @@ -1050,6 +1091,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; } + if (params.load_mode.empty()) { + params.load_mode = cmd_params_defaults.load_mode; + } if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; } @@ -1068,12 +1112,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.tensor_buft_overrides.empty()) { params.tensor_buft_overrides = cmd_params_defaults.tensor_buft_overrides; } - if (params.use_mmap.empty()) { - params.use_mmap = cmd_params_defaults.use_mmap; - } - if (params.use_direct_io.empty()) { - params.use_direct_io = cmd_params_defaults.use_direct_io; - } if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; } @@ -1115,14 +1153,13 @@ struct cmd_params_instance { int n_gpu_layers; int n_cpu_moe; llama_split_mode split_mode; + llama_load_mode load_mode; int main_gpu; bool no_kv_offload; bool flash_attn; std::vector devices; std::vector tensor_split; std::vector tensor_buft_overrides; - bool use_mmap; - bool use_direct_io; bool embeddings; bool no_op_offload; bool no_host; @@ -1135,10 +1172,9 @@ struct cmd_params_instance { mparams.devices = const_cast(devices.data()); } mparams.split_mode = split_mode; + mparams.load_mode = load_mode; mparams.main_gpu = main_gpu; mparams.tensor_split = tensor_split.data(); - mparams.use_mmap = use_mmap; - mparams.use_direct_io = use_direct_io; mparams.no_host = no_host; if (n_cpu_moe <= 0) { @@ -1184,9 +1220,7 @@ struct cmd_params_instance { return model == other.model && n_gpu_layers == other.n_gpu_layers && n_cpu_moe == other.n_cpu_moe && split_mode == other.split_mode && main_gpu == other.main_gpu && tensor_split == other.tensor_split && - use_mmap == other.use_mmap && use_direct_io == other.use_direct_io && - devices == other.devices && - no_host == other.no_host && + load_mode == other.load_mode && devices == other.devices && no_host == other.no_host && vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides); } @@ -1217,12 +1251,11 @@ static std::vector get_cmd_params_instances(const cmd_param for (const auto & nl : params.n_gpu_layers) for (const auto & ncmoe : params.n_cpu_moe) for (const auto & sm : params.split_mode) + for (const auto & lm : params.load_mode) for (const auto & mg : params.main_gpu) for (const auto & devs : params.devices) for (const auto & ts : params.tensor_split) for (const auto & ot : params.tensor_buft_overrides) - for (const auto & mmp : params.use_mmap) - for (const auto & dio : params.use_direct_io) for (const auto & noh : params.no_host) for (const auto & embd : params.embeddings) for (const auto & nopo : params.no_op_offload) @@ -1257,14 +1290,15 @@ static std::vector get_cmd_params_instances(const cmd_param /* .n_gpu_layers = */ nl, /* .n_cpu_moe = */ ncmoe, /* .split_mode = */ sm, + /* .load_mode = */ lm, /* .main_gpu = */ mg, /* .no_kv_offload= */ nkvo, /* .flash_attn = */ fa, /* .devices = */ devs, /* .tensor_split = */ ts, /* .tensor_buft_overrides = */ ot, - /* .use_mmap = */ mmp, - /* .use_direct_io= */ dio, + // /* .use_mmap = */ mmp, + // /* .use_direct_io= */ dio, /* .embeddings = */ embd, /* .no_op_offload= */ nopo, /* .no_host = */ noh, @@ -1292,14 +1326,13 @@ static std::vector get_cmd_params_instances(const cmd_param /* .n_gpu_layers = */ nl, /* .n_cpu_moe = */ ncmoe, /* .split_mode = */ sm, + /* .load_mode = */ lm, /* .main_gpu = */ mg, /* .no_kv_offload= */ nkvo, /* .flash_attn = */ fa, /* .devices = */ devs, /* .tensor_split = */ ts, /* .tensor_buft_overrides = */ ot, - /* .use_mmap = */ mmp, - /* .use_direct_io= */ dio, /* .embeddings = */ embd, /* .no_op_offload= */ nopo, /* .no_host = */ noh, @@ -1327,14 +1360,13 @@ static std::vector get_cmd_params_instances(const cmd_param /* .n_gpu_layers = */ nl, /* .n_cpu_moe = */ ncmoe, /* .split_mode = */ sm, + /* .load_mode = */ lm, /* .main_gpu = */ mg, /* .no_kv_offload= */ nkvo, /* .flash_attn = */ fa, /* .devices = */ devs, /* .tensor_split = */ ts, /* .tensor_buft_overrides = */ ot, - /* .use_mmap = */ mmp, - /* .use_direct_io= */ dio, /* .embeddings = */ embd, /* .no_op_offload= */ nopo, /* .no_host = */ noh, @@ -1367,14 +1399,15 @@ struct test { int n_gpu_layers; int n_cpu_moe; llama_split_mode split_mode; + llama_load_mode load_mode; int main_gpu; bool no_kv_offload; bool flash_attn; std::vector devices; std::vector tensor_split; std::vector tensor_buft_overrides; - bool use_mmap; - bool use_direct_io; + // bool use_mmap; + // bool use_direct_io; bool embeddings; bool no_op_offload; bool no_host; @@ -1405,14 +1438,13 @@ struct test { n_gpu_layers = inst.n_gpu_layers; n_cpu_moe = inst.n_cpu_moe; split_mode = inst.split_mode; + load_mode = inst.load_mode; main_gpu = inst.main_gpu; no_kv_offload = inst.no_kv_offload; flash_attn = inst.flash_attn; devices = inst.devices; tensor_split = inst.tensor_split; tensor_buft_overrides = inst.tensor_buft_overrides; - use_mmap = inst.use_mmap; - use_direct_io = inst.use_direct_io; embeddings = inst.embeddings; no_op_offload = inst.no_op_offload; no_host = inst.no_host; @@ -1472,7 +1504,7 @@ struct test { "n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", "n_cpu_moe", "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "devices", "tensor_split", - "tensor_buft_overrides", "use_mmap", "use_direct_io", "embeddings", + "tensor_buft_overrides", "load_mode", "embeddings", "no_op_offload", "no_host", "n_prompt", "n_gen", "n_depth", "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts" }; @@ -1489,9 +1521,12 @@ struct test { return INT; } if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" || - field == "use_mmap" || field == "use_direct_io" || field == "embeddings" || field == "no_host") { + field == "embeddings" || field == "no_host") { return BOOL; } + if (field == "load_mode") { + return STRING; + } if (field == "avg_ts" || field == "stddev_ts") { return FLOAT; } @@ -1561,8 +1596,7 @@ struct test { devices_to_string(devices), tensor_split_str, tensor_buft_overrides_str, - std::to_string(use_mmap), - std::to_string(use_direct_io), + load_mode_str(load_mode), std::to_string(embeddings), std::to_string(no_op_offload), std::to_string(no_host), @@ -1745,12 +1779,15 @@ struct markdown_printer : public printer { if (field == "devices") { return -12; } - if (field == "use_mmap") { - return 4; - } - if (field == "use_direct_io") { - return 3; + if (field == "load_mode") { + return 5; } + // if (field == "use_mmap") { + // return 4; + // } + // if (field == "use_direct_io") { + // return 3; + // } if (field == "test") { return 15; } @@ -1785,11 +1822,8 @@ struct markdown_printer : public printer { if (field == "flash_attn") { return "fa"; } - if (field == "use_mmap") { - return "mmap"; - } - if (field == "use_direct_io") { - return "dio"; + if (field == "load_mode") { + return "lm"; } if (field == "embeddings") { return "embd"; @@ -1872,11 +1906,8 @@ struct markdown_printer : public printer { if (params.tensor_buft_overrides.size() > 1 || !vec_vec_tensor_buft_override_equal(params.tensor_buft_overrides, cmd_params_defaults.tensor_buft_overrides)) { fields.emplace_back("tensor_buft_overrides"); } - if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) { - fields.emplace_back("use_mmap"); - } - if (params.use_direct_io.size() > 1 || params.use_direct_io != cmd_params_defaults.use_direct_io) { - fields.emplace_back("use_direct_io"); + if (params.load_mode.size() > 1 || params.load_mode != cmd_params_defaults.load_mode) { + fields.emplace_back("load_mode"); } if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) { fields.emplace_back("embeddings"); @@ -2102,11 +2133,11 @@ int main(int argc, char ** argv) { fprintf(stderr, "warning: sanitizer enabled, performance may be affected\n"); #endif + cmd_params params = parse_cmd_params(argc, argv); + // initialize backends ggml_backend_load_all(); - cmd_params params = parse_cmd_params(argc, argv); - auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); if (!cpu_dev) { fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__); diff --git a/tools/server/README.md b/tools/server/README.md index 554444d74b..33af0a0ece 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -73,9 +73,10 @@ For the full list of features, please refer to [server's changelog](https://gith | `-ctk, --cache-type-k TYPE` | KV cache data type for K
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_K) | | `-ctv, --cache-type-v TYPE` | KV cache data type for V
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)
(env: LLAMA_ARG_DEFRAG_THOLD) | -| `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | -| `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | -| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)
(env: LLAMA_ARG_DIO) | +| `--mlock` | DEPRECATED: force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | +| `--mmap, --no-mmap` | DEPRECATED: whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock)
(env: LLAMA_ARG_MMAP) | +| `-dio, --direct-io, -ndio, --no-direct-io` | DEPRECATED: use DirectIO if available
(env: LLAMA_ARG_DIO) | +| `-lm, --load-mode MODE` | model loading mode (default: mmap)
- none: no special loading mode
- mmap: memory-map model (if mmap disabled, slower load but may reduce pageouts if not using mlock)
- mlock: force system to keep model in RAM rather than swapping or compressing
- dio: use DirectIO if available

(env: LLAMA_ARG_LOAD_MODE) | | `--numa TYPE` | attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggml-org/llama.cpp/issues/1437
(env: LLAMA_ARG_NUMA) | | `-dev, --device ` | comma-separated list of devices to use for offloading (none = don't offload)
use --list-devices to see a list of available devices
(env: LLAMA_ARG_DEVICE) | | `--list-devices` | print list of available devices and exit | @@ -151,7 +152,7 @@ For the full list of features, please refer to [server's changelog](https://gith | `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.10) | | `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.00) | | `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,
i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' | -| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') | +| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) | | `--grammar-file FNAME` | file to read grammar from | | `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead | | `-jf, --json-schema-file FILE` | File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead | @@ -164,7 +165,8 @@ For the full list of features, please refer to [server's changelog](https://gith | -------- | ----------- | | `-lcs, --lookup-cache-static FNAME` | path to static lookup cache to use for lookup decoding (not updated by generation) | | `-lcd, --lookup-cache-dynamic FNAME` | path to dynamic lookup cache to use for lookup decoding (updated by generation) | -| `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)
(env: LLAMA_ARG_CTX_CHECKPOINTS) | +| `-ctxcp, --ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 32)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)
(env: LLAMA_ARG_CTX_CHECKPOINTS) | +| `-cpent, --checkpoint-every-n-tokens N` | create a checkpoint every n tokens during prefill (processing), -1 to disable (default: 8192)
(env: LLAMA_ARG_CHECKPOINT_EVERY_NT) | | `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)
(env: LLAMA_ARG_CACHE_RAM) | | `-kvu, --kv-unified, -no-kvu, --no-kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)
(env: LLAMA_ARG_KV_UNIFIED) | | `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)
(env: LLAMA_ARG_CONTEXT_SHIFT) | @@ -192,6 +194,7 @@ For the full list of features, please refer to [server's changelog](https://gith | `--api-prefix PREFIX` | prefix path the server serves from, without the trailing slash (default: )
(env: LLAMA_ARG_API_PREFIX) | | `--webui-config JSON` | JSON that provides default WebUI settings (overrides WebUI defaults)
(env: LLAMA_ARG_WEBUI_CONFIG) | | `--webui-config-file PATH` | JSON file that provides default WebUI settings (overrides WebUI defaults)
(env: LLAMA_ARG_WEBUI_CONFIG_FILE) | +| `--webui-mcp-proxy, --no-webui-mcp-proxy` | experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: disabled)
(env: LLAMA_ARG_WEBUI_MCP_PROXY) | | `--webui, --no-webui` | whether to enable the Web UI (default: enabled)
(env: LLAMA_ARG_WEBUI) | | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)
(env: LLAMA_ARG_EMBEDDINGS) | | `--rerank, --reranking` | enable reranking endpoint on server (default: disabled)
(env: LLAMA_ARG_RERANKING) | @@ -215,11 +218,12 @@ For the full list of features, please refer to [server's changelog](https://gith | `--models-autoload, --no-models-autoload` | for router server, whether to automatically load models (default: enabled)
(env: LLAMA_ARG_MODELS_AUTOLOAD) | | `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)
(env: LLAMA_ARG_JINJA) | | `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content`
- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`
(default: auto)
(env: LLAMA_ARG_THINK) | -| `-rea, --resoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))
(env: LLAMA_ARG_REASONING) | +| `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))
(env: LLAMA_ARG_REASONING) | | `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | | `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)
(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) | | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | +| `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)
(env: LLAMA_ARG_SKIP_CHAT_PARSING) | | `--prefill-assistant, --no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)
when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled

(env: LLAMA_ARG_PREFILL_ASSISTANT) | | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled) | | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) | @@ -234,7 +238,7 @@ For the full list of features, please refer to [server's changelog](https://gith | `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) | | `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)
(env: LLAMA_ARG_MODEL_DRAFT) | | `--spec-replace TARGET DRAFT` | translate the string in TARGET into DRAFT if the draft model and main model are not compatible | -| `--spec-type [none\|ngram-cache\|ngram-simple\|ngram-map-k\|ngram-map-k4v\|ngram-mod]` | type of speculative decoding to use when no draft model is provided (default: none) | +| `--spec-type [none\|ngram-cache\|ngram-simple\|ngram-map-k\|ngram-map-k4v\|ngram-mod]` | type of speculative decoding to use when no draft model is provided (default: none)

(env: LLAMA_ARG_SPEC_TYPE) | | `--spec-ngram-size-n N` | ngram size N for ngram-simple/ngram-map speculative decoding, length of lookup n-gram (default: 12) | | `--spec-ngram-size-m N` | ngram size M for ngram-simple/ngram-map speculative decoding, length of draft m-gram (default: 48) | | `--spec-ngram-min-hits N` | minimum hits for ngram-map speculative decoding (default: 1) |