From 6077971ec2eee27a6797495a06b209899f8f856c Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Sat, 21 Mar 2026 13:56:17 +0800 Subject: [PATCH 01/11] args: refactor mlock/mmap/directio into load-mode Signed-off-by: Aaron Teo --- common/arg.cpp | 35 +++++- common/common.cpp | 7 +- common/common.h | 7 +- examples/diffusion/diffusion-cli.cpp | 7 +- examples/training/finetune.cpp | 5 +- include/llama.h | 14 ++- src/llama-model-loader.cpp | 34 ++--- src/llama-model-loader.h | 5 +- src/llama-model.cpp | 7 +- src/llama-quant.cpp | 8 +- src/llama.cpp | 10 +- tests/test-arg-parser.cpp | 26 +++- tests/test-model-load-cancel.cpp | 3 +- tests/test-quantize-stats.cpp | 2 +- tools/llama-bench/llama-bench.cpp | 178 +++++++++++++++++++-------- 15 files changed, 242 insertions(+), 106 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index aad70ec546..80ae6cb202 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3,11 +3,14 @@ #include "chat.h" #include "common.h" #include "download.h" +#include "ggml.h" #include "json-schema-to-grammar.h" +#include "llama.h" #include "log.h" #include "sampling.h" #include "speculative.h" #include "preset.h" +#include // fix problem with std::min and std::max #if defined(_WIN32) @@ -2206,25 +2209,47 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"--mlock"}, "force system to keep model in RAM rather than swapping or compressing", [](common_params & params) { - params.use_mlock = true; + throw std::runtime_error("error: --mlock is deprecated. use --load-mode mlock instead"); + + GGML_UNUSED(params); } ).set_env("LLAMA_ARG_MLOCK")); add_opt(common_arg( {"--mmap"}, {"--no-mmap"}, - string_format("whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"), + "whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock)", [](common_params & params, bool value) { - params.use_mmap = value; + throw std::runtime_error("error: --mmap and --no-mmap are deprecated. use --load-mode mmap instead"); + + GGML_UNUSED(params); + GGML_UNUSED(value); } ).set_env("LLAMA_ARG_MMAP")); add_opt(common_arg( {"-dio", "--direct-io"}, {"-ndio", "--no-direct-io"}, - string_format("use DirectIO if available. (default: %s)", params.use_direct_io ? "enabled" : "disabled"), + "use DirectIO if available", [](common_params & params, bool value) { - params.use_direct_io = value; + throw std::invalid_argument("error: -dio/--direct-io and -ndio/--no-direct-io are deprecated. use --load-mode dio instead"); + + GGML_UNUSED(params); + GGML_UNUSED(value); } ).set_env("LLAMA_ARG_DIO")); + add_opt(common_arg( + {"-lm", "--load-mode"}, "MODE", + "model loading mode (default: mmap)\n" + "- mlock: force system to keep model in RAM rather than swapping or compressing.\n" + "- mmap: memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock)\n" + "- dio: use DirectIO if available.\n", + [](common_params & params, const std::string & value) { + if (value == "") { params.load_mode = LLAMA_LOAD_MODE_MMAP; } + else if (value == "mlock") { params.load_mode = LLAMA_LOAD_MODE_MLOCK; } + else if (value == "mmap") { params.load_mode = LLAMA_LOAD_MODE_MMAP; } + else if (value == "dio") { params.load_mode = LLAMA_LOAD_MODE_DIRECT_IO; } + else { throw std::invalid_argument("invalid value"); } + } + )); add_opt(common_arg( {"--numa"}, "TYPE", "attempt optimizations that help on some NUMA systems\n" diff --git a/common/common.cpp b/common/common.cpp index 59d75a3b95..cd7f87ed93 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1320,9 +1320,10 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { mparams.main_gpu = params.main_gpu; mparams.split_mode = params.split_mode; mparams.tensor_split = params.tensor_split; - mparams.use_mmap = params.use_mmap; - mparams.use_direct_io = params.use_direct_io; - mparams.use_mlock = params.use_mlock; + // mparams.use_mmap = params.use_mmap; + // mparams.use_direct_io = params.use_direct_io; + // mparams.use_mlock = params.use_mlock; + mparams.load_mode = params.load_mode; mparams.check_tensors = params.check_tensors; mparams.use_extra_bufts = !params.no_extra_bufts; mparams.no_host = params.no_host; diff --git a/common/common.h b/common/common.h index 62201ea1ad..32b894ffeb 100644 --- a/common/common.h +++ b/common/common.h @@ -443,6 +443,7 @@ struct common_params { std::vector fit_params_target = std::vector(llama_max_devices(), 1024 * 1024*1024); enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs + enum llama_load_mode load_mode = LLAMA_LOAD_MODE_MMAP; // how to load the model struct cpu_params cpuparams; struct cpu_params cpuparams_batch; @@ -532,9 +533,9 @@ struct common_params { bool kv_unified = false; // enable unified KV cache bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix - bool use_mmap = true; // enable mmap to use filesystem cache - bool use_direct_io = false; // read from disk without buffering - bool use_mlock = false; // use mlock to keep model in memory + // bool use_mmap = true; // enable mmap to use filesystem cache + // bool use_direct_io = false; // read from disk without buffering + // bool use_mlock = false; // use mlock to keep model in memory bool verbose_prompt = false; // print prompt tokens before generation bool display_prompt = true; // print prompt before generation bool no_kv_offload = false; // disable KV offloading diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp index d38bfe7f82..a6280326dd 100644 --- a/examples/diffusion/diffusion-cli.cpp +++ b/examples/diffusion/diffusion-cli.cpp @@ -555,9 +555,10 @@ int main(int argc, char ** argv) { llama_model_params model_params = llama_model_default_params(); model_params.n_gpu_layers = params.n_gpu_layers; model_params.devices = params.devices.data(); - model_params.use_mmap = params.use_mmap; - model_params.use_direct_io = params.use_direct_io; - model_params.use_mlock = params.use_mlock; + model_params.load_mode = params.load_mode; + // model_params.use_mmap = params.use_mmap; + // model_params.use_direct_io = params.use_direct_io; + // model_params.use_mlock = params.use_mlock; model_params.check_tensors = params.check_tensors; llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params); diff --git a/examples/training/finetune.cpp b/examples/training/finetune.cpp index e20f89488f..224961cd64 100644 --- a/examples/training/finetune.cpp +++ b/examples/training/finetune.cpp @@ -24,10 +24,11 @@ int main(int argc, char ** argv) { return 1; } - if (params.use_mmap) { + if (params.load_mode == LLAMA_LOAD_MODE_MMAP) { LOG_INF("%s: force disabling memory mapping because it would result in-read-only pointers to the weights\n", __func__); - params.use_mmap = false; + // params.use_mmap = false; + params.load_mode = LLAMA_LOAD_MODE_NONE; } if (params.cache_type_k != GGML_TYPE_F32) { LOG_INF("%s: force changing k cache type to f32 due to a lack of f16 support for OUT_PROD\n", __func__); diff --git a/include/llama.h b/include/llama.h index 6e72db7e3c..81921ff5e1 100644 --- a/include/llama.h +++ b/include/llama.h @@ -196,6 +196,13 @@ extern "C" { LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported }; + enum llama_load_mode { + LLAMA_LOAD_MODE_NONE = 0, + LLAMA_LOAD_MODE_MMAP = 1, + LLAMA_LOAD_MODE_MLOCK = 2, + LLAMA_LOAD_MODE_DIRECT_IO = 3, + }; + // TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979) typedef struct llama_token_data { llama_token id; // token id @@ -290,6 +297,7 @@ extern "C" { int32_t n_gpu_layers; // number of layers to store in VRAM, a negative value means all layers enum llama_split_mode split_mode; // how to split the model across multiple GPUs + enum llama_load_mode load_mode; // how to load the model into memory // the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE int32_t main_gpu; @@ -310,9 +318,9 @@ extern "C" { // Keep the booleans together to avoid misalignment during copy-by-value. bool vocab_only; // only load the vocabulary, no weights - bool use_mmap; // use mmap if possible - bool use_direct_io; // use direct io, takes precedence over use_mmap when supported - bool use_mlock; // force system to keep model in RAM + // bool use_mmap; // DEPRECATED: use mmap if possible + // bool use_direct_io; // DEPRECATED: use direct io, takes precedence over use_mmap when supported + // bool use_mlock; // DEPRECATED: force system to keep model in RAM bool check_tensors; // validate model tensor data bool use_extra_bufts; // use extra buffer types (used for weight repacking) bool no_host; // bypass host buffer allowing extra buffers to be used diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 413f34c226..62b6331265 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -4,6 +4,7 @@ #include "ggml.h" #include "gguf.h" #include "llama-hparams.h" +#include "llama.h" #include #include @@ -511,8 +512,9 @@ llama_model_loader::llama_model_loader( void * set_tensor_data_ud, const std::string & fname, std::vector & splits, - bool use_mmap, - bool use_direct_io, + llama_load_mode load_mode, + // bool use_mmap, + // bool use_direct_io, bool check_tensors, bool no_alloc, const llama_model_kv_override * param_overrides_p, @@ -551,19 +553,19 @@ llama_model_loader::llama_model_loader( files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io)); contexts.emplace_back(ctx); - if (use_mmap && use_direct_io) { - if (files.back()->has_direct_io()) { - LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__); - use_mmap = false; - } else { - LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__); - use_direct_io = false; + // if (use_mmap && use_direct_io) { + // if (files.back()->has_direct_io()) { + // LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__); + // use_mmap = false; + // } else { + // LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__); + // use_direct_io = false; - // reopen file using std::fopen for mmap - files.pop_back(); - files.emplace_back(new llama_file(fname.c_str(), "rb", false)); - } - } + // // reopen file using std::fopen for mmap + // files.pop_back(); + // files.emplace_back(new llama_file(fname.c_str(), "rb", false)); + // } + // } // Save tensors data offset of the main file. // For subsidiary files, `meta` tensor data offset must not be used, @@ -778,8 +780,8 @@ llama_model_loader::llama_model_loader( use_mmap = false; } - this->use_mmap = use_mmap; - this->use_direct_io = use_direct_io; + this->use_mmap = load_mode == LLAMA_LOAD_MODE_MMAP; + this->use_direct_io = load_mode == LLAMA_LOAD_MODE_DIRECT_IO; this->check_tensors = check_tensors; this->no_alloc = no_alloc; } diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index ed5de729ca..e82caf5870 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -125,8 +125,9 @@ struct llama_model_loader { void * set_tensor_data_ud, const std::string & fname, std::vector & splits, // optional, only need if the split does not follow naming scheme - bool use_mmap, - bool use_direct_io, + enum llama_load_mode load_mode, + // bool use_mmap, + // bool use_direct_io, bool check_tensors, bool no_alloc, const llama_model_kv_override * param_overrides_p, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index f8caad2889..1bbb35b6ac 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -14,6 +14,7 @@ #include "ggml-cpp.h" +#include "llama.h" #include "models/models.h" #include @@ -2581,7 +2582,7 @@ void llama_model::load_vocab(llama_model_loader & ml) { bool llama_model::load_tensors(llama_model_loader & ml) { const auto & split_mode = params.split_mode; - const auto & use_mlock = params.use_mlock; + const auto & use_mlock = params.load_mode == LLAMA_LOAD_MODE_MLOCK; const auto & tensor_split = params.tensor_split; const int n_layer = hparams.n_layer; @@ -8698,15 +8699,13 @@ llama_model_params llama_model_default_params() { /*.tensor_buft_overrides =*/ nullptr, /*.n_gpu_layers =*/ -1, /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER, + /*.load_mode =*/ LLAMA_LOAD_MODE_MMAP, /*.main_gpu =*/ 0, /*.tensor_split =*/ nullptr, /*.progress_callback =*/ nullptr, /*.progress_callback_user_data =*/ nullptr, /*.kv_overrides =*/ nullptr, /*.vocab_only =*/ false, - /*.use_mmap =*/ true, - /*.use_direct_io =*/ false, - /*.use_mlock =*/ false, /*.check_tensors =*/ false, /*.use_extra_bufts =*/ true, /*.no_host =*/ false, diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 8e8ce23124..40dcf15ac1 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -846,9 +846,11 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // mmap consistently increases speed on Linux, and also increases speed on Windows with // hot cache. It may cause a slowdown on macOS, possibly related to free memory. #if defined(__linux__) || defined(_WIN32) - constexpr bool use_mmap = true; + // constexpr bool use_mmap = true; + constexpr llama_load_mode load_mode = LLAMA_LOAD_MODE_MMAP; #else - constexpr bool use_mmap = false; + constexpr llama_load_mode load_mode = LLAMA_LOAD_MODE_NONE; + // constexpr bool use_mmap = false; #endif llama_model_kv_override * kv_overrides = nullptr; @@ -859,7 +861,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: std::vector splits = {}; llama_model_loader ml(/*metadata*/ nullptr, /*set_tensor_data*/ nullptr, /*set_tensor_data_ud*/ nullptr, - fname_inp, splits, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr); + fname_inp, splits, load_mode, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr); ml.init_mappings(false); // no prefetching llama_model model(llama_model_default_params()); diff --git a/src/llama.cpp b/src/llama.cpp index 872e659edc..423765b108 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -74,8 +74,9 @@ static std::vector llama_get_device_memory_data( llama_model_params mparams_copy = *mparams; mparams_copy.no_alloc = true; - mparams_copy.use_mmap = false; - mparams_copy.use_mlock = false; + mparams_copy.load_mode = LLAMA_LOAD_MODE_NONE; + // mparams_copy.use_mmap = false; + // mparams_copy.use_mlock = false; llama_model * model = llama_model_load_from_file(path_model, mparams_copy); if (model == nullptr) { @@ -837,7 +838,7 @@ static int llama_model_load(struct gguf_context * metadata, llama_model_set_tens model.t_start_us = tm.t_start_us; try { - llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, params.use_mmap, params.use_direct_io, + llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, params.load_mode, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides); ml.print_info(); @@ -1035,7 +1036,8 @@ struct llama_model * llama_model_init_from_user( GGML_ASSERT(metadata != nullptr); std::string path_model; std::vector splits = {}; - params.use_mmap = false; + // params.use_mmap = false; + params.load_mode = LLAMA_LOAD_MODE_NONE; params.use_extra_bufts = false; return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, params); } diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp index 67f8ca632c..230b91e9d3 100644 --- a/tests/test-arg-parser.cpp +++ b/tests/test-arg-parser.cpp @@ -156,13 +156,35 @@ int main(void) { printf("test-arg-parser: test negated environment variables\n\n"); - setenv("LLAMA_ARG_MMAP", "0", true); + setenv("LLAMA_ARG_LOAD_MODE", "none", true); + argv = {"binary_name"}; + assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + assert(params.load_mode == LLAMA_LOAD_MODE_NONE); + + setenv("LLAMA_ARG_LOAD_MODE", "mlock", true); + argv = {"binary_name"}; + assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + assert(params.load_mode == LLAMA_LOAD_MODE_MLOCK); + + setenv("LLAMA_ARG_LOAD_MODE", "mmap", true); setenv("LLAMA_ARG_NO_PERF", "1", true); // legacy format argv = {"binary_name"}; assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); - assert(params.use_mmap == false); + assert(params.load_mode == LLAMA_LOAD_MODE_MMAP); assert(params.no_perf == true); + setenv("LLAMA_ARG_LOAD_MODE", "dio", true); + argv = {"binary_name"}; + assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + assert(params.load_mode == LLAMA_LOAD_MODE_DIRECT_IO); + + // setenv("LLAMA_ARG_MMAP", "0", true); + // setenv("LLAMA_ARG_NO_PERF", "1", true); // legacy format + // argv = {"binary_name"}; + // assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + // assert(params.use_mmap == false); + // assert(params.no_perf == true); + printf("test-arg-parser: test environment variables being overwritten\n\n"); setenv("LLAMA_ARG_MODEL", "blah.gguf", true); diff --git a/tests/test-model-load-cancel.cpp b/tests/test-model-load-cancel.cpp index 9095826fa9..7a13886c32 100644 --- a/tests/test-model-load-cancel.cpp +++ b/tests/test-model-load-cancel.cpp @@ -16,7 +16,8 @@ int main(int argc, char *argv[] ) { llama_backend_init(); auto params = llama_model_params{}; - params.use_mmap = false; + // params.use_mmap = false; + params.load_mode = LLAMA_LOAD_MODE_NONE; params.progress_callback = [](float progress, void * ctx){ (void) ctx; return progress > 0.50; diff --git a/tests/test-quantize-stats.cpp b/tests/test-quantize-stats.cpp index de587d456d..dd6374e4d0 100644 --- a/tests/test-quantize-stats.cpp +++ b/tests/test-quantize-stats.cpp @@ -309,7 +309,7 @@ int main(int argc, char ** argv) { { auto mparams = llama_model_default_params(); - mparams.use_mlock = false; + // mparams.use_mlock = false; model = llama_model_load_from_file(params.model.c_str(), mparams); diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index b0f1d6b936..ec233fd01b 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -265,6 +266,21 @@ static const char * split_mode_str(llama_split_mode mode) { } } +static const char * load_mode_str(llama_load_mode mode) { + switch (mode) { + case LLAMA_LOAD_MODE_NONE: + return "none"; + case LLAMA_LOAD_MODE_MLOCK: + return "mlock"; + case LLAMA_LOAD_MODE_MMAP: + return "mmap"; + case LLAMA_LOAD_MODE_DIRECT_IO: + return "dio"; + default: + GGML_ABORT("invalid load mode"); + } +} + static std::string pair_str(const std::pair & p) { static char buf[32]; snprintf(buf, sizeof(buf), "%d,%d", p.first, p.second); @@ -331,14 +347,15 @@ struct cmd_params { std::vector n_gpu_layers; std::vector n_cpu_moe; std::vector split_mode; + std::vector load_mode; std::vector main_gpu; std::vector no_kv_offload; std::vector flash_attn; std::vector> devices; std::vector> tensor_split; std::vector> tensor_buft_overrides; - std::vector use_mmap; - std::vector use_direct_io; + // std::vector use_mmap; + // std::vector use_direct_io; std::vector embeddings; std::vector no_op_offload; std::vector no_host; @@ -373,14 +390,15 @@ static const cmd_params cmd_params_defaults = { /* n_gpu_layers */ { 99 }, /* n_cpu_moe */ { 0 }, /* split_mode */ { LLAMA_SPLIT_MODE_LAYER }, + /* load_mode */ { LLAMA_LOAD_MODE_MMAP }, /* main_gpu */ { 0 }, /* no_kv_offload */ { false }, /* flash_attn */ { false }, /* devices */ { {} }, /* tensor_split */ { std::vector(llama_max_devices(), 0.0f) }, /* tensor_buft_overrides*/ { std::vector{ { nullptr, nullptr } } }, - /* use_mmap */ { true }, - /* use_direct_io */ { false }, + // /* use_mmap */ { true }, + // /* use_direct_io */ { false }, /* embeddings */ { false }, /* no_op_offload */ { false }, /* no_host */ { false }, @@ -443,8 +461,9 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str()); printf(" -dev, --device (default: auto)\n"); - printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str()); - printf(" -dio, --direct-io <0|1> (default: %s)\n", join(cmd_params_defaults.use_direct_io, ",").c_str()); + printf(" -mmp, --mmap <0|1> (DEPRECATED)\n"); + printf(" -dio, --direct-io <0|1> (DEPRECATED)\n"); + printf(" -lm, --load-mode (default: %s)\n", join(transform_to_str(cmd_params_defaults.load_mode, load_mode_str), ",").c_str()); printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str()); printf(" -ts, --tensor-split (default: 0)\n"); printf(" -ot --override-tensor =;...\n"); @@ -747,6 +766,34 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { break; } params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end()); + } else if (arg == "-lm" || arg == "--load-mode") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + + std::vector modes; + for (const auto & m : p) { + llama_load_mode mode; + if (m == "none") { + mode = LLAMA_LOAD_MODE_NONE; + } else if (m == "mlock") { + mode = LLAMA_LOAD_MODE_MLOCK; + } else if (m == "mmap") { + mode = LLAMA_LOAD_MODE_MMAP; + } else if (m == "dio") { + mode = LLAMA_LOAD_MODE_DIRECT_IO; + } else { + invalid_param = true; + break; + } + modes.push_back(mode); + } + if (invalid_param) { + break; + } + params.load_mode.insert(params.load_mode.end(), modes.begin(), modes.end()); } else if (arg == "-mg" || arg == "--main-gpu") { if (++i >= argc) { invalid_param = true; @@ -788,15 +835,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - auto p = string_split(argv[i], split_delim); - params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end()); + throw std::invalid_argument("error: -mmp/--mmap option is deprecated; please use -lm/--load-mode mmap instead"); } else if (arg == "-dio" || arg == "--direct-io") { if (++i >= argc) { invalid_param = true; break; } - auto p = string_split(argv[i], split_delim); - params.use_direct_io.insert(params.use_direct_io.end(), p.begin(), p.end()); + throw std::invalid_argument("error: -dio/--direct-io option is deprecated; please use -lm/--load-mode dio instead"); } else if (arg == "-embd" || arg == "--embeddings") { if (++i >= argc) { invalid_param = true; @@ -1050,6 +1095,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; } + if (params.load_mode.empty()) { + params.load_mode = cmd_params_defaults.load_mode; + } if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; } @@ -1068,12 +1116,12 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.tensor_buft_overrides.empty()) { params.tensor_buft_overrides = cmd_params_defaults.tensor_buft_overrides; } - if (params.use_mmap.empty()) { - params.use_mmap = cmd_params_defaults.use_mmap; - } - if (params.use_direct_io.empty()) { - params.use_direct_io = cmd_params_defaults.use_direct_io; - } + // if (params.use_mmap.empty()) { + // params.use_mmap = cmd_params_defaults.use_mmap; + // } + // if (params.use_direct_io.empty()) { + // params.use_direct_io = cmd_params_defaults.use_direct_io; + // } if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; } @@ -1115,14 +1163,15 @@ struct cmd_params_instance { int n_gpu_layers; int n_cpu_moe; llama_split_mode split_mode; + llama_load_mode load_mode; int main_gpu; bool no_kv_offload; bool flash_attn; std::vector devices; std::vector tensor_split; std::vector tensor_buft_overrides; - bool use_mmap; - bool use_direct_io; + // bool use_mmap; + // bool use_direct_io; bool embeddings; bool no_op_offload; bool no_host; @@ -1135,10 +1184,11 @@ struct cmd_params_instance { mparams.devices = const_cast(devices.data()); } mparams.split_mode = split_mode; + mparams.load_mode = load_mode; mparams.main_gpu = main_gpu; mparams.tensor_split = tensor_split.data(); - mparams.use_mmap = use_mmap; - mparams.use_direct_io = use_direct_io; + // mparams.use_mmap = use_mmap; + // mparams.use_direct_io = use_direct_io; mparams.no_host = no_host; if (n_cpu_moe <= 0) { @@ -1184,7 +1234,8 @@ struct cmd_params_instance { return model == other.model && n_gpu_layers == other.n_gpu_layers && n_cpu_moe == other.n_cpu_moe && split_mode == other.split_mode && main_gpu == other.main_gpu && tensor_split == other.tensor_split && - use_mmap == other.use_mmap && use_direct_io == other.use_direct_io && + load_mode == other.load_mode && + // use_mmap == other.use_mmap && use_direct_io == other.use_direct_io && devices == other.devices && no_host == other.no_host && vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides); @@ -1217,12 +1268,13 @@ static std::vector get_cmd_params_instances(const cmd_param for (const auto & nl : params.n_gpu_layers) for (const auto & ncmoe : params.n_cpu_moe) for (const auto & sm : params.split_mode) + for (const auto & lm : params.load_mode) for (const auto & mg : params.main_gpu) for (const auto & devs : params.devices) for (const auto & ts : params.tensor_split) for (const auto & ot : params.tensor_buft_overrides) - for (const auto & mmp : params.use_mmap) - for (const auto & dio : params.use_direct_io) + // for (const auto & mmp : params.use_mmap) + // for (const auto & dio : params.use_direct_io) for (const auto & noh : params.no_host) for (const auto & embd : params.embeddings) for (const auto & nopo : params.no_op_offload) @@ -1257,14 +1309,15 @@ static std::vector get_cmd_params_instances(const cmd_param /* .n_gpu_layers = */ nl, /* .n_cpu_moe = */ ncmoe, /* .split_mode = */ sm, + /* .load_mode = */ lm, /* .main_gpu = */ mg, /* .no_kv_offload= */ nkvo, /* .flash_attn = */ fa, /* .devices = */ devs, /* .tensor_split = */ ts, /* .tensor_buft_overrides = */ ot, - /* .use_mmap = */ mmp, - /* .use_direct_io= */ dio, + // /* .use_mmap = */ mmp, + // /* .use_direct_io= */ dio, /* .embeddings = */ embd, /* .no_op_offload= */ nopo, /* .no_host = */ noh, @@ -1292,14 +1345,15 @@ static std::vector get_cmd_params_instances(const cmd_param /* .n_gpu_layers = */ nl, /* .n_cpu_moe = */ ncmoe, /* .split_mode = */ sm, + /* .load_mode = */ lm, /* .main_gpu = */ mg, /* .no_kv_offload= */ nkvo, /* .flash_attn = */ fa, /* .devices = */ devs, /* .tensor_split = */ ts, /* .tensor_buft_overrides = */ ot, - /* .use_mmap = */ mmp, - /* .use_direct_io= */ dio, + // /* .use_mmap = */ mmp, + // /* .use_direct_io= */ dio, /* .embeddings = */ embd, /* .no_op_offload= */ nopo, /* .no_host = */ noh, @@ -1327,14 +1381,15 @@ static std::vector get_cmd_params_instances(const cmd_param /* .n_gpu_layers = */ nl, /* .n_cpu_moe = */ ncmoe, /* .split_mode = */ sm, + /* .load_mode = */ lm, /* .main_gpu = */ mg, /* .no_kv_offload= */ nkvo, /* .flash_attn = */ fa, /* .devices = */ devs, /* .tensor_split = */ ts, /* .tensor_buft_overrides = */ ot, - /* .use_mmap = */ mmp, - /* .use_direct_io= */ dio, + // /* .use_mmap = */ mmp, + // /* .use_direct_io= */ dio, /* .embeddings = */ embd, /* .no_op_offload= */ nopo, /* .no_host = */ noh, @@ -1367,14 +1422,15 @@ struct test { int n_gpu_layers; int n_cpu_moe; llama_split_mode split_mode; + llama_load_mode load_mode; int main_gpu; bool no_kv_offload; bool flash_attn; std::vector devices; std::vector tensor_split; std::vector tensor_buft_overrides; - bool use_mmap; - bool use_direct_io; + // bool use_mmap; + // bool use_direct_io; bool embeddings; bool no_op_offload; bool no_host; @@ -1405,14 +1461,15 @@ struct test { n_gpu_layers = inst.n_gpu_layers; n_cpu_moe = inst.n_cpu_moe; split_mode = inst.split_mode; + load_mode = inst.load_mode; main_gpu = inst.main_gpu; no_kv_offload = inst.no_kv_offload; flash_attn = inst.flash_attn; devices = inst.devices; tensor_split = inst.tensor_split; tensor_buft_overrides = inst.tensor_buft_overrides; - use_mmap = inst.use_mmap; - use_direct_io = inst.use_direct_io; + // use_mmap = inst.use_mmap; + // use_direct_io = inst.use_direct_io; embeddings = inst.embeddings; no_op_offload = inst.no_op_offload; no_host = inst.no_host; @@ -1472,7 +1529,7 @@ struct test { "n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", "n_cpu_moe", "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "devices", "tensor_split", - "tensor_buft_overrides", "use_mmap", "use_direct_io", "embeddings", + "tensor_buft_overrides", "load_mode", "embeddings", "no_op_offload", "no_host", "n_prompt", "n_gen", "n_depth", "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts" }; @@ -1489,9 +1546,12 @@ struct test { return INT; } if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" || - field == "use_mmap" || field == "use_direct_io" || field == "embeddings" || field == "no_host") { + field == "embeddings" || field == "no_host") { return BOOL; } + if (field == "load_mode") { + return STRING; + } if (field == "avg_ts" || field == "stddev_ts") { return FLOAT; } @@ -1561,8 +1621,9 @@ struct test { devices_to_string(devices), tensor_split_str, tensor_buft_overrides_str, - std::to_string(use_mmap), - std::to_string(use_direct_io), + // std::to_string(use_mmap), + // std::to_string(use_direct_io), + load_mode_str(load_mode), std::to_string(embeddings), std::to_string(no_op_offload), std::to_string(no_host), @@ -1745,12 +1806,15 @@ struct markdown_printer : public printer { if (field == "devices") { return -12; } - if (field == "use_mmap") { - return 4; - } - if (field == "use_direct_io") { - return 3; + if (field == "load_mode") { + return 5; } + // if (field == "use_mmap") { + // return 4; + // } + // if (field == "use_direct_io") { + // return 3; + // } if (field == "test") { return 15; } @@ -1785,11 +1849,14 @@ struct markdown_printer : public printer { if (field == "flash_attn") { return "fa"; } - if (field == "use_mmap") { - return "mmap"; - } - if (field == "use_direct_io") { - return "dio"; + // if (field == "use_mmap") { + // return "mmap"; + // } + // if (field == "use_direct_io") { + // return "dio"; + // } + if (field == "load_mode") { + return "lm"; } if (field == "embeddings") { return "embd"; @@ -1872,12 +1939,15 @@ struct markdown_printer : public printer { if (params.tensor_buft_overrides.size() > 1 || !vec_vec_tensor_buft_override_equal(params.tensor_buft_overrides, cmd_params_defaults.tensor_buft_overrides)) { fields.emplace_back("tensor_buft_overrides"); } - if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) { - fields.emplace_back("use_mmap"); - } - if (params.use_direct_io.size() > 1 || params.use_direct_io != cmd_params_defaults.use_direct_io) { - fields.emplace_back("use_direct_io"); + if (params.load_mode.size() > 1 || params.load_mode != cmd_params_defaults.load_mode) { + fields.emplace_back("load_mode"); } + // if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) { + // fields.emplace_back("use_mmap"); + // } + // if (params.use_direct_io.size() > 1 || params.use_direct_io != cmd_params_defaults.use_direct_io) { + // fields.emplace_back("use_direct_io"); + // } if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) { fields.emplace_back("embeddings"); } @@ -2102,11 +2172,11 @@ int main(int argc, char ** argv) { fprintf(stderr, "warning: sanitizer enabled, performance may be affected\n"); #endif + cmd_params params = parse_cmd_params(argc, argv); + // initialize backends ggml_backend_load_all(); - cmd_params params = parse_cmd_params(argc, argv); - auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); if (!cpu_dev) { fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__); From bb78676fb709f45935711d156955810a5c9dd177 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Sat, 21 Mar 2026 14:11:41 +0800 Subject: [PATCH 02/11] args: add load mode env Signed-off-by: Aaron Teo --- common/arg.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/arg.cpp b/common/arg.cpp index 80ae6cb202..a189567023 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2249,7 +2249,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex else if (value == "dio") { params.load_mode = LLAMA_LOAD_MODE_DIRECT_IO; } else { throw std::invalid_argument("invalid value"); } } - )); + ).set_env("LLAMA_ARG_LOAD_MODE")); add_opt(common_arg( {"--numa"}, "TYPE", "attempt optimizations that help on some NUMA systems\n" From 27f53013c97e3bc401035fcb0a43cf5ee5c2f4af Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Sat, 21 Mar 2026 14:32:32 +0800 Subject: [PATCH 03/11] args: use invalid_argument instead of runtime_error Signed-off-by: Aaron Teo --- common/arg.cpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index a189567023..a63bc5b9a2 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -10,7 +10,6 @@ #include "sampling.h" #include "speculative.h" #include "preset.h" -#include // fix problem with std::min and std::max #if defined(_WIN32) @@ -2207,9 +2206,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } add_opt(common_arg( {"--mlock"}, - "force system to keep model in RAM rather than swapping or compressing", + "DEPRECATED: force system to keep model in RAM rather than swapping or compressing", [](common_params & params) { - throw std::runtime_error("error: --mlock is deprecated. use --load-mode mlock instead"); + throw std::invalid_argument("--mlock is deprecated. use --load-mode mlock instead"); GGML_UNUSED(params); } @@ -2217,9 +2216,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"--mmap"}, {"--no-mmap"}, - "whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock)", + "DEPRECATED: whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock)", [](common_params & params, bool value) { - throw std::runtime_error("error: --mmap and --no-mmap are deprecated. use --load-mode mmap instead"); + throw std::invalid_argument("--mmap and --no-mmap are deprecated. use --load-mode mmap instead"); GGML_UNUSED(params); GGML_UNUSED(value); @@ -2228,9 +2227,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"-dio", "--direct-io"}, {"-ndio", "--no-direct-io"}, - "use DirectIO if available", + "DEPRECATED: use DirectIO if available", [](common_params & params, bool value) { - throw std::invalid_argument("error: -dio/--direct-io and -ndio/--no-direct-io are deprecated. use --load-mode dio instead"); + throw std::invalid_argument("-dio/--direct-io and -ndio/--no-direct-io are deprecated. use --load-mode dio instead"); GGML_UNUSED(params); GGML_UNUSED(value); From 856d73b10662f410325caaed31bcaf51592c2592 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Sat, 21 Mar 2026 14:49:46 +0800 Subject: [PATCH 04/11] tests: disable --no-mmap test Signed-off-by: Aaron Teo --- tests/test-arg-parser.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp index 230b91e9d3..ee13a3e5cb 100644 --- a/tests/test-arg-parser.cpp +++ b/tests/test-arg-parser.cpp @@ -99,8 +99,8 @@ int main(void) { assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_EMBEDDING)); // negated arg - argv = {"binary_name", "--no-mmap"}; - assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + // argv = {"binary_name", "--no-mmap"}; + // assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); printf("test-arg-parser: test valid usage\n\n"); From 0935e842b00cbbe32a83137b97181674a24f2e91 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Sat, 21 Mar 2026 15:08:31 +0800 Subject: [PATCH 05/11] docs: update docs via gen-docs tool Signed-off-by: Aaron Teo --- tools/cli/README.md | 15 ++++++++++----- tools/completion/README.md | 12 ++++++++---- tools/server/README.md | 14 +++++++++----- 3 files changed, 27 insertions(+), 14 deletions(-) diff --git a/tools/cli/README.md b/tools/cli/README.md index 22d3fc87e9..139d5ab5b6 100644 --- a/tools/cli/README.md +++ b/tools/cli/README.md @@ -56,9 +56,10 @@ | `-ctv, --cache-type-v TYPE` | KV cache data type for V
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)
(env: LLAMA_ARG_DEFRAG_THOLD) | | `-np, --parallel N` | number of parallel sequences to decode (default: 1)
(env: LLAMA_ARG_N_PARALLEL) | -| `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | -| `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | -| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)
(env: LLAMA_ARG_DIO) | +| `--mlock` | DEPRECATED: force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | +| `--mmap, --no-mmap` | DEPRECATED: whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock)
(env: LLAMA_ARG_MMAP) | +| `-dio, --direct-io, -ndio, --no-direct-io` | DEPRECATED: use DirectIO if available
(env: LLAMA_ARG_DIO) | +| `-lm, --load-mode MODE` | model loading mode (default: mmap)
- mlock: force system to keep model in RAM rather than swapping or compressing.
- mmap: memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock)
- dio: use DirectIO if available.

(env: LLAMA_ARG_LOAD_MODE) | | `--numa TYPE` | attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggml-org/llama.cpp/issues/1437
(env: LLAMA_ARG_NUMA) | | `-dev, --device ` | comma-separated list of devices to use for offloading (none = don't offload)
use --list-devices to see a list of available devices
(env: LLAMA_ARG_DEVICE) | | `--list-devices` | print list of available devices and exit | @@ -147,7 +148,8 @@ | -------- | ----------- | | `--display-prompt, --no-display-prompt` | whether to print prompt at generation (default: true) | | `-co, --color [on\|off\|auto]` | Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')
'auto' enables colors when output is to a terminal | -| `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)
(env: LLAMA_ARG_CTX_CHECKPOINTS) | +| `-ctxcp, --ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 32)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)
(env: LLAMA_ARG_CTX_CHECKPOINTS) | +| `-cpent, --checkpoint-every-n-tokens N` | create a checkpoint every n tokens during prefill (processing), -1 to disable (default: 8192)
(env: LLAMA_ARG_CHECKPOINT_EVERY_NT) | | `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)
(env: LLAMA_ARG_CACHE_RAM) | | `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)
(env: LLAMA_ARG_CONTEXT_SHIFT) | | `-sys, --system-prompt PROMPT` | system prompt to use with model (if applicable, depending on chat template) | @@ -172,9 +174,12 @@ | `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'
(env: LLAMA_CHAT_TEMPLATE_KWARGS) | | `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)
(env: LLAMA_ARG_JINJA) | | `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content`
- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`
(default: auto)
(env: LLAMA_ARG_THINK) | -| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | +| `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))
(env: LLAMA_ARG_REASONING) | +| `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | +| `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)
(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) | | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | +| `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)
(env: LLAMA_ARG_SKIP_CHAT_PARSING) | | `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles | | `--draft, --draft-n, --draft-max N` | number of tokens to draft for speculative decoding (default: 16)
(env: LLAMA_ARG_DRAFT_MAX) | | `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)
(env: LLAMA_ARG_DRAFT_MIN) | diff --git a/tools/completion/README.md b/tools/completion/README.md index f868c2c7d7..9a9cd4287e 100644 --- a/tools/completion/README.md +++ b/tools/completion/README.md @@ -139,9 +139,10 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | `-ctv, --cache-type-v TYPE` | KV cache data type for V
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)
(env: LLAMA_ARG_DEFRAG_THOLD) | | `-np, --parallel N` | number of parallel sequences to decode (default: 1)
(env: LLAMA_ARG_N_PARALLEL) | -| `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | -| `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | -| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)
(env: LLAMA_ARG_DIO) | +| `--mlock` | DEPRECATED: force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | +| `--mmap, --no-mmap` | DEPRECATED: whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock)
(env: LLAMA_ARG_MMAP) | +| `-dio, --direct-io, -ndio, --no-direct-io` | DEPRECATED: use DirectIO if available
(env: LLAMA_ARG_DIO) | +| `-lm, --load-mode MODE` | model loading mode (default: mmap)
- mlock: force system to keep model in RAM rather than swapping or compressing.
- mmap: memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock)
- dio: use DirectIO if available.

(env: LLAMA_ARG_LOAD_MODE) | | `--numa TYPE` | attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggml-org/llama.cpp/issues/1437
(env: LLAMA_ARG_NUMA) | | `-dev, --device ` | comma-separated list of devices to use for offloading (none = don't offload)
use --list-devices to see a list of available devices
(env: LLAMA_ARG_DEVICE) | | `--list-devices` | print list of available devices and exit | @@ -252,9 +253,12 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | `-gaw, --grp-attn-w N` | group-attention width (default: 512)
(env: LLAMA_ARG_GRP_ATTN_W) | | `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: disabled)
(env: LLAMA_ARG_JINJA) | | `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content`
- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`
(default: auto)
(env: LLAMA_ARG_THINK) | -| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | +| `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))
(env: LLAMA_ARG_REASONING) | +| `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | +| `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)
(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) | | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | +| `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)
(env: LLAMA_ARG_SKIP_CHAT_PARSING) | | `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles | diff --git a/tools/server/README.md b/tools/server/README.md index df59e2d9b7..12cbdbde79 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -73,9 +73,10 @@ For the full list of features, please refer to [server's changelog](https://gith | `-ctk, --cache-type-k TYPE` | KV cache data type for K
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_K) | | `-ctv, --cache-type-v TYPE` | KV cache data type for V
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)
(env: LLAMA_ARG_DEFRAG_THOLD) | -| `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | -| `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | -| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)
(env: LLAMA_ARG_DIO) | +| `--mlock` | DEPRECATED: force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | +| `--mmap, --no-mmap` | DEPRECATED: whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock)
(env: LLAMA_ARG_MMAP) | +| `-dio, --direct-io, -ndio, --no-direct-io` | DEPRECATED: use DirectIO if available
(env: LLAMA_ARG_DIO) | +| `-lm, --load-mode MODE` | model loading mode (default: mmap)
- mlock: force system to keep model in RAM rather than swapping or compressing.
- mmap: memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock)
- dio: use DirectIO if available.

(env: LLAMA_ARG_LOAD_MODE) | | `--numa TYPE` | attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggml-org/llama.cpp/issues/1437
(env: LLAMA_ARG_NUMA) | | `-dev, --device ` | comma-separated list of devices to use for offloading (none = don't offload)
use --list-devices to see a list of available devices
(env: LLAMA_ARG_DEVICE) | | `--list-devices` | print list of available devices and exit | @@ -164,7 +165,8 @@ For the full list of features, please refer to [server's changelog](https://gith | -------- | ----------- | | `-lcs, --lookup-cache-static FNAME` | path to static lookup cache to use for lookup decoding (not updated by generation) | | `-lcd, --lookup-cache-dynamic FNAME` | path to dynamic lookup cache to use for lookup decoding (updated by generation) | -| `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)
(env: LLAMA_ARG_CTX_CHECKPOINTS) | +| `-ctxcp, --ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 32)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)
(env: LLAMA_ARG_CTX_CHECKPOINTS) | +| `-cpent, --checkpoint-every-n-tokens N` | create a checkpoint every n tokens during prefill (processing), -1 to disable (default: 8192)
(env: LLAMA_ARG_CHECKPOINT_EVERY_NT) | | `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)
(env: LLAMA_ARG_CACHE_RAM) | | `-kvu, --kv-unified, -no-kvu, --no-kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)
(env: LLAMA_ARG_KV_UNIFIED) | | `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)
(env: LLAMA_ARG_CONTEXT_SHIFT) | @@ -192,6 +194,7 @@ For the full list of features, please refer to [server's changelog](https://gith | `--api-prefix PREFIX` | prefix path the server serves from, without the trailing slash (default: )
(env: LLAMA_ARG_API_PREFIX) | | `--webui-config JSON` | JSON that provides default WebUI settings (overrides WebUI defaults)
(env: LLAMA_ARG_WEBUI_CONFIG) | | `--webui-config-file PATH` | JSON file that provides default WebUI settings (overrides WebUI defaults)
(env: LLAMA_ARG_WEBUI_CONFIG_FILE) | +| `--webui-mcp-proxy, --no-webui-mcp-proxy` | experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: disabled)
(env: LLAMA_ARG_WEBUI_MCP_PROXY) | | `--webui, --no-webui` | whether to enable the Web UI (default: enabled)
(env: LLAMA_ARG_WEBUI) | | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)
(env: LLAMA_ARG_EMBEDDINGS) | | `--rerank, --reranking` | enable reranking endpoint on server (default: disabled)
(env: LLAMA_ARG_RERANKING) | @@ -215,11 +218,12 @@ For the full list of features, please refer to [server's changelog](https://gith | `--models-autoload, --no-models-autoload` | for router server, whether to automatically load models (default: enabled)
(env: LLAMA_ARG_MODELS_AUTOLOAD) | | `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)
(env: LLAMA_ARG_JINJA) | | `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content`
- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`
(default: auto)
(env: LLAMA_ARG_THINK) | -| `-rea, --resoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))
(env: LLAMA_ARG_REASONING) | +| `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))
(env: LLAMA_ARG_REASONING) | | `--reasoning-budget N` | token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | | `--reasoning-budget-message MESSAGE` | message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)
(env: LLAMA_ARG_THINK_BUDGET_MESSAGE) | | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | +| `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)
(env: LLAMA_ARG_SKIP_CHAT_PARSING) | | `--prefill-assistant, --no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)
when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled

(env: LLAMA_ARG_PREFILL_ASSISTANT) | | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled) | | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) | From d7fcab8cde698a14192ab97c6bcd93347ab8df5b Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Sat, 21 Mar 2026 15:11:19 +0800 Subject: [PATCH 06/11] docs: update llama-bench docs Signed-off-by: Aaron Teo --- tools/llama-bench/README.md | 85 +++++++++++++++++++++---------------- 1 file changed, 48 insertions(+), 37 deletions(-) diff --git a/tools/llama-bench/README.md b/tools/llama-bench/README.md index c837bb6d26..bd6d11a19e 100644 --- a/tools/llama-bench/README.md +++ b/tools/llama-bench/README.md @@ -20,48 +20,59 @@ Performance testing tool for llama.cpp. ## Syntax ``` -usage: llama-bench [options] +usage: build/bin/llama-bench [options] options: -h, --help - --numa numa mode (default: disabled) - -r, --repetitions number of times to repeat each test (default: 5) - --prio <0|1|2|3> process/thread priority (default: 0) - --delay <0...N> (seconds) delay between each test (default: 0) - -o, --output output format printed to stdout (default: md) - -oe, --output-err output format printed to stderr (default: none) - --list-devices list available devices and exit - -v, --verbose verbose output - --progress print test progress indicators - -rpc, --rpc register RPC devices (comma separated) + --numa numa mode (default: disabled) + -r, --repetitions number of times to repeat each test (default: 5) + --prio <-1|0|1|2|3> process/thread priority (default: 0) + --delay <0...N> (seconds) delay between each test (default: 0) + -o, --output output format printed to stdout (default: md) + -oe, --output-err output format printed to stderr (default: none) + --list-devices list available devices and exit + -v, --verbose verbose output + --progress print test progress indicators + --no-warmup skip warmup runs before benchmarking test parameters: - -m, --model (default: models/7B/ggml-model-q4_0.gguf) - -p, --n-prompt (default: 512) - -n, --n-gen (default: 128) - -pg (default: ) - -d, --n-depth (default: 0) - -b, --batch-size (default: 2048) - -ub, --ubatch-size (default: 512) - -ctk, --cache-type-k (default: f16) - -ctv, --cache-type-v (default: f16) - -t, --threads (default: system dependent) - -C, --cpu-mask (default: 0x0) - --cpu-strict <0|1> (default: 0) - --poll <0...100> (default: 50) - -ngl, --n-gpu-layers (default: 99) - -ncmoe, --n-cpu-moe (default: 0) - -sm, --split-mode (default: layer) - -mg, --main-gpu (default: 0) - -nkvo, --no-kv-offload <0|1> (default: 0) - -fa, --flash-attn <0|1> (default: 0) - -dev, --device (default: auto) - -mmp, --mmap <0|1> (default: 1) - -embd, --embeddings <0|1> (default: 0) - -ts, --tensor-split (default: 0) - -ot --override-tensors =;... - (default: disabled) - -nopo, --no-op-offload <0|1> (default: 0) + -m, --model (default: models/7B/ggml-model-q4_0.gguf) + -hf, -hfr, --hf-repo /[:quant] Hugging Face model repository; quant is optional, case-insensitive + default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist. + example: unsloth/phi-4-GGUF:Q4_K_M + (default: unused) + -hff, --hf-file Hugging Face model file. If specified, it will override the quant in --hf-repo + (default: unused) + -hft, --hf-token Hugging Face access token + (default: value from HF_TOKEN environment variable) + -p, --n-prompt (default: 512) + -n, --n-gen (default: 128) + -pg (default: ) + -d, --n-depth (default: 0) + -b, --batch-size (default: 2048) + -ub, --ubatch-size (default: 512) + -ctk, --cache-type-k (default: f16) + -ctv, --cache-type-v (default: f16) + -t, --threads (default: 8) + -C, --cpu-mask (default: 0x0) + --cpu-strict <0|1> (default: 0) + --poll <0...100> (default: 50) + -ngl, --n-gpu-layers (default: 99) + -ncmoe, --n-cpu-moe (default: 0) + -sm, --split-mode (default: layer) + -mg, --main-gpu (default: 0) + -nkvo, --no-kv-offload <0|1> (default: 0) + -fa, --flash-attn <0|1> (default: 0) + -dev, --device (default: auto) + -mmp, --mmap <0|1> (DEPRECATED) + -dio, --direct-io <0|1> (DEPRECATED) + -lm, --load-mode (default: mmap) + -embd, --embeddings <0|1> (default: 0) + -ts, --tensor-split (default: 0) + -ot --override-tensor =;... + (default: disabled) + -nopo, --no-op-offload <0|1> (default: 0) + --no-host <0|1> (default: 0) Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times. Ranges can be given as From 2e4f0a74e4a1d333e6af3e88ad46ec7369877693 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Sat, 21 Mar 2026 15:33:29 +0800 Subject: [PATCH 07/11] args: add missing `none` option handler Signed-off-by: Aaron Teo --- common/arg.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index a63bc5b9a2..5281ad62ef 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2238,13 +2238,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex add_opt(common_arg( {"-lm", "--load-mode"}, "MODE", "model loading mode (default: mmap)\n" - "- mlock: force system to keep model in RAM rather than swapping or compressing.\n" - "- mmap: memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock)\n" - "- dio: use DirectIO if available.\n", + "- none: no special loading mode\n" + "- mmap: memory-map model (if mmap disabled, slower load but may reduce pageouts if not using mlock)\n" + "- mlock: force system to keep model in RAM rather than swapping or compressing\n" + "- dio: use DirectIO if available\n", [](common_params & params, const std::string & value) { - if (value == "") { params.load_mode = LLAMA_LOAD_MODE_MMAP; } - else if (value == "mlock") { params.load_mode = LLAMA_LOAD_MODE_MLOCK; } + /**/ if (value == "none") { params.load_mode = LLAMA_LOAD_MODE_NONE; } else if (value == "mmap") { params.load_mode = LLAMA_LOAD_MODE_MMAP; } + else if (value == "mlock") { params.load_mode = LLAMA_LOAD_MODE_MLOCK; } else if (value == "dio") { params.load_mode = LLAMA_LOAD_MODE_DIRECT_IO; } else { throw std::invalid_argument("invalid value"); } } From 21603f86dd599325fdc933e346f45cfbd5afaefb Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Sat, 21 Mar 2026 15:42:50 +0800 Subject: [PATCH 08/11] docs: update docs again via `llama-gen-docs` Signed-off-by: Aaron Teo --- tools/cli/README.md | 4 ++-- tools/completion/README.md | 4 ++-- tools/server/README.md | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/cli/README.md b/tools/cli/README.md index 139d5ab5b6..cdfa21e17a 100644 --- a/tools/cli/README.md +++ b/tools/cli/README.md @@ -59,7 +59,7 @@ | `--mlock` | DEPRECATED: force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | | `--mmap, --no-mmap` | DEPRECATED: whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock)
(env: LLAMA_ARG_MMAP) | | `-dio, --direct-io, -ndio, --no-direct-io` | DEPRECATED: use DirectIO if available
(env: LLAMA_ARG_DIO) | -| `-lm, --load-mode MODE` | model loading mode (default: mmap)
- mlock: force system to keep model in RAM rather than swapping or compressing.
- mmap: memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock)
- dio: use DirectIO if available.

(env: LLAMA_ARG_LOAD_MODE) | +| `-lm, --load-mode MODE` | model loading mode (default: mmap)
- none: no special loading mode
- mmap: memory-map model (if mmap disabled, slower load but may reduce pageouts if not using mlock)
- mlock: force system to keep model in RAM rather than swapping or compressing
- dio: use DirectIO if available

(env: LLAMA_ARG_LOAD_MODE) | | `--numa TYPE` | attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggml-org/llama.cpp/issues/1437
(env: LLAMA_ARG_NUMA) | | `-dev, --device ` | comma-separated list of devices to use for offloading (none = don't offload)
use --list-devices to see a list of available devices
(env: LLAMA_ARG_DEVICE) | | `--list-devices` | print list of available devices and exit | @@ -135,7 +135,7 @@ | `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.10) | | `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.00) | | `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,
i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' | -| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') | +| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) | | `--grammar-file FNAME` | file to read grammar from | | `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead | | `-jf, --json-schema-file FILE` | File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead | diff --git a/tools/completion/README.md b/tools/completion/README.md index 9a9cd4287e..73c58703e5 100644 --- a/tools/completion/README.md +++ b/tools/completion/README.md @@ -142,7 +142,7 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | `--mlock` | DEPRECATED: force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | | `--mmap, --no-mmap` | DEPRECATED: whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock)
(env: LLAMA_ARG_MMAP) | | `-dio, --direct-io, -ndio, --no-direct-io` | DEPRECATED: use DirectIO if available
(env: LLAMA_ARG_DIO) | -| `-lm, --load-mode MODE` | model loading mode (default: mmap)
- mlock: force system to keep model in RAM rather than swapping or compressing.
- mmap: memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock)
- dio: use DirectIO if available.

(env: LLAMA_ARG_LOAD_MODE) | +| `-lm, --load-mode MODE` | model loading mode (default: mmap)
- none: no special loading mode
- mmap: memory-map model (if mmap disabled, slower load but may reduce pageouts if not using mlock)
- mlock: force system to keep model in RAM rather than swapping or compressing
- dio: use DirectIO if available

(env: LLAMA_ARG_LOAD_MODE) | | `--numa TYPE` | attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggml-org/llama.cpp/issues/1437
(env: LLAMA_ARG_NUMA) | | `-dev, --device ` | comma-separated list of devices to use for offloading (none = don't offload)
use --list-devices to see a list of available devices
(env: LLAMA_ARG_DEVICE) | | `--list-devices` | print list of available devices and exit | @@ -218,7 +218,7 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.10) | | `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.00) | | `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,
i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' | -| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') | +| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) | | `--grammar-file FNAME` | file to read grammar from | | `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead | | `-jf, --json-schema-file FILE` | File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead | diff --git a/tools/server/README.md b/tools/server/README.md index 12cbdbde79..a851d6eae3 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -76,7 +76,7 @@ For the full list of features, please refer to [server's changelog](https://gith | `--mlock` | DEPRECATED: force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | | `--mmap, --no-mmap` | DEPRECATED: whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock)
(env: LLAMA_ARG_MMAP) | | `-dio, --direct-io, -ndio, --no-direct-io` | DEPRECATED: use DirectIO if available
(env: LLAMA_ARG_DIO) | -| `-lm, --load-mode MODE` | model loading mode (default: mmap)
- mlock: force system to keep model in RAM rather than swapping or compressing.
- mmap: memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock)
- dio: use DirectIO if available.

(env: LLAMA_ARG_LOAD_MODE) | +| `-lm, --load-mode MODE` | model loading mode (default: mmap)
- none: no special loading mode
- mmap: memory-map model (if mmap disabled, slower load but may reduce pageouts if not using mlock)
- mlock: force system to keep model in RAM rather than swapping or compressing
- dio: use DirectIO if available

(env: LLAMA_ARG_LOAD_MODE) | | `--numa TYPE` | attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggml-org/llama.cpp/issues/1437
(env: LLAMA_ARG_NUMA) | | `-dev, --device ` | comma-separated list of devices to use for offloading (none = don't offload)
use --list-devices to see a list of available devices
(env: LLAMA_ARG_DEVICE) | | `--list-devices` | print list of available devices and exit | @@ -152,7 +152,7 @@ For the full list of features, please refer to [server's changelog](https://gith | `--mirostat-lr N` | Mirostat learning rate, parameter eta (default: 0.10) | | `--mirostat-ent N` | Mirostat target entropy, parameter tau (default: 5.00) | | `-l, --logit-bias TOKEN_ID(+/-)BIAS` | modifies the likelihood of token appearing in the completion,
i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',
or `--logit-bias 15043-1` to decrease likelihood of token ' Hello' | -| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') | +| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) | | `--grammar-file FNAME` | file to read grammar from | | `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead | | `-jf, --json-schema-file FILE` | File containing a JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object
For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead | @@ -238,7 +238,7 @@ For the full list of features, please refer to [server's changelog](https://gith | `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) | | `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)
(env: LLAMA_ARG_MODEL_DRAFT) | | `--spec-replace TARGET DRAFT` | translate the string in TARGET into DRAFT if the draft model and main model are not compatible | -| `--spec-type [none\|ngram-cache\|ngram-simple\|ngram-map-k\|ngram-map-k4v\|ngram-mod]` | type of speculative decoding to use when no draft model is provided (default: none) | +| `--spec-type [none\|ngram-cache\|ngram-simple\|ngram-map-k\|ngram-map-k4v\|ngram-mod]` | type of speculative decoding to use when no draft model is provided (default: none)

(env: LLAMA_ARG_SPEC_TYPE) | | `--spec-ngram-size-n N` | ngram size N for ngram-simple/ngram-map speculative decoding, length of lookup n-gram (default: 12) | | `--spec-ngram-size-m N` | ngram size M for ngram-simple/ngram-map speculative decoding, length of draft m-gram (default: 48) | | `--spec-ngram-min-hits N` | minimum hits for ngram-map speculative decoding (default: 1) | From e777916d2f8744fe1dea3a6f6b185be5926d872a Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Sun, 22 Mar 2026 22:57:45 +0800 Subject: [PATCH 09/11] chore: clean up refactor Signed-off-by: Aaron Teo --- common/common.cpp | 3 -- common/common.h | 3 -- examples/diffusion/diffusion-cli.cpp | 3 -- examples/training/finetune.cpp | 1 - include/llama.h | 8 +++--- src/llama-model-loader.cpp | 16 ----------- src/llama-model-loader.h | 2 -- src/llama-quant.cpp | 2 -- src/llama.cpp | 3 -- tests/test-model-load-cancel.cpp | 1 - tests/test-quantize-stats.cpp | 4 ++- tools/llama-bench/README.md | 2 +- tools/llama-bench/llama-bench.cpp | 43 ++-------------------------- 13 files changed, 10 insertions(+), 81 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index cd7f87ed93..f5c6163235 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1320,9 +1320,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { mparams.main_gpu = params.main_gpu; mparams.split_mode = params.split_mode; mparams.tensor_split = params.tensor_split; - // mparams.use_mmap = params.use_mmap; - // mparams.use_direct_io = params.use_direct_io; - // mparams.use_mlock = params.use_mlock; mparams.load_mode = params.load_mode; mparams.check_tensors = params.check_tensors; mparams.use_extra_bufts = !params.no_extra_bufts; diff --git a/common/common.h b/common/common.h index 32b894ffeb..ddcb510e10 100644 --- a/common/common.h +++ b/common/common.h @@ -533,9 +533,6 @@ struct common_params { bool kv_unified = false; // enable unified KV cache bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix - // bool use_mmap = true; // enable mmap to use filesystem cache - // bool use_direct_io = false; // read from disk without buffering - // bool use_mlock = false; // use mlock to keep model in memory bool verbose_prompt = false; // print prompt tokens before generation bool display_prompt = true; // print prompt before generation bool no_kv_offload = false; // disable KV offloading diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp index a6280326dd..061db8899a 100644 --- a/examples/diffusion/diffusion-cli.cpp +++ b/examples/diffusion/diffusion-cli.cpp @@ -556,9 +556,6 @@ int main(int argc, char ** argv) { model_params.n_gpu_layers = params.n_gpu_layers; model_params.devices = params.devices.data(); model_params.load_mode = params.load_mode; - // model_params.use_mmap = params.use_mmap; - // model_params.use_direct_io = params.use_direct_io; - // model_params.use_mlock = params.use_mlock; model_params.check_tensors = params.check_tensors; llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params); diff --git a/examples/training/finetune.cpp b/examples/training/finetune.cpp index 224961cd64..4b71f2006e 100644 --- a/examples/training/finetune.cpp +++ b/examples/training/finetune.cpp @@ -27,7 +27,6 @@ int main(int argc, char ** argv) { if (params.load_mode == LLAMA_LOAD_MODE_MMAP) { LOG_INF("%s: force disabling memory mapping because it would result in-read-only pointers to the weights\n", __func__); - // params.use_mmap = false; params.load_mode = LLAMA_LOAD_MODE_NONE; } if (params.cache_type_k != GGML_TYPE_F32) { diff --git a/include/llama.h b/include/llama.h index 81921ff5e1..e22959cd8e 100644 --- a/include/llama.h +++ b/include/llama.h @@ -197,10 +197,10 @@ extern "C" { }; enum llama_load_mode { - LLAMA_LOAD_MODE_NONE = 0, - LLAMA_LOAD_MODE_MMAP = 1, - LLAMA_LOAD_MODE_MLOCK = 2, - LLAMA_LOAD_MODE_DIRECT_IO = 3, + LLAMA_LOAD_MODE_NONE = 0, // no special loading mode + LLAMA_LOAD_MODE_MMAP = 1, // memory map the model + LLAMA_LOAD_MODE_MLOCK = 2, // force system to keep model in RAM rather than swapping or compressing + LLAMA_LOAD_MODE_DIRECT_IO = 3, // use direct I/O if available }; // TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979) diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 62b6331265..119f826766 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -513,8 +513,6 @@ llama_model_loader::llama_model_loader( const std::string & fname, std::vector & splits, llama_load_mode load_mode, - // bool use_mmap, - // bool use_direct_io, bool check_tensors, bool no_alloc, const llama_model_kv_override * param_overrides_p, @@ -553,20 +551,6 @@ llama_model_loader::llama_model_loader( files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io)); contexts.emplace_back(ctx); - // if (use_mmap && use_direct_io) { - // if (files.back()->has_direct_io()) { - // LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__); - // use_mmap = false; - // } else { - // LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__); - // use_direct_io = false; - - // // reopen file using std::fopen for mmap - // files.pop_back(); - // files.emplace_back(new llama_file(fname.c_str(), "rb", false)); - // } - // } - // Save tensors data offset of the main file. // For subsidiary files, `meta` tensor data offset must not be used, // so we build a unified tensors index for weights. diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index e82caf5870..512ea715b1 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -126,8 +126,6 @@ struct llama_model_loader { const std::string & fname, std::vector & splits, // optional, only need if the split does not follow naming scheme enum llama_load_mode load_mode, - // bool use_mmap, - // bool use_direct_io, bool check_tensors, bool no_alloc, const llama_model_kv_override * param_overrides_p, diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 40dcf15ac1..a122baa4da 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -846,11 +846,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // mmap consistently increases speed on Linux, and also increases speed on Windows with // hot cache. It may cause a slowdown on macOS, possibly related to free memory. #if defined(__linux__) || defined(_WIN32) - // constexpr bool use_mmap = true; constexpr llama_load_mode load_mode = LLAMA_LOAD_MODE_MMAP; #else constexpr llama_load_mode load_mode = LLAMA_LOAD_MODE_NONE; - // constexpr bool use_mmap = false; #endif llama_model_kv_override * kv_overrides = nullptr; diff --git a/src/llama.cpp b/src/llama.cpp index 423765b108..21615f98d7 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -75,8 +75,6 @@ static std::vector llama_get_device_memory_data( llama_model_params mparams_copy = *mparams; mparams_copy.no_alloc = true; mparams_copy.load_mode = LLAMA_LOAD_MODE_NONE; - // mparams_copy.use_mmap = false; - // mparams_copy.use_mlock = false; llama_model * model = llama_model_load_from_file(path_model, mparams_copy); if (model == nullptr) { @@ -1036,7 +1034,6 @@ struct llama_model * llama_model_init_from_user( GGML_ASSERT(metadata != nullptr); std::string path_model; std::vector splits = {}; - // params.use_mmap = false; params.load_mode = LLAMA_LOAD_MODE_NONE; params.use_extra_bufts = false; return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, params); diff --git a/tests/test-model-load-cancel.cpp b/tests/test-model-load-cancel.cpp index 7a13886c32..ecc3022711 100644 --- a/tests/test-model-load-cancel.cpp +++ b/tests/test-model-load-cancel.cpp @@ -16,7 +16,6 @@ int main(int argc, char *argv[] ) { llama_backend_init(); auto params = llama_model_params{}; - // params.use_mmap = false; params.load_mode = LLAMA_LOAD_MODE_NONE; params.progress_callback = [](float progress, void * ctx){ (void) ctx; diff --git a/tests/test-quantize-stats.cpp b/tests/test-quantize-stats.cpp index dd6374e4d0..f2e736131f 100644 --- a/tests/test-quantize-stats.cpp +++ b/tests/test-quantize-stats.cpp @@ -309,7 +309,9 @@ int main(int argc, char ** argv) { { auto mparams = llama_model_default_params(); - // mparams.use_mlock = false; + if (mparams.load_mode == LLAMA_LOAD_MODE_MLOCK) { + mparams.load_mode = LLAMA_LOAD_MODE_MMAP; + } model = llama_model_load_from_file(params.model.c_str(), mparams); diff --git a/tools/llama-bench/README.md b/tools/llama-bench/README.md index bd6d11a19e..61eba4b3c0 100644 --- a/tools/llama-bench/README.md +++ b/tools/llama-bench/README.md @@ -20,7 +20,7 @@ Performance testing tool for llama.cpp. ## Syntax ``` -usage: build/bin/llama-bench [options] +usage: llama-bench [options] options: -h, --help diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index ec233fd01b..b4d06a26e0 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -354,8 +354,6 @@ struct cmd_params { std::vector> devices; std::vector> tensor_split; std::vector> tensor_buft_overrides; - // std::vector use_mmap; - // std::vector use_direct_io; std::vector embeddings; std::vector no_op_offload; std::vector no_host; @@ -397,8 +395,6 @@ static const cmd_params cmd_params_defaults = { /* devices */ { {} }, /* tensor_split */ { std::vector(llama_max_devices(), 0.0f) }, /* tensor_buft_overrides*/ { std::vector{ { nullptr, nullptr } } }, - // /* use_mmap */ { true }, - // /* use_direct_io */ { false }, /* embeddings */ { false }, /* no_op_offload */ { false }, /* no_host */ { false }, @@ -1116,12 +1112,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.tensor_buft_overrides.empty()) { params.tensor_buft_overrides = cmd_params_defaults.tensor_buft_overrides; } - // if (params.use_mmap.empty()) { - // params.use_mmap = cmd_params_defaults.use_mmap; - // } - // if (params.use_direct_io.empty()) { - // params.use_direct_io = cmd_params_defaults.use_direct_io; - // } if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; } @@ -1170,8 +1160,6 @@ struct cmd_params_instance { std::vector devices; std::vector tensor_split; std::vector tensor_buft_overrides; - // bool use_mmap; - // bool use_direct_io; bool embeddings; bool no_op_offload; bool no_host; @@ -1187,8 +1175,6 @@ struct cmd_params_instance { mparams.load_mode = load_mode; mparams.main_gpu = main_gpu; mparams.tensor_split = tensor_split.data(); - // mparams.use_mmap = use_mmap; - // mparams.use_direct_io = use_direct_io; mparams.no_host = no_host; if (n_cpu_moe <= 0) { @@ -1234,10 +1220,7 @@ struct cmd_params_instance { return model == other.model && n_gpu_layers == other.n_gpu_layers && n_cpu_moe == other.n_cpu_moe && split_mode == other.split_mode && main_gpu == other.main_gpu && tensor_split == other.tensor_split && - load_mode == other.load_mode && - // use_mmap == other.use_mmap && use_direct_io == other.use_direct_io && - devices == other.devices && - no_host == other.no_host && + load_mode == other.load_mode && devices == other.devices && no_host == other.no_host && vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides); } @@ -1273,8 +1256,6 @@ static std::vector get_cmd_params_instances(const cmd_param for (const auto & devs : params.devices) for (const auto & ts : params.tensor_split) for (const auto & ot : params.tensor_buft_overrides) - // for (const auto & mmp : params.use_mmap) - // for (const auto & dio : params.use_direct_io) for (const auto & noh : params.no_host) for (const auto & embd : params.embeddings) for (const auto & nopo : params.no_op_offload) @@ -1352,8 +1333,6 @@ static std::vector get_cmd_params_instances(const cmd_param /* .devices = */ devs, /* .tensor_split = */ ts, /* .tensor_buft_overrides = */ ot, - // /* .use_mmap = */ mmp, - // /* .use_direct_io= */ dio, /* .embeddings = */ embd, /* .no_op_offload= */ nopo, /* .no_host = */ noh, @@ -1388,8 +1367,6 @@ static std::vector get_cmd_params_instances(const cmd_param /* .devices = */ devs, /* .tensor_split = */ ts, /* .tensor_buft_overrides = */ ot, - // /* .use_mmap = */ mmp, - // /* .use_direct_io= */ dio, /* .embeddings = */ embd, /* .no_op_offload= */ nopo, /* .no_host = */ noh, @@ -1468,8 +1445,6 @@ struct test { devices = inst.devices; tensor_split = inst.tensor_split; tensor_buft_overrides = inst.tensor_buft_overrides; - // use_mmap = inst.use_mmap; - // use_direct_io = inst.use_direct_io; embeddings = inst.embeddings; no_op_offload = inst.no_op_offload; no_host = inst.no_host; @@ -1529,7 +1504,7 @@ struct test { "n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", "n_cpu_moe", "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "devices", "tensor_split", - "tensor_buft_overrides", "load_mode", "embeddings", + "tensor_buft_overrides", "load_mode", "embeddings", "no_op_offload", "no_host", "n_prompt", "n_gen", "n_depth", "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts" }; @@ -1621,8 +1596,6 @@ struct test { devices_to_string(devices), tensor_split_str, tensor_buft_overrides_str, - // std::to_string(use_mmap), - // std::to_string(use_direct_io), load_mode_str(load_mode), std::to_string(embeddings), std::to_string(no_op_offload), @@ -1849,12 +1822,6 @@ struct markdown_printer : public printer { if (field == "flash_attn") { return "fa"; } - // if (field == "use_mmap") { - // return "mmap"; - // } - // if (field == "use_direct_io") { - // return "dio"; - // } if (field == "load_mode") { return "lm"; } @@ -1942,12 +1909,6 @@ struct markdown_printer : public printer { if (params.load_mode.size() > 1 || params.load_mode != cmd_params_defaults.load_mode) { fields.emplace_back("load_mode"); } - // if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) { - // fields.emplace_back("use_mmap"); - // } - // if (params.use_direct_io.size() > 1 || params.use_direct_io != cmd_params_defaults.use_direct_io) { - // fields.emplace_back("use_direct_io"); - // } if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) { fields.emplace_back("embeddings"); } From 77aded2bdcd6b3078616e87d8e7bf29702eb5973 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Sun, 22 Mar 2026 23:28:30 +0800 Subject: [PATCH 10/11] args: lessen the blow of the deprecation Signed-off-by: Aaron Teo --- common/arg.cpp | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 5281ad62ef..aaf0371384 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2208,9 +2208,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"--mlock"}, "DEPRECATED: force system to keep model in RAM rather than swapping or compressing", [](common_params & params) { - throw std::invalid_argument("--mlock is deprecated. use --load-mode mlock instead"); - - GGML_UNUSED(params); + LOG_WRN("DEPRECATED: --mlock is deprecated. use --load-mode mlock instead"); + params.load_mode = LLAMA_LOAD_MODE_MLOCK; } ).set_env("LLAMA_ARG_MLOCK")); add_opt(common_arg( @@ -2218,10 +2217,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"--no-mmap"}, "DEPRECATED: whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock)", [](common_params & params, bool value) { - throw std::invalid_argument("--mmap and --no-mmap are deprecated. use --load-mode mmap instead"); - - GGML_UNUSED(params); - GGML_UNUSED(value); + LOG_WRN("DEPRECATED: --mmap and --no-mmap are deprecated. use --load-mode mmap instead"); + params.load_mode = value ? LLAMA_LOAD_MODE_MMAP : LLAMA_LOAD_MODE_NONE; } ).set_env("LLAMA_ARG_MMAP")); add_opt(common_arg( @@ -2229,10 +2226,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"-ndio", "--no-direct-io"}, "DEPRECATED: use DirectIO if available", [](common_params & params, bool value) { - throw std::invalid_argument("-dio/--direct-io and -ndio/--no-direct-io are deprecated. use --load-mode dio instead"); - - GGML_UNUSED(params); - GGML_UNUSED(value); + LOG_WRN("DEPRECATED: -dio/--direct-io and -ndio/--no-direct-io are deprecated. use --load-mode dio instead"); + params.load_mode = value ? LLAMA_LOAD_MODE_DIRECT_IO : LLAMA_LOAD_MODE_NONE; } ).set_env("LLAMA_ARG_DIO")); add_opt(common_arg( From 2d1d26c1ca800eb4491fc196ef1dfc24c0a5f088 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Sun, 22 Mar 2026 23:28:44 +0800 Subject: [PATCH 11/11] tests: clean up Signed-off-by: Aaron Teo --- tests/test-arg-parser.cpp | 40 ++++++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp index ee13a3e5cb..8931e3b1de 100644 --- a/tests/test-arg-parser.cpp +++ b/tests/test-arg-parser.cpp @@ -98,10 +98,8 @@ int main(void) { argv = {"binary_name", "--draft", "123"}; assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_EMBEDDING)); - // negated arg - // argv = {"binary_name", "--no-mmap"}; - // assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); - + argv = {"binary_name", "-lm", "hello"}; + assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); printf("test-arg-parser: test valid usage\n\n"); @@ -128,6 +126,22 @@ int main(void) { assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE)); assert(params.speculative.n_max == 123); + argv = {"binary_name", "-lm", "none"}; + assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + assert(params.load_mode == LLAMA_LOAD_MODE_NONE); + + argv = {"binary_name", "-lm", "mmap"}; + assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + assert(params.load_mode == LLAMA_LOAD_MODE_MMAP); + + argv = {"binary_name", "-lm", "mlock"}; + assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + assert(params.load_mode == LLAMA_LOAD_MODE_MLOCK); + + argv = {"binary_name", "-lm", "dio"}; + assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + assert(params.load_mode == LLAMA_LOAD_MODE_DIRECT_IO); + // multi-value args (CSV) argv = {"binary_name", "--lora", "file1.gguf,\"file2,2.gguf\",\"file3\"\"3\"\".gguf\",file4\".gguf"}; assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); @@ -154,7 +168,9 @@ int main(void) { assert(params.model.path == "blah.gguf"); assert(params.cpuparams.n_threads == 1010); - printf("test-arg-parser: test negated environment variables\n\n"); + setenv("LLAMA_ARG_LOAD_MODE", "blah", true); + argv = {"binary_name"}; + assert(false == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); setenv("LLAMA_ARG_LOAD_MODE", "none", true); argv = {"binary_name"}; @@ -178,12 +194,14 @@ int main(void) { assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); assert(params.load_mode == LLAMA_LOAD_MODE_DIRECT_IO); - // setenv("LLAMA_ARG_MMAP", "0", true); - // setenv("LLAMA_ARG_NO_PERF", "1", true); // legacy format - // argv = {"binary_name"}; - // assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); - // assert(params.use_mmap == false); - // assert(params.no_perf == true); + printf("test-arg-parser: test negated environment variables\n\n"); + + setenv("LLAMA_ARG_LOAD_MODE", "none", true); + setenv("LLAMA_ARG_NO_PERF", "1", true); // legacy format + argv = {"binary_name"}; + assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + assert(params.load_mode == LLAMA_LOAD_MODE_NONE); + assert(params.no_perf == true); printf("test-arg-parser: test environment variables being overwritten\n\n");