diff --git a/common/common.cpp b/common/common.cpp index cd7f87ed93..f5c6163235 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1320,9 +1320,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { mparams.main_gpu = params.main_gpu; mparams.split_mode = params.split_mode; mparams.tensor_split = params.tensor_split; - // mparams.use_mmap = params.use_mmap; - // mparams.use_direct_io = params.use_direct_io; - // mparams.use_mlock = params.use_mlock; mparams.load_mode = params.load_mode; mparams.check_tensors = params.check_tensors; mparams.use_extra_bufts = !params.no_extra_bufts; diff --git a/common/common.h b/common/common.h index 32b894ffeb..ddcb510e10 100644 --- a/common/common.h +++ b/common/common.h @@ -533,9 +533,6 @@ struct common_params { bool kv_unified = false; // enable unified KV cache bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix - // bool use_mmap = true; // enable mmap to use filesystem cache - // bool use_direct_io = false; // read from disk without buffering - // bool use_mlock = false; // use mlock to keep model in memory bool verbose_prompt = false; // print prompt tokens before generation bool display_prompt = true; // print prompt before generation bool no_kv_offload = false; // disable KV offloading diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp index a6280326dd..061db8899a 100644 --- a/examples/diffusion/diffusion-cli.cpp +++ b/examples/diffusion/diffusion-cli.cpp @@ -556,9 +556,6 @@ int main(int argc, char ** argv) { model_params.n_gpu_layers = params.n_gpu_layers; model_params.devices = params.devices.data(); model_params.load_mode = params.load_mode; - // model_params.use_mmap = params.use_mmap; - // model_params.use_direct_io = params.use_direct_io; - // model_params.use_mlock = params.use_mlock; model_params.check_tensors = params.check_tensors; llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params); diff --git a/examples/training/finetune.cpp b/examples/training/finetune.cpp index 224961cd64..4b71f2006e 100644 --- a/examples/training/finetune.cpp +++ b/examples/training/finetune.cpp @@ -27,7 +27,6 @@ int main(int argc, char ** argv) { if (params.load_mode == LLAMA_LOAD_MODE_MMAP) { LOG_INF("%s: force disabling memory mapping because it would result in-read-only pointers to the weights\n", __func__); - // params.use_mmap = false; params.load_mode = LLAMA_LOAD_MODE_NONE; } if (params.cache_type_k != GGML_TYPE_F32) { diff --git a/include/llama.h b/include/llama.h index 81921ff5e1..e22959cd8e 100644 --- a/include/llama.h +++ b/include/llama.h @@ -197,10 +197,10 @@ extern "C" { }; enum llama_load_mode { - LLAMA_LOAD_MODE_NONE = 0, - LLAMA_LOAD_MODE_MMAP = 1, - LLAMA_LOAD_MODE_MLOCK = 2, - LLAMA_LOAD_MODE_DIRECT_IO = 3, + LLAMA_LOAD_MODE_NONE = 0, // no special loading mode + LLAMA_LOAD_MODE_MMAP = 1, // memory map the model + LLAMA_LOAD_MODE_MLOCK = 2, // force system to keep model in RAM rather than swapping or compressing + LLAMA_LOAD_MODE_DIRECT_IO = 3, // use direct I/O if available }; // TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979) diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 62b6331265..119f826766 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -513,8 +513,6 @@ llama_model_loader::llama_model_loader( const std::string & fname, std::vector & splits, llama_load_mode load_mode, - // bool use_mmap, - // bool use_direct_io, bool check_tensors, bool no_alloc, const llama_model_kv_override * param_overrides_p, @@ -553,20 +551,6 @@ llama_model_loader::llama_model_loader( files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io)); contexts.emplace_back(ctx); - // if (use_mmap && use_direct_io) { - // if (files.back()->has_direct_io()) { - // LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__); - // use_mmap = false; - // } else { - // LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__); - // use_direct_io = false; - - // // reopen file using std::fopen for mmap - // files.pop_back(); - // files.emplace_back(new llama_file(fname.c_str(), "rb", false)); - // } - // } - // Save tensors data offset of the main file. // For subsidiary files, `meta` tensor data offset must not be used, // so we build a unified tensors index for weights. diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index e82caf5870..512ea715b1 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -126,8 +126,6 @@ struct llama_model_loader { const std::string & fname, std::vector & splits, // optional, only need if the split does not follow naming scheme enum llama_load_mode load_mode, - // bool use_mmap, - // bool use_direct_io, bool check_tensors, bool no_alloc, const llama_model_kv_override * param_overrides_p, diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 40dcf15ac1..a122baa4da 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -846,11 +846,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // mmap consistently increases speed on Linux, and also increases speed on Windows with // hot cache. It may cause a slowdown on macOS, possibly related to free memory. #if defined(__linux__) || defined(_WIN32) - // constexpr bool use_mmap = true; constexpr llama_load_mode load_mode = LLAMA_LOAD_MODE_MMAP; #else constexpr llama_load_mode load_mode = LLAMA_LOAD_MODE_NONE; - // constexpr bool use_mmap = false; #endif llama_model_kv_override * kv_overrides = nullptr; diff --git a/src/llama.cpp b/src/llama.cpp index 423765b108..21615f98d7 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -75,8 +75,6 @@ static std::vector llama_get_device_memory_data( llama_model_params mparams_copy = *mparams; mparams_copy.no_alloc = true; mparams_copy.load_mode = LLAMA_LOAD_MODE_NONE; - // mparams_copy.use_mmap = false; - // mparams_copy.use_mlock = false; llama_model * model = llama_model_load_from_file(path_model, mparams_copy); if (model == nullptr) { @@ -1036,7 +1034,6 @@ struct llama_model * llama_model_init_from_user( GGML_ASSERT(metadata != nullptr); std::string path_model; std::vector splits = {}; - // params.use_mmap = false; params.load_mode = LLAMA_LOAD_MODE_NONE; params.use_extra_bufts = false; return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, params); diff --git a/tests/test-model-load-cancel.cpp b/tests/test-model-load-cancel.cpp index 7a13886c32..ecc3022711 100644 --- a/tests/test-model-load-cancel.cpp +++ b/tests/test-model-load-cancel.cpp @@ -16,7 +16,6 @@ int main(int argc, char *argv[] ) { llama_backend_init(); auto params = llama_model_params{}; - // params.use_mmap = false; params.load_mode = LLAMA_LOAD_MODE_NONE; params.progress_callback = [](float progress, void * ctx){ (void) ctx; diff --git a/tests/test-quantize-stats.cpp b/tests/test-quantize-stats.cpp index dd6374e4d0..f2e736131f 100644 --- a/tests/test-quantize-stats.cpp +++ b/tests/test-quantize-stats.cpp @@ -309,7 +309,9 @@ int main(int argc, char ** argv) { { auto mparams = llama_model_default_params(); - // mparams.use_mlock = false; + if (mparams.load_mode == LLAMA_LOAD_MODE_MLOCK) { + mparams.load_mode = LLAMA_LOAD_MODE_MMAP; + } model = llama_model_load_from_file(params.model.c_str(), mparams); diff --git a/tools/llama-bench/README.md b/tools/llama-bench/README.md index bd6d11a19e..61eba4b3c0 100644 --- a/tools/llama-bench/README.md +++ b/tools/llama-bench/README.md @@ -20,7 +20,7 @@ Performance testing tool for llama.cpp. ## Syntax ``` -usage: build/bin/llama-bench [options] +usage: llama-bench [options] options: -h, --help diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index ec233fd01b..b4d06a26e0 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -354,8 +354,6 @@ struct cmd_params { std::vector> devices; std::vector> tensor_split; std::vector> tensor_buft_overrides; - // std::vector use_mmap; - // std::vector use_direct_io; std::vector embeddings; std::vector no_op_offload; std::vector no_host; @@ -397,8 +395,6 @@ static const cmd_params cmd_params_defaults = { /* devices */ { {} }, /* tensor_split */ { std::vector(llama_max_devices(), 0.0f) }, /* tensor_buft_overrides*/ { std::vector{ { nullptr, nullptr } } }, - // /* use_mmap */ { true }, - // /* use_direct_io */ { false }, /* embeddings */ { false }, /* no_op_offload */ { false }, /* no_host */ { false }, @@ -1116,12 +1112,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.tensor_buft_overrides.empty()) { params.tensor_buft_overrides = cmd_params_defaults.tensor_buft_overrides; } - // if (params.use_mmap.empty()) { - // params.use_mmap = cmd_params_defaults.use_mmap; - // } - // if (params.use_direct_io.empty()) { - // params.use_direct_io = cmd_params_defaults.use_direct_io; - // } if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; } @@ -1170,8 +1160,6 @@ struct cmd_params_instance { std::vector devices; std::vector tensor_split; std::vector tensor_buft_overrides; - // bool use_mmap; - // bool use_direct_io; bool embeddings; bool no_op_offload; bool no_host; @@ -1187,8 +1175,6 @@ struct cmd_params_instance { mparams.load_mode = load_mode; mparams.main_gpu = main_gpu; mparams.tensor_split = tensor_split.data(); - // mparams.use_mmap = use_mmap; - // mparams.use_direct_io = use_direct_io; mparams.no_host = no_host; if (n_cpu_moe <= 0) { @@ -1234,10 +1220,7 @@ struct cmd_params_instance { return model == other.model && n_gpu_layers == other.n_gpu_layers && n_cpu_moe == other.n_cpu_moe && split_mode == other.split_mode && main_gpu == other.main_gpu && tensor_split == other.tensor_split && - load_mode == other.load_mode && - // use_mmap == other.use_mmap && use_direct_io == other.use_direct_io && - devices == other.devices && - no_host == other.no_host && + load_mode == other.load_mode && devices == other.devices && no_host == other.no_host && vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides); } @@ -1273,8 +1256,6 @@ static std::vector get_cmd_params_instances(const cmd_param for (const auto & devs : params.devices) for (const auto & ts : params.tensor_split) for (const auto & ot : params.tensor_buft_overrides) - // for (const auto & mmp : params.use_mmap) - // for (const auto & dio : params.use_direct_io) for (const auto & noh : params.no_host) for (const auto & embd : params.embeddings) for (const auto & nopo : params.no_op_offload) @@ -1352,8 +1333,6 @@ static std::vector get_cmd_params_instances(const cmd_param /* .devices = */ devs, /* .tensor_split = */ ts, /* .tensor_buft_overrides = */ ot, - // /* .use_mmap = */ mmp, - // /* .use_direct_io= */ dio, /* .embeddings = */ embd, /* .no_op_offload= */ nopo, /* .no_host = */ noh, @@ -1388,8 +1367,6 @@ static std::vector get_cmd_params_instances(const cmd_param /* .devices = */ devs, /* .tensor_split = */ ts, /* .tensor_buft_overrides = */ ot, - // /* .use_mmap = */ mmp, - // /* .use_direct_io= */ dio, /* .embeddings = */ embd, /* .no_op_offload= */ nopo, /* .no_host = */ noh, @@ -1468,8 +1445,6 @@ struct test { devices = inst.devices; tensor_split = inst.tensor_split; tensor_buft_overrides = inst.tensor_buft_overrides; - // use_mmap = inst.use_mmap; - // use_direct_io = inst.use_direct_io; embeddings = inst.embeddings; no_op_offload = inst.no_op_offload; no_host = inst.no_host; @@ -1529,7 +1504,7 @@ struct test { "n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", "n_cpu_moe", "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "devices", "tensor_split", - "tensor_buft_overrides", "load_mode", "embeddings", + "tensor_buft_overrides", "load_mode", "embeddings", "no_op_offload", "no_host", "n_prompt", "n_gen", "n_depth", "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts" }; @@ -1621,8 +1596,6 @@ struct test { devices_to_string(devices), tensor_split_str, tensor_buft_overrides_str, - // std::to_string(use_mmap), - // std::to_string(use_direct_io), load_mode_str(load_mode), std::to_string(embeddings), std::to_string(no_op_offload), @@ -1849,12 +1822,6 @@ struct markdown_printer : public printer { if (field == "flash_attn") { return "fa"; } - // if (field == "use_mmap") { - // return "mmap"; - // } - // if (field == "use_direct_io") { - // return "dio"; - // } if (field == "load_mode") { return "lm"; } @@ -1942,12 +1909,6 @@ struct markdown_printer : public printer { if (params.load_mode.size() > 1 || params.load_mode != cmd_params_defaults.load_mode) { fields.emplace_back("load_mode"); } - // if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) { - // fields.emplace_back("use_mmap"); - // } - // if (params.use_direct_io.size() > 1 || params.use_direct_io != cmd_params_defaults.use_direct_io) { - // fields.emplace_back("use_direct_io"); - // } if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) { fields.emplace_back("embeddings"); }