chore: clean up refactor

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
This commit is contained in:
Aaron Teo 2026-03-22 22:57:45 +08:00
parent 21603f86dd
commit e777916d2f
No known key found for this signature in database
13 changed files with 10 additions and 81 deletions

View File

@ -1320,9 +1320,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
mparams.main_gpu = params.main_gpu;
mparams.split_mode = params.split_mode;
mparams.tensor_split = params.tensor_split;
// mparams.use_mmap = params.use_mmap;
// mparams.use_direct_io = params.use_direct_io;
// mparams.use_mlock = params.use_mlock;
mparams.load_mode = params.load_mode;
mparams.check_tensors = params.check_tensors;
mparams.use_extra_bufts = !params.no_extra_bufts;

View File

@ -533,9 +533,6 @@ struct common_params {
bool kv_unified = false; // enable unified KV cache
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
// bool use_mmap = true; // enable mmap to use filesystem cache
// bool use_direct_io = false; // read from disk without buffering
// bool use_mlock = false; // use mlock to keep model in memory
bool verbose_prompt = false; // print prompt tokens before generation
bool display_prompt = true; // print prompt before generation
bool no_kv_offload = false; // disable KV offloading

View File

@ -556,9 +556,6 @@ int main(int argc, char ** argv) {
model_params.n_gpu_layers = params.n_gpu_layers;
model_params.devices = params.devices.data();
model_params.load_mode = params.load_mode;
// model_params.use_mmap = params.use_mmap;
// model_params.use_direct_io = params.use_direct_io;
// model_params.use_mlock = params.use_mlock;
model_params.check_tensors = params.check_tensors;
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);

View File

@ -27,7 +27,6 @@ int main(int argc, char ** argv) {
if (params.load_mode == LLAMA_LOAD_MODE_MMAP) {
LOG_INF("%s: force disabling memory mapping because it would result in-read-only pointers to the weights\n",
__func__);
// params.use_mmap = false;
params.load_mode = LLAMA_LOAD_MODE_NONE;
}
if (params.cache_type_k != GGML_TYPE_F32) {

View File

@ -197,10 +197,10 @@ extern "C" {
};
enum llama_load_mode {
LLAMA_LOAD_MODE_NONE = 0,
LLAMA_LOAD_MODE_MMAP = 1,
LLAMA_LOAD_MODE_MLOCK = 2,
LLAMA_LOAD_MODE_DIRECT_IO = 3,
LLAMA_LOAD_MODE_NONE = 0, // no special loading mode
LLAMA_LOAD_MODE_MMAP = 1, // memory map the model
LLAMA_LOAD_MODE_MLOCK = 2, // force system to keep model in RAM rather than swapping or compressing
LLAMA_LOAD_MODE_DIRECT_IO = 3, // use direct I/O if available
};
// TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979)

View File

@ -513,8 +513,6 @@ llama_model_loader::llama_model_loader(
const std::string & fname,
std::vector<std::string> & splits,
llama_load_mode load_mode,
// bool use_mmap,
// bool use_direct_io,
bool check_tensors,
bool no_alloc,
const llama_model_kv_override * param_overrides_p,
@ -553,20 +551,6 @@ llama_model_loader::llama_model_loader(
files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
contexts.emplace_back(ctx);
// if (use_mmap && use_direct_io) {
// if (files.back()->has_direct_io()) {
// LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
// use_mmap = false;
// } else {
// LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
// use_direct_io = false;
// // reopen file using std::fopen for mmap
// files.pop_back();
// files.emplace_back(new llama_file(fname.c_str(), "rb", false));
// }
// }
// Save tensors data offset of the main file.
// For subsidiary files, `meta` tensor data offset must not be used,
// so we build a unified tensors index for weights.

View File

@ -126,8 +126,6 @@ struct llama_model_loader {
const std::string & fname,
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
enum llama_load_mode load_mode,
// bool use_mmap,
// bool use_direct_io,
bool check_tensors,
bool no_alloc,
const llama_model_kv_override * param_overrides_p,

View File

@ -846,11 +846,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
// mmap consistently increases speed on Linux, and also increases speed on Windows with
// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
#if defined(__linux__) || defined(_WIN32)
// constexpr bool use_mmap = true;
constexpr llama_load_mode load_mode = LLAMA_LOAD_MODE_MMAP;
#else
constexpr llama_load_mode load_mode = LLAMA_LOAD_MODE_NONE;
// constexpr bool use_mmap = false;
#endif
llama_model_kv_override * kv_overrides = nullptr;

View File

@ -75,8 +75,6 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
llama_model_params mparams_copy = *mparams;
mparams_copy.no_alloc = true;
mparams_copy.load_mode = LLAMA_LOAD_MODE_NONE;
// mparams_copy.use_mmap = false;
// mparams_copy.use_mlock = false;
llama_model * model = llama_model_load_from_file(path_model, mparams_copy);
if (model == nullptr) {
@ -1036,7 +1034,6 @@ struct llama_model * llama_model_init_from_user(
GGML_ASSERT(metadata != nullptr);
std::string path_model;
std::vector<std::string> splits = {};
// params.use_mmap = false;
params.load_mode = LLAMA_LOAD_MODE_NONE;
params.use_extra_bufts = false;
return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, params);

View File

@ -16,7 +16,6 @@ int main(int argc, char *argv[] ) {
llama_backend_init();
auto params = llama_model_params{};
// params.use_mmap = false;
params.load_mode = LLAMA_LOAD_MODE_NONE;
params.progress_callback = [](float progress, void * ctx){
(void) ctx;

View File

@ -309,7 +309,9 @@ int main(int argc, char ** argv) {
{
auto mparams = llama_model_default_params();
// mparams.use_mlock = false;
if (mparams.load_mode == LLAMA_LOAD_MODE_MLOCK) {
mparams.load_mode = LLAMA_LOAD_MODE_MMAP;
}
model = llama_model_load_from_file(params.model.c_str(), mparams);

View File

@ -20,7 +20,7 @@ Performance testing tool for llama.cpp.
## Syntax
```
usage: build/bin/llama-bench [options]
usage: llama-bench [options]
options:
-h, --help

View File

@ -354,8 +354,6 @@ struct cmd_params {
std::vector<std::vector<ggml_backend_dev_t>> devices;
std::vector<std::vector<float>> tensor_split;
std::vector<std::vector<llama_model_tensor_buft_override>> tensor_buft_overrides;
// std::vector<bool> use_mmap;
// std::vector<bool> use_direct_io;
std::vector<bool> embeddings;
std::vector<bool> no_op_offload;
std::vector<bool> no_host;
@ -397,8 +395,6 @@ static const cmd_params cmd_params_defaults = {
/* devices */ { {} },
/* tensor_split */ { std::vector<float>(llama_max_devices(), 0.0f) },
/* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{ { nullptr, nullptr } } },
// /* use_mmap */ { true },
// /* use_direct_io */ { false },
/* embeddings */ { false },
/* no_op_offload */ { false },
/* no_host */ { false },
@ -1116,12 +1112,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.tensor_buft_overrides.empty()) {
params.tensor_buft_overrides = cmd_params_defaults.tensor_buft_overrides;
}
// if (params.use_mmap.empty()) {
// params.use_mmap = cmd_params_defaults.use_mmap;
// }
// if (params.use_direct_io.empty()) {
// params.use_direct_io = cmd_params_defaults.use_direct_io;
// }
if (params.embeddings.empty()) {
params.embeddings = cmd_params_defaults.embeddings;
}
@ -1170,8 +1160,6 @@ struct cmd_params_instance {
std::vector<ggml_backend_dev_t> devices;
std::vector<float> tensor_split;
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
// bool use_mmap;
// bool use_direct_io;
bool embeddings;
bool no_op_offload;
bool no_host;
@ -1187,8 +1175,6 @@ struct cmd_params_instance {
mparams.load_mode = load_mode;
mparams.main_gpu = main_gpu;
mparams.tensor_split = tensor_split.data();
// mparams.use_mmap = use_mmap;
// mparams.use_direct_io = use_direct_io;
mparams.no_host = no_host;
if (n_cpu_moe <= 0) {
@ -1234,10 +1220,7 @@ struct cmd_params_instance {
return model == other.model && n_gpu_layers == other.n_gpu_layers && n_cpu_moe == other.n_cpu_moe &&
split_mode == other.split_mode &&
main_gpu == other.main_gpu && tensor_split == other.tensor_split &&
load_mode == other.load_mode &&
// use_mmap == other.use_mmap && use_direct_io == other.use_direct_io &&
devices == other.devices &&
no_host == other.no_host &&
load_mode == other.load_mode && devices == other.devices && no_host == other.no_host &&
vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides);
}
@ -1273,8 +1256,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
for (const auto & devs : params.devices)
for (const auto & ts : params.tensor_split)
for (const auto & ot : params.tensor_buft_overrides)
// for (const auto & mmp : params.use_mmap)
// for (const auto & dio : params.use_direct_io)
for (const auto & noh : params.no_host)
for (const auto & embd : params.embeddings)
for (const auto & nopo : params.no_op_offload)
@ -1352,8 +1333,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .devices = */ devs,
/* .tensor_split = */ ts,
/* .tensor_buft_overrides = */ ot,
// /* .use_mmap = */ mmp,
// /* .use_direct_io= */ dio,
/* .embeddings = */ embd,
/* .no_op_offload= */ nopo,
/* .no_host = */ noh,
@ -1388,8 +1367,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .devices = */ devs,
/* .tensor_split = */ ts,
/* .tensor_buft_overrides = */ ot,
// /* .use_mmap = */ mmp,
// /* .use_direct_io= */ dio,
/* .embeddings = */ embd,
/* .no_op_offload= */ nopo,
/* .no_host = */ noh,
@ -1468,8 +1445,6 @@ struct test {
devices = inst.devices;
tensor_split = inst.tensor_split;
tensor_buft_overrides = inst.tensor_buft_overrides;
// use_mmap = inst.use_mmap;
// use_direct_io = inst.use_direct_io;
embeddings = inst.embeddings;
no_op_offload = inst.no_op_offload;
no_host = inst.no_host;
@ -1529,7 +1504,7 @@ struct test {
"n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll",
"type_k", "type_v", "n_gpu_layers", "n_cpu_moe", "split_mode",
"main_gpu", "no_kv_offload", "flash_attn", "devices", "tensor_split",
"tensor_buft_overrides", "load_mode", "embeddings",
"tensor_buft_overrides", "load_mode", "embeddings",
"no_op_offload", "no_host", "n_prompt", "n_gen", "n_depth",
"test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts"
};
@ -1621,8 +1596,6 @@ struct test {
devices_to_string(devices),
tensor_split_str,
tensor_buft_overrides_str,
// std::to_string(use_mmap),
// std::to_string(use_direct_io),
load_mode_str(load_mode),
std::to_string(embeddings),
std::to_string(no_op_offload),
@ -1849,12 +1822,6 @@ struct markdown_printer : public printer {
if (field == "flash_attn") {
return "fa";
}
// if (field == "use_mmap") {
// return "mmap";
// }
// if (field == "use_direct_io") {
// return "dio";
// }
if (field == "load_mode") {
return "lm";
}
@ -1942,12 +1909,6 @@ struct markdown_printer : public printer {
if (params.load_mode.size() > 1 || params.load_mode != cmd_params_defaults.load_mode) {
fields.emplace_back("load_mode");
}
// if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
// fields.emplace_back("use_mmap");
// }
// if (params.use_direct_io.size() > 1 || params.use_direct_io != cmd_params_defaults.use_direct_io) {
// fields.emplace_back("use_direct_io");
// }
if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) {
fields.emplace_back("embeddings");
}