chore: clean up refactor
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
This commit is contained in:
parent
21603f86dd
commit
e777916d2f
|
|
@ -1320,9 +1320,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|||
mparams.main_gpu = params.main_gpu;
|
||||
mparams.split_mode = params.split_mode;
|
||||
mparams.tensor_split = params.tensor_split;
|
||||
// mparams.use_mmap = params.use_mmap;
|
||||
// mparams.use_direct_io = params.use_direct_io;
|
||||
// mparams.use_mlock = params.use_mlock;
|
||||
mparams.load_mode = params.load_mode;
|
||||
mparams.check_tensors = params.check_tensors;
|
||||
mparams.use_extra_bufts = !params.no_extra_bufts;
|
||||
|
|
|
|||
|
|
@ -533,9 +533,6 @@ struct common_params {
|
|||
bool kv_unified = false; // enable unified KV cache
|
||||
|
||||
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
||||
// bool use_mmap = true; // enable mmap to use filesystem cache
|
||||
// bool use_direct_io = false; // read from disk without buffering
|
||||
// bool use_mlock = false; // use mlock to keep model in memory
|
||||
bool verbose_prompt = false; // print prompt tokens before generation
|
||||
bool display_prompt = true; // print prompt before generation
|
||||
bool no_kv_offload = false; // disable KV offloading
|
||||
|
|
|
|||
|
|
@ -556,9 +556,6 @@ int main(int argc, char ** argv) {
|
|||
model_params.n_gpu_layers = params.n_gpu_layers;
|
||||
model_params.devices = params.devices.data();
|
||||
model_params.load_mode = params.load_mode;
|
||||
// model_params.use_mmap = params.use_mmap;
|
||||
// model_params.use_direct_io = params.use_direct_io;
|
||||
// model_params.use_mlock = params.use_mlock;
|
||||
model_params.check_tensors = params.check_tensors;
|
||||
|
||||
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
|
||||
|
|
|
|||
|
|
@ -27,7 +27,6 @@ int main(int argc, char ** argv) {
|
|||
if (params.load_mode == LLAMA_LOAD_MODE_MMAP) {
|
||||
LOG_INF("%s: force disabling memory mapping because it would result in-read-only pointers to the weights\n",
|
||||
__func__);
|
||||
// params.use_mmap = false;
|
||||
params.load_mode = LLAMA_LOAD_MODE_NONE;
|
||||
}
|
||||
if (params.cache_type_k != GGML_TYPE_F32) {
|
||||
|
|
|
|||
|
|
@ -197,10 +197,10 @@ extern "C" {
|
|||
};
|
||||
|
||||
enum llama_load_mode {
|
||||
LLAMA_LOAD_MODE_NONE = 0,
|
||||
LLAMA_LOAD_MODE_MMAP = 1,
|
||||
LLAMA_LOAD_MODE_MLOCK = 2,
|
||||
LLAMA_LOAD_MODE_DIRECT_IO = 3,
|
||||
LLAMA_LOAD_MODE_NONE = 0, // no special loading mode
|
||||
LLAMA_LOAD_MODE_MMAP = 1, // memory map the model
|
||||
LLAMA_LOAD_MODE_MLOCK = 2, // force system to keep model in RAM rather than swapping or compressing
|
||||
LLAMA_LOAD_MODE_DIRECT_IO = 3, // use direct I/O if available
|
||||
};
|
||||
|
||||
// TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979)
|
||||
|
|
|
|||
|
|
@ -513,8 +513,6 @@ llama_model_loader::llama_model_loader(
|
|||
const std::string & fname,
|
||||
std::vector<std::string> & splits,
|
||||
llama_load_mode load_mode,
|
||||
// bool use_mmap,
|
||||
// bool use_direct_io,
|
||||
bool check_tensors,
|
||||
bool no_alloc,
|
||||
const llama_model_kv_override * param_overrides_p,
|
||||
|
|
@ -553,20 +551,6 @@ llama_model_loader::llama_model_loader(
|
|||
files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
|
||||
contexts.emplace_back(ctx);
|
||||
|
||||
// if (use_mmap && use_direct_io) {
|
||||
// if (files.back()->has_direct_io()) {
|
||||
// LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
|
||||
// use_mmap = false;
|
||||
// } else {
|
||||
// LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
|
||||
// use_direct_io = false;
|
||||
|
||||
// // reopen file using std::fopen for mmap
|
||||
// files.pop_back();
|
||||
// files.emplace_back(new llama_file(fname.c_str(), "rb", false));
|
||||
// }
|
||||
// }
|
||||
|
||||
// Save tensors data offset of the main file.
|
||||
// For subsidiary files, `meta` tensor data offset must not be used,
|
||||
// so we build a unified tensors index for weights.
|
||||
|
|
|
|||
|
|
@ -126,8 +126,6 @@ struct llama_model_loader {
|
|||
const std::string & fname,
|
||||
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
|
||||
enum llama_load_mode load_mode,
|
||||
// bool use_mmap,
|
||||
// bool use_direct_io,
|
||||
bool check_tensors,
|
||||
bool no_alloc,
|
||||
const llama_model_kv_override * param_overrides_p,
|
||||
|
|
|
|||
|
|
@ -846,11 +846,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
// mmap consistently increases speed on Linux, and also increases speed on Windows with
|
||||
// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
|
||||
#if defined(__linux__) || defined(_WIN32)
|
||||
// constexpr bool use_mmap = true;
|
||||
constexpr llama_load_mode load_mode = LLAMA_LOAD_MODE_MMAP;
|
||||
#else
|
||||
constexpr llama_load_mode load_mode = LLAMA_LOAD_MODE_NONE;
|
||||
// constexpr bool use_mmap = false;
|
||||
#endif
|
||||
|
||||
llama_model_kv_override * kv_overrides = nullptr;
|
||||
|
|
|
|||
|
|
@ -75,8 +75,6 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
|
|||
llama_model_params mparams_copy = *mparams;
|
||||
mparams_copy.no_alloc = true;
|
||||
mparams_copy.load_mode = LLAMA_LOAD_MODE_NONE;
|
||||
// mparams_copy.use_mmap = false;
|
||||
// mparams_copy.use_mlock = false;
|
||||
|
||||
llama_model * model = llama_model_load_from_file(path_model, mparams_copy);
|
||||
if (model == nullptr) {
|
||||
|
|
@ -1036,7 +1034,6 @@ struct llama_model * llama_model_init_from_user(
|
|||
GGML_ASSERT(metadata != nullptr);
|
||||
std::string path_model;
|
||||
std::vector<std::string> splits = {};
|
||||
// params.use_mmap = false;
|
||||
params.load_mode = LLAMA_LOAD_MODE_NONE;
|
||||
params.use_extra_bufts = false;
|
||||
return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, params);
|
||||
|
|
|
|||
|
|
@ -16,7 +16,6 @@ int main(int argc, char *argv[] ) {
|
|||
|
||||
llama_backend_init();
|
||||
auto params = llama_model_params{};
|
||||
// params.use_mmap = false;
|
||||
params.load_mode = LLAMA_LOAD_MODE_NONE;
|
||||
params.progress_callback = [](float progress, void * ctx){
|
||||
(void) ctx;
|
||||
|
|
|
|||
|
|
@ -309,7 +309,9 @@ int main(int argc, char ** argv) {
|
|||
|
||||
{
|
||||
auto mparams = llama_model_default_params();
|
||||
// mparams.use_mlock = false;
|
||||
if (mparams.load_mode == LLAMA_LOAD_MODE_MLOCK) {
|
||||
mparams.load_mode = LLAMA_LOAD_MODE_MMAP;
|
||||
}
|
||||
|
||||
model = llama_model_load_from_file(params.model.c_str(), mparams);
|
||||
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ Performance testing tool for llama.cpp.
|
|||
## Syntax
|
||||
|
||||
```
|
||||
usage: build/bin/llama-bench [options]
|
||||
usage: llama-bench [options]
|
||||
|
||||
options:
|
||||
-h, --help
|
||||
|
|
|
|||
|
|
@ -354,8 +354,6 @@ struct cmd_params {
|
|||
std::vector<std::vector<ggml_backend_dev_t>> devices;
|
||||
std::vector<std::vector<float>> tensor_split;
|
||||
std::vector<std::vector<llama_model_tensor_buft_override>> tensor_buft_overrides;
|
||||
// std::vector<bool> use_mmap;
|
||||
// std::vector<bool> use_direct_io;
|
||||
std::vector<bool> embeddings;
|
||||
std::vector<bool> no_op_offload;
|
||||
std::vector<bool> no_host;
|
||||
|
|
@ -397,8 +395,6 @@ static const cmd_params cmd_params_defaults = {
|
|||
/* devices */ { {} },
|
||||
/* tensor_split */ { std::vector<float>(llama_max_devices(), 0.0f) },
|
||||
/* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{ { nullptr, nullptr } } },
|
||||
// /* use_mmap */ { true },
|
||||
// /* use_direct_io */ { false },
|
||||
/* embeddings */ { false },
|
||||
/* no_op_offload */ { false },
|
||||
/* no_host */ { false },
|
||||
|
|
@ -1116,12 +1112,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|||
if (params.tensor_buft_overrides.empty()) {
|
||||
params.tensor_buft_overrides = cmd_params_defaults.tensor_buft_overrides;
|
||||
}
|
||||
// if (params.use_mmap.empty()) {
|
||||
// params.use_mmap = cmd_params_defaults.use_mmap;
|
||||
// }
|
||||
// if (params.use_direct_io.empty()) {
|
||||
// params.use_direct_io = cmd_params_defaults.use_direct_io;
|
||||
// }
|
||||
if (params.embeddings.empty()) {
|
||||
params.embeddings = cmd_params_defaults.embeddings;
|
||||
}
|
||||
|
|
@ -1170,8 +1160,6 @@ struct cmd_params_instance {
|
|||
std::vector<ggml_backend_dev_t> devices;
|
||||
std::vector<float> tensor_split;
|
||||
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
|
||||
// bool use_mmap;
|
||||
// bool use_direct_io;
|
||||
bool embeddings;
|
||||
bool no_op_offload;
|
||||
bool no_host;
|
||||
|
|
@ -1187,8 +1175,6 @@ struct cmd_params_instance {
|
|||
mparams.load_mode = load_mode;
|
||||
mparams.main_gpu = main_gpu;
|
||||
mparams.tensor_split = tensor_split.data();
|
||||
// mparams.use_mmap = use_mmap;
|
||||
// mparams.use_direct_io = use_direct_io;
|
||||
mparams.no_host = no_host;
|
||||
|
||||
if (n_cpu_moe <= 0) {
|
||||
|
|
@ -1234,10 +1220,7 @@ struct cmd_params_instance {
|
|||
return model == other.model && n_gpu_layers == other.n_gpu_layers && n_cpu_moe == other.n_cpu_moe &&
|
||||
split_mode == other.split_mode &&
|
||||
main_gpu == other.main_gpu && tensor_split == other.tensor_split &&
|
||||
load_mode == other.load_mode &&
|
||||
// use_mmap == other.use_mmap && use_direct_io == other.use_direct_io &&
|
||||
devices == other.devices &&
|
||||
no_host == other.no_host &&
|
||||
load_mode == other.load_mode && devices == other.devices && no_host == other.no_host &&
|
||||
vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides);
|
||||
}
|
||||
|
||||
|
|
@ -1273,8 +1256,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|||
for (const auto & devs : params.devices)
|
||||
for (const auto & ts : params.tensor_split)
|
||||
for (const auto & ot : params.tensor_buft_overrides)
|
||||
// for (const auto & mmp : params.use_mmap)
|
||||
// for (const auto & dio : params.use_direct_io)
|
||||
for (const auto & noh : params.no_host)
|
||||
for (const auto & embd : params.embeddings)
|
||||
for (const auto & nopo : params.no_op_offload)
|
||||
|
|
@ -1352,8 +1333,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|||
/* .devices = */ devs,
|
||||
/* .tensor_split = */ ts,
|
||||
/* .tensor_buft_overrides = */ ot,
|
||||
// /* .use_mmap = */ mmp,
|
||||
// /* .use_direct_io= */ dio,
|
||||
/* .embeddings = */ embd,
|
||||
/* .no_op_offload= */ nopo,
|
||||
/* .no_host = */ noh,
|
||||
|
|
@ -1388,8 +1367,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|||
/* .devices = */ devs,
|
||||
/* .tensor_split = */ ts,
|
||||
/* .tensor_buft_overrides = */ ot,
|
||||
// /* .use_mmap = */ mmp,
|
||||
// /* .use_direct_io= */ dio,
|
||||
/* .embeddings = */ embd,
|
||||
/* .no_op_offload= */ nopo,
|
||||
/* .no_host = */ noh,
|
||||
|
|
@ -1468,8 +1445,6 @@ struct test {
|
|||
devices = inst.devices;
|
||||
tensor_split = inst.tensor_split;
|
||||
tensor_buft_overrides = inst.tensor_buft_overrides;
|
||||
// use_mmap = inst.use_mmap;
|
||||
// use_direct_io = inst.use_direct_io;
|
||||
embeddings = inst.embeddings;
|
||||
no_op_offload = inst.no_op_offload;
|
||||
no_host = inst.no_host;
|
||||
|
|
@ -1529,7 +1504,7 @@ struct test {
|
|||
"n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll",
|
||||
"type_k", "type_v", "n_gpu_layers", "n_cpu_moe", "split_mode",
|
||||
"main_gpu", "no_kv_offload", "flash_attn", "devices", "tensor_split",
|
||||
"tensor_buft_overrides", "load_mode", "embeddings",
|
||||
"tensor_buft_overrides", "load_mode", "embeddings",
|
||||
"no_op_offload", "no_host", "n_prompt", "n_gen", "n_depth",
|
||||
"test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts"
|
||||
};
|
||||
|
|
@ -1621,8 +1596,6 @@ struct test {
|
|||
devices_to_string(devices),
|
||||
tensor_split_str,
|
||||
tensor_buft_overrides_str,
|
||||
// std::to_string(use_mmap),
|
||||
// std::to_string(use_direct_io),
|
||||
load_mode_str(load_mode),
|
||||
std::to_string(embeddings),
|
||||
std::to_string(no_op_offload),
|
||||
|
|
@ -1849,12 +1822,6 @@ struct markdown_printer : public printer {
|
|||
if (field == "flash_attn") {
|
||||
return "fa";
|
||||
}
|
||||
// if (field == "use_mmap") {
|
||||
// return "mmap";
|
||||
// }
|
||||
// if (field == "use_direct_io") {
|
||||
// return "dio";
|
||||
// }
|
||||
if (field == "load_mode") {
|
||||
return "lm";
|
||||
}
|
||||
|
|
@ -1942,12 +1909,6 @@ struct markdown_printer : public printer {
|
|||
if (params.load_mode.size() > 1 || params.load_mode != cmd_params_defaults.load_mode) {
|
||||
fields.emplace_back("load_mode");
|
||||
}
|
||||
// if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
|
||||
// fields.emplace_back("use_mmap");
|
||||
// }
|
||||
// if (params.use_direct_io.size() > 1 || params.use_direct_io != cmd_params_defaults.use_direct_io) {
|
||||
// fields.emplace_back("use_direct_io");
|
||||
// }
|
||||
if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) {
|
||||
fields.emplace_back("embeddings");
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue