args: refactor mlock/mmap/directio into load-mode
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
This commit is contained in:
parent
cea560f483
commit
6077971ec2
|
|
@ -3,11 +3,14 @@
|
||||||
#include "chat.h"
|
#include "chat.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "download.h"
|
#include "download.h"
|
||||||
|
#include "ggml.h"
|
||||||
#include "json-schema-to-grammar.h"
|
#include "json-schema-to-grammar.h"
|
||||||
|
#include "llama.h"
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
#include "sampling.h"
|
#include "sampling.h"
|
||||||
#include "speculative.h"
|
#include "speculative.h"
|
||||||
#include "preset.h"
|
#include "preset.h"
|
||||||
|
#include <stdexcept>
|
||||||
|
|
||||||
// fix problem with std::min and std::max
|
// fix problem with std::min and std::max
|
||||||
#if defined(_WIN32)
|
#if defined(_WIN32)
|
||||||
|
|
@ -2206,25 +2209,47 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
{"--mlock"},
|
{"--mlock"},
|
||||||
"force system to keep model in RAM rather than swapping or compressing",
|
"force system to keep model in RAM rather than swapping or compressing",
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.use_mlock = true;
|
throw std::runtime_error("error: --mlock is deprecated. use --load-mode mlock instead");
|
||||||
|
|
||||||
|
GGML_UNUSED(params);
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_MLOCK"));
|
).set_env("LLAMA_ARG_MLOCK"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--mmap"},
|
{"--mmap"},
|
||||||
{"--no-mmap"},
|
{"--no-mmap"},
|
||||||
string_format("whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
|
"whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock)",
|
||||||
[](common_params & params, bool value) {
|
[](common_params & params, bool value) {
|
||||||
params.use_mmap = value;
|
throw std::runtime_error("error: --mmap and --no-mmap are deprecated. use --load-mode mmap instead");
|
||||||
|
|
||||||
|
GGML_UNUSED(params);
|
||||||
|
GGML_UNUSED(value);
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_MMAP"));
|
).set_env("LLAMA_ARG_MMAP"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-dio", "--direct-io"},
|
{"-dio", "--direct-io"},
|
||||||
{"-ndio", "--no-direct-io"},
|
{"-ndio", "--no-direct-io"},
|
||||||
string_format("use DirectIO if available. (default: %s)", params.use_direct_io ? "enabled" : "disabled"),
|
"use DirectIO if available",
|
||||||
[](common_params & params, bool value) {
|
[](common_params & params, bool value) {
|
||||||
params.use_direct_io = value;
|
throw std::invalid_argument("error: -dio/--direct-io and -ndio/--no-direct-io are deprecated. use --load-mode dio instead");
|
||||||
|
|
||||||
|
GGML_UNUSED(params);
|
||||||
|
GGML_UNUSED(value);
|
||||||
}
|
}
|
||||||
).set_env("LLAMA_ARG_DIO"));
|
).set_env("LLAMA_ARG_DIO"));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"-lm", "--load-mode"}, "MODE",
|
||||||
|
"model loading mode (default: mmap)\n"
|
||||||
|
"- mlock: force system to keep model in RAM rather than swapping or compressing.\n"
|
||||||
|
"- mmap: memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock)\n"
|
||||||
|
"- dio: use DirectIO if available.\n",
|
||||||
|
[](common_params & params, const std::string & value) {
|
||||||
|
if (value == "") { params.load_mode = LLAMA_LOAD_MODE_MMAP; }
|
||||||
|
else if (value == "mlock") { params.load_mode = LLAMA_LOAD_MODE_MLOCK; }
|
||||||
|
else if (value == "mmap") { params.load_mode = LLAMA_LOAD_MODE_MMAP; }
|
||||||
|
else if (value == "dio") { params.load_mode = LLAMA_LOAD_MODE_DIRECT_IO; }
|
||||||
|
else { throw std::invalid_argument("invalid value"); }
|
||||||
|
}
|
||||||
|
));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--numa"}, "TYPE",
|
{"--numa"}, "TYPE",
|
||||||
"attempt optimizations that help on some NUMA systems\n"
|
"attempt optimizations that help on some NUMA systems\n"
|
||||||
|
|
|
||||||
|
|
@ -1320,9 +1320,10 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
||||||
mparams.main_gpu = params.main_gpu;
|
mparams.main_gpu = params.main_gpu;
|
||||||
mparams.split_mode = params.split_mode;
|
mparams.split_mode = params.split_mode;
|
||||||
mparams.tensor_split = params.tensor_split;
|
mparams.tensor_split = params.tensor_split;
|
||||||
mparams.use_mmap = params.use_mmap;
|
// mparams.use_mmap = params.use_mmap;
|
||||||
mparams.use_direct_io = params.use_direct_io;
|
// mparams.use_direct_io = params.use_direct_io;
|
||||||
mparams.use_mlock = params.use_mlock;
|
// mparams.use_mlock = params.use_mlock;
|
||||||
|
mparams.load_mode = params.load_mode;
|
||||||
mparams.check_tensors = params.check_tensors;
|
mparams.check_tensors = params.check_tensors;
|
||||||
mparams.use_extra_bufts = !params.no_extra_bufts;
|
mparams.use_extra_bufts = !params.no_extra_bufts;
|
||||||
mparams.no_host = params.no_host;
|
mparams.no_host = params.no_host;
|
||||||
|
|
|
||||||
|
|
@ -443,6 +443,7 @@ struct common_params {
|
||||||
std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);
|
std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);
|
||||||
|
|
||||||
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
||||||
|
enum llama_load_mode load_mode = LLAMA_LOAD_MODE_MMAP; // how to load the model
|
||||||
|
|
||||||
struct cpu_params cpuparams;
|
struct cpu_params cpuparams;
|
||||||
struct cpu_params cpuparams_batch;
|
struct cpu_params cpuparams_batch;
|
||||||
|
|
@ -532,9 +533,9 @@ struct common_params {
|
||||||
bool kv_unified = false; // enable unified KV cache
|
bool kv_unified = false; // enable unified KV cache
|
||||||
|
|
||||||
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
||||||
bool use_mmap = true; // enable mmap to use filesystem cache
|
// bool use_mmap = true; // enable mmap to use filesystem cache
|
||||||
bool use_direct_io = false; // read from disk without buffering
|
// bool use_direct_io = false; // read from disk without buffering
|
||||||
bool use_mlock = false; // use mlock to keep model in memory
|
// bool use_mlock = false; // use mlock to keep model in memory
|
||||||
bool verbose_prompt = false; // print prompt tokens before generation
|
bool verbose_prompt = false; // print prompt tokens before generation
|
||||||
bool display_prompt = true; // print prompt before generation
|
bool display_prompt = true; // print prompt before generation
|
||||||
bool no_kv_offload = false; // disable KV offloading
|
bool no_kv_offload = false; // disable KV offloading
|
||||||
|
|
|
||||||
|
|
@ -555,9 +555,10 @@ int main(int argc, char ** argv) {
|
||||||
llama_model_params model_params = llama_model_default_params();
|
llama_model_params model_params = llama_model_default_params();
|
||||||
model_params.n_gpu_layers = params.n_gpu_layers;
|
model_params.n_gpu_layers = params.n_gpu_layers;
|
||||||
model_params.devices = params.devices.data();
|
model_params.devices = params.devices.data();
|
||||||
model_params.use_mmap = params.use_mmap;
|
model_params.load_mode = params.load_mode;
|
||||||
model_params.use_direct_io = params.use_direct_io;
|
// model_params.use_mmap = params.use_mmap;
|
||||||
model_params.use_mlock = params.use_mlock;
|
// model_params.use_direct_io = params.use_direct_io;
|
||||||
|
// model_params.use_mlock = params.use_mlock;
|
||||||
model_params.check_tensors = params.check_tensors;
|
model_params.check_tensors = params.check_tensors;
|
||||||
|
|
||||||
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
|
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
|
||||||
|
|
|
||||||
|
|
@ -24,10 +24,11 @@ int main(int argc, char ** argv) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.use_mmap) {
|
if (params.load_mode == LLAMA_LOAD_MODE_MMAP) {
|
||||||
LOG_INF("%s: force disabling memory mapping because it would result in-read-only pointers to the weights\n",
|
LOG_INF("%s: force disabling memory mapping because it would result in-read-only pointers to the weights\n",
|
||||||
__func__);
|
__func__);
|
||||||
params.use_mmap = false;
|
// params.use_mmap = false;
|
||||||
|
params.load_mode = LLAMA_LOAD_MODE_NONE;
|
||||||
}
|
}
|
||||||
if (params.cache_type_k != GGML_TYPE_F32) {
|
if (params.cache_type_k != GGML_TYPE_F32) {
|
||||||
LOG_INF("%s: force changing k cache type to f32 due to a lack of f16 support for OUT_PROD\n", __func__);
|
LOG_INF("%s: force changing k cache type to f32 due to a lack of f16 support for OUT_PROD\n", __func__);
|
||||||
|
|
|
||||||
|
|
@ -196,6 +196,13 @@ extern "C" {
|
||||||
LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported
|
LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum llama_load_mode {
|
||||||
|
LLAMA_LOAD_MODE_NONE = 0,
|
||||||
|
LLAMA_LOAD_MODE_MMAP = 1,
|
||||||
|
LLAMA_LOAD_MODE_MLOCK = 2,
|
||||||
|
LLAMA_LOAD_MODE_DIRECT_IO = 3,
|
||||||
|
};
|
||||||
|
|
||||||
// TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979)
|
// TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979)
|
||||||
typedef struct llama_token_data {
|
typedef struct llama_token_data {
|
||||||
llama_token id; // token id
|
llama_token id; // token id
|
||||||
|
|
@ -290,6 +297,7 @@ extern "C" {
|
||||||
|
|
||||||
int32_t n_gpu_layers; // number of layers to store in VRAM, a negative value means all layers
|
int32_t n_gpu_layers; // number of layers to store in VRAM, a negative value means all layers
|
||||||
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
||||||
|
enum llama_load_mode load_mode; // how to load the model into memory
|
||||||
|
|
||||||
// the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
|
// the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
|
||||||
int32_t main_gpu;
|
int32_t main_gpu;
|
||||||
|
|
@ -310,9 +318,9 @@ extern "C" {
|
||||||
|
|
||||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||||
bool vocab_only; // only load the vocabulary, no weights
|
bool vocab_only; // only load the vocabulary, no weights
|
||||||
bool use_mmap; // use mmap if possible
|
// bool use_mmap; // DEPRECATED: use mmap if possible
|
||||||
bool use_direct_io; // use direct io, takes precedence over use_mmap when supported
|
// bool use_direct_io; // DEPRECATED: use direct io, takes precedence over use_mmap when supported
|
||||||
bool use_mlock; // force system to keep model in RAM
|
// bool use_mlock; // DEPRECATED: force system to keep model in RAM
|
||||||
bool check_tensors; // validate model tensor data
|
bool check_tensors; // validate model tensor data
|
||||||
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
|
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
|
||||||
bool no_host; // bypass host buffer allowing extra buffers to be used
|
bool no_host; // bypass host buffer allowing extra buffers to be used
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,7 @@
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "gguf.h"
|
#include "gguf.h"
|
||||||
#include "llama-hparams.h"
|
#include "llama-hparams.h"
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <array>
|
#include <array>
|
||||||
|
|
@ -511,8 +512,9 @@ llama_model_loader::llama_model_loader(
|
||||||
void * set_tensor_data_ud,
|
void * set_tensor_data_ud,
|
||||||
const std::string & fname,
|
const std::string & fname,
|
||||||
std::vector<std::string> & splits,
|
std::vector<std::string> & splits,
|
||||||
bool use_mmap,
|
llama_load_mode load_mode,
|
||||||
bool use_direct_io,
|
// bool use_mmap,
|
||||||
|
// bool use_direct_io,
|
||||||
bool check_tensors,
|
bool check_tensors,
|
||||||
bool no_alloc,
|
bool no_alloc,
|
||||||
const llama_model_kv_override * param_overrides_p,
|
const llama_model_kv_override * param_overrides_p,
|
||||||
|
|
@ -551,19 +553,19 @@ llama_model_loader::llama_model_loader(
|
||||||
files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
|
files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
|
||||||
contexts.emplace_back(ctx);
|
contexts.emplace_back(ctx);
|
||||||
|
|
||||||
if (use_mmap && use_direct_io) {
|
// if (use_mmap && use_direct_io) {
|
||||||
if (files.back()->has_direct_io()) {
|
// if (files.back()->has_direct_io()) {
|
||||||
LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
|
// LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
|
||||||
use_mmap = false;
|
// use_mmap = false;
|
||||||
} else {
|
// } else {
|
||||||
LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
|
// LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
|
||||||
use_direct_io = false;
|
// use_direct_io = false;
|
||||||
|
|
||||||
// reopen file using std::fopen for mmap
|
// // reopen file using std::fopen for mmap
|
||||||
files.pop_back();
|
// files.pop_back();
|
||||||
files.emplace_back(new llama_file(fname.c_str(), "rb", false));
|
// files.emplace_back(new llama_file(fname.c_str(), "rb", false));
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
|
|
||||||
// Save tensors data offset of the main file.
|
// Save tensors data offset of the main file.
|
||||||
// For subsidiary files, `meta` tensor data offset must not be used,
|
// For subsidiary files, `meta` tensor data offset must not be used,
|
||||||
|
|
@ -778,8 +780,8 @@ llama_model_loader::llama_model_loader(
|
||||||
use_mmap = false;
|
use_mmap = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
this->use_mmap = use_mmap;
|
this->use_mmap = load_mode == LLAMA_LOAD_MODE_MMAP;
|
||||||
this->use_direct_io = use_direct_io;
|
this->use_direct_io = load_mode == LLAMA_LOAD_MODE_DIRECT_IO;
|
||||||
this->check_tensors = check_tensors;
|
this->check_tensors = check_tensors;
|
||||||
this->no_alloc = no_alloc;
|
this->no_alloc = no_alloc;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -125,8 +125,9 @@ struct llama_model_loader {
|
||||||
void * set_tensor_data_ud,
|
void * set_tensor_data_ud,
|
||||||
const std::string & fname,
|
const std::string & fname,
|
||||||
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
|
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
|
||||||
bool use_mmap,
|
enum llama_load_mode load_mode,
|
||||||
bool use_direct_io,
|
// bool use_mmap,
|
||||||
|
// bool use_direct_io,
|
||||||
bool check_tensors,
|
bool check_tensors,
|
||||||
bool no_alloc,
|
bool no_alloc,
|
||||||
const llama_model_kv_override * param_overrides_p,
|
const llama_model_kv_override * param_overrides_p,
|
||||||
|
|
|
||||||
|
|
@ -14,6 +14,7 @@
|
||||||
|
|
||||||
#include "ggml-cpp.h"
|
#include "ggml-cpp.h"
|
||||||
|
|
||||||
|
#include "llama.h"
|
||||||
#include "models/models.h"
|
#include "models/models.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
@ -2581,7 +2582,7 @@ void llama_model::load_vocab(llama_model_loader & ml) {
|
||||||
|
|
||||||
bool llama_model::load_tensors(llama_model_loader & ml) {
|
bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||||
const auto & split_mode = params.split_mode;
|
const auto & split_mode = params.split_mode;
|
||||||
const auto & use_mlock = params.use_mlock;
|
const auto & use_mlock = params.load_mode == LLAMA_LOAD_MODE_MLOCK;
|
||||||
const auto & tensor_split = params.tensor_split;
|
const auto & tensor_split = params.tensor_split;
|
||||||
|
|
||||||
const int n_layer = hparams.n_layer;
|
const int n_layer = hparams.n_layer;
|
||||||
|
|
@ -8698,15 +8699,13 @@ llama_model_params llama_model_default_params() {
|
||||||
/*.tensor_buft_overrides =*/ nullptr,
|
/*.tensor_buft_overrides =*/ nullptr,
|
||||||
/*.n_gpu_layers =*/ -1,
|
/*.n_gpu_layers =*/ -1,
|
||||||
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
||||||
|
/*.load_mode =*/ LLAMA_LOAD_MODE_MMAP,
|
||||||
/*.main_gpu =*/ 0,
|
/*.main_gpu =*/ 0,
|
||||||
/*.tensor_split =*/ nullptr,
|
/*.tensor_split =*/ nullptr,
|
||||||
/*.progress_callback =*/ nullptr,
|
/*.progress_callback =*/ nullptr,
|
||||||
/*.progress_callback_user_data =*/ nullptr,
|
/*.progress_callback_user_data =*/ nullptr,
|
||||||
/*.kv_overrides =*/ nullptr,
|
/*.kv_overrides =*/ nullptr,
|
||||||
/*.vocab_only =*/ false,
|
/*.vocab_only =*/ false,
|
||||||
/*.use_mmap =*/ true,
|
|
||||||
/*.use_direct_io =*/ false,
|
|
||||||
/*.use_mlock =*/ false,
|
|
||||||
/*.check_tensors =*/ false,
|
/*.check_tensors =*/ false,
|
||||||
/*.use_extra_bufts =*/ true,
|
/*.use_extra_bufts =*/ true,
|
||||||
/*.no_host =*/ false,
|
/*.no_host =*/ false,
|
||||||
|
|
|
||||||
|
|
@ -846,9 +846,11 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
// mmap consistently increases speed on Linux, and also increases speed on Windows with
|
// mmap consistently increases speed on Linux, and also increases speed on Windows with
|
||||||
// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
|
// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
|
||||||
#if defined(__linux__) || defined(_WIN32)
|
#if defined(__linux__) || defined(_WIN32)
|
||||||
constexpr bool use_mmap = true;
|
// constexpr bool use_mmap = true;
|
||||||
|
constexpr llama_load_mode load_mode = LLAMA_LOAD_MODE_MMAP;
|
||||||
#else
|
#else
|
||||||
constexpr bool use_mmap = false;
|
constexpr llama_load_mode load_mode = LLAMA_LOAD_MODE_NONE;
|
||||||
|
// constexpr bool use_mmap = false;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
llama_model_kv_override * kv_overrides = nullptr;
|
llama_model_kv_override * kv_overrides = nullptr;
|
||||||
|
|
@ -859,7 +861,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
|
|
||||||
std::vector<std::string> splits = {};
|
std::vector<std::string> splits = {};
|
||||||
llama_model_loader ml(/*metadata*/ nullptr, /*set_tensor_data*/ nullptr, /*set_tensor_data_ud*/ nullptr,
|
llama_model_loader ml(/*metadata*/ nullptr, /*set_tensor_data*/ nullptr, /*set_tensor_data_ud*/ nullptr,
|
||||||
fname_inp, splits, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
|
fname_inp, splits, load_mode, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
|
||||||
ml.init_mappings(false); // no prefetching
|
ml.init_mappings(false); // no prefetching
|
||||||
|
|
||||||
llama_model model(llama_model_default_params());
|
llama_model model(llama_model_default_params());
|
||||||
|
|
|
||||||
|
|
@ -74,8 +74,9 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
|
||||||
|
|
||||||
llama_model_params mparams_copy = *mparams;
|
llama_model_params mparams_copy = *mparams;
|
||||||
mparams_copy.no_alloc = true;
|
mparams_copy.no_alloc = true;
|
||||||
mparams_copy.use_mmap = false;
|
mparams_copy.load_mode = LLAMA_LOAD_MODE_NONE;
|
||||||
mparams_copy.use_mlock = false;
|
// mparams_copy.use_mmap = false;
|
||||||
|
// mparams_copy.use_mlock = false;
|
||||||
|
|
||||||
llama_model * model = llama_model_load_from_file(path_model, mparams_copy);
|
llama_model * model = llama_model_load_from_file(path_model, mparams_copy);
|
||||||
if (model == nullptr) {
|
if (model == nullptr) {
|
||||||
|
|
@ -837,7 +838,7 @@ static int llama_model_load(struct gguf_context * metadata, llama_model_set_tens
|
||||||
model.t_start_us = tm.t_start_us;
|
model.t_start_us = tm.t_start_us;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, params.use_mmap, params.use_direct_io,
|
llama_model_loader ml(metadata, set_tensor_data, set_tensor_data_ud, fname, splits, params.load_mode,
|
||||||
params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
|
params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
|
||||||
|
|
||||||
ml.print_info();
|
ml.print_info();
|
||||||
|
|
@ -1035,7 +1036,8 @@ struct llama_model * llama_model_init_from_user(
|
||||||
GGML_ASSERT(metadata != nullptr);
|
GGML_ASSERT(metadata != nullptr);
|
||||||
std::string path_model;
|
std::string path_model;
|
||||||
std::vector<std::string> splits = {};
|
std::vector<std::string> splits = {};
|
||||||
params.use_mmap = false;
|
// params.use_mmap = false;
|
||||||
|
params.load_mode = LLAMA_LOAD_MODE_NONE;
|
||||||
params.use_extra_bufts = false;
|
params.use_extra_bufts = false;
|
||||||
return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, params);
|
return llama_model_load_from_file_impl(metadata, set_tensor_data, set_tensor_data_ud, path_model, splits, params);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -156,13 +156,35 @@ int main(void) {
|
||||||
|
|
||||||
printf("test-arg-parser: test negated environment variables\n\n");
|
printf("test-arg-parser: test negated environment variables\n\n");
|
||||||
|
|
||||||
setenv("LLAMA_ARG_MMAP", "0", true);
|
setenv("LLAMA_ARG_LOAD_MODE", "none", true);
|
||||||
|
argv = {"binary_name"};
|
||||||
|
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||||
|
assert(params.load_mode == LLAMA_LOAD_MODE_NONE);
|
||||||
|
|
||||||
|
setenv("LLAMA_ARG_LOAD_MODE", "mlock", true);
|
||||||
|
argv = {"binary_name"};
|
||||||
|
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||||
|
assert(params.load_mode == LLAMA_LOAD_MODE_MLOCK);
|
||||||
|
|
||||||
|
setenv("LLAMA_ARG_LOAD_MODE", "mmap", true);
|
||||||
setenv("LLAMA_ARG_NO_PERF", "1", true); // legacy format
|
setenv("LLAMA_ARG_NO_PERF", "1", true); // legacy format
|
||||||
argv = {"binary_name"};
|
argv = {"binary_name"};
|
||||||
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||||
assert(params.use_mmap == false);
|
assert(params.load_mode == LLAMA_LOAD_MODE_MMAP);
|
||||||
assert(params.no_perf == true);
|
assert(params.no_perf == true);
|
||||||
|
|
||||||
|
setenv("LLAMA_ARG_LOAD_MODE", "dio", true);
|
||||||
|
argv = {"binary_name"};
|
||||||
|
assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||||
|
assert(params.load_mode == LLAMA_LOAD_MODE_DIRECT_IO);
|
||||||
|
|
||||||
|
// setenv("LLAMA_ARG_MMAP", "0", true);
|
||||||
|
// setenv("LLAMA_ARG_NO_PERF", "1", true); // legacy format
|
||||||
|
// argv = {"binary_name"};
|
||||||
|
// assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||||
|
// assert(params.use_mmap == false);
|
||||||
|
// assert(params.no_perf == true);
|
||||||
|
|
||||||
printf("test-arg-parser: test environment variables being overwritten\n\n");
|
printf("test-arg-parser: test environment variables being overwritten\n\n");
|
||||||
|
|
||||||
setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
|
setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
|
||||||
|
|
|
||||||
|
|
@ -16,7 +16,8 @@ int main(int argc, char *argv[] ) {
|
||||||
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
auto params = llama_model_params{};
|
auto params = llama_model_params{};
|
||||||
params.use_mmap = false;
|
// params.use_mmap = false;
|
||||||
|
params.load_mode = LLAMA_LOAD_MODE_NONE;
|
||||||
params.progress_callback = [](float progress, void * ctx){
|
params.progress_callback = [](float progress, void * ctx){
|
||||||
(void) ctx;
|
(void) ctx;
|
||||||
return progress > 0.50;
|
return progress > 0.50;
|
||||||
|
|
|
||||||
|
|
@ -309,7 +309,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
{
|
{
|
||||||
auto mparams = llama_model_default_params();
|
auto mparams = llama_model_default_params();
|
||||||
mparams.use_mlock = false;
|
// mparams.use_mlock = false;
|
||||||
|
|
||||||
model = llama_model_load_from_file(params.model.c_str(), mparams);
|
model = llama_model_load_from_file(params.model.c_str(), mparams);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -14,6 +14,7 @@
|
||||||
#include <numeric>
|
#include <numeric>
|
||||||
#include <regex>
|
#include <regex>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
|
#include <stdexcept>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <thread>
|
#include <thread>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
@ -265,6 +266,21 @@ static const char * split_mode_str(llama_split_mode mode) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static const char * load_mode_str(llama_load_mode mode) {
|
||||||
|
switch (mode) {
|
||||||
|
case LLAMA_LOAD_MODE_NONE:
|
||||||
|
return "none";
|
||||||
|
case LLAMA_LOAD_MODE_MLOCK:
|
||||||
|
return "mlock";
|
||||||
|
case LLAMA_LOAD_MODE_MMAP:
|
||||||
|
return "mmap";
|
||||||
|
case LLAMA_LOAD_MODE_DIRECT_IO:
|
||||||
|
return "dio";
|
||||||
|
default:
|
||||||
|
GGML_ABORT("invalid load mode");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static std::string pair_str(const std::pair<int, int> & p) {
|
static std::string pair_str(const std::pair<int, int> & p) {
|
||||||
static char buf[32];
|
static char buf[32];
|
||||||
snprintf(buf, sizeof(buf), "%d,%d", p.first, p.second);
|
snprintf(buf, sizeof(buf), "%d,%d", p.first, p.second);
|
||||||
|
|
@ -331,14 +347,15 @@ struct cmd_params {
|
||||||
std::vector<int> n_gpu_layers;
|
std::vector<int> n_gpu_layers;
|
||||||
std::vector<int> n_cpu_moe;
|
std::vector<int> n_cpu_moe;
|
||||||
std::vector<llama_split_mode> split_mode;
|
std::vector<llama_split_mode> split_mode;
|
||||||
|
std::vector<llama_load_mode> load_mode;
|
||||||
std::vector<int> main_gpu;
|
std::vector<int> main_gpu;
|
||||||
std::vector<bool> no_kv_offload;
|
std::vector<bool> no_kv_offload;
|
||||||
std::vector<bool> flash_attn;
|
std::vector<bool> flash_attn;
|
||||||
std::vector<std::vector<ggml_backend_dev_t>> devices;
|
std::vector<std::vector<ggml_backend_dev_t>> devices;
|
||||||
std::vector<std::vector<float>> tensor_split;
|
std::vector<std::vector<float>> tensor_split;
|
||||||
std::vector<std::vector<llama_model_tensor_buft_override>> tensor_buft_overrides;
|
std::vector<std::vector<llama_model_tensor_buft_override>> tensor_buft_overrides;
|
||||||
std::vector<bool> use_mmap;
|
// std::vector<bool> use_mmap;
|
||||||
std::vector<bool> use_direct_io;
|
// std::vector<bool> use_direct_io;
|
||||||
std::vector<bool> embeddings;
|
std::vector<bool> embeddings;
|
||||||
std::vector<bool> no_op_offload;
|
std::vector<bool> no_op_offload;
|
||||||
std::vector<bool> no_host;
|
std::vector<bool> no_host;
|
||||||
|
|
@ -373,14 +390,15 @@ static const cmd_params cmd_params_defaults = {
|
||||||
/* n_gpu_layers */ { 99 },
|
/* n_gpu_layers */ { 99 },
|
||||||
/* n_cpu_moe */ { 0 },
|
/* n_cpu_moe */ { 0 },
|
||||||
/* split_mode */ { LLAMA_SPLIT_MODE_LAYER },
|
/* split_mode */ { LLAMA_SPLIT_MODE_LAYER },
|
||||||
|
/* load_mode */ { LLAMA_LOAD_MODE_MMAP },
|
||||||
/* main_gpu */ { 0 },
|
/* main_gpu */ { 0 },
|
||||||
/* no_kv_offload */ { false },
|
/* no_kv_offload */ { false },
|
||||||
/* flash_attn */ { false },
|
/* flash_attn */ { false },
|
||||||
/* devices */ { {} },
|
/* devices */ { {} },
|
||||||
/* tensor_split */ { std::vector<float>(llama_max_devices(), 0.0f) },
|
/* tensor_split */ { std::vector<float>(llama_max_devices(), 0.0f) },
|
||||||
/* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{ { nullptr, nullptr } } },
|
/* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{ { nullptr, nullptr } } },
|
||||||
/* use_mmap */ { true },
|
// /* use_mmap */ { true },
|
||||||
/* use_direct_io */ { false },
|
// /* use_direct_io */ { false },
|
||||||
/* embeddings */ { false },
|
/* embeddings */ { false },
|
||||||
/* no_op_offload */ { false },
|
/* no_op_offload */ { false },
|
||||||
/* no_host */ { false },
|
/* no_host */ { false },
|
||||||
|
|
@ -443,8 +461,9 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||||
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
||||||
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
|
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
|
||||||
printf(" -dev, --device <dev0/dev1/...> (default: auto)\n");
|
printf(" -dev, --device <dev0/dev1/...> (default: auto)\n");
|
||||||
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
|
printf(" -mmp, --mmap <0|1> (DEPRECATED)\n");
|
||||||
printf(" -dio, --direct-io <0|1> (default: %s)\n", join(cmd_params_defaults.use_direct_io, ",").c_str());
|
printf(" -dio, --direct-io <0|1> (DEPRECATED)\n");
|
||||||
|
printf(" -lm, --load-mode <none|mlock|mmap|dio> (default: %s)\n", join(transform_to_str(cmd_params_defaults.load_mode, load_mode_str), ",").c_str());
|
||||||
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
|
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
|
||||||
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
|
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
|
||||||
printf(" -ot --override-tensor <tensor name pattern>=<buffer type>;...\n");
|
printf(" -ot --override-tensor <tensor name pattern>=<buffer type>;...\n");
|
||||||
|
|
@ -747,6 +766,34 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
|
params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
|
||||||
|
} else if (arg == "-lm" || arg == "--load-mode") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
auto p = string_split<std::string>(argv[i], split_delim);
|
||||||
|
|
||||||
|
std::vector<llama_load_mode> modes;
|
||||||
|
for (const auto & m : p) {
|
||||||
|
llama_load_mode mode;
|
||||||
|
if (m == "none") {
|
||||||
|
mode = LLAMA_LOAD_MODE_NONE;
|
||||||
|
} else if (m == "mlock") {
|
||||||
|
mode = LLAMA_LOAD_MODE_MLOCK;
|
||||||
|
} else if (m == "mmap") {
|
||||||
|
mode = LLAMA_LOAD_MODE_MMAP;
|
||||||
|
} else if (m == "dio") {
|
||||||
|
mode = LLAMA_LOAD_MODE_DIRECT_IO;
|
||||||
|
} else {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
modes.push_back(mode);
|
||||||
|
}
|
||||||
|
if (invalid_param) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.load_mode.insert(params.load_mode.end(), modes.begin(), modes.end());
|
||||||
} else if (arg == "-mg" || arg == "--main-gpu") {
|
} else if (arg == "-mg" || arg == "--main-gpu") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
|
@ -788,15 +835,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
auto p = string_split<bool>(argv[i], split_delim);
|
throw std::invalid_argument("error: -mmp/--mmap option is deprecated; please use -lm/--load-mode mmap instead");
|
||||||
params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
|
|
||||||
} else if (arg == "-dio" || arg == "--direct-io") {
|
} else if (arg == "-dio" || arg == "--direct-io") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
auto p = string_split<bool>(argv[i], split_delim);
|
throw std::invalid_argument("error: -dio/--direct-io option is deprecated; please use -lm/--load-mode dio instead");
|
||||||
params.use_direct_io.insert(params.use_direct_io.end(), p.begin(), p.end());
|
|
||||||
} else if (arg == "-embd" || arg == "--embeddings") {
|
} else if (arg == "-embd" || arg == "--embeddings") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
|
@ -1050,6 +1095,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||||
if (params.split_mode.empty()) {
|
if (params.split_mode.empty()) {
|
||||||
params.split_mode = cmd_params_defaults.split_mode;
|
params.split_mode = cmd_params_defaults.split_mode;
|
||||||
}
|
}
|
||||||
|
if (params.load_mode.empty()) {
|
||||||
|
params.load_mode = cmd_params_defaults.load_mode;
|
||||||
|
}
|
||||||
if (params.main_gpu.empty()) {
|
if (params.main_gpu.empty()) {
|
||||||
params.main_gpu = cmd_params_defaults.main_gpu;
|
params.main_gpu = cmd_params_defaults.main_gpu;
|
||||||
}
|
}
|
||||||
|
|
@ -1068,12 +1116,12 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||||
if (params.tensor_buft_overrides.empty()) {
|
if (params.tensor_buft_overrides.empty()) {
|
||||||
params.tensor_buft_overrides = cmd_params_defaults.tensor_buft_overrides;
|
params.tensor_buft_overrides = cmd_params_defaults.tensor_buft_overrides;
|
||||||
}
|
}
|
||||||
if (params.use_mmap.empty()) {
|
// if (params.use_mmap.empty()) {
|
||||||
params.use_mmap = cmd_params_defaults.use_mmap;
|
// params.use_mmap = cmd_params_defaults.use_mmap;
|
||||||
}
|
// }
|
||||||
if (params.use_direct_io.empty()) {
|
// if (params.use_direct_io.empty()) {
|
||||||
params.use_direct_io = cmd_params_defaults.use_direct_io;
|
// params.use_direct_io = cmd_params_defaults.use_direct_io;
|
||||||
}
|
// }
|
||||||
if (params.embeddings.empty()) {
|
if (params.embeddings.empty()) {
|
||||||
params.embeddings = cmd_params_defaults.embeddings;
|
params.embeddings = cmd_params_defaults.embeddings;
|
||||||
}
|
}
|
||||||
|
|
@ -1115,14 +1163,15 @@ struct cmd_params_instance {
|
||||||
int n_gpu_layers;
|
int n_gpu_layers;
|
||||||
int n_cpu_moe;
|
int n_cpu_moe;
|
||||||
llama_split_mode split_mode;
|
llama_split_mode split_mode;
|
||||||
|
llama_load_mode load_mode;
|
||||||
int main_gpu;
|
int main_gpu;
|
||||||
bool no_kv_offload;
|
bool no_kv_offload;
|
||||||
bool flash_attn;
|
bool flash_attn;
|
||||||
std::vector<ggml_backend_dev_t> devices;
|
std::vector<ggml_backend_dev_t> devices;
|
||||||
std::vector<float> tensor_split;
|
std::vector<float> tensor_split;
|
||||||
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
|
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
|
||||||
bool use_mmap;
|
// bool use_mmap;
|
||||||
bool use_direct_io;
|
// bool use_direct_io;
|
||||||
bool embeddings;
|
bool embeddings;
|
||||||
bool no_op_offload;
|
bool no_op_offload;
|
||||||
bool no_host;
|
bool no_host;
|
||||||
|
|
@ -1135,10 +1184,11 @@ struct cmd_params_instance {
|
||||||
mparams.devices = const_cast<ggml_backend_dev_t *>(devices.data());
|
mparams.devices = const_cast<ggml_backend_dev_t *>(devices.data());
|
||||||
}
|
}
|
||||||
mparams.split_mode = split_mode;
|
mparams.split_mode = split_mode;
|
||||||
|
mparams.load_mode = load_mode;
|
||||||
mparams.main_gpu = main_gpu;
|
mparams.main_gpu = main_gpu;
|
||||||
mparams.tensor_split = tensor_split.data();
|
mparams.tensor_split = tensor_split.data();
|
||||||
mparams.use_mmap = use_mmap;
|
// mparams.use_mmap = use_mmap;
|
||||||
mparams.use_direct_io = use_direct_io;
|
// mparams.use_direct_io = use_direct_io;
|
||||||
mparams.no_host = no_host;
|
mparams.no_host = no_host;
|
||||||
|
|
||||||
if (n_cpu_moe <= 0) {
|
if (n_cpu_moe <= 0) {
|
||||||
|
|
@ -1184,7 +1234,8 @@ struct cmd_params_instance {
|
||||||
return model == other.model && n_gpu_layers == other.n_gpu_layers && n_cpu_moe == other.n_cpu_moe &&
|
return model == other.model && n_gpu_layers == other.n_gpu_layers && n_cpu_moe == other.n_cpu_moe &&
|
||||||
split_mode == other.split_mode &&
|
split_mode == other.split_mode &&
|
||||||
main_gpu == other.main_gpu && tensor_split == other.tensor_split &&
|
main_gpu == other.main_gpu && tensor_split == other.tensor_split &&
|
||||||
use_mmap == other.use_mmap && use_direct_io == other.use_direct_io &&
|
load_mode == other.load_mode &&
|
||||||
|
// use_mmap == other.use_mmap && use_direct_io == other.use_direct_io &&
|
||||||
devices == other.devices &&
|
devices == other.devices &&
|
||||||
no_host == other.no_host &&
|
no_host == other.no_host &&
|
||||||
vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides);
|
vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides);
|
||||||
|
|
@ -1217,12 +1268,13 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||||
for (const auto & nl : params.n_gpu_layers)
|
for (const auto & nl : params.n_gpu_layers)
|
||||||
for (const auto & ncmoe : params.n_cpu_moe)
|
for (const auto & ncmoe : params.n_cpu_moe)
|
||||||
for (const auto & sm : params.split_mode)
|
for (const auto & sm : params.split_mode)
|
||||||
|
for (const auto & lm : params.load_mode)
|
||||||
for (const auto & mg : params.main_gpu)
|
for (const auto & mg : params.main_gpu)
|
||||||
for (const auto & devs : params.devices)
|
for (const auto & devs : params.devices)
|
||||||
for (const auto & ts : params.tensor_split)
|
for (const auto & ts : params.tensor_split)
|
||||||
for (const auto & ot : params.tensor_buft_overrides)
|
for (const auto & ot : params.tensor_buft_overrides)
|
||||||
for (const auto & mmp : params.use_mmap)
|
// for (const auto & mmp : params.use_mmap)
|
||||||
for (const auto & dio : params.use_direct_io)
|
// for (const auto & dio : params.use_direct_io)
|
||||||
for (const auto & noh : params.no_host)
|
for (const auto & noh : params.no_host)
|
||||||
for (const auto & embd : params.embeddings)
|
for (const auto & embd : params.embeddings)
|
||||||
for (const auto & nopo : params.no_op_offload)
|
for (const auto & nopo : params.no_op_offload)
|
||||||
|
|
@ -1257,14 +1309,15 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||||
/* .n_gpu_layers = */ nl,
|
/* .n_gpu_layers = */ nl,
|
||||||
/* .n_cpu_moe = */ ncmoe,
|
/* .n_cpu_moe = */ ncmoe,
|
||||||
/* .split_mode = */ sm,
|
/* .split_mode = */ sm,
|
||||||
|
/* .load_mode = */ lm,
|
||||||
/* .main_gpu = */ mg,
|
/* .main_gpu = */ mg,
|
||||||
/* .no_kv_offload= */ nkvo,
|
/* .no_kv_offload= */ nkvo,
|
||||||
/* .flash_attn = */ fa,
|
/* .flash_attn = */ fa,
|
||||||
/* .devices = */ devs,
|
/* .devices = */ devs,
|
||||||
/* .tensor_split = */ ts,
|
/* .tensor_split = */ ts,
|
||||||
/* .tensor_buft_overrides = */ ot,
|
/* .tensor_buft_overrides = */ ot,
|
||||||
/* .use_mmap = */ mmp,
|
// /* .use_mmap = */ mmp,
|
||||||
/* .use_direct_io= */ dio,
|
// /* .use_direct_io= */ dio,
|
||||||
/* .embeddings = */ embd,
|
/* .embeddings = */ embd,
|
||||||
/* .no_op_offload= */ nopo,
|
/* .no_op_offload= */ nopo,
|
||||||
/* .no_host = */ noh,
|
/* .no_host = */ noh,
|
||||||
|
|
@ -1292,14 +1345,15 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||||
/* .n_gpu_layers = */ nl,
|
/* .n_gpu_layers = */ nl,
|
||||||
/* .n_cpu_moe = */ ncmoe,
|
/* .n_cpu_moe = */ ncmoe,
|
||||||
/* .split_mode = */ sm,
|
/* .split_mode = */ sm,
|
||||||
|
/* .load_mode = */ lm,
|
||||||
/* .main_gpu = */ mg,
|
/* .main_gpu = */ mg,
|
||||||
/* .no_kv_offload= */ nkvo,
|
/* .no_kv_offload= */ nkvo,
|
||||||
/* .flash_attn = */ fa,
|
/* .flash_attn = */ fa,
|
||||||
/* .devices = */ devs,
|
/* .devices = */ devs,
|
||||||
/* .tensor_split = */ ts,
|
/* .tensor_split = */ ts,
|
||||||
/* .tensor_buft_overrides = */ ot,
|
/* .tensor_buft_overrides = */ ot,
|
||||||
/* .use_mmap = */ mmp,
|
// /* .use_mmap = */ mmp,
|
||||||
/* .use_direct_io= */ dio,
|
// /* .use_direct_io= */ dio,
|
||||||
/* .embeddings = */ embd,
|
/* .embeddings = */ embd,
|
||||||
/* .no_op_offload= */ nopo,
|
/* .no_op_offload= */ nopo,
|
||||||
/* .no_host = */ noh,
|
/* .no_host = */ noh,
|
||||||
|
|
@ -1327,14 +1381,15 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||||
/* .n_gpu_layers = */ nl,
|
/* .n_gpu_layers = */ nl,
|
||||||
/* .n_cpu_moe = */ ncmoe,
|
/* .n_cpu_moe = */ ncmoe,
|
||||||
/* .split_mode = */ sm,
|
/* .split_mode = */ sm,
|
||||||
|
/* .load_mode = */ lm,
|
||||||
/* .main_gpu = */ mg,
|
/* .main_gpu = */ mg,
|
||||||
/* .no_kv_offload= */ nkvo,
|
/* .no_kv_offload= */ nkvo,
|
||||||
/* .flash_attn = */ fa,
|
/* .flash_attn = */ fa,
|
||||||
/* .devices = */ devs,
|
/* .devices = */ devs,
|
||||||
/* .tensor_split = */ ts,
|
/* .tensor_split = */ ts,
|
||||||
/* .tensor_buft_overrides = */ ot,
|
/* .tensor_buft_overrides = */ ot,
|
||||||
/* .use_mmap = */ mmp,
|
// /* .use_mmap = */ mmp,
|
||||||
/* .use_direct_io= */ dio,
|
// /* .use_direct_io= */ dio,
|
||||||
/* .embeddings = */ embd,
|
/* .embeddings = */ embd,
|
||||||
/* .no_op_offload= */ nopo,
|
/* .no_op_offload= */ nopo,
|
||||||
/* .no_host = */ noh,
|
/* .no_host = */ noh,
|
||||||
|
|
@ -1367,14 +1422,15 @@ struct test {
|
||||||
int n_gpu_layers;
|
int n_gpu_layers;
|
||||||
int n_cpu_moe;
|
int n_cpu_moe;
|
||||||
llama_split_mode split_mode;
|
llama_split_mode split_mode;
|
||||||
|
llama_load_mode load_mode;
|
||||||
int main_gpu;
|
int main_gpu;
|
||||||
bool no_kv_offload;
|
bool no_kv_offload;
|
||||||
bool flash_attn;
|
bool flash_attn;
|
||||||
std::vector<ggml_backend_dev_t> devices;
|
std::vector<ggml_backend_dev_t> devices;
|
||||||
std::vector<float> tensor_split;
|
std::vector<float> tensor_split;
|
||||||
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
|
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
|
||||||
bool use_mmap;
|
// bool use_mmap;
|
||||||
bool use_direct_io;
|
// bool use_direct_io;
|
||||||
bool embeddings;
|
bool embeddings;
|
||||||
bool no_op_offload;
|
bool no_op_offload;
|
||||||
bool no_host;
|
bool no_host;
|
||||||
|
|
@ -1405,14 +1461,15 @@ struct test {
|
||||||
n_gpu_layers = inst.n_gpu_layers;
|
n_gpu_layers = inst.n_gpu_layers;
|
||||||
n_cpu_moe = inst.n_cpu_moe;
|
n_cpu_moe = inst.n_cpu_moe;
|
||||||
split_mode = inst.split_mode;
|
split_mode = inst.split_mode;
|
||||||
|
load_mode = inst.load_mode;
|
||||||
main_gpu = inst.main_gpu;
|
main_gpu = inst.main_gpu;
|
||||||
no_kv_offload = inst.no_kv_offload;
|
no_kv_offload = inst.no_kv_offload;
|
||||||
flash_attn = inst.flash_attn;
|
flash_attn = inst.flash_attn;
|
||||||
devices = inst.devices;
|
devices = inst.devices;
|
||||||
tensor_split = inst.tensor_split;
|
tensor_split = inst.tensor_split;
|
||||||
tensor_buft_overrides = inst.tensor_buft_overrides;
|
tensor_buft_overrides = inst.tensor_buft_overrides;
|
||||||
use_mmap = inst.use_mmap;
|
// use_mmap = inst.use_mmap;
|
||||||
use_direct_io = inst.use_direct_io;
|
// use_direct_io = inst.use_direct_io;
|
||||||
embeddings = inst.embeddings;
|
embeddings = inst.embeddings;
|
||||||
no_op_offload = inst.no_op_offload;
|
no_op_offload = inst.no_op_offload;
|
||||||
no_host = inst.no_host;
|
no_host = inst.no_host;
|
||||||
|
|
@ -1472,7 +1529,7 @@ struct test {
|
||||||
"n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll",
|
"n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll",
|
||||||
"type_k", "type_v", "n_gpu_layers", "n_cpu_moe", "split_mode",
|
"type_k", "type_v", "n_gpu_layers", "n_cpu_moe", "split_mode",
|
||||||
"main_gpu", "no_kv_offload", "flash_attn", "devices", "tensor_split",
|
"main_gpu", "no_kv_offload", "flash_attn", "devices", "tensor_split",
|
||||||
"tensor_buft_overrides", "use_mmap", "use_direct_io", "embeddings",
|
"tensor_buft_overrides", "load_mode", "embeddings",
|
||||||
"no_op_offload", "no_host", "n_prompt", "n_gen", "n_depth",
|
"no_op_offload", "no_host", "n_prompt", "n_gen", "n_depth",
|
||||||
"test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts"
|
"test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts"
|
||||||
};
|
};
|
||||||
|
|
@ -1489,9 +1546,12 @@ struct test {
|
||||||
return INT;
|
return INT;
|
||||||
}
|
}
|
||||||
if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
|
if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
|
||||||
field == "use_mmap" || field == "use_direct_io" || field == "embeddings" || field == "no_host") {
|
field == "embeddings" || field == "no_host") {
|
||||||
return BOOL;
|
return BOOL;
|
||||||
}
|
}
|
||||||
|
if (field == "load_mode") {
|
||||||
|
return STRING;
|
||||||
|
}
|
||||||
if (field == "avg_ts" || field == "stddev_ts") {
|
if (field == "avg_ts" || field == "stddev_ts") {
|
||||||
return FLOAT;
|
return FLOAT;
|
||||||
}
|
}
|
||||||
|
|
@ -1561,8 +1621,9 @@ struct test {
|
||||||
devices_to_string(devices),
|
devices_to_string(devices),
|
||||||
tensor_split_str,
|
tensor_split_str,
|
||||||
tensor_buft_overrides_str,
|
tensor_buft_overrides_str,
|
||||||
std::to_string(use_mmap),
|
// std::to_string(use_mmap),
|
||||||
std::to_string(use_direct_io),
|
// std::to_string(use_direct_io),
|
||||||
|
load_mode_str(load_mode),
|
||||||
std::to_string(embeddings),
|
std::to_string(embeddings),
|
||||||
std::to_string(no_op_offload),
|
std::to_string(no_op_offload),
|
||||||
std::to_string(no_host),
|
std::to_string(no_host),
|
||||||
|
|
@ -1745,12 +1806,15 @@ struct markdown_printer : public printer {
|
||||||
if (field == "devices") {
|
if (field == "devices") {
|
||||||
return -12;
|
return -12;
|
||||||
}
|
}
|
||||||
if (field == "use_mmap") {
|
if (field == "load_mode") {
|
||||||
return 4;
|
return 5;
|
||||||
}
|
|
||||||
if (field == "use_direct_io") {
|
|
||||||
return 3;
|
|
||||||
}
|
}
|
||||||
|
// if (field == "use_mmap") {
|
||||||
|
// return 4;
|
||||||
|
// }
|
||||||
|
// if (field == "use_direct_io") {
|
||||||
|
// return 3;
|
||||||
|
// }
|
||||||
if (field == "test") {
|
if (field == "test") {
|
||||||
return 15;
|
return 15;
|
||||||
}
|
}
|
||||||
|
|
@ -1785,11 +1849,14 @@ struct markdown_printer : public printer {
|
||||||
if (field == "flash_attn") {
|
if (field == "flash_attn") {
|
||||||
return "fa";
|
return "fa";
|
||||||
}
|
}
|
||||||
if (field == "use_mmap") {
|
// if (field == "use_mmap") {
|
||||||
return "mmap";
|
// return "mmap";
|
||||||
}
|
// }
|
||||||
if (field == "use_direct_io") {
|
// if (field == "use_direct_io") {
|
||||||
return "dio";
|
// return "dio";
|
||||||
|
// }
|
||||||
|
if (field == "load_mode") {
|
||||||
|
return "lm";
|
||||||
}
|
}
|
||||||
if (field == "embeddings") {
|
if (field == "embeddings") {
|
||||||
return "embd";
|
return "embd";
|
||||||
|
|
@ -1872,12 +1939,15 @@ struct markdown_printer : public printer {
|
||||||
if (params.tensor_buft_overrides.size() > 1 || !vec_vec_tensor_buft_override_equal(params.tensor_buft_overrides, cmd_params_defaults.tensor_buft_overrides)) {
|
if (params.tensor_buft_overrides.size() > 1 || !vec_vec_tensor_buft_override_equal(params.tensor_buft_overrides, cmd_params_defaults.tensor_buft_overrides)) {
|
||||||
fields.emplace_back("tensor_buft_overrides");
|
fields.emplace_back("tensor_buft_overrides");
|
||||||
}
|
}
|
||||||
if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
|
if (params.load_mode.size() > 1 || params.load_mode != cmd_params_defaults.load_mode) {
|
||||||
fields.emplace_back("use_mmap");
|
fields.emplace_back("load_mode");
|
||||||
}
|
|
||||||
if (params.use_direct_io.size() > 1 || params.use_direct_io != cmd_params_defaults.use_direct_io) {
|
|
||||||
fields.emplace_back("use_direct_io");
|
|
||||||
}
|
}
|
||||||
|
// if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
|
||||||
|
// fields.emplace_back("use_mmap");
|
||||||
|
// }
|
||||||
|
// if (params.use_direct_io.size() > 1 || params.use_direct_io != cmd_params_defaults.use_direct_io) {
|
||||||
|
// fields.emplace_back("use_direct_io");
|
||||||
|
// }
|
||||||
if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) {
|
if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) {
|
||||||
fields.emplace_back("embeddings");
|
fields.emplace_back("embeddings");
|
||||||
}
|
}
|
||||||
|
|
@ -2102,11 +2172,11 @@ int main(int argc, char ** argv) {
|
||||||
fprintf(stderr, "warning: sanitizer enabled, performance may be affected\n");
|
fprintf(stderr, "warning: sanitizer enabled, performance may be affected\n");
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
cmd_params params = parse_cmd_params(argc, argv);
|
||||||
|
|
||||||
// initialize backends
|
// initialize backends
|
||||||
ggml_backend_load_all();
|
ggml_backend_load_all();
|
||||||
|
|
||||||
cmd_params params = parse_cmd_params(argc, argv);
|
|
||||||
|
|
||||||
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||||
if (!cpu_dev) {
|
if (!cpu_dev) {
|
||||||
fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__);
|
fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue