use no_alloc to get memory requirements for model load
This commit is contained in:
parent
c2df1ac64a
commit
24f461b66d
|
|
@ -614,12 +614,6 @@ extern "C" {
|
|||
// Returns the total size of all the tensors in the model in bytes
|
||||
LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
|
||||
|
||||
// Returns the total size of all the tensors in the model in bytes from a model path
|
||||
// without fully loading the model. Uses llama_model_loader with no_alloc=true.
|
||||
// Returns 0 if the model cannot be loaded or the path is invalid.
|
||||
// This function can be used to estimate memory requirements before loading a model.
|
||||
LLAMA_API uint64_t llama_model_size_from_path(const char * path);
|
||||
|
||||
// Get the default chat template. Returns nullptr if not available
|
||||
// If name is NULL, returns the default chat template
|
||||
LLAMA_API const char * llama_model_chat_template(const struct llama_model * model, const char * name);
|
||||
|
|
|
|||
|
|
@ -9253,35 +9253,6 @@ uint64_t llama_model_size(const llama_model * model) {
|
|||
return model->size();
|
||||
}
|
||||
|
||||
uint64_t llama_model_size_from_path(const char * path) {
|
||||
if (!path) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
try {
|
||||
std::vector<std::string> splits;
|
||||
|
||||
llama_model_loader loader(
|
||||
/* metadata */ nullptr,
|
||||
/* set_tensor_data */ nullptr,
|
||||
/* set_tensor_data_ud */ nullptr,
|
||||
/* fname */ path,
|
||||
/* splits */ splits,
|
||||
/* file */ nullptr,
|
||||
/* use_mmap */ false,
|
||||
/* use_direct_io */ false,
|
||||
/* check_tensors */ false,
|
||||
/* no_alloc */ true,
|
||||
/* param_overrides_p */ nullptr,
|
||||
/* param_tensor_buft_overrides_p */ nullptr
|
||||
);
|
||||
|
||||
return loader.n_bytes;
|
||||
} catch (...) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
const char * llama_model_chat_template(const llama_model * model, const char * name) {
|
||||
const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE)
|
||||
: LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
|
||||
|
|
|
|||
|
|
@ -3495,7 +3495,6 @@ void server_routes::init_routes() {
|
|||
{ "total_slots", params.n_parallel },
|
||||
{ "model_alias", meta->model_name },
|
||||
{ "model_path", meta->model_path },
|
||||
{ "memory_mb", meta->model_size / (1024 * 1024) },
|
||||
{ "modalities", json {
|
||||
{"vision", meta->has_inp_image},
|
||||
{"audio", meta->has_inp_audio},
|
||||
|
|
|
|||
|
|
@ -538,6 +538,49 @@ void server_models::unload_lru(uint64_t new_model_memory_mb) {
|
|||
}
|
||||
}
|
||||
|
||||
static uint64_t get_model_memory_mb(const common_preset& preset) {
|
||||
common_params params;
|
||||
preset.apply_to_params(params);
|
||||
|
||||
if(params.model.path.empty()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct log_ud_t {
|
||||
struct {
|
||||
ggml_log_callback callback;
|
||||
void * user_data;
|
||||
} original;
|
||||
ggml_log_level min_level;
|
||||
} log_ud;
|
||||
llama_log_get(&log_ud.original.callback, &log_ud.original.user_data);
|
||||
log_ud.min_level = GGML_LOG_LEVEL_WARN;
|
||||
|
||||
llama_log_set([](ggml_log_level level, const char * text, void * ud) {
|
||||
log_ud_t * d = (log_ud_t *) ud;
|
||||
const ggml_log_level eff = level >= d->min_level ? level : GGML_LOG_LEVEL_DEBUG;
|
||||
d->original.callback(eff, text, d->original.user_data);
|
||||
}, &log_ud);
|
||||
|
||||
llama_model_params mparams = common_model_params_to_llama(params);
|
||||
mparams.no_alloc = true;
|
||||
mparams.use_mmap = false;
|
||||
mparams.use_mlock = false;
|
||||
|
||||
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
|
||||
|
||||
llama_log_set(log_ud.original.callback, log_ud.original.user_data);
|
||||
|
||||
if (!model) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint64_t size_bytes = llama_model_size(model);
|
||||
llama_model_free(model);
|
||||
|
||||
return size_bytes / (1024 * 1024);
|
||||
}
|
||||
|
||||
void server_models::load(const std::string & name) {
|
||||
if (!has_model(name)) {
|
||||
throw std::runtime_error("model name=" + name + " is not found");
|
||||
|
|
@ -545,19 +588,13 @@ void server_models::load(const std::string & name) {
|
|||
|
||||
uint64_t new_model_memory_mb = 0;
|
||||
if (base_params.models_memory_max > 0) {
|
||||
std::string model_path;
|
||||
{
|
||||
std::lock_guard<std::mutex> lk(mutex);
|
||||
auto & meta = mapping[name].meta;
|
||||
if (meta.preset.get_option("LLAMA_ARG_MODEL", model_path) && !model_path.empty()) {
|
||||
uint64_t size_bytes = llama_model_size_from_path(model_path.c_str());
|
||||
new_model_memory_mb = size_bytes / (1024 * 1024);
|
||||
meta.memory_mb = new_model_memory_mb;
|
||||
if (new_model_memory_mb > 0) {
|
||||
SRV_INF("model %s estimated size: %lu MB\n", name.c_str(),
|
||||
(unsigned long)new_model_memory_mb);
|
||||
}
|
||||
}
|
||||
std::lock_guard<std::mutex> lk(mutex);
|
||||
auto & meta = mapping[name].meta;
|
||||
new_model_memory_mb = get_model_memory_mb(meta.preset);
|
||||
meta.memory_mb = new_model_memory_mb;
|
||||
if (new_model_memory_mb > 0) {
|
||||
SRV_INF("model %s memory requirements: %lu MB\n", name.c_str(),
|
||||
(unsigned long)new_model_memory_mb);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -643,33 +680,10 @@ void server_models::load(const std::string & name) {
|
|||
// also handle status report from child process
|
||||
if (stdout_file) {
|
||||
char buffer[4096];
|
||||
bool ready_received = false;
|
||||
while (fgets(buffer, sizeof(buffer), stdout_file) != nullptr) {
|
||||
LOG("[%5d] %s", port, buffer);
|
||||
std::string str(buffer);
|
||||
if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_READY)) {
|
||||
if (!ready_received) {
|
||||
ready_received = true;
|
||||
try {
|
||||
httplib::Client cli("http://CHILD_ADDR");
|
||||
cli.set_connection_timeout(5, 0);
|
||||
if (auto res = cli.Get("/props")) {
|
||||
if (res->status == 200) {
|
||||
json props = json::parse(res->body);
|
||||
if (props.contains("memory_mb")) {
|
||||
uint64_t memory_mb = props["memory_mb"].get<uint64_t>();
|
||||
SRV_INF("model %s loaded, actual memory: %lu MB\n", name.c_str(), (unsigned long)memory_mb);
|
||||
std::lock_guard<std::mutex> lk(this->mutex);
|
||||
if (mapping.find(name) != mapping.end()) {
|
||||
mapping[name].meta.memory_mb = memory_mb;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (const std::exception & e) {
|
||||
SRV_WRN("failed to query memory for model %s: %s\n", name.c_str(), e.what());
|
||||
}
|
||||
}
|
||||
this->update_status(name, SERVER_MODEL_STATUS_LOADED, 0);
|
||||
} else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_SLEEP)) {
|
||||
this->update_status(name, SERVER_MODEL_STATUS_SLEEPING, 0);
|
||||
|
|
|
|||
|
|
@ -62,7 +62,7 @@ struct server_model_meta {
|
|||
int port = 0;
|
||||
server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
|
||||
int64_t last_used = 0; // for LRU unloading
|
||||
uint64_t memory_mb = 0; // size in MB (estimate before load, actual after load)
|
||||
uint64_t memory_mb = 0; // size in MB
|
||||
std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
|
||||
int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
|
||||
int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown
|
||||
|
|
|
|||
Loading…
Reference in New Issue