From 24f461b66da411d0e6210f79dbeae61732c3fef2 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Tue, 31 Mar 2026 16:18:03 +0200 Subject: [PATCH] use no_alloc to get memory requirements for model load --- include/llama.h | 6 --- src/llama-model.cpp | 29 ----------- tools/server/server-context.cpp | 1 - tools/server/server-models.cpp | 86 +++++++++++++++++++-------------- tools/server/server-models.h | 2 +- 5 files changed, 51 insertions(+), 73 deletions(-) diff --git a/include/llama.h b/include/llama.h index 69d9ff80c1..a940f9d648 100644 --- a/include/llama.h +++ b/include/llama.h @@ -614,12 +614,6 @@ extern "C" { // Returns the total size of all the tensors in the model in bytes LLAMA_API uint64_t llama_model_size(const struct llama_model * model); - // Returns the total size of all the tensors in the model in bytes from a model path - // without fully loading the model. Uses llama_model_loader with no_alloc=true. - // Returns 0 if the model cannot be loaded or the path is invalid. - // This function can be used to estimate memory requirements before loading a model. - LLAMA_API uint64_t llama_model_size_from_path(const char * path); - // Get the default chat template. Returns nullptr if not available // If name is NULL, returns the default chat template LLAMA_API const char * llama_model_chat_template(const struct llama_model * model, const char * name); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 6df9440dc1..ba935340fc 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -9253,35 +9253,6 @@ uint64_t llama_model_size(const llama_model * model) { return model->size(); } -uint64_t llama_model_size_from_path(const char * path) { - if (!path) { - return 0; - } - - try { - std::vector splits; - - llama_model_loader loader( - /* metadata */ nullptr, - /* set_tensor_data */ nullptr, - /* set_tensor_data_ud */ nullptr, - /* fname */ path, - /* splits */ splits, - /* file */ nullptr, - /* use_mmap */ false, - /* use_direct_io */ false, - /* check_tensors */ false, - /* no_alloc */ true, - /* param_overrides_p */ nullptr, - /* param_tensor_buft_overrides_p */ nullptr - ); - - return loader.n_bytes; - } catch (...) { - return 0; - } -} - const char * llama_model_chat_template(const llama_model * model, const char * name) { const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE) : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE); diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index bfa032a814..6f737d94d0 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -3495,7 +3495,6 @@ void server_routes::init_routes() { { "total_slots", params.n_parallel }, { "model_alias", meta->model_name }, { "model_path", meta->model_path }, - { "memory_mb", meta->model_size / (1024 * 1024) }, { "modalities", json { {"vision", meta->has_inp_image}, {"audio", meta->has_inp_audio}, diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index be10a88d84..317b091305 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -538,6 +538,49 @@ void server_models::unload_lru(uint64_t new_model_memory_mb) { } } +static uint64_t get_model_memory_mb(const common_preset& preset) { + common_params params; + preset.apply_to_params(params); + + if(params.model.path.empty()) { + return 0; + } + + struct log_ud_t { + struct { + ggml_log_callback callback; + void * user_data; + } original; + ggml_log_level min_level; + } log_ud; + llama_log_get(&log_ud.original.callback, &log_ud.original.user_data); + log_ud.min_level = GGML_LOG_LEVEL_WARN; + + llama_log_set([](ggml_log_level level, const char * text, void * ud) { + log_ud_t * d = (log_ud_t *) ud; + const ggml_log_level eff = level >= d->min_level ? level : GGML_LOG_LEVEL_DEBUG; + d->original.callback(eff, text, d->original.user_data); + }, &log_ud); + + llama_model_params mparams = common_model_params_to_llama(params); + mparams.no_alloc = true; + mparams.use_mmap = false; + mparams.use_mlock = false; + + llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams); + + llama_log_set(log_ud.original.callback, log_ud.original.user_data); + + if (!model) { + return 0; + } + + uint64_t size_bytes = llama_model_size(model); + llama_model_free(model); + + return size_bytes / (1024 * 1024); +} + void server_models::load(const std::string & name) { if (!has_model(name)) { throw std::runtime_error("model name=" + name + " is not found"); @@ -545,19 +588,13 @@ void server_models::load(const std::string & name) { uint64_t new_model_memory_mb = 0; if (base_params.models_memory_max > 0) { - std::string model_path; - { - std::lock_guard lk(mutex); - auto & meta = mapping[name].meta; - if (meta.preset.get_option("LLAMA_ARG_MODEL", model_path) && !model_path.empty()) { - uint64_t size_bytes = llama_model_size_from_path(model_path.c_str()); - new_model_memory_mb = size_bytes / (1024 * 1024); - meta.memory_mb = new_model_memory_mb; - if (new_model_memory_mb > 0) { - SRV_INF("model %s estimated size: %lu MB\n", name.c_str(), - (unsigned long)new_model_memory_mb); - } - } + std::lock_guard lk(mutex); + auto & meta = mapping[name].meta; + new_model_memory_mb = get_model_memory_mb(meta.preset); + meta.memory_mb = new_model_memory_mb; + if (new_model_memory_mb > 0) { + SRV_INF("model %s memory requirements: %lu MB\n", name.c_str(), + (unsigned long)new_model_memory_mb); } } @@ -643,33 +680,10 @@ void server_models::load(const std::string & name) { // also handle status report from child process if (stdout_file) { char buffer[4096]; - bool ready_received = false; while (fgets(buffer, sizeof(buffer), stdout_file) != nullptr) { LOG("[%5d] %s", port, buffer); std::string str(buffer); if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_READY)) { - if (!ready_received) { - ready_received = true; - try { - httplib::Client cli("http://CHILD_ADDR"); - cli.set_connection_timeout(5, 0); - if (auto res = cli.Get("/props")) { - if (res->status == 200) { - json props = json::parse(res->body); - if (props.contains("memory_mb")) { - uint64_t memory_mb = props["memory_mb"].get(); - SRV_INF("model %s loaded, actual memory: %lu MB\n", name.c_str(), (unsigned long)memory_mb); - std::lock_guard lk(this->mutex); - if (mapping.find(name) != mapping.end()) { - mapping[name].meta.memory_mb = memory_mb; - } - } - } - } - } catch (const std::exception & e) { - SRV_WRN("failed to query memory for model %s: %s\n", name.c_str(), e.what()); - } - } this->update_status(name, SERVER_MODEL_STATUS_LOADED, 0); } else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_SLEEP)) { this->update_status(name, SERVER_MODEL_STATUS_SLEEPING, 0); diff --git a/tools/server/server-models.h b/tools/server/server-models.h index 29c1c7c6f8..2cbdb35b32 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -62,7 +62,7 @@ struct server_model_meta { int port = 0; server_model_status status = SERVER_MODEL_STATUS_UNLOADED; int64_t last_used = 0; // for LRU unloading - uint64_t memory_mb = 0; // size in MB (estimate before load, actual after load) + uint64_t memory_mb = 0; // size in MB std::vector args; // args passed to the model instance, will be populated by render_args() int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED) int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown