From c2df1ac64ab643cf0a659f6b3180c09292a6be6f Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Sun, 29 Mar 2026 12:18:51 +0200 Subject: [PATCH] estimate with to-be-loaded model size included --- include/llama.h | 6 +++++ src/llama-model.cpp | 29 +++++++++++++++++++++++ tools/server/server-models.cpp | 43 ++++++++++++++++++++++++---------- tools/server/server-models.h | 4 ++-- 4 files changed, 67 insertions(+), 15 deletions(-) diff --git a/include/llama.h b/include/llama.h index a940f9d648..69d9ff80c1 100644 --- a/include/llama.h +++ b/include/llama.h @@ -614,6 +614,12 @@ extern "C" { // Returns the total size of all the tensors in the model in bytes LLAMA_API uint64_t llama_model_size(const struct llama_model * model); + // Returns the total size of all the tensors in the model in bytes from a model path + // without fully loading the model. Uses llama_model_loader with no_alloc=true. + // Returns 0 if the model cannot be loaded or the path is invalid. + // This function can be used to estimate memory requirements before loading a model. + LLAMA_API uint64_t llama_model_size_from_path(const char * path); + // Get the default chat template. Returns nullptr if not available // If name is NULL, returns the default chat template LLAMA_API const char * llama_model_chat_template(const struct llama_model * model, const char * name); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index ba935340fc..6df9440dc1 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -9253,6 +9253,35 @@ uint64_t llama_model_size(const llama_model * model) { return model->size(); } +uint64_t llama_model_size_from_path(const char * path) { + if (!path) { + return 0; + } + + try { + std::vector splits; + + llama_model_loader loader( + /* metadata */ nullptr, + /* set_tensor_data */ nullptr, + /* set_tensor_data_ud */ nullptr, + /* fname */ path, + /* splits */ splits, + /* file */ nullptr, + /* use_mmap */ false, + /* use_direct_io */ false, + /* check_tensors */ false, + /* no_alloc */ true, + /* param_overrides_p */ nullptr, + /* param_tensor_buft_overrides_p */ nullptr + ); + + return loader.n_bytes; + } catch (...) { + return 0; + } +} + const char * llama_model_chat_template(const llama_model * model, const char * name) { const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE) : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE); diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index f86e267919..be10a88d84 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -494,11 +494,10 @@ std::vector server_models::get_all_meta() { return result; } -void server_models::unload_lru() { +void server_models::unload_lru(uint64_t new_model_memory_mb) { if (base_params.models_max <= 0 && base_params.models_memory_max <= 0) { return; // no limit } - // Keep unloading LRU models until limits are satisfied while (true) { std::string lru_model_name = ""; int64_t lru_last_used = ggml_time_ms(); @@ -517,12 +516,14 @@ void server_models::unload_lru() { } } } - // Check if limits exceeded - bool count_exceeded = base_params.models_max > 0 && count_active >= (size_t)base_params.models_max; - bool memory_exceeded = base_params.models_memory_max > 0 && total_memory_mb >= (uint64_t)base_params.models_memory_max; + bool count_exceeded = base_params.models_max > 0 && + (count_active + 1) >= (size_t)base_params.models_max; + uint64_t projected_memory = total_memory_mb + new_model_memory_mb; + bool memory_exceeded = base_params.models_memory_max > 0 && + projected_memory >= (uint64_t)base_params.models_memory_max; if (!lru_model_name.empty() && (count_exceeded || memory_exceeded)) { - SRV_INF("limits reached (count=%zu, memory=%lu MB), removing LRU name=%s\n", - count_active, (unsigned long)total_memory_mb, lru_model_name.c_str()); + SRV_INF("limits reached (count=%zu, memory=%lu MB + %lu MB new), removing LRU name=%s\n", + count_active, (unsigned long)total_memory_mb, (unsigned long)new_model_memory_mb, lru_model_name.c_str()); unload(lru_model_name); // wait for unload to complete { @@ -531,9 +532,8 @@ void server_models::unload_lru() { return mapping[lru_model_name].meta.status == SERVER_MODEL_STATUS_UNLOADED; }); } - // Loop continues to check if more unloading is needed } else { - break; // limits satisfied + break; } } } @@ -542,7 +542,26 @@ void server_models::load(const std::string & name) { if (!has_model(name)) { throw std::runtime_error("model name=" + name + " is not found"); } - unload_lru(); + + uint64_t new_model_memory_mb = 0; + if (base_params.models_memory_max > 0) { + std::string model_path; + { + std::lock_guard lk(mutex); + auto & meta = mapping[name].meta; + if (meta.preset.get_option("LLAMA_ARG_MODEL", model_path) && !model_path.empty()) { + uint64_t size_bytes = llama_model_size_from_path(model_path.c_str()); + new_model_memory_mb = size_bytes / (1024 * 1024); + meta.memory_mb = new_model_memory_mb; + if (new_model_memory_mb > 0) { + SRV_INF("model %s estimated size: %lu MB\n", name.c_str(), + (unsigned long)new_model_memory_mb); + } + } + } + } + + unload_lru(new_model_memory_mb); std::lock_guard lk(mutex); @@ -629,7 +648,6 @@ void server_models::load(const std::string & name) { LOG("[%5d] %s", port, buffer); std::string str(buffer); if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_READY)) { - // Query memory usage from the child's /props endpoint if (!ready_received) { ready_received = true; try { @@ -640,8 +658,7 @@ void server_models::load(const std::string & name) { json props = json::parse(res->body); if (props.contains("memory_mb")) { uint64_t memory_mb = props["memory_mb"].get(); - SRV_INF("model %s loaded, memory usage: %lu MB\n", name.c_str(), (unsigned long)memory_mb); - // Update memory_mb in meta + SRV_INF("model %s loaded, actual memory: %lu MB\n", name.c_str(), (unsigned long)memory_mb); std::lock_guard lk(this->mutex); if (mapping.find(name) != mapping.end()) { mapping[name].meta.memory_mb = memory_mb; diff --git a/tools/server/server-models.h b/tools/server/server-models.h index c195dbeb26..29c1c7c6f8 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -62,7 +62,7 @@ struct server_model_meta { int port = 0; server_model_status status = SERVER_MODEL_STATUS_UNLOADED; int64_t last_used = 0; // for LRU unloading - uint64_t memory_mb = 0; // estimated memory usage in MB + uint64_t memory_mb = 0; // size in MB (estimate before load, actual after load) std::vector args; // args passed to the model instance, will be populated by render_args() int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED) int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown @@ -111,7 +111,7 @@ private: void update_meta(const std::string & name, const server_model_meta & meta); // unload least recently used models if the limit is reached - void unload_lru(); + void unload_lru(uint64_t new_model_memory_mb = 0); // not thread-safe, caller must hold mutex void add_model(server_model_meta && meta);