From c2df1ac64ab643cf0a659f6b3180c09292a6be6f Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Sun, 29 Mar 2026 12:18:51 +0200
Subject: [PATCH] estimate with to-be-loaded model size included

---
 include/llama.h                |  6 +++++
 src/llama-model.cpp            | 29 +++++++++++++++++++++++
 tools/server/server-models.cpp | 43 ++++++++++++++++++++++++----------
 tools/server/server-models.h   |  4 ++--
 4 files changed, 67 insertions(+), 15 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index a940f9d648..69d9ff80c1 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -614,6 +614,12 @@ extern "C" {
     // Returns the total size of all the tensors in the model in bytes
     LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
 
+    // Returns the total size of all the tensors in the model in bytes from a model path
+    // without fully loading the model. Uses llama_model_loader with no_alloc=true.
+    // Returns 0 if the model cannot be loaded or the path is invalid.
+    // This function can be used to estimate memory requirements before loading a model.
+    LLAMA_API uint64_t llama_model_size_from_path(const char * path);
+
     // Get the default chat template. Returns nullptr if not available
     // If name is NULL, returns the default chat template
     LLAMA_API const char * llama_model_chat_template(const struct llama_model * model, const char * name);
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index ba935340fc..6df9440dc1 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -9253,6 +9253,35 @@ uint64_t llama_model_size(const llama_model * model) {
     return model->size();
 }
 
+uint64_t llama_model_size_from_path(const char * path) {
+    if (!path) {
+        return 0;
+    }
+
+    try {
+        std::vector<std::string> splits;
+
+        llama_model_loader loader(
+            /* metadata                      */ nullptr,
+            /* set_tensor_data               */ nullptr,
+            /* set_tensor_data_ud            */ nullptr,
+            /* fname                         */ path,
+            /* splits                        */ splits,
+            /* file                          */ nullptr,
+            /* use_mmap                      */ false,
+            /* use_direct_io                 */ false,
+            /* check_tensors                 */ false,
+            /* no_alloc                      */ true,
+            /* param_overrides_p             */ nullptr,
+            /* param_tensor_buft_overrides_p */ nullptr
+        );
+
+        return loader.n_bytes;
+    } catch (...) {
+        return 0;
+    }
+}
+
 const char * llama_model_chat_template(const llama_model * model, const char * name) {
     const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE)
         : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index f86e267919..be10a88d84 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -494,11 +494,10 @@ std::vector<server_model_meta> server_models::get_all_meta() {
     return result;
 }
 
-void server_models::unload_lru() {
+void server_models::unload_lru(uint64_t new_model_memory_mb) {
     if (base_params.models_max <= 0 && base_params.models_memory_max <= 0) {
         return; // no limit
     }
-    // Keep unloading LRU models until limits are satisfied
     while (true) {
         std::string lru_model_name = "";
         int64_t lru_last_used = ggml_time_ms();
@@ -517,12 +516,14 @@ void server_models::unload_lru() {
                 }
             }
         }
-        // Check if limits exceeded
-        bool count_exceeded = base_params.models_max > 0 && count_active >= (size_t)base_params.models_max;
-        bool memory_exceeded = base_params.models_memory_max > 0 && total_memory_mb >= (uint64_t)base_params.models_memory_max;
+        bool count_exceeded = base_params.models_max > 0 &&
+                              (count_active + 1) >= (size_t)base_params.models_max;
+        uint64_t projected_memory = total_memory_mb + new_model_memory_mb;
+        bool memory_exceeded = base_params.models_memory_max > 0 &&
+                               projected_memory >= (uint64_t)base_params.models_memory_max;
         if (!lru_model_name.empty() && (count_exceeded || memory_exceeded)) {
-            SRV_INF("limits reached (count=%zu, memory=%lu MB), removing LRU name=%s\n",
-                    count_active, (unsigned long)total_memory_mb, lru_model_name.c_str());
+            SRV_INF("limits reached (count=%zu, memory=%lu MB + %lu MB new), removing LRU name=%s\n",
+                    count_active, (unsigned long)total_memory_mb, (unsigned long)new_model_memory_mb, lru_model_name.c_str());
             unload(lru_model_name);
             // wait for unload to complete
             {
@@ -531,9 +532,8 @@ void server_models::unload_lru() {
                     return mapping[lru_model_name].meta.status == SERVER_MODEL_STATUS_UNLOADED;
                 });
             }
-            // Loop continues to check if more unloading is needed
         } else {
-            break; // limits satisfied
+            break;
         }
     }
 }
@@ -542,7 +542,26 @@ void server_models::load(const std::string & name) {
     if (!has_model(name)) {
         throw std::runtime_error("model name=" + name + " is not found");
     }
-    unload_lru();
+
+    uint64_t new_model_memory_mb = 0;
+    if (base_params.models_memory_max > 0) {
+        std::string model_path;
+        {
+            std::lock_guard<std::mutex> lk(mutex);
+            auto & meta = mapping[name].meta;
+            if (meta.preset.get_option("LLAMA_ARG_MODEL", model_path) && !model_path.empty()) {
+                uint64_t size_bytes = llama_model_size_from_path(model_path.c_str());
+                new_model_memory_mb = size_bytes / (1024 * 1024);
+                meta.memory_mb = new_model_memory_mb;
+                if (new_model_memory_mb > 0) {
+                    SRV_INF("model %s estimated size: %lu MB\n", name.c_str(),
+                            (unsigned long)new_model_memory_mb);
+                }
+            }
+        }
+    }
+
+    unload_lru(new_model_memory_mb);
 
     std::lock_guard<std::mutex> lk(mutex);
 
@@ -629,7 +648,6 @@ void server_models::load(const std::string & name) {
                     LOG("[%5d] %s", port, buffer);
                     std::string str(buffer);
                     if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_READY)) {
-                        // Query memory usage from the child's /props endpoint
                         if (!ready_received) {
                             ready_received = true;
                             try {
@@ -640,8 +658,7 @@ void server_models::load(const std::string & name) {
                                         json props = json::parse(res->body);
                                         if (props.contains("memory_mb")) {
                                             uint64_t memory_mb = props["memory_mb"].get<uint64_t>();
-                                            SRV_INF("model %s loaded, memory usage: %lu MB\n", name.c_str(), (unsigned long)memory_mb);
-                                            // Update memory_mb in meta
+                                            SRV_INF("model %s loaded, actual memory: %lu MB\n", name.c_str(), (unsigned long)memory_mb);
                                             std::lock_guard<std::mutex> lk(this->mutex);
                                             if (mapping.find(name) != mapping.end()) {
                                                 mapping[name].meta.memory_mb = memory_mb;
diff --git a/tools/server/server-models.h b/tools/server/server-models.h
index c195dbeb26..29c1c7c6f8 100644
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@@ -62,7 +62,7 @@ struct server_model_meta {
     int port = 0;
     server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
     int64_t last_used = 0; // for LRU unloading
-    uint64_t memory_mb = 0; // estimated memory usage in MB
+    uint64_t memory_mb = 0; // size in MB (estimate before load, actual after load)
     std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
     int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
     int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown
@@ -111,7 +111,7 @@ private:
     void update_meta(const std::string & name, const server_model_meta & meta);
 
     // unload least recently used models if the limit is reached
-    void unload_lru();
+    void unload_lru(uint64_t new_model_memory_mb = 0);
 
     // not thread-safe, caller must hold mutex
     void add_model(server_model_meta && meta);