From 24f461b66da411d0e6210f79dbeae61732c3fef2 Mon Sep 17 00:00:00 2001
From: Ruben Ortlam <rortlam@redhat.com>
Date: Tue, 31 Mar 2026 16:18:03 +0200
Subject: [PATCH] use no_alloc to get memory requirements for model load

---
 include/llama.h                 |  6 ---
 src/llama-model.cpp             | 29 -----------
 tools/server/server-context.cpp |  1 -
 tools/server/server-models.cpp  | 86 +++++++++++++++++++--------------
 tools/server/server-models.h    |  2 +-
 5 files changed, 51 insertions(+), 73 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index 69d9ff80c1..a940f9d648 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -614,12 +614,6 @@ extern "C" {
     // Returns the total size of all the tensors in the model in bytes
     LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
 
-    // Returns the total size of all the tensors in the model in bytes from a model path
-    // without fully loading the model. Uses llama_model_loader with no_alloc=true.
-    // Returns 0 if the model cannot be loaded or the path is invalid.
-    // This function can be used to estimate memory requirements before loading a model.
-    LLAMA_API uint64_t llama_model_size_from_path(const char * path);
-
     // Get the default chat template. Returns nullptr if not available
     // If name is NULL, returns the default chat template
     LLAMA_API const char * llama_model_chat_template(const struct llama_model * model, const char * name);
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 6df9440dc1..ba935340fc 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -9253,35 +9253,6 @@ uint64_t llama_model_size(const llama_model * model) {
     return model->size();
 }
 
-uint64_t llama_model_size_from_path(const char * path) {
-    if (!path) {
-        return 0;
-    }
-
-    try {
-        std::vector<std::string> splits;
-
-        llama_model_loader loader(
-            /* metadata                      */ nullptr,
-            /* set_tensor_data               */ nullptr,
-            /* set_tensor_data_ud            */ nullptr,
-            /* fname                         */ path,
-            /* splits                        */ splits,
-            /* file                          */ nullptr,
-            /* use_mmap                      */ false,
-            /* use_direct_io                 */ false,
-            /* check_tensors                 */ false,
-            /* no_alloc                      */ true,
-            /* param_overrides_p             */ nullptr,
-            /* param_tensor_buft_overrides_p */ nullptr
-        );
-
-        return loader.n_bytes;
-    } catch (...) {
-        return 0;
-    }
-}
-
 const char * llama_model_chat_template(const llama_model * model, const char * name) {
     const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE)
         : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index bfa032a814..6f737d94d0 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -3495,7 +3495,6 @@ void server_routes::init_routes() {
             { "total_slots",                 params.n_parallel },
             { "model_alias",                 meta->model_name },
             { "model_path",                  meta->model_path },
-            { "memory_mb",                   meta->model_size / (1024 * 1024) },
             { "modalities",                  json {
                 {"vision", meta->has_inp_image},
                 {"audio",  meta->has_inp_audio},
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index be10a88d84..317b091305 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -538,6 +538,49 @@ void server_models::unload_lru(uint64_t new_model_memory_mb) {
     }
 }
 
+static uint64_t get_model_memory_mb(const common_preset& preset) {
+    common_params params;
+    preset.apply_to_params(params);
+
+    if(params.model.path.empty()) {
+        return 0;
+    }
+
+    struct log_ud_t {
+        struct {
+            ggml_log_callback callback;
+            void * user_data;
+        } original;
+        ggml_log_level min_level;
+    } log_ud;
+    llama_log_get(&log_ud.original.callback, &log_ud.original.user_data);
+    log_ud.min_level = GGML_LOG_LEVEL_WARN;
+
+    llama_log_set([](ggml_log_level level, const char * text, void * ud) {
+        log_ud_t * d = (log_ud_t *) ud;
+        const ggml_log_level eff = level >= d->min_level ? level : GGML_LOG_LEVEL_DEBUG;
+        d->original.callback(eff, text, d->original.user_data);
+    }, &log_ud);
+
+    llama_model_params mparams = common_model_params_to_llama(params);
+    mparams.no_alloc = true;
+    mparams.use_mmap = false;
+    mparams.use_mlock = false;
+
+    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
+
+    llama_log_set(log_ud.original.callback, log_ud.original.user_data);
+
+    if (!model) {
+        return 0;
+    }
+
+    uint64_t size_bytes = llama_model_size(model);
+    llama_model_free(model);
+
+    return size_bytes / (1024 * 1024);
+}
+
 void server_models::load(const std::string & name) {
     if (!has_model(name)) {
         throw std::runtime_error("model name=" + name + " is not found");
@@ -545,19 +588,13 @@ void server_models::load(const std::string & name) {
 
     uint64_t new_model_memory_mb = 0;
     if (base_params.models_memory_max > 0) {
-        std::string model_path;
-        {
-            std::lock_guard<std::mutex> lk(mutex);
-            auto & meta = mapping[name].meta;
-            if (meta.preset.get_option("LLAMA_ARG_MODEL", model_path) && !model_path.empty()) {
-                uint64_t size_bytes = llama_model_size_from_path(model_path.c_str());
-                new_model_memory_mb = size_bytes / (1024 * 1024);
-                meta.memory_mb = new_model_memory_mb;
-                if (new_model_memory_mb > 0) {
-                    SRV_INF("model %s estimated size: %lu MB\n", name.c_str(),
-                            (unsigned long)new_model_memory_mb);
-                }
-            }
+        std::lock_guard<std::mutex> lk(mutex);
+        auto & meta = mapping[name].meta;
+        new_model_memory_mb = get_model_memory_mb(meta.preset);
+        meta.memory_mb = new_model_memory_mb;
+        if (new_model_memory_mb > 0) {
+            SRV_INF("model %s memory requirements: %lu MB\n", name.c_str(),
+                    (unsigned long)new_model_memory_mb);
         }
     }
 
@@ -643,33 +680,10 @@ void server_models::load(const std::string & name) {
             // also handle status report from child process
             if (stdout_file) {
                 char buffer[4096];
-                bool ready_received = false;
                 while (fgets(buffer, sizeof(buffer), stdout_file) != nullptr) {
                     LOG("[%5d] %s", port, buffer);
                     std::string str(buffer);
                     if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_READY)) {
-                        if (!ready_received) {
-                            ready_received = true;
-                            try {
-                                httplib::Client cli("http://CHILD_ADDR");
-                                cli.set_connection_timeout(5, 0);
-                                if (auto res = cli.Get("/props")) {
-                                    if (res->status == 200) {
-                                        json props = json::parse(res->body);
-                                        if (props.contains("memory_mb")) {
-                                            uint64_t memory_mb = props["memory_mb"].get<uint64_t>();
-                                            SRV_INF("model %s loaded, actual memory: %lu MB\n", name.c_str(), (unsigned long)memory_mb);
-                                            std::lock_guard<std::mutex> lk(this->mutex);
-                                            if (mapping.find(name) != mapping.end()) {
-                                                mapping[name].meta.memory_mb = memory_mb;
-                                            }
-                                        }
-                                    }
-                                }
-                            } catch (const std::exception & e) {
-                                SRV_WRN("failed to query memory for model %s: %s\n", name.c_str(), e.what());
-                            }
-                        }
                         this->update_status(name, SERVER_MODEL_STATUS_LOADED, 0);
                     } else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_SLEEP)) {
                         this->update_status(name, SERVER_MODEL_STATUS_SLEEPING, 0);
diff --git a/tools/server/server-models.h b/tools/server/server-models.h
index 29c1c7c6f8..2cbdb35b32 100644
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@@ -62,7 +62,7 @@ struct server_model_meta {
     int port = 0;
     server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
     int64_t last_used = 0; // for LRU unloading
-    uint64_t memory_mb = 0; // size in MB (estimate before load, actual after load)
+    uint64_t memory_mb = 0; // size in MB
     std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
     int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
     int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown