From 8482ffc3871ab2522362f81f93e5494dfd8c5c7d Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Sun, 29 Mar 2026 10:00:49 +0200 Subject: [PATCH 1/8] server: add --models-memory-max parameter to allow dynamically unloading models when they exceed a memory size threshold --- common/arg.cpp | 7 +++ common/common.h | 1 + tools/server/server-context.cpp | 1 + tools/server/server-models.cpp | 91 ++++++++++++++++++++++++--------- tools/server/server-models.h | 1 + 5 files changed, 76 insertions(+), 25 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 649216b7f0..cac8819956 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3044,6 +3044,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.models_max = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX")); + add_opt(common_arg( + {"--models-memory-max"}, "N", + string_format("for router server, maximum memory usage in MB (default: %d, 0 = unlimited)", params.models_memory_max), + [](common_params & params, int value) { + params.models_memory_max = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MEMORY_MAX")); add_opt(common_arg( {"--models-autoload"}, {"--no-models-autoload"}, diff --git a/common/common.h b/common/common.h index 31a337daa6..573a9bf4ef 100644 --- a/common/common.h +++ b/common/common.h @@ -621,6 +621,7 @@ struct common_params { std::string models_dir = ""; // directory containing models for the router server std::string models_preset = ""; // directory containing model presets for the router server int models_max = 4; // maximum number of models to load simultaneously + int models_memory_max = 0; // maximum memory usage in MB (0 = unlimited, estimated from model files) bool models_autoload = true; // automatically load models when requested via the router server bool log_json = false; diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 6f737d94d0..bfa032a814 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -3495,6 +3495,7 @@ void server_routes::init_routes() { { "total_slots", params.n_parallel }, { "model_alias", meta->model_name }, { "model_path", meta->model_path }, + { "memory_mb", meta->model_size / (1024 * 1024) }, { "modalities", json { {"vision", meta->has_inp_image}, {"audio", meta->has_inp_audio}, diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index c83709272f..f86e267919 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -300,6 +300,7 @@ void server_models::load_models() { /* port */ 0, /* status */ SERVER_MODEL_STATUS_UNLOADED, /* last_used */ 0, + /* memory_mb */ 0, /* args */ std::vector(), /* exit_code */ 0, /* stop_timeout */ DEFAULT_STOP_TIMEOUT, @@ -494,34 +495,45 @@ std::vector server_models::get_all_meta() { } void server_models::unload_lru() { - if (base_params.models_max <= 0) { + if (base_params.models_max <= 0 && base_params.models_memory_max <= 0) { return; // no limit } - // remove one of the servers if we passed the models_max (least recently used - LRU) - std::string lru_model_name = ""; - int64_t lru_last_used = ggml_time_ms(); - size_t count_active = 0; - { - std::unique_lock lk(mutex); - for (const auto & m : mapping) { - if (m.second.meta.is_running()) { - count_active++; - if (m.second.meta.last_used < lru_last_used) { - lru_model_name = m.first; - lru_last_used = m.second.meta.last_used; + // Keep unloading LRU models until limits are satisfied + while (true) { + std::string lru_model_name = ""; + int64_t lru_last_used = ggml_time_ms(); + size_t count_active = 0; + uint64_t total_memory_mb = 0; + { + std::unique_lock lk(mutex); + for (const auto & m : mapping) { + if (m.second.meta.is_running()) { + count_active++; + total_memory_mb += m.second.meta.memory_mb; + if (m.second.meta.last_used < lru_last_used) { + lru_model_name = m.first; + lru_last_used = m.second.meta.last_used; + } } } } - } - if (!lru_model_name.empty() && count_active >= (size_t)base_params.models_max) { - SRV_INF("models_max limit reached, removing LRU name=%s\n", lru_model_name.c_str()); - unload(lru_model_name); - // wait for unload to complete - { - std::unique_lock lk(mutex); - cv.wait(lk, [this, &lru_model_name]() { - return mapping[lru_model_name].meta.status == SERVER_MODEL_STATUS_UNLOADED; - }); + // Check if limits exceeded + bool count_exceeded = base_params.models_max > 0 && count_active >= (size_t)base_params.models_max; + bool memory_exceeded = base_params.models_memory_max > 0 && total_memory_mb >= (uint64_t)base_params.models_memory_max; + if (!lru_model_name.empty() && (count_exceeded || memory_exceeded)) { + SRV_INF("limits reached (count=%zu, memory=%lu MB), removing LRU name=%s\n", + count_active, (unsigned long)total_memory_mb, lru_model_name.c_str()); + unload(lru_model_name); + // wait for unload to complete + { + std::unique_lock lk(mutex); + cv.wait(lk, [this, &lru_model_name]() { + return mapping[lru_model_name].meta.status == SERVER_MODEL_STATUS_UNLOADED; + }); + } + // Loop continues to check if more unloading is needed + } else { + break; // limits satisfied } } } @@ -544,14 +556,18 @@ void server_models::load(const std::string & name) { // exceeding models_max. Without this, the window between unload_lru() // releasing its lock and this lock_guard acquiring allows multiple // threads to each observe capacity and all proceed to load. - if (base_params.models_max > 0) { + if (base_params.models_max > 0 || base_params.models_memory_max > 0) { size_t count_active = 0; + uint64_t total_memory_mb = 0; for (const auto & m : mapping) { if (m.second.meta.is_running()) { count_active++; + total_memory_mb += m.second.meta.memory_mb; } } - if (count_active >= (size_t)base_params.models_max) { + bool count_exceeded = base_params.models_max > 0 && count_active >= (size_t)base_params.models_max; + bool memory_exceeded = base_params.models_memory_max > 0 && total_memory_mb >= (uint64_t)base_params.models_memory_max; + if (count_exceeded || memory_exceeded) { throw std::runtime_error("model limit reached, try again later"); } } @@ -608,10 +624,35 @@ void server_models::load(const std::string & name) { // also handle status report from child process if (stdout_file) { char buffer[4096]; + bool ready_received = false; while (fgets(buffer, sizeof(buffer), stdout_file) != nullptr) { LOG("[%5d] %s", port, buffer); std::string str(buffer); if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_READY)) { + // Query memory usage from the child's /props endpoint + if (!ready_received) { + ready_received = true; + try { + httplib::Client cli("http://CHILD_ADDR"); + cli.set_connection_timeout(5, 0); + if (auto res = cli.Get("/props")) { + if (res->status == 200) { + json props = json::parse(res->body); + if (props.contains("memory_mb")) { + uint64_t memory_mb = props["memory_mb"].get(); + SRV_INF("model %s loaded, memory usage: %lu MB\n", name.c_str(), (unsigned long)memory_mb); + // Update memory_mb in meta + std::lock_guard lk(this->mutex); + if (mapping.find(name) != mapping.end()) { + mapping[name].meta.memory_mb = memory_mb; + } + } + } + } + } catch (const std::exception & e) { + SRV_WRN("failed to query memory for model %s: %s\n", name.c_str(), e.what()); + } + } this->update_status(name, SERVER_MODEL_STATUS_LOADED, 0); } else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_SLEEP)) { this->update_status(name, SERVER_MODEL_STATUS_SLEEPING, 0); diff --git a/tools/server/server-models.h b/tools/server/server-models.h index 1db34b6c4d..c195dbeb26 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -62,6 +62,7 @@ struct server_model_meta { int port = 0; server_model_status status = SERVER_MODEL_STATUS_UNLOADED; int64_t last_used = 0; // for LRU unloading + uint64_t memory_mb = 0; // estimated memory usage in MB std::vector args; // args passed to the model instance, will be populated by render_args() int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED) int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown From c2df1ac64ab643cf0a659f6b3180c09292a6be6f Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Sun, 29 Mar 2026 12:18:51 +0200 Subject: [PATCH 2/8] estimate with to-be-loaded model size included --- include/llama.h | 6 +++++ src/llama-model.cpp | 29 +++++++++++++++++++++++ tools/server/server-models.cpp | 43 ++++++++++++++++++++++++---------- tools/server/server-models.h | 4 ++-- 4 files changed, 67 insertions(+), 15 deletions(-) diff --git a/include/llama.h b/include/llama.h index a940f9d648..69d9ff80c1 100644 --- a/include/llama.h +++ b/include/llama.h @@ -614,6 +614,12 @@ extern "C" { // Returns the total size of all the tensors in the model in bytes LLAMA_API uint64_t llama_model_size(const struct llama_model * model); + // Returns the total size of all the tensors in the model in bytes from a model path + // without fully loading the model. Uses llama_model_loader with no_alloc=true. + // Returns 0 if the model cannot be loaded or the path is invalid. + // This function can be used to estimate memory requirements before loading a model. + LLAMA_API uint64_t llama_model_size_from_path(const char * path); + // Get the default chat template. Returns nullptr if not available // If name is NULL, returns the default chat template LLAMA_API const char * llama_model_chat_template(const struct llama_model * model, const char * name); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index ba935340fc..6df9440dc1 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -9253,6 +9253,35 @@ uint64_t llama_model_size(const llama_model * model) { return model->size(); } +uint64_t llama_model_size_from_path(const char * path) { + if (!path) { + return 0; + } + + try { + std::vector splits; + + llama_model_loader loader( + /* metadata */ nullptr, + /* set_tensor_data */ nullptr, + /* set_tensor_data_ud */ nullptr, + /* fname */ path, + /* splits */ splits, + /* file */ nullptr, + /* use_mmap */ false, + /* use_direct_io */ false, + /* check_tensors */ false, + /* no_alloc */ true, + /* param_overrides_p */ nullptr, + /* param_tensor_buft_overrides_p */ nullptr + ); + + return loader.n_bytes; + } catch (...) { + return 0; + } +} + const char * llama_model_chat_template(const llama_model * model, const char * name) { const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE) : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE); diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index f86e267919..be10a88d84 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -494,11 +494,10 @@ std::vector server_models::get_all_meta() { return result; } -void server_models::unload_lru() { +void server_models::unload_lru(uint64_t new_model_memory_mb) { if (base_params.models_max <= 0 && base_params.models_memory_max <= 0) { return; // no limit } - // Keep unloading LRU models until limits are satisfied while (true) { std::string lru_model_name = ""; int64_t lru_last_used = ggml_time_ms(); @@ -517,12 +516,14 @@ void server_models::unload_lru() { } } } - // Check if limits exceeded - bool count_exceeded = base_params.models_max > 0 && count_active >= (size_t)base_params.models_max; - bool memory_exceeded = base_params.models_memory_max > 0 && total_memory_mb >= (uint64_t)base_params.models_memory_max; + bool count_exceeded = base_params.models_max > 0 && + (count_active + 1) >= (size_t)base_params.models_max; + uint64_t projected_memory = total_memory_mb + new_model_memory_mb; + bool memory_exceeded = base_params.models_memory_max > 0 && + projected_memory >= (uint64_t)base_params.models_memory_max; if (!lru_model_name.empty() && (count_exceeded || memory_exceeded)) { - SRV_INF("limits reached (count=%zu, memory=%lu MB), removing LRU name=%s\n", - count_active, (unsigned long)total_memory_mb, lru_model_name.c_str()); + SRV_INF("limits reached (count=%zu, memory=%lu MB + %lu MB new), removing LRU name=%s\n", + count_active, (unsigned long)total_memory_mb, (unsigned long)new_model_memory_mb, lru_model_name.c_str()); unload(lru_model_name); // wait for unload to complete { @@ -531,9 +532,8 @@ void server_models::unload_lru() { return mapping[lru_model_name].meta.status == SERVER_MODEL_STATUS_UNLOADED; }); } - // Loop continues to check if more unloading is needed } else { - break; // limits satisfied + break; } } } @@ -542,7 +542,26 @@ void server_models::load(const std::string & name) { if (!has_model(name)) { throw std::runtime_error("model name=" + name + " is not found"); } - unload_lru(); + + uint64_t new_model_memory_mb = 0; + if (base_params.models_memory_max > 0) { + std::string model_path; + { + std::lock_guard lk(mutex); + auto & meta = mapping[name].meta; + if (meta.preset.get_option("LLAMA_ARG_MODEL", model_path) && !model_path.empty()) { + uint64_t size_bytes = llama_model_size_from_path(model_path.c_str()); + new_model_memory_mb = size_bytes / (1024 * 1024); + meta.memory_mb = new_model_memory_mb; + if (new_model_memory_mb > 0) { + SRV_INF("model %s estimated size: %lu MB\n", name.c_str(), + (unsigned long)new_model_memory_mb); + } + } + } + } + + unload_lru(new_model_memory_mb); std::lock_guard lk(mutex); @@ -629,7 +648,6 @@ void server_models::load(const std::string & name) { LOG("[%5d] %s", port, buffer); std::string str(buffer); if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_READY)) { - // Query memory usage from the child's /props endpoint if (!ready_received) { ready_received = true; try { @@ -640,8 +658,7 @@ void server_models::load(const std::string & name) { json props = json::parse(res->body); if (props.contains("memory_mb")) { uint64_t memory_mb = props["memory_mb"].get(); - SRV_INF("model %s loaded, memory usage: %lu MB\n", name.c_str(), (unsigned long)memory_mb); - // Update memory_mb in meta + SRV_INF("model %s loaded, actual memory: %lu MB\n", name.c_str(), (unsigned long)memory_mb); std::lock_guard lk(this->mutex); if (mapping.find(name) != mapping.end()) { mapping[name].meta.memory_mb = memory_mb; diff --git a/tools/server/server-models.h b/tools/server/server-models.h index c195dbeb26..29c1c7c6f8 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -62,7 +62,7 @@ struct server_model_meta { int port = 0; server_model_status status = SERVER_MODEL_STATUS_UNLOADED; int64_t last_used = 0; // for LRU unloading - uint64_t memory_mb = 0; // estimated memory usage in MB + uint64_t memory_mb = 0; // size in MB (estimate before load, actual after load) std::vector args; // args passed to the model instance, will be populated by render_args() int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED) int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown @@ -111,7 +111,7 @@ private: void update_meta(const std::string & name, const server_model_meta & meta); // unload least recently used models if the limit is reached - void unload_lru(); + void unload_lru(uint64_t new_model_memory_mb = 0); // not thread-safe, caller must hold mutex void add_model(server_model_meta && meta); From 24f461b66da411d0e6210f79dbeae61732c3fef2 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Tue, 31 Mar 2026 16:18:03 +0200 Subject: [PATCH 3/8] use no_alloc to get memory requirements for model load --- include/llama.h | 6 --- src/llama-model.cpp | 29 ----------- tools/server/server-context.cpp | 1 - tools/server/server-models.cpp | 86 +++++++++++++++++++-------------- tools/server/server-models.h | 2 +- 5 files changed, 51 insertions(+), 73 deletions(-) diff --git a/include/llama.h b/include/llama.h index 69d9ff80c1..a940f9d648 100644 --- a/include/llama.h +++ b/include/llama.h @@ -614,12 +614,6 @@ extern "C" { // Returns the total size of all the tensors in the model in bytes LLAMA_API uint64_t llama_model_size(const struct llama_model * model); - // Returns the total size of all the tensors in the model in bytes from a model path - // without fully loading the model. Uses llama_model_loader with no_alloc=true. - // Returns 0 if the model cannot be loaded or the path is invalid. - // This function can be used to estimate memory requirements before loading a model. - LLAMA_API uint64_t llama_model_size_from_path(const char * path); - // Get the default chat template. Returns nullptr if not available // If name is NULL, returns the default chat template LLAMA_API const char * llama_model_chat_template(const struct llama_model * model, const char * name); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 6df9440dc1..ba935340fc 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -9253,35 +9253,6 @@ uint64_t llama_model_size(const llama_model * model) { return model->size(); } -uint64_t llama_model_size_from_path(const char * path) { - if (!path) { - return 0; - } - - try { - std::vector splits; - - llama_model_loader loader( - /* metadata */ nullptr, - /* set_tensor_data */ nullptr, - /* set_tensor_data_ud */ nullptr, - /* fname */ path, - /* splits */ splits, - /* file */ nullptr, - /* use_mmap */ false, - /* use_direct_io */ false, - /* check_tensors */ false, - /* no_alloc */ true, - /* param_overrides_p */ nullptr, - /* param_tensor_buft_overrides_p */ nullptr - ); - - return loader.n_bytes; - } catch (...) { - return 0; - } -} - const char * llama_model_chat_template(const llama_model * model, const char * name) { const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE) : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE); diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index bfa032a814..6f737d94d0 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -3495,7 +3495,6 @@ void server_routes::init_routes() { { "total_slots", params.n_parallel }, { "model_alias", meta->model_name }, { "model_path", meta->model_path }, - { "memory_mb", meta->model_size / (1024 * 1024) }, { "modalities", json { {"vision", meta->has_inp_image}, {"audio", meta->has_inp_audio}, diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index be10a88d84..317b091305 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -538,6 +538,49 @@ void server_models::unload_lru(uint64_t new_model_memory_mb) { } } +static uint64_t get_model_memory_mb(const common_preset& preset) { + common_params params; + preset.apply_to_params(params); + + if(params.model.path.empty()) { + return 0; + } + + struct log_ud_t { + struct { + ggml_log_callback callback; + void * user_data; + } original; + ggml_log_level min_level; + } log_ud; + llama_log_get(&log_ud.original.callback, &log_ud.original.user_data); + log_ud.min_level = GGML_LOG_LEVEL_WARN; + + llama_log_set([](ggml_log_level level, const char * text, void * ud) { + log_ud_t * d = (log_ud_t *) ud; + const ggml_log_level eff = level >= d->min_level ? level : GGML_LOG_LEVEL_DEBUG; + d->original.callback(eff, text, d->original.user_data); + }, &log_ud); + + llama_model_params mparams = common_model_params_to_llama(params); + mparams.no_alloc = true; + mparams.use_mmap = false; + mparams.use_mlock = false; + + llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams); + + llama_log_set(log_ud.original.callback, log_ud.original.user_data); + + if (!model) { + return 0; + } + + uint64_t size_bytes = llama_model_size(model); + llama_model_free(model); + + return size_bytes / (1024 * 1024); +} + void server_models::load(const std::string & name) { if (!has_model(name)) { throw std::runtime_error("model name=" + name + " is not found"); @@ -545,19 +588,13 @@ void server_models::load(const std::string & name) { uint64_t new_model_memory_mb = 0; if (base_params.models_memory_max > 0) { - std::string model_path; - { - std::lock_guard lk(mutex); - auto & meta = mapping[name].meta; - if (meta.preset.get_option("LLAMA_ARG_MODEL", model_path) && !model_path.empty()) { - uint64_t size_bytes = llama_model_size_from_path(model_path.c_str()); - new_model_memory_mb = size_bytes / (1024 * 1024); - meta.memory_mb = new_model_memory_mb; - if (new_model_memory_mb > 0) { - SRV_INF("model %s estimated size: %lu MB\n", name.c_str(), - (unsigned long)new_model_memory_mb); - } - } + std::lock_guard lk(mutex); + auto & meta = mapping[name].meta; + new_model_memory_mb = get_model_memory_mb(meta.preset); + meta.memory_mb = new_model_memory_mb; + if (new_model_memory_mb > 0) { + SRV_INF("model %s memory requirements: %lu MB\n", name.c_str(), + (unsigned long)new_model_memory_mb); } } @@ -643,33 +680,10 @@ void server_models::load(const std::string & name) { // also handle status report from child process if (stdout_file) { char buffer[4096]; - bool ready_received = false; while (fgets(buffer, sizeof(buffer), stdout_file) != nullptr) { LOG("[%5d] %s", port, buffer); std::string str(buffer); if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_READY)) { - if (!ready_received) { - ready_received = true; - try { - httplib::Client cli("http://CHILD_ADDR"); - cli.set_connection_timeout(5, 0); - if (auto res = cli.Get("/props")) { - if (res->status == 200) { - json props = json::parse(res->body); - if (props.contains("memory_mb")) { - uint64_t memory_mb = props["memory_mb"].get(); - SRV_INF("model %s loaded, actual memory: %lu MB\n", name.c_str(), (unsigned long)memory_mb); - std::lock_guard lk(this->mutex); - if (mapping.find(name) != mapping.end()) { - mapping[name].meta.memory_mb = memory_mb; - } - } - } - } - } catch (const std::exception & e) { - SRV_WRN("failed to query memory for model %s: %s\n", name.c_str(), e.what()); - } - } this->update_status(name, SERVER_MODEL_STATUS_LOADED, 0); } else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_SLEEP)) { this->update_status(name, SERVER_MODEL_STATUS_SLEEPING, 0); diff --git a/tools/server/server-models.h b/tools/server/server-models.h index 29c1c7c6f8..2cbdb35b32 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -62,7 +62,7 @@ struct server_model_meta { int port = 0; server_model_status status = SERVER_MODEL_STATUS_UNLOADED; int64_t last_used = 0; // for LRU unloading - uint64_t memory_mb = 0; // size in MB (estimate before load, actual after load) + uint64_t memory_mb = 0; // size in MB std::vector args; // args passed to the model instance, will be populated by render_args() int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED) int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown From d2892543f428b003b214edc9d258c1e8dca5b849 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Tue, 31 Mar 2026 17:37:16 +0200 Subject: [PATCH 4/8] only set model memory_mb if not previously calculated --- tools/server/server-models.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 317b091305..943192a721 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -590,8 +590,12 @@ void server_models::load(const std::string & name) { if (base_params.models_memory_max > 0) { std::lock_guard lk(mutex); auto & meta = mapping[name].meta; - new_model_memory_mb = get_model_memory_mb(meta.preset); - meta.memory_mb = new_model_memory_mb; + if (meta.memory_mb > 0) { + new_model_memory_mb = meta.memory_mb; + } else { + new_model_memory_mb = get_model_memory_mb(meta.preset); + meta.memory_mb = new_model_memory_mb; + } if (new_model_memory_mb > 0) { SRV_INF("model %s memory requirements: %lu MB\n", name.c_str(), (unsigned long)new_model_memory_mb); From 4af1a283a6d6b8f15b7b10990fde0e531c8c7ff1 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Thu, 2 Apr 2026 09:24:53 +0200 Subject: [PATCH 5/8] use memory margin instead of total size limit, apply to each device separately --- common/arg.cpp | 8 +- common/common.h | 10 +-- include/llama.h | 6 ++ src/llama-context.cpp | 13 +++ tools/server/server-models.cpp | 141 ++++++++++++++++++++++----------- tools/server/server-models.h | 12 ++- 6 files changed, 133 insertions(+), 57 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index cac8819956..b053a25a1e 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3045,12 +3045,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX")); add_opt(common_arg( - {"--models-memory-max"}, "N", - string_format("for router server, maximum memory usage in MB (default: %d, 0 = unlimited)", params.models_memory_max), + {"--models-memory-margin"}, "N", + string_format("for router server, MB of memory to leave free, per device (default: %d, 0 = unlimited)", params.models_memory_margin), [](common_params & params, int value) { - params.models_memory_max = value; + params.models_memory_margin = value; } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MEMORY_MAX")); + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MEMORY_MARGIN")); add_opt(common_arg( {"--models-autoload"}, {"--no-models-autoload"}, diff --git a/common/common.h b/common/common.h index 573a9bf4ef..879032977b 100644 --- a/common/common.h +++ b/common/common.h @@ -618,11 +618,11 @@ struct common_params { std::vector server_tools; // router server configs - std::string models_dir = ""; // directory containing models for the router server - std::string models_preset = ""; // directory containing model presets for the router server - int models_max = 4; // maximum number of models to load simultaneously - int models_memory_max = 0; // maximum memory usage in MB (0 = unlimited, estimated from model files) - bool models_autoload = true; // automatically load models when requested via the router server + std::string models_dir = ""; // directory containing models for the router server + std::string models_preset = ""; // directory containing model presets for the router server + int models_max = 4; // maximum number of models to load simultaneously + int models_memory_margin = 1024; // MB of free memory to preserve per device (0 = disabled) + bool models_autoload = true; // automatically load models when requested via the router server bool log_json = false; diff --git a/include/llama.h b/include/llama.h index a940f9d648..de7c0670f5 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1547,6 +1547,12 @@ extern "C" { // print a breakdown of per-device memory use via LLAMA_LOG: LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx); + // Returns the projected memory use (model + context + compute) in bytes + // for the given device within this context. Returns 0 if the device is not used. + LLAMA_API uint64_t llama_context_device_memory( + const struct llama_context * ctx, + ggml_backend_dev_t device); + // // training // diff --git a/src/llama-context.cpp b/src/llama-context.cpp index a808e3e454..a9b001bcbd 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -3610,6 +3610,19 @@ void llama_memory_breakdown_print(const struct llama_context * ctx) { } } +uint64_t llama_context_device_memory(const llama_context * ctx, ggml_backend_dev_t device) { + const bool is_host = ggml_backend_dev_type(device) == GGML_BACKEND_DEVICE_TYPE_CPU; + uint64_t total = 0; + for (const auto & [buft, mb] : ctx->memory_breakdown()) { + const bool matches = is_host ? ggml_backend_buft_is_host(buft) : + ggml_backend_buft_get_device(buft) == device; + if (matches) { + total += mb.total(); + } + } + return total; +} + // // training // diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 943192a721..81b6d5b3e8 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -178,6 +178,21 @@ server_models::server_models( LOG_WRN("failed to get server executable path: %s\n", e.what()); LOG_WRN("using original argv[0] as fallback: %s\n", argv[0]); } + + const uint64_t memory_margin = base_params.models_memory_margin * 1024 * 1024; + + if (memory_margin > 0) { + const size_t n_devs = ggml_backend_dev_count(); + for (size_t i = 0; i < n_devs; i++) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + size_t free, total; + ggml_backend_dev_memory(dev, &free, &total); + if (total > 0) { + memory_per_device[dev] = (free > memory_margin) ? free - memory_margin : 0; + } + } + } + load_models(); } @@ -293,17 +308,17 @@ void server_models::load_models() { // convert presets to server_model_meta and add to mapping for (const auto & preset : final_presets) { server_model_meta meta{ - /* preset */ preset.second, - /* name */ preset.first, - /* aliases */ {}, - /* tags */ {}, - /* port */ 0, - /* status */ SERVER_MODEL_STATUS_UNLOADED, - /* last_used */ 0, - /* memory_mb */ 0, - /* args */ std::vector(), - /* exit_code */ 0, - /* stop_timeout */ DEFAULT_STOP_TIMEOUT, + /* preset */ preset.second, + /* name */ preset.first, + /* aliases */ {}, + /* tags */ {}, + /* port */ 0, + /* status */ SERVER_MODEL_STATUS_UNLOADED, + /* last_used */ 0, + /* memory_per_device */ {}, + /* args */ std::vector(), + /* exit_code */ 0, + /* stop_timeout */ DEFAULT_STOP_TIMEOUT, }; add_model(std::move(meta)); } @@ -494,36 +509,63 @@ std::vector server_models::get_all_meta() { return result; } -void server_models::unload_lru(uint64_t new_model_memory_mb) { - if (base_params.models_max <= 0 && base_params.models_memory_max <= 0) { +uint64_t server_models::get_memory_exceeded(const model_memory_map& new_model_memory_per_device) const { + model_memory_map total_memory_per_device; + for (const auto & m : mapping) { + if (m.second.meta.is_running()) { + for (const auto& [key, value] : m.second.meta.memory_per_device) { + total_memory_per_device[key] += value; + } + } + } + + auto get = [](const model_memory_map & m, ggml_backend_dev_t k) { + auto it = m.find(k); + return it != m.end() ? it->second : 0; + }; + + uint64_t memory_exceeded = 0; + + for (const auto& [key, limit] : memory_per_device) { + if (get(new_model_memory_per_device, key) + get(total_memory_per_device, key) > limit) { + memory_exceeded++; + } + } + + return memory_exceeded; +} + +void server_models::unload_lru(const model_memory_map& new_model_memory_per_device) { + const bool check_memory = base_params.models_memory_margin > 0 && !memory_per_device.empty(); + + if (base_params.models_max <= 0 && !check_memory) { return; // no limit } + while (true) { std::string lru_model_name = ""; int64_t lru_last_used = ggml_time_ms(); size_t count_active = 0; - uint64_t total_memory_mb = 0; + uint64_t memory_exceeded = 0; { std::unique_lock lk(mutex); for (const auto & m : mapping) { if (m.second.meta.is_running()) { count_active++; - total_memory_mb += m.second.meta.memory_mb; if (m.second.meta.last_used < lru_last_used) { lru_model_name = m.first; lru_last_used = m.second.meta.last_used; } } } + memory_exceeded = get_memory_exceeded(new_model_memory_per_device); } bool count_exceeded = base_params.models_max > 0 && (count_active + 1) >= (size_t)base_params.models_max; - uint64_t projected_memory = total_memory_mb + new_model_memory_mb; - bool memory_exceeded = base_params.models_memory_max > 0 && - projected_memory >= (uint64_t)base_params.models_memory_max; - if (!lru_model_name.empty() && (count_exceeded || memory_exceeded)) { - SRV_INF("limits reached (count=%zu, memory=%lu MB + %lu MB new), removing LRU name=%s\n", - count_active, (unsigned long)total_memory_mb, (unsigned long)new_model_memory_mb, lru_model_name.c_str()); + + if (!lru_model_name.empty() && (count_exceeded || memory_exceeded > 0)) { + SRV_INF("limits reached (count=%zu, memory margin exceeded on %zu device(s)), removing LRU name=%s\n", + count_active, memory_exceeded, lru_model_name.c_str()); unload(lru_model_name); // wait for unload to complete { @@ -538,12 +580,12 @@ void server_models::unload_lru(uint64_t new_model_memory_mb) { } } -static uint64_t get_model_memory_mb(const common_preset& preset) { +static model_memory_map get_model_memory_per_device(const common_preset& preset) { common_params params; preset.apply_to_params(params); if(params.model.path.empty()) { - return 0; + return {}; } struct log_ud_t { @@ -567,18 +609,32 @@ static uint64_t get_model_memory_mb(const common_preset& preset) { mparams.use_mmap = false; mparams.use_mlock = false; - llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams); - - llama_log_set(log_ud.original.callback, log_ud.original.user_data); + llama_model_ptr model{llama_model_load_from_file(params.model.path.c_str(), mparams)}; if (!model) { - return 0; + llama_log_set(log_ud.original.callback, log_ud.original.user_data); + return {}; } - uint64_t size_bytes = llama_model_size(model); - llama_model_free(model); + llama_context_params cparams = common_context_params_to_llama(params); + llama_context_ptr ctx{llama_init_from_model(model.get(), cparams)}; + llama_log_set(log_ud.original.callback, log_ud.original.user_data); - return size_bytes / (1024 * 1024); + if (!ctx) { + return {}; + } + + model_memory_map result; + const size_t n_devs = ggml_backend_dev_count(); + for (size_t i = 0; i < n_devs; i++) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + uint64_t bytes = llama_context_device_memory(ctx.get(), dev); + if (bytes > 0) { + result[dev] = bytes; + } + } + + return result; } void server_models::load(const std::string & name) { @@ -586,23 +642,18 @@ void server_models::load(const std::string & name) { throw std::runtime_error("model name=" + name + " is not found"); } - uint64_t new_model_memory_mb = 0; - if (base_params.models_memory_max > 0) { + model_memory_map new_model_memory_per_device; + if (base_params.models_memory_margin > 0) { std::lock_guard lk(mutex); auto & meta = mapping[name].meta; - if (meta.memory_mb > 0) { - new_model_memory_mb = meta.memory_mb; - } else { - new_model_memory_mb = get_model_memory_mb(meta.preset); - meta.memory_mb = new_model_memory_mb; - } - if (new_model_memory_mb > 0) { - SRV_INF("model %s memory requirements: %lu MB\n", name.c_str(), - (unsigned long)new_model_memory_mb); + if (meta.memory_per_device.empty()) { + meta.memory_per_device = get_model_memory_per_device(meta.preset); } + + new_model_memory_per_device = meta.memory_per_device; } - unload_lru(new_model_memory_mb); + unload_lru(new_model_memory_per_device); std::lock_guard lk(mutex); @@ -616,17 +667,15 @@ void server_models::load(const std::string & name) { // exceeding models_max. Without this, the window between unload_lru() // releasing its lock and this lock_guard acquiring allows multiple // threads to each observe capacity and all proceed to load. - if (base_params.models_max > 0 || base_params.models_memory_max > 0) { + if (base_params.models_max > 0 || base_params.models_memory_margin > 0) { size_t count_active = 0; - uint64_t total_memory_mb = 0; for (const auto & m : mapping) { if (m.second.meta.is_running()) { count_active++; - total_memory_mb += m.second.meta.memory_mb; } } bool count_exceeded = base_params.models_max > 0 && count_active >= (size_t)base_params.models_max; - bool memory_exceeded = base_params.models_memory_max > 0 && total_memory_mb >= (uint64_t)base_params.models_memory_max; + bool memory_exceeded = get_memory_exceeded(new_model_memory_per_device) > 0; if (count_exceeded || memory_exceeded) { throw std::runtime_error("model limit reached, try again later"); } diff --git a/tools/server/server-models.h b/tools/server/server-models.h index 2cbdb35b32..38d6929a88 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -54,6 +54,8 @@ static std::string server_model_status_to_string(server_model_status status) { } } +using model_memory_map = std::map; + struct server_model_meta { common_preset preset; std::string name; @@ -62,7 +64,7 @@ struct server_model_meta { int port = 0; server_model_status status = SERVER_MODEL_STATUS_UNLOADED; int64_t last_used = 0; // for LRU unloading - uint64_t memory_mb = 0; // size in MB + model_memory_map memory_per_device; // projected bytes per device std::vector args; // args passed to the model instance, will be populated by render_args() int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED) int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown @@ -108,14 +110,20 @@ private: std::vector base_env; common_preset base_preset; // base preset from llama-server CLI args + // available memory per device + std::map memory_per_device; + void update_meta(const std::string & name, const server_model_meta & meta); // unload least recently used models if the limit is reached - void unload_lru(uint64_t new_model_memory_mb = 0); + void unload_lru(const model_memory_map& new_model_memory_per_device); // not thread-safe, caller must hold mutex void add_model(server_model_meta && meta); + // not thread-safe, caller must hold mutex + uint64_t get_memory_exceeded(const model_memory_map& new_model_memory_per_device) const; + public: server_models(const common_params & params, int argc, char ** argv); From 7e10ec8ff20ebae85072b28aa8c0f0e582191182 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Thu, 2 Apr 2026 10:07:04 +0200 Subject: [PATCH 6/8] add server memory debug logging --- tools/server/server-models.cpp | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 81b6d5b3e8..bb41f205a9 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -179,7 +179,7 @@ server_models::server_models( LOG_WRN("using original argv[0] as fallback: %s\n", argv[0]); } - const uint64_t memory_margin = base_params.models_memory_margin * 1024 * 1024; + const uint64_t memory_margin = (uint64_t)base_params.models_memory_margin * 1024 * 1024; if (memory_margin > 0) { const size_t n_devs = ggml_backend_dev_count(); @@ -188,7 +188,11 @@ server_models::server_models( size_t free, total; ggml_backend_dev_memory(dev, &free, &total); if (total > 0) { - memory_per_device[dev] = (free > memory_margin) ? free - memory_margin : 0; + const uint64_t available = (free > memory_margin) ? free - memory_margin : 0; + memory_per_device[dev] = available; + SRV_DBG("device %s: available memory after margin=%lu MB\n", + ggml_backend_dev_name(dev), + (unsigned long)(available / (1024 * 1024))); } } } @@ -527,7 +531,15 @@ uint64_t server_models::get_memory_exceeded(const model_memory_map& new_model_me uint64_t memory_exceeded = 0; for (const auto& [key, limit] : memory_per_device) { - if (get(new_model_memory_per_device, key) + get(total_memory_per_device, key) > limit) { + const uint64_t total_memory = get(total_memory_per_device, key); + const uint64_t new_memory = get(new_model_memory_per_device, key); + SRV_DBG("device %s: total=%lu MB, new=%lu MB, limit=%lu MB\n", + ggml_backend_dev_name(key), + (unsigned long)(total_memory / (1024 * 1024)), + (unsigned long)(new_memory / (1024 * 1024)), + (unsigned long)(limit / (1024 * 1024))); + + if (total_memory + new_memory > limit) { memory_exceeded++; } } From 7666cacf28591c0179c499d1525121e7406b58e5 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Thu, 2 Apr 2026 11:39:07 +0200 Subject: [PATCH 7/8] move llama_context_device_memory function to llama-ext.h --- include/llama.h | 6 ------ src/llama-ext.h | 6 ++++++ tools/server/server-models.cpp | 2 ++ 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/include/llama.h b/include/llama.h index de7c0670f5..a940f9d648 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1547,12 +1547,6 @@ extern "C" { // print a breakdown of per-device memory use via LLAMA_LOG: LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx); - // Returns the projected memory use (model + context + compute) in bytes - // for the given device within this context. Returns 0 if the device is not used. - LLAMA_API uint64_t llama_context_device_memory( - const struct llama_context * ctx, - ggml_backend_dev_t device); - // // training // diff --git a/src/llama-ext.h b/src/llama-ext.h index 2ffb77934e..ee6ff27be3 100644 --- a/src/llama-ext.h +++ b/src/llama-ext.h @@ -54,3 +54,9 @@ LLAMA_API void llama_quant_compute_types( ggml_tensor ** tensors, ggml_type * result_types, size_t n_tensors); + +// Returns the projected memory use (model + context + compute) in bytes +// for the given device within this context. Returns 0 if the device is not used. +LLAMA_API uint64_t llama_context_device_memory( + const struct llama_context * ctx, + ggml_backend_dev_t device); diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index bb41f205a9..fe039b03ee 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -7,6 +7,8 @@ #include // TODO: remove this once we use HTTP client from download.h #include +#include "../../src/llama-ext.h" + #include #include #include From 1d4a5f93802ebb90f36228c6de0ef160ec7c7e53 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Thu, 2 Apr 2026 11:39:36 +0200 Subject: [PATCH 8/8] fix model count exceeded check --- tools/server/server-models.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index fe039b03ee..fca6beb122 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -575,7 +575,7 @@ void server_models::unload_lru(const model_memory_map& new_model_memory_per_devi memory_exceeded = get_memory_exceeded(new_model_memory_per_device); } bool count_exceeded = base_params.models_max > 0 && - (count_active + 1) >= (size_t)base_params.models_max; + (count_active + 1) > (size_t)base_params.models_max; if (!lru_model_name.empty() && (count_exceeded || memory_exceeded > 0)) { SRV_INF("limits reached (count=%zu, memory margin exceeded on %zu device(s)), removing LRU name=%s\n",