From 8d594383a1860717a8dfe66008395a2a17e77a4b Mon Sep 17 00:00:00 2001 From: Michel Belleau Date: Thu, 25 Dec 2025 14:23:05 -0500 Subject: [PATCH 1/3] Add pin option to protect models from LRU eviction in router mode - Add COMMON_ARG_PRESET_PIN define - Add pin preset option for model protection - Add pinned field to server_model_meta - Modify load_models to set pinned flag - Update unload_lru to skip pinned models --- common/arg.cpp | 11 ++++++----- common/arg.h | 1 + tools/server/server-models.cpp | 10 +++++++++- tools/server/server-models.h | 1 + 4 files changed, 17 insertions(+), 6 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 774f8731a9..3b301a560b 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3524,9 +3524,10 @@ void common_params_add_preset_options(std::vector & args) { [](common_params &, int) { /* unused */ } ).set_env(COMMON_ARG_PRESET_STOP_TIMEOUT).set_preset_only()); - // args.push_back(common_arg( - // {"pin"}, - // "in server router mode, do not unload this model if models_max is exceeded", - // [](common_params &) { /* unused */ } - // ).set_preset_only()); + args.push_back(common_arg( + {"pin"}, + nullptr, + "in server router mode, do not unload this model if models_max is exceeded", + [](common_params &, const std::string &) { /* unused */ } + ).set_env(COMMON_ARG_PRESET_PIN).set_preset_only()); } diff --git a/common/arg.h b/common/arg.h index a1b6a14e67..bbde9733a0 100644 --- a/common/arg.h +++ b/common/arg.h @@ -11,6 +11,7 @@ // pseudo-env variable to identify preset-only arguments #define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP" #define COMMON_ARG_PRESET_STOP_TIMEOUT "__PRESET_STOP_TIMEOUT" +#define COMMON_ARG_PRESET_PIN "__PRESET_PIN" // // CLI argument parsing diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index cb7e70455a..0647ad2e2c 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -244,6 +244,14 @@ void server_models::load_models() { } } + // handle custom pin option + for (auto & [name, inst] : mapping) { + std::string val; + if (inst.meta.preset.get_option(COMMON_ARG_PRESET_PIN, val)) { + inst.meta.pinned = true; + } + } + // load any autoload models std::vector models_to_load; for (const auto & [name, inst] : mapping) { @@ -383,7 +391,7 @@ void server_models::unload_lru() { for (const auto & m : mapping) { if (m.second.meta.is_active()) { count_active++; - if (m.second.meta.last_used < lru_last_used) { + if (!m.second.meta.pinned && m.second.meta.last_used < lru_last_used) { lru_model_name = m.first; lru_last_used = m.second.meta.last_used; } diff --git a/tools/server/server-models.h b/tools/server/server-models.h index 7e33537536..1f790bccc8 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -58,6 +58,7 @@ struct server_model_meta { std::vector args; // args passed to the model instance, will be populated by render_args() int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED) int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown + bool pinned = false; // if true, this model will not be unloaded by LRU bool is_active() const { return status == SERVER_MODEL_STATUS_LOADED || status == SERVER_MODEL_STATUS_LOADING; From d9e65c3baf1a079d6762949ea9db958fa25c36b8 Mon Sep 17 00:00:00 2001 From: Michel Belleau Date: Thu, 25 Dec 2025 17:50:57 -0500 Subject: [PATCH 2/3] Add warning when LRU eviction cannot succeed due to all models being pinned When models_max limit is reached but all active models are pinned, log a warning message to clarify that automatic unload cannot succeed. ... also add --pin preset option documentation. --- tools/server/README.md | 1 + tools/server/server-models.cpp | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/tools/server/README.md b/tools/server/README.md index 7d2f6f798e..43e7b066da 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -1487,6 +1487,7 @@ The precedence rule for preset options is as follows: We also offer additional options that are exclusive to presets (these aren't treated as command-line arguments): - `load-on-startup` (boolean): Controls whether the model loads automatically when the server starts - `stop-timeout` (int, seconds): After requested unload, wait for this many seconds before forcing termination (default: 10) +- `pin` (boolean): Prevents the router from unloading this model when the `models_max` limit is exceeded; the model remains loaded until explicitly unloaded or the server restarts. ### Routing requests diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 0647ad2e2c..af526dc73f 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -391,6 +391,8 @@ void server_models::unload_lru() { for (const auto & m : mapping) { if (m.second.meta.is_active()) { count_active++; + // If all active models are pinned, this condition never holds and no LRU eviction will occur. + // The server will keep all pinned models in memory, potentially exceeding models_max. if (!m.second.meta.pinned && m.second.meta.last_used < lru_last_used) { lru_model_name = m.first; lru_last_used = m.second.meta.last_used; @@ -408,6 +410,8 @@ void server_models::unload_lru() { return mapping[lru_model_name].meta.status == SERVER_MODEL_STATUS_UNLOADED; }); } + } else if (count_active >= (size_t)base_params.models_max) { + SRV_WRN("models_max limit reached, but no unpinned models available for LRU eviction - automatic unload cannot succeed\n"); } } From 95d2017d283c95b83ccf22eeba2c1d2b5fb96565 Mon Sep 17 00:00:00 2001 From: Michel Belleau Date: Fri, 26 Dec 2025 11:41:48 -0500 Subject: [PATCH 3/3] Throw error when models_max limit reached with no unpinned models for LRU eviction --- tools/server/server-models.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index af526dc73f..6b41d5f246 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -392,7 +392,7 @@ void server_models::unload_lru() { if (m.second.meta.is_active()) { count_active++; // If all active models are pinned, this condition never holds and no LRU eviction will occur. - // The server will keep all pinned models in memory, potentially exceeding models_max. + // We throw an error instead of allowing the server to exceed models_max. if (!m.second.meta.pinned && m.second.meta.last_used < lru_last_used) { lru_model_name = m.first; lru_last_used = m.second.meta.last_used; @@ -411,7 +411,10 @@ void server_models::unload_lru() { }); } } else if (count_active >= (size_t)base_params.models_max) { - SRV_WRN("models_max limit reached, but no unpinned models available for LRU eviction - automatic unload cannot succeed\n"); + throw std::runtime_error(string_format( + "models_max limit (%d) reached, but no unpinned models available for LRU eviction - cannot load more models", + base_params.models_max + )); } }