From 8d594383a1860717a8dfe66008395a2a17e77a4b Mon Sep 17 00:00:00 2001 From: Michel Belleau Date: Thu, 25 Dec 2025 14:23:05 -0500 Subject: [PATCH] Add pin option to protect models from LRU eviction in router mode - Add COMMON_ARG_PRESET_PIN define - Add pin preset option for model protection - Add pinned field to server_model_meta - Modify load_models to set pinned flag - Update unload_lru to skip pinned models --- common/arg.cpp | 11 ++++++----- common/arg.h | 1 + tools/server/server-models.cpp | 10 +++++++++- tools/server/server-models.h | 1 + 4 files changed, 17 insertions(+), 6 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 774f8731a9..3b301a560b 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3524,9 +3524,10 @@ void common_params_add_preset_options(std::vector & args) { [](common_params &, int) { /* unused */ } ).set_env(COMMON_ARG_PRESET_STOP_TIMEOUT).set_preset_only()); - // args.push_back(common_arg( - // {"pin"}, - // "in server router mode, do not unload this model if models_max is exceeded", - // [](common_params &) { /* unused */ } - // ).set_preset_only()); + args.push_back(common_arg( + {"pin"}, + nullptr, + "in server router mode, do not unload this model if models_max is exceeded", + [](common_params &, const std::string &) { /* unused */ } + ).set_env(COMMON_ARG_PRESET_PIN).set_preset_only()); } diff --git a/common/arg.h b/common/arg.h index a1b6a14e67..bbde9733a0 100644 --- a/common/arg.h +++ b/common/arg.h @@ -11,6 +11,7 @@ // pseudo-env variable to identify preset-only arguments #define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP" #define COMMON_ARG_PRESET_STOP_TIMEOUT "__PRESET_STOP_TIMEOUT" +#define COMMON_ARG_PRESET_PIN "__PRESET_PIN" // // CLI argument parsing diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index cb7e70455a..0647ad2e2c 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -244,6 +244,14 @@ void server_models::load_models() { } } + // handle custom pin option + for (auto & [name, inst] : mapping) { + std::string val; + if (inst.meta.preset.get_option(COMMON_ARG_PRESET_PIN, val)) { + inst.meta.pinned = true; + } + } + // load any autoload models std::vector models_to_load; for (const auto & [name, inst] : mapping) { @@ -383,7 +391,7 @@ void server_models::unload_lru() { for (const auto & m : mapping) { if (m.second.meta.is_active()) { count_active++; - if (m.second.meta.last_used < lru_last_used) { + if (!m.second.meta.pinned && m.second.meta.last_used < lru_last_used) { lru_model_name = m.first; lru_last_used = m.second.meta.last_used; } diff --git a/tools/server/server-models.h b/tools/server/server-models.h index 7e33537536..1f790bccc8 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -58,6 +58,7 @@ struct server_model_meta { std::vector args; // args passed to the model instance, will be populated by render_args() int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED) int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown + bool pinned = false; // if true, this model will not be unloaded by LRU bool is_active() const { return status == SERVER_MODEL_STATUS_LOADED || status == SERVER_MODEL_STATUS_LOADING;