diff --git a/common/arg.cpp b/common/arg.cpp index 62d31393c4..98b9881f08 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3539,9 +3539,10 @@ void common_params_add_preset_options(std::vector & args) { [](common_params &, int) { /* unused */ } ).set_env(COMMON_ARG_PRESET_STOP_TIMEOUT).set_preset_only()); - // args.push_back(common_arg( - // {"pin"}, - // "in server router mode, do not unload this model if models_max is exceeded", - // [](common_params &) { /* unused */ } - // ).set_preset_only()); + args.push_back(common_arg( + {"pin"}, + nullptr, + "in server router mode, do not unload this model if models_max is exceeded", + [](common_params &, const std::string &) { /* unused */ } + ).set_env(COMMON_ARG_PRESET_PIN).set_preset_only()); } diff --git a/common/arg.h b/common/arg.h index a1b6a14e67..bbde9733a0 100644 --- a/common/arg.h +++ b/common/arg.h @@ -11,6 +11,7 @@ // pseudo-env variable to identify preset-only arguments #define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP" #define COMMON_ARG_PRESET_STOP_TIMEOUT "__PRESET_STOP_TIMEOUT" +#define COMMON_ARG_PRESET_PIN "__PRESET_PIN" // // CLI argument parsing diff --git a/tools/server/README.md b/tools/server/README.md index 7d2f6f798e..43e7b066da 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -1487,6 +1487,7 @@ The precedence rule for preset options is as follows: We also offer additional options that are exclusive to presets (these aren't treated as command-line arguments): - `load-on-startup` (boolean): Controls whether the model loads automatically when the server starts - `stop-timeout` (int, seconds): After requested unload, wait for this many seconds before forcing termination (default: 10) +- `pin` (boolean): Prevents the router from unloading this model when the `models_max` limit is exceeded; the model remains loaded until explicitly unloaded or the server restarts. ### Routing requests diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 56e1dc46b8..810af9f8c2 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -244,6 +244,14 @@ void server_models::load_models() { } } + // handle custom pin option + for (auto & [name, inst] : mapping) { + std::string val; + if (inst.meta.preset.get_option(COMMON_ARG_PRESET_PIN, val)) { + inst.meta.pinned = true; + } + } + // load any autoload models std::vector models_to_load; for (const auto & [name, inst] : mapping) { @@ -383,7 +391,9 @@ void server_models::unload_lru() { for (const auto & m : mapping) { if (m.second.meta.is_active()) { count_active++; - if (m.second.meta.last_used < lru_last_used) { + // If all active models are pinned, this condition never holds and no LRU eviction will occur. + // We throw an error instead of allowing the server to exceed models_max. + if (!m.second.meta.pinned && m.second.meta.last_used < lru_last_used) { lru_model_name = m.first; lru_last_used = m.second.meta.last_used; } @@ -400,6 +410,11 @@ void server_models::unload_lru() { return mapping[lru_model_name].meta.status == SERVER_MODEL_STATUS_UNLOADED; }); } + } else if (count_active >= (size_t)base_params.models_max) { + throw std::runtime_error(string_format( + "models_max limit (%d) reached, but no unpinned models available for LRU eviction - cannot load more models", + base_params.models_max + )); } } diff --git a/tools/server/server-models.h b/tools/server/server-models.h index 24ddc65662..f6a0832915 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -58,6 +58,7 @@ struct server_model_meta { std::vector args; // args passed to the model instance, will be populated by render_args() int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED) int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown + bool pinned = false; // if true, this model will not be unloaded by LRU bool is_active() const { return status == SERVER_MODEL_STATUS_LOADED || status == SERVER_MODEL_STATUS_LOADING;