Add pin option to protect models from LRU eviction in router mode

- Add COMMON_ARG_PRESET_PIN define - Add pin preset option for model protection - Add pinned field to server_model_meta - Modify load_models to set pinned flag - Update unload_lru to skip pinned models
2025-12-25 14:23:05 -05:00 · 2025-12-25 14:23:05 -05:00 · 8d594383a1
parent b07cda687c
commit 8d594383a1
4 changed files with 17 additions and 6 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -3524,9 +3524,10 @@ void common_params_add_preset_options(std::vector<common_arg> & args) {
        [](common_params &, int) { /* unused */ }
    ).set_env(COMMON_ARG_PRESET_STOP_TIMEOUT).set_preset_only());

-    // args.push_back(common_arg(
-    //     {"pin"},
-    //     "in server router mode, do not unload this model if models_max is exceeded",
-    //     [](common_params &) { /* unused */ }
-    // ).set_preset_only());
+    args.push_back(common_arg(
+        {"pin"},
+        nullptr,
+        "in server router mode, do not unload this model if models_max is exceeded",
+        [](common_params &, const std::string &) { /* unused */ }
+    ).set_env(COMMON_ARG_PRESET_PIN).set_preset_only());
 }
--- a/common/arg.h
+++ b/common/arg.h
@ -11,6 +11,7 @@
 // pseudo-env variable to identify preset-only arguments
 #define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP"
 #define COMMON_ARG_PRESET_STOP_TIMEOUT    "__PRESET_STOP_TIMEOUT"
+#define COMMON_ARG_PRESET_PIN             "__PRESET_PIN"

 //
 // CLI argument parsing
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@ -244,6 +244,14 @@ void server_models::load_models() {
        }
    }

+    // handle custom pin option
+    for (auto & [name, inst] : mapping) {
+        std::string val;
+        if (inst.meta.preset.get_option(COMMON_ARG_PRESET_PIN, val)) {
+            inst.meta.pinned = true;
+        }
+    }
+
    // load any autoload models
    std::vector<std::string> models_to_load;
    for (const auto & [name, inst] : mapping) {
@ -383,7 +391,7 @@ void server_models::unload_lru() {
        for (const auto & m : mapping) {
            if (m.second.meta.is_active()) {
                count_active++;
-                if (m.second.meta.last_used < lru_last_used) {
+                if (!m.second.meta.pinned && m.second.meta.last_used < lru_last_used) {
                    lru_model_name = m.first;
                    lru_last_used = m.second.meta.last_used;
                }
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@ -58,6 +58,7 @@ struct server_model_meta {
    std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
    int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
    int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown
+    bool pinned = false; // if true, this model will not be unloaded by LRU

    bool is_active() const {
        return status == SERVER_MODEL_STATUS_LOADED || status == SERVER_MODEL_STATUS_LOADING;