From 8d594383a1860717a8dfe66008395a2a17e77a4b Mon Sep 17 00:00:00 2001
From: Michel Belleau <mbelleau@Michels-MacBook-Pro.local>
Date: Thu, 25 Dec 2025 14:23:05 -0500
Subject: [PATCH 1/3] Add pin option to protect models from LRU eviction in
 router mode

- Add COMMON_ARG_PRESET_PIN define
- Add pin preset option for model protection
- Add pinned field to server_model_meta
- Modify load_models to set pinned flag
- Update unload_lru to skip pinned models
---
 common/arg.cpp                 | 11 ++++++-----
 common/arg.h                   |  1 +
 tools/server/server-models.cpp | 10 +++++++++-
 tools/server/server-models.h   |  1 +
 4 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 774f8731a9..3b301a560b 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -3524,9 +3524,10 @@ void common_params_add_preset_options(std::vector<common_arg> & args) {
         [](common_params &, int) { /* unused */ }
     ).set_env(COMMON_ARG_PRESET_STOP_TIMEOUT).set_preset_only());
 
-    // args.push_back(common_arg(
-    //     {"pin"},
-    //     "in server router mode, do not unload this model if models_max is exceeded",
-    //     [](common_params &) { /* unused */ }
-    // ).set_preset_only());
+    args.push_back(common_arg(
+        {"pin"},
+        nullptr,
+        "in server router mode, do not unload this model if models_max is exceeded",
+        [](common_params &, const std::string &) { /* unused */ }
+    ).set_env(COMMON_ARG_PRESET_PIN).set_preset_only());
 }
diff --git a/common/arg.h b/common/arg.h
index a1b6a14e67..bbde9733a0 100644
--- a/common/arg.h
+++ b/common/arg.h
@@ -11,6 +11,7 @@
 // pseudo-env variable to identify preset-only arguments
 #define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP"
 #define COMMON_ARG_PRESET_STOP_TIMEOUT    "__PRESET_STOP_TIMEOUT"
+#define COMMON_ARG_PRESET_PIN             "__PRESET_PIN"
 
 //
 // CLI argument parsing
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index cb7e70455a..0647ad2e2c 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -244,6 +244,14 @@ void server_models::load_models() {
         }
     }
 
+    // handle custom pin option
+    for (auto & [name, inst] : mapping) {
+        std::string val;
+        if (inst.meta.preset.get_option(COMMON_ARG_PRESET_PIN, val)) {
+            inst.meta.pinned = true;
+        }
+    }
+
     // load any autoload models
     std::vector<std::string> models_to_load;
     for (const auto & [name, inst] : mapping) {
@@ -383,7 +391,7 @@ void server_models::unload_lru() {
         for (const auto & m : mapping) {
             if (m.second.meta.is_active()) {
                 count_active++;
-                if (m.second.meta.last_used < lru_last_used) {
+                if (!m.second.meta.pinned && m.second.meta.last_used < lru_last_used) {
                     lru_model_name = m.first;
                     lru_last_used = m.second.meta.last_used;
                 }
diff --git a/tools/server/server-models.h b/tools/server/server-models.h
index 7e33537536..1f790bccc8 100644
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@@ -58,6 +58,7 @@ struct server_model_meta {
     std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
     int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
     int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown
+    bool pinned = false; // if true, this model will not be unloaded by LRU
 
     bool is_active() const {
         return status == SERVER_MODEL_STATUS_LOADED || status == SERVER_MODEL_STATUS_LOADING;

From d9e65c3baf1a079d6762949ea9db958fa25c36b8 Mon Sep 17 00:00:00 2001
From: Michel Belleau <mbelleau@Michels-MacBook-Pro.local>
Date: Thu, 25 Dec 2025 17:50:57 -0500
Subject: [PATCH 2/3] Add warning when LRU eviction cannot succeed due to all
 models being pinned

When models_max limit is reached but all active models are pinned, log a
warning message to clarify that automatic unload cannot succeed.

... also add --pin preset option documentation.
---
 tools/server/README.md         | 1 +
 tools/server/server-models.cpp | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/tools/server/README.md b/tools/server/README.md
index 7d2f6f798e..43e7b066da 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -1487,6 +1487,7 @@ The precedence rule for preset options is as follows:
 We also offer additional options that are exclusive to presets (these aren't treated as command-line arguments):
 - `load-on-startup` (boolean): Controls whether the model loads automatically when the server starts
 - `stop-timeout` (int, seconds): After requested unload, wait for this many seconds before forcing termination (default: 10)
+- `pin` (boolean): Prevents the router from unloading this model when the `models_max` limit is exceeded; the model remains loaded until explicitly unloaded or the server restarts.
 
 ### Routing requests
 
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index 0647ad2e2c..af526dc73f 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -391,6 +391,8 @@ void server_models::unload_lru() {
         for (const auto & m : mapping) {
             if (m.second.meta.is_active()) {
                 count_active++;
+                // If all active models are pinned, this condition never holds and no LRU eviction will occur.
+                // The server will keep all pinned models in memory, potentially exceeding models_max.
                 if (!m.second.meta.pinned && m.second.meta.last_used < lru_last_used) {
                     lru_model_name = m.first;
                     lru_last_used = m.second.meta.last_used;
@@ -408,6 +410,8 @@ void server_models::unload_lru() {
                 return mapping[lru_model_name].meta.status == SERVER_MODEL_STATUS_UNLOADED;
             });
         }
+    } else if (count_active >= (size_t)base_params.models_max) {
+        SRV_WRN("models_max limit reached, but no unpinned models available for LRU eviction - automatic unload cannot succeed\n");
     }
 }
 

From 95d2017d283c95b83ccf22eeba2c1d2b5fb96565 Mon Sep 17 00:00:00 2001
From: Michel Belleau <mbelleau@Michels-MacBook-Pro.local>
Date: Fri, 26 Dec 2025 11:41:48 -0500
Subject: [PATCH 3/3] Throw error when models_max limit reached with no
 unpinned models for LRU eviction

---
 tools/server/server-models.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index af526dc73f..6b41d5f246 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -392,7 +392,7 @@ void server_models::unload_lru() {
             if (m.second.meta.is_active()) {
                 count_active++;
                 // If all active models are pinned, this condition never holds and no LRU eviction will occur.
-                // The server will keep all pinned models in memory, potentially exceeding models_max.
+                // We throw an error instead of allowing the server to exceed models_max.
                 if (!m.second.meta.pinned && m.second.meta.last_used < lru_last_used) {
                     lru_model_name = m.first;
                     lru_last_used = m.second.meta.last_used;
@@ -411,7 +411,10 @@ void server_models::unload_lru() {
             });
         }
     } else if (count_active >= (size_t)base_params.models_max) {
-        SRV_WRN("models_max limit reached, but no unpinned models available for LRU eviction - automatic unload cannot succeed\n");
+        throw std::runtime_error(string_format(
+            "models_max limit (%d) reached, but no unpinned models available for LRU eviction - cannot load more models",
+            base_params.models_max
+        ));
     }
 }