This commit is contained in:
Michel Belleau 2026-01-02 13:20:56 -03:00 committed by GitHub
commit bb4f331e98
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 25 additions and 6 deletions

View File

@ -3539,9 +3539,10 @@ void common_params_add_preset_options(std::vector<common_arg> & args) {
[](common_params &, int) { /* unused */ }
).set_env(COMMON_ARG_PRESET_STOP_TIMEOUT).set_preset_only());
// args.push_back(common_arg(
// {"pin"},
// "in server router mode, do not unload this model if models_max is exceeded",
// [](common_params &) { /* unused */ }
// ).set_preset_only());
args.push_back(common_arg(
{"pin"},
nullptr,
"in server router mode, do not unload this model if models_max is exceeded",
[](common_params &, const std::string &) { /* unused */ }
).set_env(COMMON_ARG_PRESET_PIN).set_preset_only());
}

View File

@ -11,6 +11,7 @@
// pseudo-env variable to identify preset-only arguments
#define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP"
#define COMMON_ARG_PRESET_STOP_TIMEOUT "__PRESET_STOP_TIMEOUT"
#define COMMON_ARG_PRESET_PIN "__PRESET_PIN"
//
// CLI argument parsing

View File

@ -1487,6 +1487,7 @@ The precedence rule for preset options is as follows:
We also offer additional options that are exclusive to presets (these aren't treated as command-line arguments):
- `load-on-startup` (boolean): Controls whether the model loads automatically when the server starts
- `stop-timeout` (int, seconds): After requested unload, wait for this many seconds before forcing termination (default: 10)
- `pin` (boolean): Prevents the router from unloading this model when the `models_max` limit is exceeded; the model remains loaded until explicitly unloaded or the server restarts.
### Routing requests

View File

@ -244,6 +244,14 @@ void server_models::load_models() {
}
}
// handle custom pin option
for (auto & [name, inst] : mapping) {
std::string val;
if (inst.meta.preset.get_option(COMMON_ARG_PRESET_PIN, val)) {
inst.meta.pinned = true;
}
}
// load any autoload models
std::vector<std::string> models_to_load;
for (const auto & [name, inst] : mapping) {
@ -383,7 +391,9 @@ void server_models::unload_lru() {
for (const auto & m : mapping) {
if (m.second.meta.is_active()) {
count_active++;
if (m.second.meta.last_used < lru_last_used) {
// If all active models are pinned, this condition never holds and no LRU eviction will occur.
// We throw an error instead of allowing the server to exceed models_max.
if (!m.second.meta.pinned && m.second.meta.last_used < lru_last_used) {
lru_model_name = m.first;
lru_last_used = m.second.meta.last_used;
}
@ -400,6 +410,11 @@ void server_models::unload_lru() {
return mapping[lru_model_name].meta.status == SERVER_MODEL_STATUS_UNLOADED;
});
}
} else if (count_active >= (size_t)base_params.models_max) {
throw std::runtime_error(string_format(
"models_max limit (%d) reached, but no unpinned models available for LRU eviction - cannot load more models",
base_params.models_max
));
}
}

View File

@ -58,6 +58,7 @@ struct server_model_meta {
std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown
bool pinned = false; // if true, this model will not be unloaded by LRU
bool is_active() const {
return status == SERVER_MODEL_STATUS_LOADED || status == SERVER_MODEL_STATUS_LOADING;