Add warning when LRU eviction cannot succeed due to all models being pinned
When models_max limit is reached but all active models are pinned, log a warning message to clarify that automatic unload cannot succeed. ... also add --pin preset option documentation.
This commit is contained in:
parent
8d594383a1
commit
d9e65c3baf
|
|
@ -1487,6 +1487,7 @@ The precedence rule for preset options is as follows:
|
|||
We also offer additional options that are exclusive to presets (these aren't treated as command-line arguments):
|
||||
- `load-on-startup` (boolean): Controls whether the model loads automatically when the server starts
|
||||
- `stop-timeout` (int, seconds): After requested unload, wait for this many seconds before forcing termination (default: 10)
|
||||
- `pin` (boolean): Prevents the router from unloading this model when the `models_max` limit is exceeded; the model remains loaded until explicitly unloaded or the server restarts.
|
||||
|
||||
### Routing requests
|
||||
|
||||
|
|
|
|||
|
|
@ -391,6 +391,8 @@ void server_models::unload_lru() {
|
|||
for (const auto & m : mapping) {
|
||||
if (m.second.meta.is_active()) {
|
||||
count_active++;
|
||||
// If all active models are pinned, this condition never holds and no LRU eviction will occur.
|
||||
// The server will keep all pinned models in memory, potentially exceeding models_max.
|
||||
if (!m.second.meta.pinned && m.second.meta.last_used < lru_last_used) {
|
||||
lru_model_name = m.first;
|
||||
lru_last_used = m.second.meta.last_used;
|
||||
|
|
@ -408,6 +410,8 @@ void server_models::unload_lru() {
|
|||
return mapping[lru_model_name].meta.status == SERVER_MODEL_STATUS_UNLOADED;
|
||||
});
|
||||
}
|
||||
} else if (count_active >= (size_t)base_params.models_max) {
|
||||
SRV_WRN("models_max limit reached, but no unpinned models available for LRU eviction - automatic unload cannot succeed\n");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue