Add pin option to protect models from LRU eviction in router mode
- Add COMMON_ARG_PRESET_PIN define - Add pin preset option for model protection - Add pinned field to server_model_meta - Modify load_models to set pinned flag - Update unload_lru to skip pinned models
This commit is contained in:
parent
b07cda687c
commit
8d594383a1
|
|
@ -3524,9 +3524,10 @@ void common_params_add_preset_options(std::vector<common_arg> & args) {
|
|||
[](common_params &, int) { /* unused */ }
|
||||
).set_env(COMMON_ARG_PRESET_STOP_TIMEOUT).set_preset_only());
|
||||
|
||||
// args.push_back(common_arg(
|
||||
// {"pin"},
|
||||
// "in server router mode, do not unload this model if models_max is exceeded",
|
||||
// [](common_params &) { /* unused */ }
|
||||
// ).set_preset_only());
|
||||
args.push_back(common_arg(
|
||||
{"pin"},
|
||||
nullptr,
|
||||
"in server router mode, do not unload this model if models_max is exceeded",
|
||||
[](common_params &, const std::string &) { /* unused */ }
|
||||
).set_env(COMMON_ARG_PRESET_PIN).set_preset_only());
|
||||
}
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@
|
|||
// pseudo-env variable to identify preset-only arguments
|
||||
#define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP"
|
||||
#define COMMON_ARG_PRESET_STOP_TIMEOUT "__PRESET_STOP_TIMEOUT"
|
||||
#define COMMON_ARG_PRESET_PIN "__PRESET_PIN"
|
||||
|
||||
//
|
||||
// CLI argument parsing
|
||||
|
|
|
|||
|
|
@ -244,6 +244,14 @@ void server_models::load_models() {
|
|||
}
|
||||
}
|
||||
|
||||
// handle custom pin option
|
||||
for (auto & [name, inst] : mapping) {
|
||||
std::string val;
|
||||
if (inst.meta.preset.get_option(COMMON_ARG_PRESET_PIN, val)) {
|
||||
inst.meta.pinned = true;
|
||||
}
|
||||
}
|
||||
|
||||
// load any autoload models
|
||||
std::vector<std::string> models_to_load;
|
||||
for (const auto & [name, inst] : mapping) {
|
||||
|
|
@ -383,7 +391,7 @@ void server_models::unload_lru() {
|
|||
for (const auto & m : mapping) {
|
||||
if (m.second.meta.is_active()) {
|
||||
count_active++;
|
||||
if (m.second.meta.last_used < lru_last_used) {
|
||||
if (!m.second.meta.pinned && m.second.meta.last_used < lru_last_used) {
|
||||
lru_model_name = m.first;
|
||||
lru_last_used = m.second.meta.last_used;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -58,6 +58,7 @@ struct server_model_meta {
|
|||
std::vector<std::string> args; // args passed to the model instance, will be populated by render_args()
|
||||
int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
|
||||
int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown
|
||||
bool pinned = false; // if true, this model will not be unloaded by LRU
|
||||
|
||||
bool is_active() const {
|
||||
return status == SERVER_MODEL_STATUS_LOADED || status == SERVER_MODEL_STATUS_LOADING;
|
||||
|
|
|
|||
Loading…
Reference in New Issue