server : add default-model preset and fallback logic

This commit is contained in:
Mikhail Shevtsov 2026-02-24 17:14:08 +01:00
parent b541241104
commit be9837b66e
5 changed files with 48 additions and 0 deletions

View File

@ -3853,6 +3853,12 @@ void common_params_add_preset_options(std::vector<common_arg> & args) {
[](common_params &, const std::string &) { /* unused */ }
).set_env(COMMON_ARG_PRESET_LOAD_ON_STARTUP).set_preset_only());
args.push_back(common_arg(
{"default-model"}, "NAME",
"in server router mode, this model will be used if model not found",
[](common_params &, const std::string &) { /* unused */ }
).set_env(COMMON_ARG_PRESET_DEFAULT_MODEL).set_preset_only());
args.push_back(common_arg(
{"stop-timeout"}, "SECONDS",
"in server router mode, force-kill model instance after this many seconds of graceful shutdown",

View File

@ -11,6 +11,7 @@
// pseudo-env variable to identify preset-only arguments
#define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP"
#define COMMON_ARG_PRESET_STOP_TIMEOUT "__PRESET_STOP_TIMEOUT"
#define COMMON_ARG_PRESET_DEFAULT_MODEL "__PRESET_DEFAULT_MODEL"
//
// CLI argument parsing

View File

@ -1552,6 +1552,8 @@ The precedence rule for preset options is as follows:
We also offer additional options that are exclusive to presets (these aren't treated as command-line arguments):
- `load-on-startup` (boolean): Controls whether the model loads automatically when the server starts
- `default-model` (boolean): The model to use when no model is specified in a request or the model is not found.
When multiple `default-model` options are found only the first one will be used.
- `stop-timeout` (int, seconds): After requested unload, wait for this many seconds before forcing termination (default: 10)
### Routing requests

View File

@ -306,6 +306,30 @@ void server_models::load_models() {
add_model(std::move(meta));
}
// determine default model if any
bool first_found = false;
std::string first_default;
for (const auto & [name, inst] : mapping) {
std::string val;
if (!inst.meta.preset.get_option(COMMON_ARG_PRESET_DEFAULT_MODEL, val)) {
continue;
}
if (!first_found) {
default_model_name = name;
SRV_INF("Default preset model: %s\n", name.c_str());
first_default = name;
first_found = true;
} else {
SRV_WRN(
"Multiple default models detected: '%s' and '%s'; "
"using '%s' as default\n",
name.c_str(),
first_default.c_str(),
first_default.c_str()
);
}
}
// log available models
{
std::unordered_set<std::string> custom_names;
@ -374,6 +398,15 @@ void server_models::load_models() {
}
}
std::string server_models::resolve_model_name(const std::string & requested) {
// If a nonempty request matches a known model, use it.
if (!requested.empty() && has_model(requested)) {
return requested;
}
// Otherwise fall back to the default model if one is set.
return default_model_name.empty() ? requested : default_model_name;
}
void server_models::update_meta(const std::string & name, const server_model_meta & meta) {
std::lock_guard<std::mutex> lk(mutex);
auto it = mapping.find(name);
@ -904,6 +937,7 @@ void server_models_routes::init_routes() {
this->proxy_get = [this](const server_http_req & req) {
std::string method = "GET";
std::string name = req.get_param("model");
name = models.resolve_model_name(name);
bool autoload = is_autoload(params, req);
auto error_res = std::make_unique<server_http_res>();
if (!router_validate_model(name, models, autoload, error_res)) {
@ -916,6 +950,7 @@ void server_models_routes::init_routes() {
std::string method = "POST";
json body = json::parse(req.body);
std::string name = json_value(body, "model", std::string());
name = models.resolve_model_name(name);
bool autoload = is_autoload(params, req);
auto error_res = std::make_unique<server_http_res>();
if (!router_validate_model(name, models, autoload, error_res)) {

View File

@ -97,6 +97,7 @@ private:
std::string bin_path;
std::vector<std::string> base_env;
common_preset base_preset; // base preset from llama-server CLI args
std::string default_model_name;
void update_meta(const std::string & name, const server_model_meta & meta);
@ -143,6 +144,9 @@ public:
// notify the router server that a model instance is ready
// return the monitoring thread (to be joined by the caller)
static std::thread setup_child_server(const std::function<void(int)> & shutdown_handler);
// Resolve model name: fallback to default if requested name is empty or not found
std::string resolve_model_name(const std::string & requested);
};
struct server_models_routes {