From be9837b66e9db1eb1f1d3868fbc3eca266a2ad9b Mon Sep 17 00:00:00 2001 From: Mikhail Shevtsov Date: Tue, 24 Feb 2026 17:14:08 +0100 Subject: [PATCH] server : add default-model preset and fallback logic --- common/arg.cpp | 6 ++++++ common/arg.h | 1 + tools/server/README.md | 2 ++ tools/server/server-models.cpp | 35 ++++++++++++++++++++++++++++++++++ tools/server/server-models.h | 4 ++++ 5 files changed, 48 insertions(+) diff --git a/common/arg.cpp b/common/arg.cpp index 41da8563d6..00e8d69816 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3853,6 +3853,12 @@ void common_params_add_preset_options(std::vector & args) { [](common_params &, const std::string &) { /* unused */ } ).set_env(COMMON_ARG_PRESET_LOAD_ON_STARTUP).set_preset_only()); + args.push_back(common_arg( + {"default-model"}, "NAME", + "in server router mode, this model will be used if model not found", + [](common_params &, const std::string &) { /* unused */ } + ).set_env(COMMON_ARG_PRESET_DEFAULT_MODEL).set_preset_only()); + args.push_back(common_arg( {"stop-timeout"}, "SECONDS", "in server router mode, force-kill model instance after this many seconds of graceful shutdown", diff --git a/common/arg.h b/common/arg.h index 55782a158d..a6b627e609 100644 --- a/common/arg.h +++ b/common/arg.h @@ -11,6 +11,7 @@ // pseudo-env variable to identify preset-only arguments #define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP" #define COMMON_ARG_PRESET_STOP_TIMEOUT "__PRESET_STOP_TIMEOUT" +#define COMMON_ARG_PRESET_DEFAULT_MODEL "__PRESET_DEFAULT_MODEL" // // CLI argument parsing diff --git a/tools/server/README.md b/tools/server/README.md index da16ddc756..68f56bd3fb 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -1552,6 +1552,8 @@ The precedence rule for preset options is as follows: We also offer additional options that are exclusive to presets (these aren't treated as command-line arguments): - `load-on-startup` (boolean): Controls whether the model loads automatically when the server starts +- `default-model` (boolean): The model to use when no model is specified in a request or the model is not found. + When multiple `default-model` options are found only the first one will be used. - `stop-timeout` (int, seconds): After requested unload, wait for this many seconds before forcing termination (default: 10) ### Routing requests diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index c13d48a363..61e962255c 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -306,6 +306,30 @@ void server_models::load_models() { add_model(std::move(meta)); } + // determine default model if any + bool first_found = false; + std::string first_default; + for (const auto & [name, inst] : mapping) { + std::string val; + if (!inst.meta.preset.get_option(COMMON_ARG_PRESET_DEFAULT_MODEL, val)) { + continue; + } + if (!first_found) { + default_model_name = name; + SRV_INF("Default preset model: %s\n", name.c_str()); + first_default = name; + first_found = true; + } else { + SRV_WRN( + "Multiple default models detected: '%s' and '%s'; " + "using '%s' as default\n", + name.c_str(), + first_default.c_str(), + first_default.c_str() + ); + } + } + // log available models { std::unordered_set custom_names; @@ -374,6 +398,15 @@ void server_models::load_models() { } } +std::string server_models::resolve_model_name(const std::string & requested) { + // If a non‑empty request matches a known model, use it. + if (!requested.empty() && has_model(requested)) { + return requested; + } + // Otherwise fall back to the default model if one is set. + return default_model_name.empty() ? requested : default_model_name; +} + void server_models::update_meta(const std::string & name, const server_model_meta & meta) { std::lock_guard lk(mutex); auto it = mapping.find(name); @@ -904,6 +937,7 @@ void server_models_routes::init_routes() { this->proxy_get = [this](const server_http_req & req) { std::string method = "GET"; std::string name = req.get_param("model"); + name = models.resolve_model_name(name); bool autoload = is_autoload(params, req); auto error_res = std::make_unique(); if (!router_validate_model(name, models, autoload, error_res)) { @@ -916,6 +950,7 @@ void server_models_routes::init_routes() { std::string method = "POST"; json body = json::parse(req.body); std::string name = json_value(body, "model", std::string()); + name = models.resolve_model_name(name); bool autoload = is_autoload(params, req); auto error_res = std::make_unique(); if (!router_validate_model(name, models, autoload, error_res)) { diff --git a/tools/server/server-models.h b/tools/server/server-models.h index 2b392f299a..77e9c1d45f 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -97,6 +97,7 @@ private: std::string bin_path; std::vector base_env; common_preset base_preset; // base preset from llama-server CLI args + std::string default_model_name; void update_meta(const std::string & name, const server_model_meta & meta); @@ -143,6 +144,9 @@ public: // notify the router server that a model instance is ready // return the monitoring thread (to be joined by the caller) static std::thread setup_child_server(const std::function & shutdown_handler); + + // Resolve model name: fallback to default if requested name is empty or not found + std::string resolve_model_name(const std::string & requested); }; struct server_models_routes {