server : add default-model preset and fallback logic

2026-02-24 17:14:08 +01:00 · 2026-02-24 17:14:08 +01:00 · be9837b66e
parent b541241104
commit be9837b66e
5 changed files with 48 additions and 0 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -3853,6 +3853,12 @@ void common_params_add_preset_options(std::vector<common_arg> & args) {
        [](common_params &, const std::string &) { /* unused */ }
    ).set_env(COMMON_ARG_PRESET_LOAD_ON_STARTUP).set_preset_only());

+    args.push_back(common_arg(
+        {"default-model"}, "NAME",
+        "in server router mode, this model will be used if model not found",
+        [](common_params &, const std::string &) { /* unused */ }
+    ).set_env(COMMON_ARG_PRESET_DEFAULT_MODEL).set_preset_only());
+
    args.push_back(common_arg(
        {"stop-timeout"}, "SECONDS",
        "in server router mode, force-kill model instance after this many seconds of graceful shutdown",
--- a/common/arg.h
+++ b/common/arg.h
@ -11,6 +11,7 @@
 // pseudo-env variable to identify preset-only arguments
 #define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP"
 #define COMMON_ARG_PRESET_STOP_TIMEOUT    "__PRESET_STOP_TIMEOUT"
+#define COMMON_ARG_PRESET_DEFAULT_MODEL   "__PRESET_DEFAULT_MODEL"

 //
 // CLI argument parsing
--- a/tools/server/README.md
+++ b/tools/server/README.md
@ -1552,6 +1552,8 @@ The precedence rule for preset options is as follows:

 We also offer additional options that are exclusive to presets (these aren't treated as command-line arguments):
 - `load-on-startup` (boolean): Controls whether the model loads automatically when the server starts
+- `default-model` (boolean): The model to use when no model is specified in a request or the model is not found.
+   When multiple `default-model` options are found only the first one will be used.
 - `stop-timeout` (int, seconds): After requested unload, wait for this many seconds before forcing termination (default: 10)

 ### Routing requests
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@ -306,6 +306,30 @@ void server_models::load_models() {
        add_model(std::move(meta));
    }

+    // determine default model if any
+    bool first_found = false;
+    std::string first_default;
+    for (const auto & [name, inst] : mapping) {
+        std::string val;
+        if (!inst.meta.preset.get_option(COMMON_ARG_PRESET_DEFAULT_MODEL, val)) {
+            continue;
+        }
+        if (!first_found) {
+            default_model_name = name;
+            SRV_INF("Default preset model: %s\n", name.c_str());
+            first_default = name;
+            first_found = true;
+        } else {
+            SRV_WRN(
+                "Multiple default models detected: '%s' and '%s'; "
+                "using '%s' as default\n",
+                name.c_str(),
+                first_default.c_str(),
+                first_default.c_str()
+            );
+        }
+    }
+
    // log available models
    {
        std::unordered_set<std::string> custom_names;
@ -374,6 +398,15 @@ void server_models::load_models() {
    }
 }

+std::string server_models::resolve_model_name(const std::string & requested) {
+    // If a non‑empty request matches a known model, use it.
+    if (!requested.empty() && has_model(requested)) {
+        return requested;
+    }
+    // Otherwise fall back to the default model if one is set.
+    return default_model_name.empty() ? requested : default_model_name;
+}
+
 void server_models::update_meta(const std::string & name, const server_model_meta & meta) {
    std::lock_guard<std::mutex> lk(mutex);
    auto it = mapping.find(name);
@ -904,6 +937,7 @@ void server_models_routes::init_routes() {
    this->proxy_get = [this](const server_http_req & req) {
        std::string method = "GET";
        std::string name = req.get_param("model");
+        name = models.resolve_model_name(name);
        bool autoload = is_autoload(params, req);
        auto error_res = std::make_unique<server_http_res>();
        if (!router_validate_model(name, models, autoload, error_res)) {
@ -916,6 +950,7 @@ void server_models_routes::init_routes() {
        std::string method = "POST";
        json body = json::parse(req.body);
        std::string name = json_value(body, "model", std::string());
+        name = models.resolve_model_name(name);
        bool autoload = is_autoload(params, req);
        auto error_res = std::make_unique<server_http_res>();
        if (!router_validate_model(name, models, autoload, error_res)) {
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@ -97,6 +97,7 @@ private:
    std::string bin_path;
    std::vector<std::string> base_env;
    common_preset base_preset; // base preset from llama-server CLI args
+    std::string default_model_name;

    void update_meta(const std::string & name, const server_model_meta & meta);

@ -143,6 +144,9 @@ public:
    // notify the router server that a model instance is ready
    // return the monitoring thread (to be joined by the caller)
    static std::thread setup_child_server(const std::function<void(int)> & shutdown_handler);
+
+    // Resolve model name: fallback to default if requested name is empty or not found
+    std::string resolve_model_name(const std::string & requested);
 };

 struct server_models_routes {