From f25bfaba4d73555cbc399e82c4d3b60824ee855d Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 23 Nov 2025 14:59:04 +0100 Subject: [PATCH 1/8] expose args and exit_code in API --- tools/server/README.md | 10 +++++++--- tools/server/server-models.cpp | 10 ++++++---- tools/server/server-models.h | 4 +++- tools/server/server.cpp | 13 +++++++++---- 4 files changed, 25 insertions(+), 12 deletions(-) diff --git a/tools/server/README.md b/tools/server/README.md index 3e311a657c..f22b57fee2 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -1455,19 +1455,23 @@ The `status` object can be: ```json "status": { - "value": "loading" + "value": "loading", + "args": ["llama-server", "-ctx", "4096"] } ``` ```json "status": { - "value": "failed" + "value": "failed", + "args": ["llama-server", "-ctx", "4096"], + "exit_code": 1 } ``` ```json "status": { - "value": "loaded" + "value": "loaded", + "args": ["llama-server", "-ctx", "4096"] } ``` diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 071b5522ea..525f2bb347 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -368,11 +368,11 @@ void server_models::load(const std::string & name) { std::vector child_env = base_env; // copy child_env.push_back("LLAMA_SERVER_ROUTER_PORT=" + std::to_string(base_params.port)); - // TODO: add logging SRV_INF("%s", "spawning server instance with args:\n"); for (const auto & arg : child_args) { SRV_INF(" %s\n", arg.c_str()); } + inst.meta.args = child_args; // save for debugging std::vector argv = to_char_ptr_array(child_args); std::vector envp = to_char_ptr_array(child_env); @@ -405,9 +405,11 @@ void server_models::load(const std::string & name) { std::lock_guard lk(mutex); auto it = mapping.find(name); if (it != mapping.end()) { - it->second.meta.status = exit_code == 0 - ? SERVER_MODEL_STATUS_UNLOADED - : SERVER_MODEL_STATUS_FAILED; + auto & meta = it->second.meta; + meta.exit_code = exit_code; + meta.status = exit_code == 0 + ? SERVER_MODEL_STATUS_UNLOADED + : SERVER_MODEL_STATUS_FAILED; } cv.notify_all(); } diff --git a/tools/server/server-models.h b/tools/server/server-models.h index 3cb3b39fe7..222f31645e 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -58,7 +58,9 @@ struct server_model_meta { bool in_cache = false; // if true, use -hf; use -m otherwise int port = 0; server_model_status status = SERVER_MODEL_STATUS_UNLOADED; - int64_t last_used = 0; + int64_t last_used = 0; // for LRU unloading + std::vector args; // additional args passed to the model instance (used for debugging) + int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED) bool is_active() const { return status == SERVER_MODEL_STATUS_LOADED || status == SERVER_MODEL_STATUS_LOADING; diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 43d145fb67..c7dbd74e81 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -5160,7 +5160,7 @@ public: std::string name = json_value(body, "model", std::string()); auto model = models->get_meta(name); if (!model.has_value()) { - res->error(format_error_response("model is not found", ERROR_TYPE_INVALID_REQUEST)); + res->error(format_error_response("model is not found", ERROR_TYPE_NOT_FOUND)); return res; } if (model->status == SERVER_MODEL_STATUS_LOADED) { @@ -5188,15 +5188,20 @@ public: json models_json = json::array(); auto all_models = models->get_all_meta(); for (const auto & model : all_models) { + json status { + {"value", server_model_status_to_string(model.status)}, + {"args", model.args}, + }; + if (model.status == SERVER_MODEL_STATUS_FAILED) { + status["exit_code"] = model.exit_code; + } models_json.push_back(json { {"name", model.name}, {"id", model.name}, {"in_cache", model.in_cache}, {"path", model.path}, + {"status", status}, // TODO: other fields... - {"status", { - {"value", server_model_status_to_string(model.status)} - }}, }); } res->ok({{"data", models_json}}); From 7ef6312f85dca19c104380ca61d8735d26493115 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 23 Nov 2025 15:08:31 +0100 Subject: [PATCH 2/8] add note --- tools/server/server-models.cpp | 8 ++++++-- tools/server/server.cpp | 2 ++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 525f2bb347..691ce746e6 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -154,7 +154,9 @@ server_models::server_models( /* in_cache */ true, /* port */ 0, /* status */ SERVER_MODEL_STATUS_UNLOADED, - /* last_used */ 0 + /* last_used */ 0, + /* args */ std::vector(), + /* exit_code */ 0 }; mapping[meta.name] = instance_t{ /* subproc */ std::make_shared(), @@ -177,7 +179,9 @@ server_models::server_models( /* in_cache */ false, /* port */ 0, /* status */ SERVER_MODEL_STATUS_UNLOADED, - /* last_used */ 0 + /* last_used */ 0, + /* args */ std::vector(), + /* exit_code */ 0 }; mapping[meta.name] = instance_t{ /* subproc */ std::make_shared(), diff --git a/tools/server/server.cpp b/tools/server/server.cpp index c7dbd74e81..4ec8aa879c 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -5820,6 +5820,8 @@ int main(int argc, char ** argv, char ** envp) { if (is_router_server) { LOG_INF("%s: router server is listening on %s\n", __func__, ctx_http.listening_address.c_str()); + LOG_INF("%s: NOTE: router mode is experimental\n", __func__); + LOG_INF("%s: it is not recommended to use this mode in untrusted environments\n", __func__); ctx_http.is_ready.store(true); if (ctx_http.thread.joinable()) { ctx_http.thread.join(); // keep the main thread alive From f927e21ffc06491f8fe0906b6e18ba210483feeb Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 23 Nov 2025 15:39:03 +0100 Subject: [PATCH 3/8] support extra_args on loading model --- tools/server/README.md | 10 ++++++---- tools/server/server-models.cpp | 20 ++++++++++++++++---- tools/server/server-models.h | 4 ++-- tools/server/server.cpp | 6 ++++-- 4 files changed, 28 insertions(+), 12 deletions(-) diff --git a/tools/server/README.md b/tools/server/README.md index f22b57fee2..13ac89617f 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -1436,7 +1436,8 @@ Listing all models in cache. The model metadata will also include a field to ind "in_cache": true, "path": "/Users/REDACTED/Library/Caches/llama.cpp/ggml-org_gemma-3-4b-it-GGUF_gemma-3-4b-it-Q4_K_M.gguf", "status": { - "value": "loaded" + "value": "loaded", + "args": ["llama-server", "-ctx", "4096"] }, ... }] @@ -1477,14 +1478,16 @@ The `status` object can be: ### POST `/models/load`: Load a model - Load a model Payload: +- `model`: name of the model to be loaded +- `extra_args`: (optional) an array of additional arguments to be passed to the model instance ```json { - "model": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M" + "model": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M", + "extra_args": ["-n", "128", "--top-k", "4"] } ``` @@ -1498,7 +1501,6 @@ Response: ### POST `/models/unload`: Unload a model - Unload a model Payload: diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 691ce746e6..285e1e7f7c 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -322,7 +322,7 @@ void server_models::unload_lru() { } } -void server_models::load(const std::string & name) { +void server_models::load(const std::string & name, const std::vector & extra_args) { if (!has_model(name)) { throw std::runtime_error("model name=" + name + " is not found"); } @@ -369,6 +369,11 @@ void server_models::load(const std::string & name) { child_args.push_back("--port"); child_args.push_back(std::to_string(inst.meta.port)); + // append extra args + for (const auto & arg : extra_args) { + child_args.push_back(arg); + } + std::vector child_env = base_env; // copy child_env.push_back("LLAMA_SERVER_ROUTER_PORT=" + std::to_string(base_params.port)); @@ -465,6 +470,10 @@ void server_models::unload_all() { } void server_models::update_status(const std::string & name, server_model_status status) { + // for now, we only allow updating to LOADED status + if (status != SERVER_MODEL_STATUS_LOADED) { + throw std::runtime_error("invalid status value"); + } auto meta = get_meta(name); if (meta.has_value()) { meta->status = status; @@ -493,7 +502,7 @@ bool server_models::ensure_model_loaded(const std::string & name) { return false; // already loaded } SRV_INF("model name=%s is not loaded, loading...\n", name.c_str()); - load(name); + load(name, {}); wait_until_loaded(name); { // check final status @@ -529,15 +538,18 @@ server_http_res_ptr server_models::proxy_request(const server_http_req & req, co return proxy; } -void server_models::setup_child_server(const std::string & host, int router_port, const std::string & name, std::function & shutdown_handler) { +void server_models::setup_child_server(const common_params & base_params, int router_port, const std::string & name, std::function & shutdown_handler) { // send a notification to the router server that a model instance is ready - httplib::Client cli(host, router_port); + httplib::Client cli(base_params.hostname, router_port); cli.set_connection_timeout(0, 200000); // 200 milliseconds httplib::Request req; req.method = "POST"; req.path = "/models/status"; req.set_header("Content-Type", "application/json"); + if (!base_params.api_keys.empty()) { + req.set_header("Authorization", "Bearer " + base_params.api_keys[0]); + } json body; body["model"] = name; diff --git a/tools/server/server-models.h b/tools/server/server-models.h index 222f31645e..e192d3dd6e 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -100,7 +100,7 @@ public: // return a copy of all model metadata std::vector get_all_meta(); - void load(const std::string & name); + void load(const std::string & name, const std::vector & extra_args); void unload(const std::string & name); void unload_all(); @@ -119,7 +119,7 @@ public: server_http_res_ptr proxy_request(const server_http_req & req, const std::string & method, const std::string & name, bool update_last_used); // notify the router server that a model instance is ready - static void setup_child_server(const std::string & host, int router_port, const std::string & name, std::function & shutdown_handler); + static void setup_child_server(const common_params & base_params, int router_port, const std::string & name, std::function & shutdown_handler); }; /** diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 4ec8aa879c..ab825e24ba 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -5158,6 +5158,7 @@ public: auto res = std::make_unique(ctx_server); json body = json::parse(req.body); std::string name = json_value(body, "model", std::string()); + std::vector extra_args = json_value(body, "extra_args", std::vector()); auto model = models->get_meta(name); if (!model.has_value()) { res->error(format_error_response("model is not found", ERROR_TYPE_NOT_FOUND)); @@ -5167,12 +5168,13 @@ public: res->error(format_error_response("model is already loaded", ERROR_TYPE_INVALID_REQUEST)); return res; } - models->load(name); + models->load(name, extra_args); res->ok({{"success", true}}); return res; }; // used by child process to notify the router about status change + // TODO @ngxson : maybe implement authentication for this endpoint in the future server_http_context::handler_t post_router_models_status = [this](const server_http_req & req) { auto res = std::make_unique(ctx_server); json body = json::parse(req.body); @@ -5836,7 +5838,7 @@ int main(int argc, char ** argv, char ** envp) { // optionally, notify router server that this instance is ready const char * router_port = std::getenv("LLAMA_SERVER_ROUTER_PORT"); if (router_port != nullptr) { - server_models::setup_child_server(params.hostname, std::atoi(router_port), params.model_alias, shutdown_handler); + server_models::setup_child_server(params, std::atoi(router_port), params.model_alias, shutdown_handler); } // this call blocks the main thread until queue_tasks.terminate() is called From 74685f41946a701ee25f8162a0966a96255d097c Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 23 Nov 2025 15:42:33 +0100 Subject: [PATCH 4/8] allow reusing args if auto_load --- tools/server/server-models.cpp | 52 +++++++++++++++++++++------------- tools/server/server-models.h | 3 +- tools/server/server.cpp | 2 +- 3 files changed, 35 insertions(+), 22 deletions(-) diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 285e1e7f7c..cf81540f5a 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -322,7 +322,7 @@ void server_models::unload_lru() { } } -void server_models::load(const std::string & name, const std::vector & extra_args) { +void server_models::load(const std::string & name, const std::vector & extra_args, bool auto_load) { if (!has_model(name)) { throw std::runtime_error("model name=" + name + " is not found"); } @@ -352,26 +352,38 @@ void server_models::load(const std::string & name, const std::vector child_args = base_args; // copy - if (inst.meta.in_cache) { - child_args.push_back("-hf"); - child_args.push_back(inst.meta.name); - } else { - child_args.push_back("-m"); - child_args.push_back(inst.meta.path); - if (!inst.meta.path_mmproj.empty()) { - child_args.push_back("--mmproj"); - child_args.push_back(inst.meta.path_mmproj); + std::vector child_args; + if (auto_load && !meta.args.empty()) { + child_args = meta.args; // reuse previous args + // update port arg + for (size_t i = 0; i < child_args.size(); i++) { + if (child_args[i] == "--port" && i + 1 < child_args.size()) { + child_args[i + 1] = std::to_string(inst.meta.port); + break; + } } - } - child_args.push_back("--alias"); - child_args.push_back(inst.meta.name); - child_args.push_back("--port"); - child_args.push_back(std::to_string(inst.meta.port)); + } else { + child_args = base_args; // copy + if (inst.meta.in_cache) { + child_args.push_back("-hf"); + child_args.push_back(inst.meta.name); + } else { + child_args.push_back("-m"); + child_args.push_back(inst.meta.path); + if (!inst.meta.path_mmproj.empty()) { + child_args.push_back("--mmproj"); + child_args.push_back(inst.meta.path_mmproj); + } + } + child_args.push_back("--alias"); + child_args.push_back(inst.meta.name); + child_args.push_back("--port"); + child_args.push_back(std::to_string(inst.meta.port)); - // append extra args - for (const auto & arg : extra_args) { - child_args.push_back(arg); + // append extra args + for (const auto & arg : extra_args) { + child_args.push_back(arg); + } } std::vector child_env = base_env; // copy @@ -502,7 +514,7 @@ bool server_models::ensure_model_loaded(const std::string & name) { return false; // already loaded } SRV_INF("model name=%s is not loaded, loading...\n", name.c_str()); - load(name, {}); + load(name, {}, true); wait_until_loaded(name); { // check final status diff --git a/tools/server/server-models.h b/tools/server/server-models.h index e192d3dd6e..ed08c5023e 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -100,7 +100,8 @@ public: // return a copy of all model metadata std::vector get_all_meta(); - void load(const std::string & name, const std::vector & extra_args); + // if auto_load is true, load the model with previous args if any + void load(const std::string & name, const std::vector & extra_args, bool auto_load); void unload(const std::string & name); void unload_all(); diff --git a/tools/server/server.cpp b/tools/server/server.cpp index ab825e24ba..bf06cc5133 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -5168,7 +5168,7 @@ public: res->error(format_error_response("model is already loaded", ERROR_TYPE_INVALID_REQUEST)); return res; } - models->load(name, extra_args); + models->load(name, extra_args, false); res->ok({{"success", true}}); return res; }; From f95f9c51289573d338a2c0db10e0814a831ee0e4 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 23 Nov 2025 16:14:02 +0100 Subject: [PATCH 5/8] typo docs --- tools/server/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/server/README.md b/tools/server/README.md index 13ac89617f..62ae83a1f0 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -1385,14 +1385,14 @@ models_directory │ └─ mmproj-F16.gguf # file name must start with "mmproj" │ │ # multi-shard - ├─ gemma-3-4b-it-Q8_0 + ├─ Kimi-K2-Thinking-UD-IQ1_S │ ├─ Kimi-K2-Thinking-UD-IQ1_S-00001-of-00006.gguf │ ├─ Kimi-K2-Thinking-UD-IQ1_S-00002-of-00006.gguf │ ├─ ... │ └─ Kimi-K2-Thinking-UD-IQ1_S-00006-of-00006.gguf ``` -You may also specify default arguments that will be passed to every loaded model instance: +You may also specify default arguments that will be passed to every model instance: ```sh llama-server -ctx 8192 -n 1024 -np 2 From 2e355c7f8e60b9a67f3b0a6f3966467e941d8177 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 23 Nov 2025 17:25:24 +0100 Subject: [PATCH 6/8] oai-compat /models endpoint --- tools/server/server.cpp | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index bf06cc5133..6a499d9577 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -5142,6 +5142,11 @@ public: server_http_context::handler_t proxy_get = [this](const server_http_req & req) { std::string method = "GET"; std::string name = req.get_param("model"); + if (name.empty()) { + auto res = std::make_unique(ctx_server); + res->error(format_error_response("model name is missing from the request", ERROR_TYPE_INVALID_REQUEST)); + return std::unique_ptr(std::move(res)); + } models->ensure_model_loaded(name); return models->proxy_request(req, method, name, false); }; @@ -5150,6 +5155,11 @@ public: std::string method = "POST"; json body = json::parse(req.body); std::string name = json_value(body, "model", std::string()); + if (name.empty()) { + auto res = std::make_unique(ctx_server); + res->error(format_error_response("model name is missing from the request", ERROR_TYPE_INVALID_REQUEST)); + return std::unique_ptr(std::move(res)); + } models->ensure_model_loaded(name); return models->proxy_request(req, method, name, true); // update last usage for POST request only }; @@ -5189,6 +5199,7 @@ public: auto res = std::make_unique(ctx_server); json models_json = json::array(); auto all_models = models->get_all_meta(); + std::time_t t = std::time(0); for (const auto & model : all_models) { json status { {"value", server_model_status_to_string(model.status)}, @@ -5198,15 +5209,21 @@ public: status["exit_code"] = model.exit_code; } models_json.push_back(json { - {"name", model.name}, {"id", model.name}, + {"name", model.name}, + {"object", "model"}, // for OAI-compat + {"owned_by", "llamacpp"}, // for OAI-compat + {"created", t}, // for OAI-compat {"in_cache", model.in_cache}, {"path", model.path}, {"status", status}, - // TODO: other fields... + // TODO: add other fields, may require reading GGUF metadata }); } - res->ok({{"data", models_json}}); + res->ok({ + {"data", models_json}, + {"object", "list"}, + }); return res; }; From 5ad594e6d65610f3dbcd92801cd051d416740294 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 23 Nov 2025 19:02:07 +0100 Subject: [PATCH 7/8] cleaner --- common/arg.cpp | 15 +++++-- common/common.h | 3 +- tools/server/README.md | 3 +- tools/server/server-models.cpp | 72 +++++++++++++++++++++------------- tools/server/server-models.h | 19 ++++----- tools/server/server.cpp | 54 +++++++++++++++++-------- 6 files changed, 107 insertions(+), 59 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index eab26b67f2..062046c0d0 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2482,12 +2482,19 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_DIR")); add_opt(common_arg( - {"--max-models"}, "N", - string_format("for router server, maximum number of models to load simultaneously (default: %d, 0 = unlimited)", params.max_models), + {"--models-max"}, "N", + string_format("for router server, maximum number of models to load simultaneously (default: %d, 0 = unlimited)", params.models_max), [](common_params & params, int value) { - params.max_models = value; + params.models_max = value; } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MAX_MODELS")); + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX")); + add_opt(common_arg( + {"--no-models-autoload"}, + "disables automatic loading of models (default: enabled)", + [](common_params & params) { + params.models_autoload = false; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_MODELS_AUTOLOAD")); add_opt(common_arg( {"--jinja"}, "use jinja template for chat (default: disabled)", diff --git a/common/common.h b/common/common.h index 20ba209ce4..4ac9700d7b 100644 --- a/common/common.h +++ b/common/common.h @@ -460,7 +460,8 @@ struct common_params { // router server configs std::string models_dir = ""; // directory containing models for the router server - int max_models = 4; // maximum number of models to load simultaneously + int models_max = 4; // maximum number of models to load simultaneously + bool models_autoload = true; // automatically load models when requested via the router server bool log_json = false; diff --git a/tools/server/README.md b/tools/server/README.md index 62ae83a1f0..bc1a4f8f7a 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -1463,8 +1463,9 @@ The `status` object can be: ```json "status": { - "value": "failed", + "value": "unloaded", "args": ["llama-server", "-ctx", "4096"], + "failed": true, "exit_code": 1 } ``` diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index cf81540f5a..67f84a508f 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -29,6 +29,8 @@ #include #endif +#define CMD_EXIT "exit" + static std::filesystem::path get_server_exec_path() { #if defined(_WIN32) wchar_t buf[32768] = { 0 }; // Large buffer to handle long paths @@ -297,10 +299,10 @@ std::vector server_models::get_all_meta() { } void server_models::unload_lru() { - if (base_params.max_models <= 0) { + if (base_params.models_max <= 0) { return; // no limit } - // remove one of the servers if we passed the max_models (least recently used - LRU) + // remove one of the servers if we passed the models_max (least recently used - LRU) std::string lru_model_name = ""; int64_t lru_last_used = ggml_time_ms(); size_t count_active = 0; @@ -316,8 +318,8 @@ void server_models::unload_lru() { } } } - if (!lru_model_name.empty() && count_active >= (size_t)base_params.max_models) { - SRV_INF("max_models limit reached, removing LRU name=%s\n", lru_model_name.c_str()); + if (!lru_model_name.empty() && count_active >= (size_t)base_params.models_max) { + SRV_INF("models_max limit reached, removing LRU name=%s\n", lru_model_name.c_str()); unload(lru_model_name); } } @@ -331,7 +333,7 @@ void server_models::load(const std::string & name, const std::vector lk(mutex); auto meta = mapping[name].meta; - if (meta.status != SERVER_MODEL_STATUS_FAILED && meta.status != SERVER_MODEL_STATUS_UNLOADED) { + if (meta.status != SERVER_MODEL_STATUS_UNLOADED) { SRV_INF("model %s is not ready\n", name.c_str()); return; } @@ -428,9 +430,7 @@ void server_models::load(const std::string & name, const std::vectorsecond.meta; meta.exit_code = exit_code; - meta.status = exit_code == 0 - ? SERVER_MODEL_STATUS_UNLOADED - : SERVER_MODEL_STATUS_FAILED; + meta.status = SERVER_MODEL_STATUS_UNLOADED; } cv.notify_all(); } @@ -446,13 +446,23 @@ void server_models::load(const std::string & name, const std::vector lk(mutex); auto it = mapping.find(name); if (it != mapping.end()) { if (it->second.meta.is_active()) { SRV_INF("unloading model instance name=%s\n", name.c_str()); - subprocess_terminate(it->second.subproc.get()); + interrupt_subprocess(it->second.subproc.get()); // status change will be handled by the managing thread } else { SRV_WRN("model instance name=%s is not loaded\n", name.c_str()); @@ -467,7 +477,7 @@ void server_models::unload_all() { for (auto & [name, inst] : mapping) { if (inst.meta.is_active()) { SRV_INF("unloading model instance name=%s\n", name.c_str()); - subprocess_terminate(inst.subproc.get()); + interrupt_subprocess(inst.subproc.get()); // status change will be handled by the managing thread } // moving the thread to join list to avoid deadlock @@ -498,8 +508,7 @@ void server_models::wait_until_loaded(const std::string & name) { cv.wait(lk, [this, &name]() { auto it = mapping.find(name); if (it != mapping.end()) { - return it->second.meta.status == SERVER_MODEL_STATUS_LOADED || - it->second.meta.status == SERVER_MODEL_STATUS_FAILED; + return it->second.meta.status != SERVER_MODEL_STATUS_LOADING; } return false; }); @@ -510,19 +519,23 @@ bool server_models::ensure_model_loaded(const std::string & name) { if (!meta.has_value()) { throw std::runtime_error("model name=" + name + " is not found"); } - if (meta->is_active()) { + if (meta->status == SERVER_MODEL_STATUS_LOADED) { return false; // already loaded } - SRV_INF("model name=%s is not loaded, loading...\n", name.c_str()); - load(name, {}, true); - wait_until_loaded(name); - { - // check final status - meta = get_meta(name); - if (!meta.has_value() || meta->status == SERVER_MODEL_STATUS_FAILED) { - throw std::runtime_error("model name=" + name + " failed to load"); - } + if (meta->status == SERVER_MODEL_STATUS_UNLOADED) { + SRV_INF("model name=%s is not loaded, loading...\n", name.c_str()); + load(name, {}, true); } + + SRV_INF("waiting until model name=%s is fully loaded...\n", name.c_str()); + wait_until_loaded(name); + + // check final status + meta = get_meta(name); + if (!meta.has_value() || meta->is_failed()) { + throw std::runtime_error("model name=" + name + " failed to load"); + } + return true; } @@ -582,13 +595,18 @@ void server_models::setup_child_server(const common_params & base_params, int ro // wait for EOF on stdin SRV_INF("%s", "child server monitoring thread started, waiting for EOF on stdin...\n"); while (true) { - int c = getchar(); - if (c == EOF) { - break; + std::string line; + if (!std::getline(std::cin, line)) { + break; // EOF detected + } + if (line.find(CMD_EXIT) != std::string::npos) { + SRV_INF("%s", "exit command received, exiting...\n"); + shutdown_handler(0); } } - SRV_INF("%s", "EOF on stdin detected, invoking shutdown handler...\n"); - shutdown_handler(0); // invoke shutdown handler + // EOF meaning router server is unexpectedly exit or killed + SRV_INF("%s", "EOF on stdin detected, forcing shutdown...\n"); + exit(1); }).detach(); } diff --git a/tools/server/server-models.h b/tools/server/server-models.h index ed08c5023e..c49cb7c62c 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -15,16 +15,16 @@ * state diagram: * * UNLOADED ──► LOADING ──► LOADED - * ▲ │ - * │ │ - * FAILED ◄───────┘ + * ▲ │ │ + * └───failed───┘ │ + * ▲ │ + * └────────unloaded─────────┘ */ enum server_model_status { - // TODO: also add downloading state + // TODO: also add downloading state when the logic is added SERVER_MODEL_STATUS_UNLOADED, SERVER_MODEL_STATUS_LOADING, - SERVER_MODEL_STATUS_LOADED, - SERVER_MODEL_STATUS_FAILED + SERVER_MODEL_STATUS_LOADED }; static server_model_status server_model_status_from_string(const std::string & status_str) { @@ -34,8 +34,6 @@ static server_model_status server_model_status_from_string(const std::string & s return SERVER_MODEL_STATUS_LOADING; } else if (status_str == "loaded") { return SERVER_MODEL_STATUS_LOADED; - } else if (status_str == "failed") { - return SERVER_MODEL_STATUS_FAILED; } else { throw std::runtime_error("invalid server model status"); } @@ -46,7 +44,6 @@ static std::string server_model_status_to_string(server_model_status status) { case SERVER_MODEL_STATUS_UNLOADED: return "unloaded"; case SERVER_MODEL_STATUS_LOADING: return "loading"; case SERVER_MODEL_STATUS_LOADED: return "loaded"; - case SERVER_MODEL_STATUS_FAILED: return "failed"; default: return "unknown"; } } @@ -65,6 +62,10 @@ struct server_model_meta { bool is_active() const { return status == SERVER_MODEL_STATUS_LOADED || status == SERVER_MODEL_STATUS_LOADING; } + + bool is_failed() const { + return status == SERVER_MODEL_STATUS_UNLOADED && exit_code != 0; + } }; struct server_models { diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 6a499d9577..efd9aee83f 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -5142,10 +5142,9 @@ public: server_http_context::handler_t proxy_get = [this](const server_http_req & req) { std::string method = "GET"; std::string name = req.get_param("model"); - if (name.empty()) { - auto res = std::make_unique(ctx_server); - res->error(format_error_response("model name is missing from the request", ERROR_TYPE_INVALID_REQUEST)); - return std::unique_ptr(std::move(res)); + auto error_res = std::make_unique(ctx_server); + if (!router_validate_model(name, error_res)) { + return std::unique_ptr(std::move(error_res)); } models->ensure_model_loaded(name); return models->proxy_request(req, method, name, false); @@ -5155,10 +5154,9 @@ public: std::string method = "POST"; json body = json::parse(req.body); std::string name = json_value(body, "model", std::string()); - if (name.empty()) { - auto res = std::make_unique(ctx_server); - res->error(format_error_response("model name is missing from the request", ERROR_TYPE_INVALID_REQUEST)); - return std::unique_ptr(std::move(res)); + auto error_res = std::make_unique(ctx_server); + if (!router_validate_model(name, error_res)) { + return std::unique_ptr(std::move(error_res)); } models->ensure_model_loaded(name); return models->proxy_request(req, method, name, true); // update last usage for POST request only @@ -5200,22 +5198,23 @@ public: json models_json = json::array(); auto all_models = models->get_all_meta(); std::time_t t = std::time(0); - for (const auto & model : all_models) { + for (const auto & meta : all_models) { json status { - {"value", server_model_status_to_string(model.status)}, - {"args", model.args}, + {"value", server_model_status_to_string(meta.status)}, + {"args", meta.args}, }; - if (model.status == SERVER_MODEL_STATUS_FAILED) { - status["exit_code"] = model.exit_code; + if (meta.is_failed()) { + status["exit_code"] = meta.exit_code; + status["failed"] = true; } models_json.push_back(json { - {"id", model.name}, - {"name", model.name}, + {"id", meta.name}, + {"name", meta.name}, {"object", "model"}, // for OAI-compat {"owned_by", "llamacpp"}, // for OAI-compat {"created", t}, // for OAI-compat - {"in_cache", model.in_cache}, - {"path", model.path}, + {"in_cache", meta.in_cache}, + {"path", meta.path}, {"status", status}, // TODO: add other fields, may require reading GGUF metadata }); @@ -5595,6 +5594,27 @@ private: res->ok(root); return res; } + + bool router_validate_model(const std::string & name, std::unique_ptr & res) { + if (name.empty()) { + res->error(format_error_response("model name is missing from the request", ERROR_TYPE_INVALID_REQUEST)); + return false; + } + auto meta = models->get_meta(name); + if (!meta.has_value()) { + res->error(format_error_response("model not found", ERROR_TYPE_INVALID_REQUEST)); + return false; + } + if (params.models_autoload) { + models->ensure_model_loaded(name); + } else { + if (meta->status != SERVER_MODEL_STATUS_LOADED) { + res->error(format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST)); + return false; + } + } + return true; + } }; std::function shutdown_handler; From d65be9170bbd566f84387036557e1014c45b99d9 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sun, 23 Nov 2025 19:31:21 +0100 Subject: [PATCH 8/8] address review comments --- tools/server/README.md | 45 ++++++++++++++++++++++------------ tools/server/server-models.cpp | 31 ++++++++++++++++------- tools/server/server.cpp | 5 ++-- 3 files changed, 54 insertions(+), 27 deletions(-) diff --git a/tools/server/README.md b/tools/server/README.md index bc1a4f8f7a..24984d8696 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -30,9 +30,10 @@ The project is under active development, and we are [looking for feedback and co | -------- | ----------- | | `-h, --help, --usage` | print usage and exit | | `--version` | show version and build info | +| `-cl, --cache-list` | show list of models in cache | | `--completion-bash` | print source-able bash completion script for llama.cpp | | `--verbose-prompt` | print a verbose prompt before generation (default: false) | -| `-t, --threads N` | number of threads to use during generation (default: -1)
(env: LLAMA_ARG_THREADS) | +| `-t, --threads N` | number of CPU threads to use during generation (default: -1)
(env: LLAMA_ARG_THREADS) | | `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) | | `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") | | `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask | @@ -51,7 +52,7 @@ The project is under active development, and we are [looking for feedback and co | `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) | | `--swa-full` | use full-size SWA cache (default: false)
[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
(env: LLAMA_ARG_SWA_FULL) | | `--kv-unified, -kvu` | use single unified KV buffer for the KV cache of all sequences (default: false)
[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)
(env: LLAMA_ARG_KV_SPLIT) | -| `-fa, --flash-attn` | enable Flash Attention (default: disabled)
(env: LLAMA_ARG_FLASH_ATTN) | +| `-fa, --flash-attn [on\|off\|auto]` | set Flash Attention use ('on', 'off', or 'auto', default: 'auto')
(env: LLAMA_ARG_FLASH_ATTN) | | `--no-perf` | disable internal libllama performance timings (default: false)
(env: LLAMA_ARG_NO_PERF) | | `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) | | `--no-escape` | do not process escape sequences | @@ -61,11 +62,12 @@ The project is under active development, and we are [looking for feedback and co | `--rope-freq-scale N` | RoPE frequency scaling factor, expands context by a factor of 1/N
(env: LLAMA_ARG_ROPE_FREQ_SCALE) | | `--yarn-orig-ctx N` | YaRN: original context size of model (default: 0 = model training context size)
(env: LLAMA_ARG_YARN_ORIG_CTX) | | `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation)
(env: LLAMA_ARG_YARN_EXT_FACTOR) | -| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: 1.0)
(env: LLAMA_ARG_YARN_ATTN_FACTOR) | -| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: 1.0)
(env: LLAMA_ARG_YARN_BETA_SLOW) | -| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0)
(env: LLAMA_ARG_YARN_BETA_FAST) | +| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.0)
(env: LLAMA_ARG_YARN_ATTN_FACTOR) | +| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.0)
(env: LLAMA_ARG_YARN_BETA_SLOW) | +| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.0)
(env: LLAMA_ARG_YARN_BETA_FAST) | | `-nkvo, --no-kv-offload` | disable KV offload
(env: LLAMA_ARG_NO_KV_OFFLOAD) | | `-nr, --no-repack` | disable weight repacking
(env: LLAMA_ARG_NO_REPACK) | +| `--no-host` | bypass host buffer allowing extra buffers to be used
(env: LLAMA_ARG_NO_HOST) | | `-ctk, --cache-type-k TYPE` | KV cache data type for K
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_K) | | `-ctv, --cache-type-v TYPE` | KV cache data type for V
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)
(env: LLAMA_ARG_DEFRAG_THOLD) | @@ -78,7 +80,7 @@ The project is under active development, and we are [looking for feedback and co | `--override-tensor, -ot =,...` | override tensor buffer type | | `--cpu-moe, -cmoe` | keep all Mixture of Experts (MoE) weights in the CPU
(env: LLAMA_ARG_CPU_MOE) | | `--n-cpu-moe, -ncmoe N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU
(env: LLAMA_ARG_N_CPU_MOE) | -| `-ngl, --gpu-layers, --n-gpu-layers N` | number of layers to store in VRAM
(env: LLAMA_ARG_N_GPU_LAYERS) | +| `-ngl, --gpu-layers, --n-gpu-layers N` | max. number of layers to store in VRAM (default: -1)
(env: LLAMA_ARG_N_GPU_LAYERS) | | `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:
- none: use one GPU only
- layer (default): split layers and KV across GPUs
- row: split rows across GPUs
(env: LLAMA_ARG_SPLIT_MODE) | | `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1
(env: LLAMA_ARG_TENSOR_SPLIT) | | `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)
(env: LLAMA_ARG_MAIN_GPU) | @@ -90,8 +92,9 @@ The project is under active development, and we are [looking for feedback and co | `--control-vector FNAME` | add a control vector
note: this argument can be repeated to add multiple control vectors | | `--control-vector-scaled FNAME SCALE` | add a control vector with user defined scaling SCALE
note: this argument can be repeated to add multiple scaled control vectors | | `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive | -| `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)
(env: LLAMA_ARG_MODEL) | +| `-m, --model FNAME` | model path to load
(env: LLAMA_ARG_MODEL) | | `-mu, --model-url MODEL_URL` | model download url (default: unused)
(env: LLAMA_ARG_MODEL_URL) | +| `-dr, --docker-repo [/][:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.
example: gemma3
(default: unused)
(env: LLAMA_ARG_DOCKER_REPO) | | `-hf, -hfr, --hf-repo /[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.
mmproj is also downloaded automatically if available. to disable, add --no-mmproj
example: unsloth/phi-4-GGUF:q4_k_m
(default: unused)
(env: LLAMA_ARG_HF_REPO) | | `-hfd, -hfrd, --hf-repo-draft /[:quant]` | Same as --hf-repo, but for the draft model (default: unused)
(env: LLAMA_ARG_HFD_REPO) | | `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)
(env: LLAMA_ARG_HF_FILE) | @@ -100,7 +103,7 @@ The project is under active development, and we are [looking for feedback and co | `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)
(env: HF_TOKEN) | | `--log-disable` | Log disable | | `--log-file FNAME` | Log to file | -| `--log-colors` | Enable colored logging
(env: LLAMA_LOG_COLORS) | +| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')
'auto' enables colors when output is to a terminal
(env: LLAMA_LOG_COLORS) | | `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) | | `--offline` | Offline mode: forces use of cache, prevents network access
(env: LLAMA_OFFLINE) | | `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored.
(env: LLAMA_LOG_VERBOSITY) | @@ -151,7 +154,8 @@ The project is under active development, and we are [looking for feedback and co | Argument | Explanation | | -------- | ----------- | -| `--swa-checkpoints N` | max number of SWA checkpoints per slot to create (default: 3)
[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)
(env: LLAMA_ARG_SWA_CHECKPOINTS) | +| `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)
[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)
(env: LLAMA_ARG_CTX_CHECKPOINTS) | +| `--cache-ram, -cram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)
[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)
(env: LLAMA_ARG_CACHE_RAM) | | `--no-context-shift` | disables context shift on infinite text generation (default: enabled)
(env: LLAMA_ARG_NO_CONTEXT_SHIFT) | | `--context-shift` | enables context shift on infinite text generation (default: disabled)
(env: LLAMA_ARG_CONTEXT_SHIFT) | | `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode
| @@ -165,6 +169,8 @@ The project is under active development, and we are [looking for feedback and co | `--mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md
(env: LLAMA_ARG_MMPROJ_URL) | | `--no-mmproj` | explicitly disable multimodal projector, useful when using -hf
(env: LLAMA_ARG_NO_MMPROJ) | | `--no-mmproj-offload` | do not offload multimodal projector to GPU
(env: LLAMA_ARG_NO_MMPROJ_OFFLOAD) | +| `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)
(env: LLAMA_ARG_IMAGE_MIN_TOKENS) | +| `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)
(env: LLAMA_ARG_IMAGE_MAX_TOKENS) | | `--override-tensor-draft, -otd =,...` | override tensor buffer type for draft model | | `--cpu-moe-draft, -cmoed` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model
(env: LLAMA_ARG_CPU_MOE_DRAFT) | | `--n-cpu-moe-draft, -ncmoed N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model
(env: LLAMA_ARG_N_CPU_MOE_DRAFT) | @@ -189,13 +195,16 @@ The project is under active development, and we are [looking for feedback and co | `--slots` | enable slots monitoring endpoint (default: enabled)
(env: LLAMA_ARG_ENDPOINT_SLOTS) | | `--no-slots` | disables slots monitoring endpoint
(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) | | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) | +| `--models-dir PATH` | directory containing models for the router server (default: disabled)
(env: LLAMA_ARG_MODELS_DIR) | +| `--models-max N` | for router server, maximum number of models to load simultaneously (default: 4, 0 = unlimited)
(env: LLAMA_ARG_MODELS_MAX) | +| `--no-models-autoload` | disables automatic loading of models (default: enabled)
(env: LLAMA_ARG_NO_MODELS_AUTOLOAD) | | `--jinja` | use jinja template for chat (default: disabled)
(env: LLAMA_ARG_JINJA) | -| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content`
- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`
(default: deepseek)
(env: LLAMA_ARG_THINK) | +| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content`
- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`
(default: auto)
(env: LLAMA_ARG_THINK) | | `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | -| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | -| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | +| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | +| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | | `--no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)
when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled

(env: LLAMA_ARG_NO_PREFILL_ASSISTANT) | -| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)
| +| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled)
| | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) | | `-td, --threads-draft N` | number of threads to use during generation (default: same as --threads) | | `-tbd, --threads-batch-draft N` | number of threads to use during batch and prompt processing (default: same as --threads-draft) | @@ -209,15 +218,17 @@ The project is under active development, and we are [looking for feedback and co | `--spec-replace TARGET DRAFT` | translate the string in TARGET into DRAFT if the draft model and main model are not compatible | | `-mv, --model-vocoder FNAME` | vocoder model for audio generation (default: unused) | | `--tts-use-guide-tokens` | Use guide tokens to improve TTS word recall | -| `--embd-bge-small-en-default` | use default bge-small-en-v1.5 model (note: can download weights from the internet) | -| `--embd-e5-small-en-default` | use default e5-small-v2 model (note: can download weights from the internet) | -| `--embd-gte-small-default` | use default gte-small model (note: can download weights from the internet) | +| `--embd-gemma-default` | use default EmbeddingGemma model (note: can download weights from the internet) | | `--fim-qwen-1.5b-default` | use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet) | | `--fim-qwen-3b-default` | use default Qwen 2.5 Coder 3B (note: can download weights from the internet) | | `--fim-qwen-7b-default` | use default Qwen 2.5 Coder 7B (note: can download weights from the internet) | | `--fim-qwen-7b-spec` | use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet) | | `--fim-qwen-14b-spec` | use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet) | | `--fim-qwen-30b-default` | use default Qwen 3 Coder 30B A3B Instruct (note: can download weights from the internet) | +| `--gpt-oss-20b-default` | use gpt-oss-20b (note: can download weights from the internet) | +| `--gpt-oss-120b-default` | use gpt-oss-120b (note: can download weights from the internet) | +| `--vision-gemma-4b-default` | use Gemma 3 4B QAT (note: can download weights from the internet) | +| `--vision-gemma-12b-default` | use Gemma 3 12B QAT (note: can download weights from the internet) | Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var. @@ -1424,6 +1435,8 @@ For **GET** endpoints (`/props`, `/metrics`, etc.) The router uses the `model` q GET /props?model=ggml-org%2Fgemma-3-4b-it-GGUF%3AQ4_K_M ``` +By default, the model will be loaded automatically if it's not loaded. To disable this, add `--no-models-autoload` when starting the server. + ### GET `/models`: List available models Listing all models in cache. The model metadata will also include a field to indicate the status of the model: diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 67f84a508f..6ab0a9c226 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -408,6 +408,7 @@ void server_models::load(const std::string & name, const std::vector(std::move(error_res)); } - models->ensure_model_loaded(name); return models->proxy_request(req, method, name, false); }; @@ -5158,7 +5157,6 @@ public: if (!router_validate_model(name, error_res)) { return std::unique_ptr(std::move(error_res)); } - models->ensure_model_loaded(name); return models->proxy_request(req, method, name, true); // update last usage for POST request only }; @@ -5713,7 +5711,10 @@ int main(int argc, char ** argv, char ** envp) { routes.models.reset(new server_models(params, argc, argv, envp)); // proxy handlers + // note: routes.get_health stays the same + routes.get_metrics = routes.proxy_get; routes.post_props = routes.proxy_post; + routes.get_api_show = routes.proxy_get; routes.post_completions = routes.proxy_post; routes.post_completions_oai = routes.proxy_post; routes.post_chat_completions = routes.proxy_post;