From a82dbbfb30c7777569685367a06fe0ea7170b75a Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 29 Nov 2025 23:00:35 +0100 Subject: [PATCH] decouple server_models from server_routes --- tools/server/server-models.cpp | 195 +++++++++++++++++++++++++++++++++ tools/server/server-models.h | 19 ++++ tools/server/server.cpp | 53 ++++----- 3 files changed, 242 insertions(+), 25 deletions(-) diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 4ec322a6f4..7f62dc4edb 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -621,6 +621,201 @@ void server_models::setup_child_server(const common_params & base_params, int ro } + +// +// server_models_routes +// + +static void res_ok(std::unique_ptr & res, const json & response_data) { + res->status = 200; + res->data = safe_json_to_str(response_data); +} + +static void res_error(std::unique_ptr & res, const json & error_data) { + res->status = json_value(error_data, "code", 500); + res->data = safe_json_to_str({{ "error", error_data }}); +} + +static bool router_validate_model(const std::string & name, server_models & models, bool models_autoload, std::unique_ptr & res) { + if (name.empty()) { + res_error(res, format_error_response("model name is missing from the request", ERROR_TYPE_INVALID_REQUEST)); + return false; + } + auto meta = models.get_meta(name); + if (!meta.has_value()) { + res_error(res, format_error_response("model not found", ERROR_TYPE_INVALID_REQUEST)); + return false; + } + if (models_autoload) { + models.ensure_model_loaded(name); + } else { + if (meta->status != SERVER_MODEL_STATUS_LOADED) { + res_error(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST)); + return false; + } + } + return true; +} + +void server_models_routes::init_routes() { + this->get_router_props = [this](const server_http_req & req) { + std::string name = req.get_param("model"); + if (name.empty()) { + // main instance + auto res = std::make_unique(); + res_ok(res, { + // TODO: add support for this on web UI + {"role", "router"}, + {"max_instances", 4}, // dummy value for testing + // this is a dummy response to make sure webui doesn't break + {"model_alias", "llama-server"}, + {"model_path", "none"}, + {"default_generation_settings", { + {"params", json{}}, + {"n_ctx", 0}, + }}, + }); + return res; + } + return proxy_get(req); + }; + + this->proxy_get = [this](const server_http_req & req) { + std::string method = "GET"; + std::string name = req.get_param("model"); + auto error_res = std::make_unique(); + if (!router_validate_model(name, models, params.models_autoload, error_res)) { + return error_res; + } + return models.proxy_request(req, method, name, false); + }; + + this->proxy_post = [this](const server_http_req & req) { + std::string method = "POST"; + json body = json::parse(req.body); + std::string name = json_value(body, "model", std::string()); + auto error_res = std::make_unique(); + if (!router_validate_model(name, models, params.models_autoload, error_res)) { + return error_res; + } + return models.proxy_request(req, method, name, true); // update last usage for POST request only + }; + + this->get_router_models = [this](const server_http_req &) { + auto res = std::make_unique(); + json models_json = json::array(); + auto all_models = models.get_all_meta(); + std::time_t t = std::time(0); + for (const auto & meta : all_models) { + json status { + {"value", server_model_status_to_string(meta.status)}, + {"args", meta.args}, + }; + if (meta.is_failed()) { + status["exit_code"] = meta.exit_code; + status["failed"] = true; + } + models_json.push_back(json { + {"id", meta.name}, + {"object", "model"}, // for OAI-compat + {"owned_by", "llamacpp"}, // for OAI-compat + {"created", t}, // for OAI-compat + {"in_cache", meta.in_cache}, + {"path", meta.path}, + {"status", status}, + // TODO: add other fields, may require reading GGUF metadata + }); + } + res_ok(res, { + {"data", models_json}, + {"object", "list"}, + }); + return res; + }; + + this->post_router_models_load = [this](const server_http_req & req) { + auto res = std::make_unique(); + json body = json::parse(req.body); + std::string name = json_value(body, "model", std::string()); + auto model = models.get_meta(name); + if (!model.has_value()) { + res_error(res, format_error_response("model is not found", ERROR_TYPE_NOT_FOUND)); + return res; + } + if (model->status == SERVER_MODEL_STATUS_LOADED) { + res_error(res, format_error_response("model is already loaded", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + models.load(name, false); + res_ok(res, {{"success", true}}); + return res; + }; + + // used by child process to notify the router about status change + // TODO @ngxson : maybe implement authentication for this endpoint in the future + this->post_router_models_status = [this](const server_http_req & req) { + auto res = std::make_unique(); + json body = json::parse(req.body); + std::string model = json_value(body, "model", std::string()); + std::string value = json_value(body, "value", std::string()); + models.update_status(model, server_model_status_from_string(value)); + res_ok(res, {{"success", true}}); + return res; + }; + + this->get_router_models = [this](const server_http_req &) { + auto res = std::make_unique(); + json models_json = json::array(); + auto all_models = models.get_all_meta(); + std::time_t t = std::time(0); + for (const auto & meta : all_models) { + json status { + {"value", server_model_status_to_string(meta.status)}, + {"args", meta.args}, + }; + if (meta.is_failed()) { + status["exit_code"] = meta.exit_code; + status["failed"] = true; + } + models_json.push_back(json { + {"id", meta.name}, + {"object", "model"}, // for OAI-compat + {"owned_by", "llamacpp"}, // for OAI-compat + {"created", t}, // for OAI-compat + {"in_cache", meta.in_cache}, + {"path", meta.path}, + {"status", status}, + // TODO: add other fields, may require reading GGUF metadata + }); + } + res_ok(res, { + {"data", models_json}, + {"object", "list"}, + }); + return res; + }; + + this->post_router_models_unload = [this](const server_http_req & req) { + auto res = std::make_unique(); + json body = json::parse(req.body); + std::string name = json_value(body, "model", std::string()); + auto model = models.get_meta(name); + if (!model.has_value()) { + res_error(res, format_error_response("model is not found", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + if (model->status != SERVER_MODEL_STATUS_LOADED) { + res_error(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + models.unload(name); + res_ok(res, {{"success", true}}); + return res; + }; +} + + + // // server_http_proxy // diff --git a/tools/server/server-models.h b/tools/server/server-models.h index dd8487ca21..552e750f58 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -125,6 +125,25 @@ public: static void setup_child_server(const common_params & base_params, int router_port, const std::string & name, std::function & shutdown_handler); }; +struct server_models_routes { + common_params params; + server_models models; + server_models_routes(const common_params & params, int argc, char ** argv, char ** envp) + : params(params), models(params, argc, argv, envp) { + init_routes(); + } + + void init_routes(); + // handlers using lambda function, so that they can capture `this` without `std::bind` + server_http_context::handler_t get_router_props; + server_http_context::handler_t proxy_get; + server_http_context::handler_t proxy_post; + server_http_context::handler_t get_router_models; + server_http_context::handler_t post_router_models_load; + server_http_context::handler_t post_router_models_status; + server_http_context::handler_t post_router_models_unload; +}; + /** * A simple HTTP proxy that forwards requests to another server * and relays the responses back. diff --git a/tools/server/server.cpp b/tools/server/server.cpp index b2a04263ff..939e1656bd 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -108,36 +108,37 @@ int main(int argc, char ** argv, char ** envp) { server_routes routes(params, ctx_server, [&ctx_http]() { return ctx_http.is_ready.load(); }); bool is_router_server = params.model.path.empty(); + std::optional models_routes{}; if (is_router_server) { // setup server instances manager - routes.models.reset(new server_models(params, argc, argv, envp)); + models_routes.emplace(params, argc, argv, envp); // proxy handlers // note: routes.get_health stays the same - routes.get_metrics = routes.proxy_get; - routes.post_props = routes.proxy_post; - routes.get_api_show = routes.proxy_get; - routes.post_completions = routes.proxy_post; - routes.post_completions_oai = routes.proxy_post; - routes.post_chat_completions = routes.proxy_post; - routes.post_infill = routes.proxy_post; - routes.post_embeddings = routes.proxy_post; - routes.post_embeddings_oai = routes.proxy_post; - routes.post_rerank = routes.proxy_post; - routes.post_tokenize = routes.proxy_post; - routes.post_detokenize = routes.proxy_post; - routes.post_apply_template = routes.proxy_post; - routes.get_lora_adapters = routes.proxy_get; - routes.post_lora_adapters = routes.proxy_post; - routes.get_slots = routes.proxy_get; - routes.post_slots = routes.proxy_post; + routes.get_metrics = models_routes->proxy_get; + routes.post_props = models_routes->proxy_post; + routes.get_api_show = models_routes->proxy_get; + routes.post_completions = models_routes->proxy_post; + routes.post_completions_oai = models_routes->proxy_post; + routes.post_chat_completions = models_routes->proxy_post; + routes.post_infill = models_routes->proxy_post; + routes.post_embeddings = models_routes->proxy_post; + routes.post_embeddings_oai = models_routes->proxy_post; + routes.post_rerank = models_routes->proxy_post; + routes.post_tokenize = models_routes->proxy_post; + routes.post_detokenize = models_routes->proxy_post; + routes.post_apply_template = models_routes->proxy_post; + routes.get_lora_adapters = models_routes->proxy_get; + routes.post_lora_adapters = models_routes->proxy_post; + routes.get_slots = models_routes->proxy_get; + routes.post_slots = models_routes->proxy_post; // custom routes for router - routes.get_props = routes.get_router_props; - routes.get_models = routes.get_router_models; - ctx_http.post("/models/load", ex_wrapper(routes.post_router_models_load)); - ctx_http.post("/models/unload", ex_wrapper(routes.post_router_models_unload)); - ctx_http.post("/models/status", ex_wrapper(routes.post_router_models_status)); + routes.get_props = models_routes->get_router_props; + routes.get_models = models_routes->get_router_models; + ctx_http.post("/models/load", ex_wrapper(models_routes->post_router_models_load)); + ctx_http.post("/models/unload", ex_wrapper(models_routes->post_router_models_unload)); + ctx_http.post("/models/status", ex_wrapper(models_routes->post_router_models_status)); } ctx_http.get ("/health", ex_wrapper(routes.get_health)); // public endpoint (no API key check) @@ -184,9 +185,11 @@ int main(int argc, char ** argv, char ** envp) { if (is_router_server) { LOG_INF("%s: starting router server, no model will be loaded in this process\n", __func__); - clean_up = [&routes]() { + clean_up = [&models_routes]() { SRV_INF("%s: cleaning up before exit...\n", __func__); - routes.models->unload_all(); + if (models_routes.has_value()) { + models_routes->models.unload_all(); + } llama_backend_free(); };