This commit is contained in:
Xuan Son Nguyen 2025-11-23 19:02:07 +01:00
parent 2e355c7f8e
commit 5ad594e6d6
6 changed files with 107 additions and 59 deletions

View File

@ -2482,12 +2482,19 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_DIR"));
add_opt(common_arg(
{"--max-models"}, "N",
string_format("for router server, maximum number of models to load simultaneously (default: %d, 0 = unlimited)", params.max_models),
{"--models-max"}, "N",
string_format("for router server, maximum number of models to load simultaneously (default: %d, 0 = unlimited)", params.models_max),
[](common_params & params, int value) {
params.max_models = value;
params.models_max = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MAX_MODELS"));
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
add_opt(common_arg(
{"--no-models-autoload"},
"disables automatic loading of models (default: enabled)",
[](common_params & params) {
params.models_autoload = false;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_MODELS_AUTOLOAD"));
add_opt(common_arg(
{"--jinja"},
"use jinja template for chat (default: disabled)",

View File

@ -460,7 +460,8 @@ struct common_params {
// router server configs
std::string models_dir = ""; // directory containing models for the router server
int max_models = 4; // maximum number of models to load simultaneously
int models_max = 4; // maximum number of models to load simultaneously
bool models_autoload = true; // automatically load models when requested via the router server
bool log_json = false;

View File

@ -1463,8 +1463,9 @@ The `status` object can be:
```json
"status": {
"value": "failed",
"value": "unloaded",
"args": ["llama-server", "-ctx", "4096"],
"failed": true,
"exit_code": 1
}
```

View File

@ -29,6 +29,8 @@
#include <limits.h>
#endif
#define CMD_EXIT "exit"
static std::filesystem::path get_server_exec_path() {
#if defined(_WIN32)
wchar_t buf[32768] = { 0 }; // Large buffer to handle long paths
@ -297,10 +299,10 @@ std::vector<server_model_meta> server_models::get_all_meta() {
}
void server_models::unload_lru() {
if (base_params.max_models <= 0) {
if (base_params.models_max <= 0) {
return; // no limit
}
// remove one of the servers if we passed the max_models (least recently used - LRU)
// remove one of the servers if we passed the models_max (least recently used - LRU)
std::string lru_model_name = "";
int64_t lru_last_used = ggml_time_ms();
size_t count_active = 0;
@ -316,8 +318,8 @@ void server_models::unload_lru() {
}
}
}
if (!lru_model_name.empty() && count_active >= (size_t)base_params.max_models) {
SRV_INF("max_models limit reached, removing LRU name=%s\n", lru_model_name.c_str());
if (!lru_model_name.empty() && count_active >= (size_t)base_params.models_max) {
SRV_INF("models_max limit reached, removing LRU name=%s\n", lru_model_name.c_str());
unload(lru_model_name);
}
}
@ -331,7 +333,7 @@ void server_models::load(const std::string & name, const std::vector<std::string
std::lock_guard<std::mutex> lk(mutex);
auto meta = mapping[name].meta;
if (meta.status != SERVER_MODEL_STATUS_FAILED && meta.status != SERVER_MODEL_STATUS_UNLOADED) {
if (meta.status != SERVER_MODEL_STATUS_UNLOADED) {
SRV_INF("model %s is not ready\n", name.c_str());
return;
}
@ -428,9 +430,7 @@ void server_models::load(const std::string & name, const std::vector<std::string
if (it != mapping.end()) {
auto & meta = it->second.meta;
meta.exit_code = exit_code;
meta.status = exit_code == 0
? SERVER_MODEL_STATUS_UNLOADED
: SERVER_MODEL_STATUS_FAILED;
meta.status = SERVER_MODEL_STATUS_UNLOADED;
}
cv.notify_all();
}
@ -446,13 +446,23 @@ void server_models::load(const std::string & name, const std::vector<std::string
cv.notify_all();
}
static void interrupt_subprocess(subprocess_s * proc) {
// because subprocess.h does not provide a way to send SIGINT,
// we will send a command to the child process to exit gracefully
FILE * p_stdin = subprocess_stdin(proc);
if (p_stdin) {
fprintf(p_stdin, "%s\n", CMD_EXIT);
fflush(p_stdin);
}
}
void server_models::unload(const std::string & name) {
std::lock_guard<std::mutex> lk(mutex);
auto it = mapping.find(name);
if (it != mapping.end()) {
if (it->second.meta.is_active()) {
SRV_INF("unloading model instance name=%s\n", name.c_str());
subprocess_terminate(it->second.subproc.get());
interrupt_subprocess(it->second.subproc.get());
// status change will be handled by the managing thread
} else {
SRV_WRN("model instance name=%s is not loaded\n", name.c_str());
@ -467,7 +477,7 @@ void server_models::unload_all() {
for (auto & [name, inst] : mapping) {
if (inst.meta.is_active()) {
SRV_INF("unloading model instance name=%s\n", name.c_str());
subprocess_terminate(inst.subproc.get());
interrupt_subprocess(inst.subproc.get());
// status change will be handled by the managing thread
}
// moving the thread to join list to avoid deadlock
@ -498,8 +508,7 @@ void server_models::wait_until_loaded(const std::string & name) {
cv.wait(lk, [this, &name]() {
auto it = mapping.find(name);
if (it != mapping.end()) {
return it->second.meta.status == SERVER_MODEL_STATUS_LOADED ||
it->second.meta.status == SERVER_MODEL_STATUS_FAILED;
return it->second.meta.status != SERVER_MODEL_STATUS_LOADING;
}
return false;
});
@ -510,19 +519,23 @@ bool server_models::ensure_model_loaded(const std::string & name) {
if (!meta.has_value()) {
throw std::runtime_error("model name=" + name + " is not found");
}
if (meta->is_active()) {
if (meta->status == SERVER_MODEL_STATUS_LOADED) {
return false; // already loaded
}
SRV_INF("model name=%s is not loaded, loading...\n", name.c_str());
load(name, {}, true);
wait_until_loaded(name);
{
// check final status
meta = get_meta(name);
if (!meta.has_value() || meta->status == SERVER_MODEL_STATUS_FAILED) {
throw std::runtime_error("model name=" + name + " failed to load");
}
if (meta->status == SERVER_MODEL_STATUS_UNLOADED) {
SRV_INF("model name=%s is not loaded, loading...\n", name.c_str());
load(name, {}, true);
}
SRV_INF("waiting until model name=%s is fully loaded...\n", name.c_str());
wait_until_loaded(name);
// check final status
meta = get_meta(name);
if (!meta.has_value() || meta->is_failed()) {
throw std::runtime_error("model name=" + name + " failed to load");
}
return true;
}
@ -582,13 +595,18 @@ void server_models::setup_child_server(const common_params & base_params, int ro
// wait for EOF on stdin
SRV_INF("%s", "child server monitoring thread started, waiting for EOF on stdin...\n");
while (true) {
int c = getchar();
if (c == EOF) {
break;
std::string line;
if (!std::getline(std::cin, line)) {
break; // EOF detected
}
if (line.find(CMD_EXIT) != std::string::npos) {
SRV_INF("%s", "exit command received, exiting...\n");
shutdown_handler(0);
}
}
SRV_INF("%s", "EOF on stdin detected, invoking shutdown handler...\n");
shutdown_handler(0); // invoke shutdown handler
// EOF meaning router server is unexpectedly exit or killed
SRV_INF("%s", "EOF on stdin detected, forcing shutdown...\n");
exit(1);
}).detach();
}

View File

@ -15,16 +15,16 @@
* state diagram:
*
* UNLOADED LOADING LOADED
*
*
* FAILED
*
* failed
*
* unloaded
*/
enum server_model_status {
// TODO: also add downloading state
// TODO: also add downloading state when the logic is added
SERVER_MODEL_STATUS_UNLOADED,
SERVER_MODEL_STATUS_LOADING,
SERVER_MODEL_STATUS_LOADED,
SERVER_MODEL_STATUS_FAILED
SERVER_MODEL_STATUS_LOADED
};
static server_model_status server_model_status_from_string(const std::string & status_str) {
@ -34,8 +34,6 @@ static server_model_status server_model_status_from_string(const std::string & s
return SERVER_MODEL_STATUS_LOADING;
} else if (status_str == "loaded") {
return SERVER_MODEL_STATUS_LOADED;
} else if (status_str == "failed") {
return SERVER_MODEL_STATUS_FAILED;
} else {
throw std::runtime_error("invalid server model status");
}
@ -46,7 +44,6 @@ static std::string server_model_status_to_string(server_model_status status) {
case SERVER_MODEL_STATUS_UNLOADED: return "unloaded";
case SERVER_MODEL_STATUS_LOADING: return "loading";
case SERVER_MODEL_STATUS_LOADED: return "loaded";
case SERVER_MODEL_STATUS_FAILED: return "failed";
default: return "unknown";
}
}
@ -65,6 +62,10 @@ struct server_model_meta {
bool is_active() const {
return status == SERVER_MODEL_STATUS_LOADED || status == SERVER_MODEL_STATUS_LOADING;
}
bool is_failed() const {
return status == SERVER_MODEL_STATUS_UNLOADED && exit_code != 0;
}
};
struct server_models {

View File

@ -5142,10 +5142,9 @@ public:
server_http_context::handler_t proxy_get = [this](const server_http_req & req) {
std::string method = "GET";
std::string name = req.get_param("model");
if (name.empty()) {
auto res = std::make_unique<server_res_generator>(ctx_server);
res->error(format_error_response("model name is missing from the request", ERROR_TYPE_INVALID_REQUEST));
return std::unique_ptr<server_http_res>(std::move(res));
auto error_res = std::make_unique<server_res_generator>(ctx_server);
if (!router_validate_model(name, error_res)) {
return std::unique_ptr<server_http_res>(std::move(error_res));
}
models->ensure_model_loaded(name);
return models->proxy_request(req, method, name, false);
@ -5155,10 +5154,9 @@ public:
std::string method = "POST";
json body = json::parse(req.body);
std::string name = json_value(body, "model", std::string());
if (name.empty()) {
auto res = std::make_unique<server_res_generator>(ctx_server);
res->error(format_error_response("model name is missing from the request", ERROR_TYPE_INVALID_REQUEST));
return std::unique_ptr<server_http_res>(std::move(res));
auto error_res = std::make_unique<server_res_generator>(ctx_server);
if (!router_validate_model(name, error_res)) {
return std::unique_ptr<server_http_res>(std::move(error_res));
}
models->ensure_model_loaded(name);
return models->proxy_request(req, method, name, true); // update last usage for POST request only
@ -5200,22 +5198,23 @@ public:
json models_json = json::array();
auto all_models = models->get_all_meta();
std::time_t t = std::time(0);
for (const auto & model : all_models) {
for (const auto & meta : all_models) {
json status {
{"value", server_model_status_to_string(model.status)},
{"args", model.args},
{"value", server_model_status_to_string(meta.status)},
{"args", meta.args},
};
if (model.status == SERVER_MODEL_STATUS_FAILED) {
status["exit_code"] = model.exit_code;
if (meta.is_failed()) {
status["exit_code"] = meta.exit_code;
status["failed"] = true;
}
models_json.push_back(json {
{"id", model.name},
{"name", model.name},
{"id", meta.name},
{"name", meta.name},
{"object", "model"}, // for OAI-compat
{"owned_by", "llamacpp"}, // for OAI-compat
{"created", t}, // for OAI-compat
{"in_cache", model.in_cache},
{"path", model.path},
{"in_cache", meta.in_cache},
{"path", meta.path},
{"status", status},
// TODO: add other fields, may require reading GGUF metadata
});
@ -5595,6 +5594,27 @@ private:
res->ok(root);
return res;
}
bool router_validate_model(const std::string & name, std::unique_ptr<server_res_generator> & res) {
if (name.empty()) {
res->error(format_error_response("model name is missing from the request", ERROR_TYPE_INVALID_REQUEST));
return false;
}
auto meta = models->get_meta(name);
if (!meta.has_value()) {
res->error(format_error_response("model not found", ERROR_TYPE_INVALID_REQUEST));
return false;
}
if (params.models_autoload) {
models->ensure_model_loaded(name);
} else {
if (meta->status != SERVER_MODEL_STATUS_LOADED) {
res->error(format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST));
return false;
}
}
return true;
}
};
std::function<void(int)> shutdown_handler;