cleaner
This commit is contained in:
parent
2e355c7f8e
commit
5ad594e6d6
|
|
@ -2482,12 +2482,19 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_DIR"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_DIR"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--max-models"}, "N",
|
{"--models-max"}, "N",
|
||||||
string_format("for router server, maximum number of models to load simultaneously (default: %d, 0 = unlimited)", params.max_models),
|
string_format("for router server, maximum number of models to load simultaneously (default: %d, 0 = unlimited)", params.models_max),
|
||||||
[](common_params & params, int value) {
|
[](common_params & params, int value) {
|
||||||
params.max_models = value;
|
params.models_max = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MAX_MODELS"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--no-models-autoload"},
|
||||||
|
"disables automatic loading of models (default: enabled)",
|
||||||
|
[](common_params & params) {
|
||||||
|
params.models_autoload = false;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_MODELS_AUTOLOAD"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--jinja"},
|
{"--jinja"},
|
||||||
"use jinja template for chat (default: disabled)",
|
"use jinja template for chat (default: disabled)",
|
||||||
|
|
|
||||||
|
|
@ -460,7 +460,8 @@ struct common_params {
|
||||||
|
|
||||||
// router server configs
|
// router server configs
|
||||||
std::string models_dir = ""; // directory containing models for the router server
|
std::string models_dir = ""; // directory containing models for the router server
|
||||||
int max_models = 4; // maximum number of models to load simultaneously
|
int models_max = 4; // maximum number of models to load simultaneously
|
||||||
|
bool models_autoload = true; // automatically load models when requested via the router server
|
||||||
|
|
||||||
bool log_json = false;
|
bool log_json = false;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1463,8 +1463,9 @@ The `status` object can be:
|
||||||
|
|
||||||
```json
|
```json
|
||||||
"status": {
|
"status": {
|
||||||
"value": "failed",
|
"value": "unloaded",
|
||||||
"args": ["llama-server", "-ctx", "4096"],
|
"args": ["llama-server", "-ctx", "4096"],
|
||||||
|
"failed": true,
|
||||||
"exit_code": 1
|
"exit_code": 1
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
|
||||||
|
|
@ -29,6 +29,8 @@
|
||||||
#include <limits.h>
|
#include <limits.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#define CMD_EXIT "exit"
|
||||||
|
|
||||||
static std::filesystem::path get_server_exec_path() {
|
static std::filesystem::path get_server_exec_path() {
|
||||||
#if defined(_WIN32)
|
#if defined(_WIN32)
|
||||||
wchar_t buf[32768] = { 0 }; // Large buffer to handle long paths
|
wchar_t buf[32768] = { 0 }; // Large buffer to handle long paths
|
||||||
|
|
@ -297,10 +299,10 @@ std::vector<server_model_meta> server_models::get_all_meta() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void server_models::unload_lru() {
|
void server_models::unload_lru() {
|
||||||
if (base_params.max_models <= 0) {
|
if (base_params.models_max <= 0) {
|
||||||
return; // no limit
|
return; // no limit
|
||||||
}
|
}
|
||||||
// remove one of the servers if we passed the max_models (least recently used - LRU)
|
// remove one of the servers if we passed the models_max (least recently used - LRU)
|
||||||
std::string lru_model_name = "";
|
std::string lru_model_name = "";
|
||||||
int64_t lru_last_used = ggml_time_ms();
|
int64_t lru_last_used = ggml_time_ms();
|
||||||
size_t count_active = 0;
|
size_t count_active = 0;
|
||||||
|
|
@ -316,8 +318,8 @@ void server_models::unload_lru() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!lru_model_name.empty() && count_active >= (size_t)base_params.max_models) {
|
if (!lru_model_name.empty() && count_active >= (size_t)base_params.models_max) {
|
||||||
SRV_INF("max_models limit reached, removing LRU name=%s\n", lru_model_name.c_str());
|
SRV_INF("models_max limit reached, removing LRU name=%s\n", lru_model_name.c_str());
|
||||||
unload(lru_model_name);
|
unload(lru_model_name);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -331,7 +333,7 @@ void server_models::load(const std::string & name, const std::vector<std::string
|
||||||
std::lock_guard<std::mutex> lk(mutex);
|
std::lock_guard<std::mutex> lk(mutex);
|
||||||
|
|
||||||
auto meta = mapping[name].meta;
|
auto meta = mapping[name].meta;
|
||||||
if (meta.status != SERVER_MODEL_STATUS_FAILED && meta.status != SERVER_MODEL_STATUS_UNLOADED) {
|
if (meta.status != SERVER_MODEL_STATUS_UNLOADED) {
|
||||||
SRV_INF("model %s is not ready\n", name.c_str());
|
SRV_INF("model %s is not ready\n", name.c_str());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
@ -428,9 +430,7 @@ void server_models::load(const std::string & name, const std::vector<std::string
|
||||||
if (it != mapping.end()) {
|
if (it != mapping.end()) {
|
||||||
auto & meta = it->second.meta;
|
auto & meta = it->second.meta;
|
||||||
meta.exit_code = exit_code;
|
meta.exit_code = exit_code;
|
||||||
meta.status = exit_code == 0
|
meta.status = SERVER_MODEL_STATUS_UNLOADED;
|
||||||
? SERVER_MODEL_STATUS_UNLOADED
|
|
||||||
: SERVER_MODEL_STATUS_FAILED;
|
|
||||||
}
|
}
|
||||||
cv.notify_all();
|
cv.notify_all();
|
||||||
}
|
}
|
||||||
|
|
@ -446,13 +446,23 @@ void server_models::load(const std::string & name, const std::vector<std::string
|
||||||
cv.notify_all();
|
cv.notify_all();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void interrupt_subprocess(subprocess_s * proc) {
|
||||||
|
// because subprocess.h does not provide a way to send SIGINT,
|
||||||
|
// we will send a command to the child process to exit gracefully
|
||||||
|
FILE * p_stdin = subprocess_stdin(proc);
|
||||||
|
if (p_stdin) {
|
||||||
|
fprintf(p_stdin, "%s\n", CMD_EXIT);
|
||||||
|
fflush(p_stdin);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void server_models::unload(const std::string & name) {
|
void server_models::unload(const std::string & name) {
|
||||||
std::lock_guard<std::mutex> lk(mutex);
|
std::lock_guard<std::mutex> lk(mutex);
|
||||||
auto it = mapping.find(name);
|
auto it = mapping.find(name);
|
||||||
if (it != mapping.end()) {
|
if (it != mapping.end()) {
|
||||||
if (it->second.meta.is_active()) {
|
if (it->second.meta.is_active()) {
|
||||||
SRV_INF("unloading model instance name=%s\n", name.c_str());
|
SRV_INF("unloading model instance name=%s\n", name.c_str());
|
||||||
subprocess_terminate(it->second.subproc.get());
|
interrupt_subprocess(it->second.subproc.get());
|
||||||
// status change will be handled by the managing thread
|
// status change will be handled by the managing thread
|
||||||
} else {
|
} else {
|
||||||
SRV_WRN("model instance name=%s is not loaded\n", name.c_str());
|
SRV_WRN("model instance name=%s is not loaded\n", name.c_str());
|
||||||
|
|
@ -467,7 +477,7 @@ void server_models::unload_all() {
|
||||||
for (auto & [name, inst] : mapping) {
|
for (auto & [name, inst] : mapping) {
|
||||||
if (inst.meta.is_active()) {
|
if (inst.meta.is_active()) {
|
||||||
SRV_INF("unloading model instance name=%s\n", name.c_str());
|
SRV_INF("unloading model instance name=%s\n", name.c_str());
|
||||||
subprocess_terminate(inst.subproc.get());
|
interrupt_subprocess(inst.subproc.get());
|
||||||
// status change will be handled by the managing thread
|
// status change will be handled by the managing thread
|
||||||
}
|
}
|
||||||
// moving the thread to join list to avoid deadlock
|
// moving the thread to join list to avoid deadlock
|
||||||
|
|
@ -498,8 +508,7 @@ void server_models::wait_until_loaded(const std::string & name) {
|
||||||
cv.wait(lk, [this, &name]() {
|
cv.wait(lk, [this, &name]() {
|
||||||
auto it = mapping.find(name);
|
auto it = mapping.find(name);
|
||||||
if (it != mapping.end()) {
|
if (it != mapping.end()) {
|
||||||
return it->second.meta.status == SERVER_MODEL_STATUS_LOADED ||
|
return it->second.meta.status != SERVER_MODEL_STATUS_LOADING;
|
||||||
it->second.meta.status == SERVER_MODEL_STATUS_FAILED;
|
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
});
|
});
|
||||||
|
|
@ -510,19 +519,23 @@ bool server_models::ensure_model_loaded(const std::string & name) {
|
||||||
if (!meta.has_value()) {
|
if (!meta.has_value()) {
|
||||||
throw std::runtime_error("model name=" + name + " is not found");
|
throw std::runtime_error("model name=" + name + " is not found");
|
||||||
}
|
}
|
||||||
if (meta->is_active()) {
|
if (meta->status == SERVER_MODEL_STATUS_LOADED) {
|
||||||
return false; // already loaded
|
return false; // already loaded
|
||||||
}
|
}
|
||||||
SRV_INF("model name=%s is not loaded, loading...\n", name.c_str());
|
if (meta->status == SERVER_MODEL_STATUS_UNLOADED) {
|
||||||
load(name, {}, true);
|
SRV_INF("model name=%s is not loaded, loading...\n", name.c_str());
|
||||||
wait_until_loaded(name);
|
load(name, {}, true);
|
||||||
{
|
|
||||||
// check final status
|
|
||||||
meta = get_meta(name);
|
|
||||||
if (!meta.has_value() || meta->status == SERVER_MODEL_STATUS_FAILED) {
|
|
||||||
throw std::runtime_error("model name=" + name + " failed to load");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SRV_INF("waiting until model name=%s is fully loaded...\n", name.c_str());
|
||||||
|
wait_until_loaded(name);
|
||||||
|
|
||||||
|
// check final status
|
||||||
|
meta = get_meta(name);
|
||||||
|
if (!meta.has_value() || meta->is_failed()) {
|
||||||
|
throw std::runtime_error("model name=" + name + " failed to load");
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -582,13 +595,18 @@ void server_models::setup_child_server(const common_params & base_params, int ro
|
||||||
// wait for EOF on stdin
|
// wait for EOF on stdin
|
||||||
SRV_INF("%s", "child server monitoring thread started, waiting for EOF on stdin...\n");
|
SRV_INF("%s", "child server monitoring thread started, waiting for EOF on stdin...\n");
|
||||||
while (true) {
|
while (true) {
|
||||||
int c = getchar();
|
std::string line;
|
||||||
if (c == EOF) {
|
if (!std::getline(std::cin, line)) {
|
||||||
break;
|
break; // EOF detected
|
||||||
|
}
|
||||||
|
if (line.find(CMD_EXIT) != std::string::npos) {
|
||||||
|
SRV_INF("%s", "exit command received, exiting...\n");
|
||||||
|
shutdown_handler(0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
SRV_INF("%s", "EOF on stdin detected, invoking shutdown handler...\n");
|
// EOF meaning router server is unexpectedly exit or killed
|
||||||
shutdown_handler(0); // invoke shutdown handler
|
SRV_INF("%s", "EOF on stdin detected, forcing shutdown...\n");
|
||||||
|
exit(1);
|
||||||
}).detach();
|
}).detach();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -15,16 +15,16 @@
|
||||||
* state diagram:
|
* state diagram:
|
||||||
*
|
*
|
||||||
* UNLOADED ──► LOADING ──► LOADED
|
* UNLOADED ──► LOADING ──► LOADED
|
||||||
* ▲ │
|
* ▲ │ │
|
||||||
* │ │
|
* └───failed───┘ │
|
||||||
* FAILED ◄───────┘
|
* ▲ │
|
||||||
|
* └────────unloaded─────────┘
|
||||||
*/
|
*/
|
||||||
enum server_model_status {
|
enum server_model_status {
|
||||||
// TODO: also add downloading state
|
// TODO: also add downloading state when the logic is added
|
||||||
SERVER_MODEL_STATUS_UNLOADED,
|
SERVER_MODEL_STATUS_UNLOADED,
|
||||||
SERVER_MODEL_STATUS_LOADING,
|
SERVER_MODEL_STATUS_LOADING,
|
||||||
SERVER_MODEL_STATUS_LOADED,
|
SERVER_MODEL_STATUS_LOADED
|
||||||
SERVER_MODEL_STATUS_FAILED
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static server_model_status server_model_status_from_string(const std::string & status_str) {
|
static server_model_status server_model_status_from_string(const std::string & status_str) {
|
||||||
|
|
@ -34,8 +34,6 @@ static server_model_status server_model_status_from_string(const std::string & s
|
||||||
return SERVER_MODEL_STATUS_LOADING;
|
return SERVER_MODEL_STATUS_LOADING;
|
||||||
} else if (status_str == "loaded") {
|
} else if (status_str == "loaded") {
|
||||||
return SERVER_MODEL_STATUS_LOADED;
|
return SERVER_MODEL_STATUS_LOADED;
|
||||||
} else if (status_str == "failed") {
|
|
||||||
return SERVER_MODEL_STATUS_FAILED;
|
|
||||||
} else {
|
} else {
|
||||||
throw std::runtime_error("invalid server model status");
|
throw std::runtime_error("invalid server model status");
|
||||||
}
|
}
|
||||||
|
|
@ -46,7 +44,6 @@ static std::string server_model_status_to_string(server_model_status status) {
|
||||||
case SERVER_MODEL_STATUS_UNLOADED: return "unloaded";
|
case SERVER_MODEL_STATUS_UNLOADED: return "unloaded";
|
||||||
case SERVER_MODEL_STATUS_LOADING: return "loading";
|
case SERVER_MODEL_STATUS_LOADING: return "loading";
|
||||||
case SERVER_MODEL_STATUS_LOADED: return "loaded";
|
case SERVER_MODEL_STATUS_LOADED: return "loaded";
|
||||||
case SERVER_MODEL_STATUS_FAILED: return "failed";
|
|
||||||
default: return "unknown";
|
default: return "unknown";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -65,6 +62,10 @@ struct server_model_meta {
|
||||||
bool is_active() const {
|
bool is_active() const {
|
||||||
return status == SERVER_MODEL_STATUS_LOADED || status == SERVER_MODEL_STATUS_LOADING;
|
return status == SERVER_MODEL_STATUS_LOADED || status == SERVER_MODEL_STATUS_LOADING;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool is_failed() const {
|
||||||
|
return status == SERVER_MODEL_STATUS_UNLOADED && exit_code != 0;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct server_models {
|
struct server_models {
|
||||||
|
|
|
||||||
|
|
@ -5142,10 +5142,9 @@ public:
|
||||||
server_http_context::handler_t proxy_get = [this](const server_http_req & req) {
|
server_http_context::handler_t proxy_get = [this](const server_http_req & req) {
|
||||||
std::string method = "GET";
|
std::string method = "GET";
|
||||||
std::string name = req.get_param("model");
|
std::string name = req.get_param("model");
|
||||||
if (name.empty()) {
|
auto error_res = std::make_unique<server_res_generator>(ctx_server);
|
||||||
auto res = std::make_unique<server_res_generator>(ctx_server);
|
if (!router_validate_model(name, error_res)) {
|
||||||
res->error(format_error_response("model name is missing from the request", ERROR_TYPE_INVALID_REQUEST));
|
return std::unique_ptr<server_http_res>(std::move(error_res));
|
||||||
return std::unique_ptr<server_http_res>(std::move(res));
|
|
||||||
}
|
}
|
||||||
models->ensure_model_loaded(name);
|
models->ensure_model_loaded(name);
|
||||||
return models->proxy_request(req, method, name, false);
|
return models->proxy_request(req, method, name, false);
|
||||||
|
|
@ -5155,10 +5154,9 @@ public:
|
||||||
std::string method = "POST";
|
std::string method = "POST";
|
||||||
json body = json::parse(req.body);
|
json body = json::parse(req.body);
|
||||||
std::string name = json_value(body, "model", std::string());
|
std::string name = json_value(body, "model", std::string());
|
||||||
if (name.empty()) {
|
auto error_res = std::make_unique<server_res_generator>(ctx_server);
|
||||||
auto res = std::make_unique<server_res_generator>(ctx_server);
|
if (!router_validate_model(name, error_res)) {
|
||||||
res->error(format_error_response("model name is missing from the request", ERROR_TYPE_INVALID_REQUEST));
|
return std::unique_ptr<server_http_res>(std::move(error_res));
|
||||||
return std::unique_ptr<server_http_res>(std::move(res));
|
|
||||||
}
|
}
|
||||||
models->ensure_model_loaded(name);
|
models->ensure_model_loaded(name);
|
||||||
return models->proxy_request(req, method, name, true); // update last usage for POST request only
|
return models->proxy_request(req, method, name, true); // update last usage for POST request only
|
||||||
|
|
@ -5200,22 +5198,23 @@ public:
|
||||||
json models_json = json::array();
|
json models_json = json::array();
|
||||||
auto all_models = models->get_all_meta();
|
auto all_models = models->get_all_meta();
|
||||||
std::time_t t = std::time(0);
|
std::time_t t = std::time(0);
|
||||||
for (const auto & model : all_models) {
|
for (const auto & meta : all_models) {
|
||||||
json status {
|
json status {
|
||||||
{"value", server_model_status_to_string(model.status)},
|
{"value", server_model_status_to_string(meta.status)},
|
||||||
{"args", model.args},
|
{"args", meta.args},
|
||||||
};
|
};
|
||||||
if (model.status == SERVER_MODEL_STATUS_FAILED) {
|
if (meta.is_failed()) {
|
||||||
status["exit_code"] = model.exit_code;
|
status["exit_code"] = meta.exit_code;
|
||||||
|
status["failed"] = true;
|
||||||
}
|
}
|
||||||
models_json.push_back(json {
|
models_json.push_back(json {
|
||||||
{"id", model.name},
|
{"id", meta.name},
|
||||||
{"name", model.name},
|
{"name", meta.name},
|
||||||
{"object", "model"}, // for OAI-compat
|
{"object", "model"}, // for OAI-compat
|
||||||
{"owned_by", "llamacpp"}, // for OAI-compat
|
{"owned_by", "llamacpp"}, // for OAI-compat
|
||||||
{"created", t}, // for OAI-compat
|
{"created", t}, // for OAI-compat
|
||||||
{"in_cache", model.in_cache},
|
{"in_cache", meta.in_cache},
|
||||||
{"path", model.path},
|
{"path", meta.path},
|
||||||
{"status", status},
|
{"status", status},
|
||||||
// TODO: add other fields, may require reading GGUF metadata
|
// TODO: add other fields, may require reading GGUF metadata
|
||||||
});
|
});
|
||||||
|
|
@ -5595,6 +5594,27 @@ private:
|
||||||
res->ok(root);
|
res->ok(root);
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool router_validate_model(const std::string & name, std::unique_ptr<server_res_generator> & res) {
|
||||||
|
if (name.empty()) {
|
||||||
|
res->error(format_error_response("model name is missing from the request", ERROR_TYPE_INVALID_REQUEST));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
auto meta = models->get_meta(name);
|
||||||
|
if (!meta.has_value()) {
|
||||||
|
res->error(format_error_response("model not found", ERROR_TYPE_INVALID_REQUEST));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (params.models_autoload) {
|
||||||
|
models->ensure_model_loaded(name);
|
||||||
|
} else {
|
||||||
|
if (meta->status != SERVER_MODEL_STATUS_LOADED) {
|
||||||
|
res->error(format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
std::function<void(int)> shutdown_handler;
|
std::function<void(int)> shutdown_handler;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue