Merge remote-tracking branch 'ngxson/xsn/server_model_management_v1_2' into allozaur/server_model_management_v1_2
This commit is contained in:
commit
45bf2a4983
|
|
@ -347,10 +347,10 @@ const char * llama_grammar_parser::parse_sequence(
|
|||
size_t last_sym_start = rule.size();
|
||||
const char * pos = src;
|
||||
|
||||
// use UINT64_MAX as the empty value because we aligned to the proper unsigned long type so -1 can't be used
|
||||
// use UINT64_MAX as the empty value because we aligned to the proper uint64_t type so -1 can't be used
|
||||
// (though it's technically the same as -1 now)
|
||||
auto handle_repetitions = [&](unsigned long min_times, unsigned long max_times) {
|
||||
|
||||
auto handle_repetitions = [&](uint64_t min_times, uint64_t max_times) {
|
||||
bool no_max = max_times == UINT64_MAX;
|
||||
if (last_sym_start == rule.size()) {
|
||||
throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
|
||||
}
|
||||
|
|
@ -377,20 +377,20 @@ const char * llama_grammar_parser::parse_sequence(
|
|||
rule.resize(last_sym_start);
|
||||
} else {
|
||||
// Repeat the previous elements (min_times - 1) times
|
||||
for (unsigned long i = 1; i < min_times; i++) {
|
||||
for (uint64_t i = 1; i < min_times; i++) {
|
||||
rule.insert(rule.end(), prev_rule.begin(), prev_rule.end());
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t last_rec_rule_id = 0;
|
||||
auto n_opt = max_times == UINT64_MAX ? 1 : max_times - min_times;
|
||||
auto n_opt = no_max ? 1 : max_times - min_times;
|
||||
|
||||
llama_grammar_rule rec_rule(prev_rule);
|
||||
for (unsigned long i = 0; i < n_opt; i++) {
|
||||
for (uint64_t i = 0; i < n_opt; i++) {
|
||||
rec_rule.resize(prev_rule.size());
|
||||
uint32_t rec_rule_id = generate_symbol_id( rule_name);
|
||||
if (i > 0 || max_times == UINT64_MAX) {
|
||||
rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times == UINT64_MAX ? rec_rule_id : last_rec_rule_id});
|
||||
if (i > 0 || no_max) {
|
||||
rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, no_max ? rec_rule_id : last_rec_rule_id});
|
||||
}
|
||||
rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
|
||||
rec_rule.push_back({LLAMA_GRETYPE_END, 0});
|
||||
|
|
@ -482,10 +482,10 @@ const char * llama_grammar_parser::parse_sequence(
|
|||
throw std::runtime_error(std::string("expecting an int at ") + pos);
|
||||
}
|
||||
const char * int_end = parse_int(pos);
|
||||
unsigned long min_times = std::stoul(std::string(pos, int_end - pos));
|
||||
uint64_t min_times = std::stoul(std::string(pos, int_end - pos));
|
||||
pos = parse_space(int_end, is_nested);
|
||||
|
||||
unsigned long max_times = UINT64_MAX;
|
||||
uint64_t max_times = UINT64_MAX; // default: no max limit
|
||||
|
||||
if (*pos == '}') {
|
||||
max_times = min_times;
|
||||
|
|
@ -506,7 +506,8 @@ const char * llama_grammar_parser::parse_sequence(
|
|||
} else {
|
||||
throw std::runtime_error(std::string("expecting ',' at ") + pos);
|
||||
}
|
||||
if (min_times > MAX_REPETITION_THRESHOLD || (max_times != UINT64_MAX && max_times > MAX_REPETITION_THRESHOLD)) {
|
||||
bool has_max = max_times != UINT64_MAX;
|
||||
if (min_times > MAX_REPETITION_THRESHOLD || (has_max && max_times > MAX_REPETITION_THRESHOLD)) {
|
||||
throw std::runtime_error(std::string("number of repetitions exceeds sane defaults, please reduce the number of repetitions"));
|
||||
}
|
||||
handle_repetitions(min_times, max_times);
|
||||
|
|
|
|||
|
|
@ -1343,6 +1343,78 @@ See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-r
|
|||
}'
|
||||
```
|
||||
|
||||
|
||||
## Using multiple models
|
||||
|
||||
`llama-server` can be launched in a **router mode** that exposes an API for dynamically loading and unloading models. The main process (the "router") automatically forwards each request to the appropriate model instance.
|
||||
|
||||
To start in router mode, launch `llama-server` **without specifying any model**:
|
||||
|
||||
```sh
|
||||
llama-server
|
||||
```
|
||||
|
||||
### Model sources
|
||||
|
||||
By default, the router looks for models in the cache. You can add Hugging Face models to the cache with:
|
||||
|
||||
```sh
|
||||
llama-server -hf <user>/<model>:<tag>
|
||||
```
|
||||
|
||||
*The server must be restarted after adding a new model.*
|
||||
|
||||
Alternatively, you can point the router to a local directory containing your GGUF files using `--models-dir`. Files prefixed with `mmproj-` will automatically be treated as multimodal projection files **for the model with the matching base name**:
|
||||
|
||||
```sh
|
||||
llama-3.2-1b-Q4_K_M.gguf
|
||||
gemma-3-4b-it-Q8_0.gguf
|
||||
mmproj-gemma-3-4b-it-Q8_0.gguf # must be "mmproj-" + text model filename
|
||||
```
|
||||
|
||||
Example:
|
||||
|
||||
```sh
|
||||
llama-server --models-dir ./path/to/models
|
||||
```
|
||||
|
||||
You may also specify default arguments that will be passed to every loaded model instance:
|
||||
|
||||
```sh
|
||||
llama-server -ctx 8192 -n 1024 -np 2
|
||||
```
|
||||
|
||||
### Routing requests
|
||||
|
||||
Requests are routed according to the requested model name.
|
||||
|
||||
For **POST** endpoints (`/v1/chat/completions`, `/v1/completions`, `/infill`, etc.) The router uses the `"model"` field in the JSON body:
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M",
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
For **GET** endpoints (`/props`, `/metrics`, etc.) The router uses the `model` query parameter (URL-encoded):
|
||||
|
||||
```
|
||||
GET /props?model=ggml-org%2Fgemma-3-4b-it-GGUF%3AQ4_K_M
|
||||
```
|
||||
|
||||
### GET `/models`: List available models
|
||||
|
||||
TODO
|
||||
|
||||
### POST `/models/load`: Load a model
|
||||
|
||||
TODO
|
||||
|
||||
### POST `/models/unload`: Unload a model
|
||||
|
||||
TODO
|
||||
|
||||
## More examples
|
||||
|
||||
### Interactive mode
|
||||
|
|
|
|||
|
|
@ -10,6 +10,8 @@
|
|||
#include <mutex>
|
||||
#include <condition_variable>
|
||||
#include <cstring>
|
||||
#include <atomic>
|
||||
#include <chrono>
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <winsock2.h>
|
||||
|
|
@ -60,7 +62,10 @@ static std::filesystem::path get_server_exec_path() {
|
|||
#else
|
||||
char path[FILENAME_MAX];
|
||||
ssize_t count = readlink("/proc/self/exe", path, FILENAME_MAX);
|
||||
return std::filesystem::path(std::string(path, (count > 0) ? count: 0));
|
||||
if (count <= 0) {
|
||||
throw std::runtime_error("failed to resolve /proc/self/exe");
|
||||
}
|
||||
return std::filesystem::path(std::string(path, count));
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -203,22 +208,27 @@ std::vector<server_model_meta> server_models::get_all_meta() {
|
|||
}
|
||||
|
||||
void server_models::load(const std::string & name) {
|
||||
auto meta = get_meta(name);
|
||||
if (!meta.has_value()) {
|
||||
std::lock_guard<std::mutex> lk(mutex);
|
||||
if (mapping.find(name) == mapping.end()) {
|
||||
throw std::runtime_error("model name=" + name + " is not found");
|
||||
}
|
||||
|
||||
std::lock_guard<std::mutex> lk(mutex);
|
||||
if (meta->status != SERVER_MODEL_STATUS_FAILED && meta->status != SERVER_MODEL_STATUS_UNLOADED) {
|
||||
auto meta = mapping[name].meta;
|
||||
if (meta.status != SERVER_MODEL_STATUS_FAILED && meta.status != SERVER_MODEL_STATUS_UNLOADED) {
|
||||
SRV_INF("model %s is not ready\n", name.c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
// prepare new instance info
|
||||
instance_t inst;
|
||||
inst.meta = meta.value();
|
||||
inst.meta = meta;
|
||||
inst.meta.port = get_free_port();
|
||||
inst.meta.status = SERVER_MODEL_STATUS_LOADING;
|
||||
|
||||
if (inst.meta.port <= 0) {
|
||||
throw std::runtime_error("failed to get a port number");
|
||||
}
|
||||
|
||||
inst.subproc = std::make_shared<subprocess_s>();
|
||||
{
|
||||
std::string exec_path = get_server_exec_path().string();
|
||||
|
|
@ -263,19 +273,19 @@ void server_models::load(const std::string & name) {
|
|||
// start a thread to manage the child process
|
||||
inst.th = std::thread([this, name, child_proc = inst.subproc, port = inst.meta.port]() {
|
||||
// read stdout/stderr and forward to main server log
|
||||
{
|
||||
FILE * p_stdout_stderr = subprocess_stdout(child_proc.get());
|
||||
if (!p_stdout_stderr) {
|
||||
return;
|
||||
}
|
||||
FILE * p_stdout_stderr = subprocess_stdout(child_proc.get());
|
||||
if (p_stdout_stderr) {
|
||||
char buffer[4096];
|
||||
while (fgets(buffer, sizeof(buffer), p_stdout_stderr) != nullptr) {
|
||||
LOG("[%5d] %s", port, buffer);
|
||||
}
|
||||
} else {
|
||||
SRV_ERR("failed to get stdout/stderr of child process for name=%s\n", name.c_str());
|
||||
}
|
||||
// we reach here when the child process exits
|
||||
int exit_code = 0;
|
||||
subprocess_join(child_proc.get(), &exit_code);
|
||||
subprocess_destroy(child_proc.get());
|
||||
// update PID and status
|
||||
{
|
||||
std::lock_guard<std::mutex> lk(mutex);
|
||||
|
|
@ -305,7 +315,7 @@ void server_models::unload(const std::string & name) {
|
|||
if (it != mapping.end()) {
|
||||
if (it->second.meta.is_active()) {
|
||||
SRV_INF("unloading model instance name=%s\n", name.c_str());
|
||||
subprocess_destroy(it->second.subproc.get());
|
||||
subprocess_terminate(it->second.subproc.get());
|
||||
// status change will be handled by the managing thread
|
||||
} else {
|
||||
SRV_WRN("model instance name=%s is not loaded\n", name.c_str());
|
||||
|
|
@ -320,7 +330,7 @@ void server_models::unload_all() {
|
|||
for (auto & [name, inst] : mapping) {
|
||||
if (inst.meta.is_active()) {
|
||||
SRV_INF("unloading model instance name=%s\n", name.c_str());
|
||||
subprocess_destroy(inst.subproc.get());
|
||||
subprocess_terminate(inst.subproc.get());
|
||||
// status change will be handled by the managing thread
|
||||
}
|
||||
// moving the thread to join list to avoid deadlock
|
||||
|
|
@ -354,17 +364,25 @@ void server_models::wait_until_loaded(const std::string & name) {
|
|||
});
|
||||
}
|
||||
|
||||
void server_models::ensure_model_loaded(const std::string & name) {
|
||||
bool server_models::ensure_model_loaded(const std::string & name) {
|
||||
auto meta = get_meta(name);
|
||||
if (!meta.has_value()) {
|
||||
throw std::runtime_error("model name=" + name + " is not found");
|
||||
}
|
||||
if (meta->is_active()) {
|
||||
return; // already loaded
|
||||
return false; // already loaded
|
||||
}
|
||||
SRV_INF("model name=%s is not loaded, loading...\n", name.c_str());
|
||||
load(name);
|
||||
wait_until_loaded(name);
|
||||
{
|
||||
// check final status
|
||||
meta = get_meta(name);
|
||||
if (!meta.has_value() || meta->status == SERVER_MODEL_STATUS_FAILED) {
|
||||
throw std::runtime_error("model name=" + name + " failed to load");
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
server_http_res_ptr server_models::proxy_request(const server_http_req & req, const std::string & method, const std::string & name) {
|
||||
|
|
@ -372,7 +390,9 @@ server_http_res_ptr server_models::proxy_request(const server_http_req & req, co
|
|||
if (!meta.has_value()) {
|
||||
throw std::runtime_error("model name=" + name + " is not found");
|
||||
}
|
||||
ensure_model_loaded(name); // TODO: handle failure case
|
||||
if (ensure_model_loaded(name)) {
|
||||
meta = get_meta(name); // refresh meta
|
||||
}
|
||||
SRV_INF("proxying request to model %s on port %d\n", name.c_str(), meta->port);
|
||||
auto proxy = std::make_unique<server_http_proxy>(
|
||||
method,
|
||||
|
|
@ -439,11 +459,11 @@ struct pipe_t {
|
|||
std::atomic<bool> writer_closed{false};
|
||||
std::atomic<bool> reader_closed{false};
|
||||
void close_write() {
|
||||
writer_closed.store(true);
|
||||
writer_closed.store(true, std::memory_order_relaxed);
|
||||
cv.notify_all();
|
||||
}
|
||||
void close_read() {
|
||||
reader_closed.store(true);
|
||||
reader_closed.store(true, std::memory_order_relaxed);
|
||||
cv.notify_all();
|
||||
}
|
||||
bool read(T & output, const std::function<bool()> & should_stop) {
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@
|
|||
|
||||
/**
|
||||
* state diagram:
|
||||
*
|
||||
*
|
||||
* UNLOADED ──► LOADING ──► LOADED
|
||||
* ▲ │
|
||||
* │ │
|
||||
|
|
@ -105,7 +105,8 @@ public:
|
|||
void wait_until_loaded(const std::string & name);
|
||||
|
||||
// load the model if not loaded, otherwise do nothing
|
||||
void ensure_model_loaded(const std::string & name);
|
||||
// return false if model is already loaded; return true otherwise (meta may need to be refreshed)
|
||||
bool ensure_model_loaded(const std::string & name);
|
||||
|
||||
// proxy an HTTP request to the model instance
|
||||
server_http_res_ptr proxy_request(const server_http_req & req, const std::string & method, const std::string & name);
|
||||
|
|
|
|||
Loading…
Reference in New Issue