diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp index bdd337e952..b3c5eb5717 100644 --- a/src/llama-grammar.cpp +++ b/src/llama-grammar.cpp @@ -347,10 +347,10 @@ const char * llama_grammar_parser::parse_sequence( size_t last_sym_start = rule.size(); const char * pos = src; - // use UINT64_MAX as the empty value because we aligned to the proper unsigned long type so -1 can't be used + // use UINT64_MAX as the empty value because we aligned to the proper uint64_t type so -1 can't be used // (though it's technically the same as -1 now) - auto handle_repetitions = [&](unsigned long min_times, unsigned long max_times) { - + auto handle_repetitions = [&](uint64_t min_times, uint64_t max_times) { + bool no_max = max_times == UINT64_MAX; if (last_sym_start == rule.size()) { throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos); } @@ -377,20 +377,20 @@ const char * llama_grammar_parser::parse_sequence( rule.resize(last_sym_start); } else { // Repeat the previous elements (min_times - 1) times - for (unsigned long i = 1; i < min_times; i++) { + for (uint64_t i = 1; i < min_times; i++) { rule.insert(rule.end(), prev_rule.begin(), prev_rule.end()); } } uint32_t last_rec_rule_id = 0; - auto n_opt = max_times == UINT64_MAX ? 1 : max_times - min_times; + auto n_opt = no_max ? 1 : max_times - min_times; llama_grammar_rule rec_rule(prev_rule); - for (unsigned long i = 0; i < n_opt; i++) { + for (uint64_t i = 0; i < n_opt; i++) { rec_rule.resize(prev_rule.size()); uint32_t rec_rule_id = generate_symbol_id( rule_name); - if (i > 0 || max_times == UINT64_MAX) { - rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times == UINT64_MAX ? rec_rule_id : last_rec_rule_id}); + if (i > 0 || no_max) { + rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, no_max ? rec_rule_id : last_rec_rule_id}); } rec_rule.push_back({LLAMA_GRETYPE_ALT, 0}); rec_rule.push_back({LLAMA_GRETYPE_END, 0}); @@ -482,10 +482,10 @@ const char * llama_grammar_parser::parse_sequence( throw std::runtime_error(std::string("expecting an int at ") + pos); } const char * int_end = parse_int(pos); - unsigned long min_times = std::stoul(std::string(pos, int_end - pos)); + uint64_t min_times = std::stoul(std::string(pos, int_end - pos)); pos = parse_space(int_end, is_nested); - unsigned long max_times = UINT64_MAX; + uint64_t max_times = UINT64_MAX; // default: no max limit if (*pos == '}') { max_times = min_times; @@ -506,7 +506,8 @@ const char * llama_grammar_parser::parse_sequence( } else { throw std::runtime_error(std::string("expecting ',' at ") + pos); } - if (min_times > MAX_REPETITION_THRESHOLD || (max_times != UINT64_MAX && max_times > MAX_REPETITION_THRESHOLD)) { + bool has_max = max_times != UINT64_MAX; + if (min_times > MAX_REPETITION_THRESHOLD || (has_max && max_times > MAX_REPETITION_THRESHOLD)) { throw std::runtime_error(std::string("number of repetitions exceeds sane defaults, please reduce the number of repetitions")); } handle_repetitions(min_times, max_times); diff --git a/tools/server/README.md b/tools/server/README.md index 8fd478eb32..54c1062c9b 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -1343,6 +1343,78 @@ See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-r }' ``` + +## Using multiple models + +`llama-server` can be launched in a **router mode** that exposes an API for dynamically loading and unloading models. The main process (the "router") automatically forwards each request to the appropriate model instance. + +To start in router mode, launch `llama-server` **without specifying any model**: + +```sh +llama-server +``` + +### Model sources + +By default, the router looks for models in the cache. You can add Hugging Face models to the cache with: + +```sh +llama-server -hf /: +``` + +*The server must be restarted after adding a new model.* + +Alternatively, you can point the router to a local directory containing your GGUF files using `--models-dir`. Files prefixed with `mmproj-` will automatically be treated as multimodal projection files **for the model with the matching base name**: + +```sh +llama-3.2-1b-Q4_K_M.gguf +gemma-3-4b-it-Q8_0.gguf +mmproj-gemma-3-4b-it-Q8_0.gguf # must be "mmproj-" + text model filename +``` + +Example: + +```sh +llama-server --models-dir ./path/to/models +``` + +You may also specify default arguments that will be passed to every loaded model instance: + +```sh +llama-server -ctx 8192 -n 1024 -np 2 +``` + +### Routing requests + +Requests are routed according to the requested model name. + +For **POST** endpoints (`/v1/chat/completions`, `/v1/completions`, `/infill`, etc.) The router uses the `"model"` field in the JSON body: + +```json +{ + "model": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M", + ... +} +``` + +For **GET** endpoints (`/props`, `/metrics`, etc.) The router uses the `model` query parameter (URL-encoded): + +``` +GET /props?model=ggml-org%2Fgemma-3-4b-it-GGUF%3AQ4_K_M +``` + +### GET `/models`: List available models + +TODO + +### POST `/models/load`: Load a model + +TODO + +### POST `/models/unload`: Unload a model + +TODO + ## More examples ### Interactive mode diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index be3226ada3..92b02fbf49 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -10,6 +10,8 @@ #include #include #include +#include +#include #ifdef _WIN32 #include @@ -60,7 +62,10 @@ static std::filesystem::path get_server_exec_path() { #else char path[FILENAME_MAX]; ssize_t count = readlink("/proc/self/exe", path, FILENAME_MAX); - return std::filesystem::path(std::string(path, (count > 0) ? count: 0)); + if (count <= 0) { + throw std::runtime_error("failed to resolve /proc/self/exe"); + } + return std::filesystem::path(std::string(path, count)); #endif } @@ -203,22 +208,27 @@ std::vector server_models::get_all_meta() { } void server_models::load(const std::string & name) { - auto meta = get_meta(name); - if (!meta.has_value()) { + std::lock_guard lk(mutex); + if (mapping.find(name) == mapping.end()) { throw std::runtime_error("model name=" + name + " is not found"); } - std::lock_guard lk(mutex); - if (meta->status != SERVER_MODEL_STATUS_FAILED && meta->status != SERVER_MODEL_STATUS_UNLOADED) { + auto meta = mapping[name].meta; + if (meta.status != SERVER_MODEL_STATUS_FAILED && meta.status != SERVER_MODEL_STATUS_UNLOADED) { SRV_INF("model %s is not ready\n", name.c_str()); return; } + // prepare new instance info instance_t inst; - inst.meta = meta.value(); + inst.meta = meta; inst.meta.port = get_free_port(); inst.meta.status = SERVER_MODEL_STATUS_LOADING; + if (inst.meta.port <= 0) { + throw std::runtime_error("failed to get a port number"); + } + inst.subproc = std::make_shared(); { std::string exec_path = get_server_exec_path().string(); @@ -263,19 +273,19 @@ void server_models::load(const std::string & name) { // start a thread to manage the child process inst.th = std::thread([this, name, child_proc = inst.subproc, port = inst.meta.port]() { // read stdout/stderr and forward to main server log - { - FILE * p_stdout_stderr = subprocess_stdout(child_proc.get()); - if (!p_stdout_stderr) { - return; - } + FILE * p_stdout_stderr = subprocess_stdout(child_proc.get()); + if (p_stdout_stderr) { char buffer[4096]; while (fgets(buffer, sizeof(buffer), p_stdout_stderr) != nullptr) { LOG("[%5d] %s", port, buffer); } + } else { + SRV_ERR("failed to get stdout/stderr of child process for name=%s\n", name.c_str()); } // we reach here when the child process exits int exit_code = 0; subprocess_join(child_proc.get(), &exit_code); + subprocess_destroy(child_proc.get()); // update PID and status { std::lock_guard lk(mutex); @@ -305,7 +315,7 @@ void server_models::unload(const std::string & name) { if (it != mapping.end()) { if (it->second.meta.is_active()) { SRV_INF("unloading model instance name=%s\n", name.c_str()); - subprocess_destroy(it->second.subproc.get()); + subprocess_terminate(it->second.subproc.get()); // status change will be handled by the managing thread } else { SRV_WRN("model instance name=%s is not loaded\n", name.c_str()); @@ -320,7 +330,7 @@ void server_models::unload_all() { for (auto & [name, inst] : mapping) { if (inst.meta.is_active()) { SRV_INF("unloading model instance name=%s\n", name.c_str()); - subprocess_destroy(inst.subproc.get()); + subprocess_terminate(inst.subproc.get()); // status change will be handled by the managing thread } // moving the thread to join list to avoid deadlock @@ -354,17 +364,25 @@ void server_models::wait_until_loaded(const std::string & name) { }); } -void server_models::ensure_model_loaded(const std::string & name) { +bool server_models::ensure_model_loaded(const std::string & name) { auto meta = get_meta(name); if (!meta.has_value()) { throw std::runtime_error("model name=" + name + " is not found"); } if (meta->is_active()) { - return; // already loaded + return false; // already loaded } SRV_INF("model name=%s is not loaded, loading...\n", name.c_str()); load(name); wait_until_loaded(name); + { + // check final status + meta = get_meta(name); + if (!meta.has_value() || meta->status == SERVER_MODEL_STATUS_FAILED) { + throw std::runtime_error("model name=" + name + " failed to load"); + } + } + return true; } server_http_res_ptr server_models::proxy_request(const server_http_req & req, const std::string & method, const std::string & name) { @@ -372,7 +390,9 @@ server_http_res_ptr server_models::proxy_request(const server_http_req & req, co if (!meta.has_value()) { throw std::runtime_error("model name=" + name + " is not found"); } - ensure_model_loaded(name); // TODO: handle failure case + if (ensure_model_loaded(name)) { + meta = get_meta(name); // refresh meta + } SRV_INF("proxying request to model %s on port %d\n", name.c_str(), meta->port); auto proxy = std::make_unique( method, @@ -439,11 +459,11 @@ struct pipe_t { std::atomic writer_closed{false}; std::atomic reader_closed{false}; void close_write() { - writer_closed.store(true); + writer_closed.store(true, std::memory_order_relaxed); cv.notify_all(); } void close_read() { - reader_closed.store(true); + reader_closed.store(true, std::memory_order_relaxed); cv.notify_all(); } bool read(T & output, const std::function & should_stop) { diff --git a/tools/server/server-models.h b/tools/server/server-models.h index 3cd070f89a..f8ae757fa4 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -13,7 +13,7 @@ /** * state diagram: - * + * * UNLOADED ──► LOADING ──► LOADED * ▲ │ * │ │ @@ -105,7 +105,8 @@ public: void wait_until_loaded(const std::string & name); // load the model if not loaded, otherwise do nothing - void ensure_model_loaded(const std::string & name); + // return false if model is already loaded; return true otherwise (meta may need to be refreshed) + bool ensure_model_loaded(const std::string & name); // proxy an HTTP request to the model instance server_http_res_ptr proxy_request(const server_http_req & req, const std::string & method, const std::string & name);