Merge remote-tracking branch 'ngxson/xsn/server_model_management_v1_2' into allozaur/server_model_management_v1_2
This commit is contained in:
commit
76557cd5d3
|
|
@ -2482,12 +2482,19 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_DIR"));
|
||||
add_opt(common_arg(
|
||||
{"--max-models"}, "N",
|
||||
string_format("for router server, maximum number of models to load simultaneously (default: %d, 0 = unlimited)", params.max_models),
|
||||
{"--models-max"}, "N",
|
||||
string_format("for router server, maximum number of models to load simultaneously (default: %d, 0 = unlimited)", params.models_max),
|
||||
[](common_params & params, int value) {
|
||||
params.max_models = value;
|
||||
params.models_max = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MAX_MODELS"));
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
|
||||
add_opt(common_arg(
|
||||
{"--no-models-autoload"},
|
||||
"disables automatic loading of models (default: enabled)",
|
||||
[](common_params & params) {
|
||||
params.models_autoload = false;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_MODELS_AUTOLOAD"));
|
||||
add_opt(common_arg(
|
||||
{"--jinja"},
|
||||
"use jinja template for chat (default: disabled)",
|
||||
|
|
|
|||
|
|
@ -460,7 +460,8 @@ struct common_params {
|
|||
|
||||
// router server configs
|
||||
std::string models_dir = ""; // directory containing models for the router server
|
||||
int max_models = 4; // maximum number of models to load simultaneously
|
||||
int models_max = 4; // maximum number of models to load simultaneously
|
||||
bool models_autoload = true; // automatically load models when requested via the router server
|
||||
|
||||
bool log_json = false;
|
||||
|
||||
|
|
|
|||
|
|
@ -30,9 +30,10 @@ The project is under active development, and we are [looking for feedback and co
|
|||
| -------- | ----------- |
|
||||
| `-h, --help, --usage` | print usage and exit |
|
||||
| `--version` | show version and build info |
|
||||
| `-cl, --cache-list` | show list of models in cache |
|
||||
| `--completion-bash` | print source-able bash completion script for llama.cpp |
|
||||
| `--verbose-prompt` | print a verbose prompt before generation (default: false) |
|
||||
| `-t, --threads N` | number of threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
|
||||
| `-t, --threads N` | number of CPU threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
|
||||
| `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
|
||||
| `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") |
|
||||
| `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask |
|
||||
|
|
@ -51,7 +52,7 @@ The project is under active development, and we are [looking for feedback and co
|
|||
| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
|
||||
| `--swa-full` | use full-size SWA cache (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)<br/>(env: LLAMA_ARG_SWA_FULL) |
|
||||
| `--kv-unified, -kvu` | use single unified KV buffer for the KV cache of all sequences (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)<br/>(env: LLAMA_ARG_KV_SPLIT) |
|
||||
| `-fa, --flash-attn` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) |
|
||||
| `-fa, --flash-attn [on\|off\|auto]` | set Flash Attention use ('on', 'off', or 'auto', default: 'auto')<br/>(env: LLAMA_ARG_FLASH_ATTN) |
|
||||
| `--no-perf` | disable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_NO_PERF) |
|
||||
| `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
|
||||
| `--no-escape` | do not process escape sequences |
|
||||
|
|
@ -61,11 +62,12 @@ The project is under active development, and we are [looking for feedback and co
|
|||
| `--rope-freq-scale N` | RoPE frequency scaling factor, expands context by a factor of 1/N<br/>(env: LLAMA_ARG_ROPE_FREQ_SCALE) |
|
||||
| `--yarn-orig-ctx N` | YaRN: original context size of model (default: 0 = model training context size)<br/>(env: LLAMA_ARG_YARN_ORIG_CTX) |
|
||||
| `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation)<br/>(env: LLAMA_ARG_YARN_EXT_FACTOR) |
|
||||
| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: 1.0)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) |
|
||||
| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: 1.0)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
|
||||
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
|
||||
| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.0)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) |
|
||||
| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
|
||||
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
|
||||
| `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) |
|
||||
| `-nr, --no-repack` | disable weight repacking<br/>(env: LLAMA_ARG_NO_REPACK) |
|
||||
| `--no-host` | bypass host buffer allowing extra buffers to be used<br/>(env: LLAMA_ARG_NO_HOST) |
|
||||
| `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
|
||||
| `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
|
||||
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
|
||||
|
|
@ -78,7 +80,7 @@ The project is under active development, and we are [looking for feedback and co
|
|||
| `--override-tensor, -ot <tensor name pattern>=<buffer type>,...` | override tensor buffer type |
|
||||
| `--cpu-moe, -cmoe` | keep all Mixture of Experts (MoE) weights in the CPU<br/>(env: LLAMA_ARG_CPU_MOE) |
|
||||
| `--n-cpu-moe, -ncmoe N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU<br/>(env: LLAMA_ARG_N_CPU_MOE) |
|
||||
| `-ngl, --gpu-layers, --n-gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
|
||||
| `-ngl, --gpu-layers, --n-gpu-layers N` | max. number of layers to store in VRAM (default: -1)<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
|
||||
| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs<br/>(env: LLAMA_ARG_SPLIT_MODE) |
|
||||
| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1<br/>(env: LLAMA_ARG_TENSOR_SPLIT) |
|
||||
| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)<br/>(env: LLAMA_ARG_MAIN_GPU) |
|
||||
|
|
@ -90,8 +92,9 @@ The project is under active development, and we are [looking for feedback and co
|
|||
| `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
|
||||
| `--control-vector-scaled FNAME SCALE` | add a control vector with user defined scaling SCALE<br/>note: this argument can be repeated to add multiple scaled control vectors |
|
||||
| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
|
||||
| `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)<br/>(env: LLAMA_ARG_MODEL) |
|
||||
| `-m, --model FNAME` | model path to load<br/>(env: LLAMA_ARG_MODEL) |
|
||||
| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
|
||||
| `-dr, --docker-repo [<repo>/]<model>[:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.<br/>example: gemma3<br/>(default: unused)<br/>(env: LLAMA_ARG_DOCKER_REPO) |
|
||||
| `-hf, -hfr, --hf-repo <user>/<model>[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.<br/>mmproj is also downloaded automatically if available. to disable, add --no-mmproj<br/>example: unsloth/phi-4-GGUF:q4_k_m<br/>(default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
|
||||
| `-hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]` | Same as --hf-repo, but for the draft model (default: unused)<br/>(env: LLAMA_ARG_HFD_REPO) |
|
||||
| `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
|
||||
|
|
@ -100,7 +103,7 @@ The project is under active development, and we are [looking for feedback and co
|
|||
| `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)<br/>(env: HF_TOKEN) |
|
||||
| `--log-disable` | Log disable |
|
||||
| `--log-file FNAME` | Log to file |
|
||||
| `--log-colors` | Enable colored logging<br/>(env: LLAMA_LOG_COLORS) |
|
||||
| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')<br/>'auto' enables colors when output is to a terminal<br/>(env: LLAMA_LOG_COLORS) |
|
||||
| `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) |
|
||||
| `--offline` | Offline mode: forces use of cache, prevents network access<br/>(env: LLAMA_OFFLINE) |
|
||||
| `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored.<br/>(env: LLAMA_LOG_VERBOSITY) |
|
||||
|
|
@ -151,7 +154,8 @@ The project is under active development, and we are [looking for feedback and co
|
|||
|
||||
| Argument | Explanation |
|
||||
| -------- | ----------- |
|
||||
| `--swa-checkpoints N` | max number of SWA checkpoints per slot to create (default: 3)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)<br/>(env: LLAMA_ARG_SWA_CHECKPOINTS) |
|
||||
| `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)<br/>(env: LLAMA_ARG_CTX_CHECKPOINTS) |
|
||||
| `--cache-ram, -cram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
|
||||
| `--no-context-shift` | disables context shift on infinite text generation (default: enabled)<br/>(env: LLAMA_ARG_NO_CONTEXT_SHIFT) |
|
||||
| `--context-shift` | enables context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
|
||||
| `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode<br/> |
|
||||
|
|
@ -165,6 +169,8 @@ The project is under active development, and we are [looking for feedback and co
|
|||
| `--mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
|
||||
| `--no-mmproj` | explicitly disable multimodal projector, useful when using -hf<br/>(env: LLAMA_ARG_NO_MMPROJ) |
|
||||
| `--no-mmproj-offload` | do not offload multimodal projector to GPU<br/>(env: LLAMA_ARG_NO_MMPROJ_OFFLOAD) |
|
||||
| `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MIN_TOKENS) |
|
||||
| `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MAX_TOKENS) |
|
||||
| `--override-tensor-draft, -otd <tensor name pattern>=<buffer type>,...` | override tensor buffer type for draft model |
|
||||
| `--cpu-moe-draft, -cmoed` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model<br/>(env: LLAMA_ARG_CPU_MOE_DRAFT) |
|
||||
| `--n-cpu-moe-draft, -ncmoed N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model<br/>(env: LLAMA_ARG_N_CPU_MOE_DRAFT) |
|
||||
|
|
@ -189,13 +195,16 @@ The project is under active development, and we are [looking for feedback and co
|
|||
| `--slots` | enable slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
|
||||
| `--no-slots` | disables slots monitoring endpoint<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
|
||||
| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
|
||||
| `--models-dir PATH` | directory containing models for the router server (default: disabled)<br/>(env: LLAMA_ARG_MODELS_DIR) |
|
||||
| `--models-max N` | for router server, maximum number of models to load simultaneously (default: 4, 0 = unlimited)<br/>(env: LLAMA_ARG_MODELS_MAX) |
|
||||
| `--no-models-autoload` | disables automatic loading of models (default: enabled)<br/>(env: LLAMA_ARG_NO_MODELS_AUTOLOAD) |
|
||||
| `--jinja` | use jinja template for chat (default: disabled)<br/>(env: LLAMA_ARG_JINJA) |
|
||||
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: deepseek)<br/>(env: LLAMA_ARG_THINK) |
|
||||
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
|
||||
| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
|
||||
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
||||
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
|
||||
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
||||
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
|
||||
| `--no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/><br/>(env: LLAMA_ARG_NO_PREFILL_ASSISTANT) |
|
||||
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
|
||||
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled)<br/> |
|
||||
| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
|
||||
| `-td, --threads-draft N` | number of threads to use during generation (default: same as --threads) |
|
||||
| `-tbd, --threads-batch-draft N` | number of threads to use during batch and prompt processing (default: same as --threads-draft) |
|
||||
|
|
@ -209,15 +218,17 @@ The project is under active development, and we are [looking for feedback and co
|
|||
| `--spec-replace TARGET DRAFT` | translate the string in TARGET into DRAFT if the draft model and main model are not compatible |
|
||||
| `-mv, --model-vocoder FNAME` | vocoder model for audio generation (default: unused) |
|
||||
| `--tts-use-guide-tokens` | Use guide tokens to improve TTS word recall |
|
||||
| `--embd-bge-small-en-default` | use default bge-small-en-v1.5 model (note: can download weights from the internet) |
|
||||
| `--embd-e5-small-en-default` | use default e5-small-v2 model (note: can download weights from the internet) |
|
||||
| `--embd-gte-small-default` | use default gte-small model (note: can download weights from the internet) |
|
||||
| `--embd-gemma-default` | use default EmbeddingGemma model (note: can download weights from the internet) |
|
||||
| `--fim-qwen-1.5b-default` | use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet) |
|
||||
| `--fim-qwen-3b-default` | use default Qwen 2.5 Coder 3B (note: can download weights from the internet) |
|
||||
| `--fim-qwen-7b-default` | use default Qwen 2.5 Coder 7B (note: can download weights from the internet) |
|
||||
| `--fim-qwen-7b-spec` | use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet) |
|
||||
| `--fim-qwen-14b-spec` | use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet) |
|
||||
| `--fim-qwen-30b-default` | use default Qwen 3 Coder 30B A3B Instruct (note: can download weights from the internet) |
|
||||
| `--gpt-oss-20b-default` | use gpt-oss-20b (note: can download weights from the internet) |
|
||||
| `--gpt-oss-120b-default` | use gpt-oss-120b (note: can download weights from the internet) |
|
||||
| `--vision-gemma-4b-default` | use Gemma 3 4B QAT (note: can download weights from the internet) |
|
||||
| `--vision-gemma-12b-default` | use Gemma 3 12B QAT (note: can download weights from the internet) |
|
||||
|
||||
|
||||
Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
|
||||
|
|
@ -1385,14 +1396,14 @@ models_directory
|
|||
│ └─ mmproj-F16.gguf # file name must start with "mmproj"
|
||||
│
|
||||
│ # multi-shard
|
||||
├─ gemma-3-4b-it-Q8_0
|
||||
├─ Kimi-K2-Thinking-UD-IQ1_S
|
||||
│ ├─ Kimi-K2-Thinking-UD-IQ1_S-00001-of-00006.gguf
|
||||
│ ├─ Kimi-K2-Thinking-UD-IQ1_S-00002-of-00006.gguf
|
||||
│ ├─ ...
|
||||
│ └─ Kimi-K2-Thinking-UD-IQ1_S-00006-of-00006.gguf
|
||||
```
|
||||
|
||||
You may also specify default arguments that will be passed to every loaded model instance:
|
||||
You may also specify default arguments that will be passed to every model instance:
|
||||
|
||||
```sh
|
||||
llama-server -ctx 8192 -n 1024 -np 2
|
||||
|
|
@ -1424,6 +1435,8 @@ For **GET** endpoints (`/props`, `/metrics`, etc.) The router uses the `model` q
|
|||
GET /props?model=ggml-org%2Fgemma-3-4b-it-GGUF%3AQ4_K_M
|
||||
```
|
||||
|
||||
By default, the model will be loaded automatically if it's not loaded. To disable this, add `--no-models-autoload` when starting the server.
|
||||
|
||||
### GET `/models`: List available models
|
||||
|
||||
Listing all models in cache. The model metadata will also include a field to indicate the status of the model:
|
||||
|
|
@ -1436,7 +1449,8 @@ Listing all models in cache. The model metadata will also include a field to ind
|
|||
"in_cache": true,
|
||||
"path": "/Users/REDACTED/Library/Caches/llama.cpp/ggml-org_gemma-3-4b-it-GGUF_gemma-3-4b-it-Q4_K_M.gguf",
|
||||
"status": {
|
||||
"value": "loaded"
|
||||
"value": "loaded",
|
||||
"args": ["llama-server", "-ctx", "4096"]
|
||||
},
|
||||
...
|
||||
}]
|
||||
|
|
@ -1455,32 +1469,39 @@ The `status` object can be:
|
|||
|
||||
```json
|
||||
"status": {
|
||||
"value": "loading"
|
||||
"value": "loading",
|
||||
"args": ["llama-server", "-ctx", "4096"]
|
||||
}
|
||||
```
|
||||
|
||||
```json
|
||||
"status": {
|
||||
"value": "failed"
|
||||
"value": "unloaded",
|
||||
"args": ["llama-server", "-ctx", "4096"],
|
||||
"failed": true,
|
||||
"exit_code": 1
|
||||
}
|
||||
```
|
||||
|
||||
```json
|
||||
"status": {
|
||||
"value": "loaded"
|
||||
"value": "loaded",
|
||||
"args": ["llama-server", "-ctx", "4096"]
|
||||
}
|
||||
```
|
||||
|
||||
### POST `/models/load`: Load a model
|
||||
|
||||
|
||||
Load a model
|
||||
|
||||
Payload:
|
||||
- `model`: name of the model to be loaded
|
||||
- `extra_args`: (optional) an array of additional arguments to be passed to the model instance
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
|
||||
"model": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M",
|
||||
"extra_args": ["-n", "128", "--top-k", "4"]
|
||||
}
|
||||
```
|
||||
|
||||
|
|
@ -1494,7 +1515,6 @@ Response:
|
|||
|
||||
### POST `/models/unload`: Unload a model
|
||||
|
||||
|
||||
Unload a model
|
||||
|
||||
Payload:
|
||||
|
|
|
|||
|
|
@ -29,6 +29,8 @@
|
|||
#include <limits.h>
|
||||
#endif
|
||||
|
||||
#define CMD_EXIT "exit"
|
||||
|
||||
static std::filesystem::path get_server_exec_path() {
|
||||
#if defined(_WIN32)
|
||||
wchar_t buf[32768] = { 0 }; // Large buffer to handle long paths
|
||||
|
|
@ -154,7 +156,9 @@ server_models::server_models(
|
|||
/* in_cache */ true,
|
||||
/* port */ 0,
|
||||
/* status */ SERVER_MODEL_STATUS_UNLOADED,
|
||||
/* last_used */ 0
|
||||
/* last_used */ 0,
|
||||
/* args */ std::vector<std::string>(),
|
||||
/* exit_code */ 0
|
||||
};
|
||||
mapping[meta.name] = instance_t{
|
||||
/* subproc */ std::make_shared<subprocess_s>(),
|
||||
|
|
@ -177,7 +181,9 @@ server_models::server_models(
|
|||
/* in_cache */ false,
|
||||
/* port */ 0,
|
||||
/* status */ SERVER_MODEL_STATUS_UNLOADED,
|
||||
/* last_used */ 0
|
||||
/* last_used */ 0,
|
||||
/* args */ std::vector<std::string>(),
|
||||
/* exit_code */ 0
|
||||
};
|
||||
mapping[meta.name] = instance_t{
|
||||
/* subproc */ std::make_shared<subprocess_s>(),
|
||||
|
|
@ -293,10 +299,10 @@ std::vector<server_model_meta> server_models::get_all_meta() {
|
|||
}
|
||||
|
||||
void server_models::unload_lru() {
|
||||
if (base_params.max_models <= 0) {
|
||||
if (base_params.models_max <= 0) {
|
||||
return; // no limit
|
||||
}
|
||||
// remove one of the servers if we passed the max_models (least recently used - LRU)
|
||||
// remove one of the servers if we passed the models_max (least recently used - LRU)
|
||||
std::string lru_model_name = "";
|
||||
int64_t lru_last_used = ggml_time_ms();
|
||||
size_t count_active = 0;
|
||||
|
|
@ -312,13 +318,13 @@ void server_models::unload_lru() {
|
|||
}
|
||||
}
|
||||
}
|
||||
if (!lru_model_name.empty() && count_active >= (size_t)base_params.max_models) {
|
||||
SRV_INF("max_models limit reached, removing LRU name=%s\n", lru_model_name.c_str());
|
||||
if (!lru_model_name.empty() && count_active >= (size_t)base_params.models_max) {
|
||||
SRV_INF("models_max limit reached, removing LRU name=%s\n", lru_model_name.c_str());
|
||||
unload(lru_model_name);
|
||||
}
|
||||
}
|
||||
|
||||
void server_models::load(const std::string & name) {
|
||||
void server_models::load(const std::string & name, const std::vector<std::string> & extra_args, bool auto_load) {
|
||||
if (!has_model(name)) {
|
||||
throw std::runtime_error("model name=" + name + " is not found");
|
||||
}
|
||||
|
|
@ -327,7 +333,7 @@ void server_models::load(const std::string & name) {
|
|||
std::lock_guard<std::mutex> lk(mutex);
|
||||
|
||||
auto meta = mapping[name].meta;
|
||||
if (meta.status != SERVER_MODEL_STATUS_FAILED && meta.status != SERVER_MODEL_STATUS_UNLOADED) {
|
||||
if (meta.status != SERVER_MODEL_STATUS_UNLOADED) {
|
||||
SRV_INF("model %s is not ready\n", name.c_str());
|
||||
return;
|
||||
}
|
||||
|
|
@ -348,31 +354,48 @@ void server_models::load(const std::string & name) {
|
|||
std::string exec_path = get_server_exec_path().string();
|
||||
SRV_INF("spawning server instance with name=%s on port %d\n", inst.meta.name.c_str(), inst.meta.port);
|
||||
|
||||
std::vector<std::string> child_args = base_args; // copy
|
||||
if (inst.meta.in_cache) {
|
||||
child_args.push_back("-hf");
|
||||
child_args.push_back(inst.meta.name);
|
||||
std::vector<std::string> child_args;
|
||||
if (auto_load && !meta.args.empty()) {
|
||||
child_args = meta.args; // reuse previous args
|
||||
// update port arg
|
||||
for (size_t i = 0; i < child_args.size(); i++) {
|
||||
if (child_args[i] == "--port" && i + 1 < child_args.size()) {
|
||||
child_args[i + 1] = std::to_string(inst.meta.port);
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
child_args.push_back("-m");
|
||||
child_args.push_back(inst.meta.path);
|
||||
if (!inst.meta.path_mmproj.empty()) {
|
||||
child_args.push_back("--mmproj");
|
||||
child_args.push_back(inst.meta.path_mmproj);
|
||||
child_args = base_args; // copy
|
||||
if (inst.meta.in_cache) {
|
||||
child_args.push_back("-hf");
|
||||
child_args.push_back(inst.meta.name);
|
||||
} else {
|
||||
child_args.push_back("-m");
|
||||
child_args.push_back(inst.meta.path);
|
||||
if (!inst.meta.path_mmproj.empty()) {
|
||||
child_args.push_back("--mmproj");
|
||||
child_args.push_back(inst.meta.path_mmproj);
|
||||
}
|
||||
}
|
||||
child_args.push_back("--alias");
|
||||
child_args.push_back(inst.meta.name);
|
||||
child_args.push_back("--port");
|
||||
child_args.push_back(std::to_string(inst.meta.port));
|
||||
|
||||
// append extra args
|
||||
for (const auto & arg : extra_args) {
|
||||
child_args.push_back(arg);
|
||||
}
|
||||
}
|
||||
child_args.push_back("--alias");
|
||||
child_args.push_back(inst.meta.name);
|
||||
child_args.push_back("--port");
|
||||
child_args.push_back(std::to_string(inst.meta.port));
|
||||
|
||||
std::vector<std::string> child_env = base_env; // copy
|
||||
child_env.push_back("LLAMA_SERVER_ROUTER_PORT=" + std::to_string(base_params.port));
|
||||
|
||||
// TODO: add logging
|
||||
SRV_INF("%s", "spawning server instance with args:\n");
|
||||
for (const auto & arg : child_args) {
|
||||
SRV_INF(" %s\n", arg.c_str());
|
||||
}
|
||||
inst.meta.args = child_args; // save for debugging
|
||||
|
||||
std::vector<char *> argv = to_char_ptr_array(child_args);
|
||||
std::vector<char *> envp = to_char_ptr_array(child_env);
|
||||
|
|
@ -385,6 +408,7 @@ void server_models::load(const std::string & name) {
|
|||
}
|
||||
|
||||
// start a thread to manage the child process
|
||||
// captured variables are guaranteed to be destroyed only after the thread is joined
|
||||
inst.th = std::thread([this, name, child_proc = inst.subproc, port = inst.meta.port]() {
|
||||
// read stdout/stderr and forward to main server log
|
||||
FILE * p_stdout_stderr = subprocess_stdout(child_proc.get());
|
||||
|
|
@ -405,31 +429,49 @@ void server_models::load(const std::string & name) {
|
|||
std::lock_guard<std::mutex> lk(mutex);
|
||||
auto it = mapping.find(name);
|
||||
if (it != mapping.end()) {
|
||||
it->second.meta.status = exit_code == 0
|
||||
? SERVER_MODEL_STATUS_UNLOADED
|
||||
: SERVER_MODEL_STATUS_FAILED;
|
||||
auto & meta = it->second.meta;
|
||||
meta.exit_code = exit_code;
|
||||
meta.status = SERVER_MODEL_STATUS_UNLOADED;
|
||||
}
|
||||
cv.notify_all();
|
||||
}
|
||||
SRV_INF("instance name=%s exited with status %d\n", name.c_str(), exit_code);
|
||||
});
|
||||
|
||||
// clean up old thread if exists
|
||||
if (mapping[name].th.joinable()) {
|
||||
mapping[name].th.join();
|
||||
// clean up old process/thread if exists
|
||||
{
|
||||
auto & old_instance = mapping[name];
|
||||
// old process should have exited already, but just in case, we clean it up here
|
||||
if (subprocess_alive(old_instance.subproc.get())) {
|
||||
SRV_WRN("old process for model name=%s is still alive, this is unexpected\n", name.c_str());
|
||||
subprocess_terminate(old_instance.subproc.get()); // force kill
|
||||
}
|
||||
if (old_instance.th.joinable()) {
|
||||
old_instance.th.join();
|
||||
}
|
||||
}
|
||||
|
||||
mapping[name] = std::move(inst);
|
||||
cv.notify_all();
|
||||
}
|
||||
|
||||
static void interrupt_subprocess(subprocess_s * proc) {
|
||||
// because subprocess.h does not provide a way to send SIGINT,
|
||||
// we will send a command to the child process to exit gracefully
|
||||
FILE * p_stdin = subprocess_stdin(proc);
|
||||
if (p_stdin) {
|
||||
fprintf(p_stdin, "%s\n", CMD_EXIT);
|
||||
fflush(p_stdin);
|
||||
}
|
||||
}
|
||||
|
||||
void server_models::unload(const std::string & name) {
|
||||
std::lock_guard<std::mutex> lk(mutex);
|
||||
auto it = mapping.find(name);
|
||||
if (it != mapping.end()) {
|
||||
if (it->second.meta.is_active()) {
|
||||
SRV_INF("unloading model instance name=%s\n", name.c_str());
|
||||
subprocess_terminate(it->second.subproc.get());
|
||||
interrupt_subprocess(it->second.subproc.get());
|
||||
// status change will be handled by the managing thread
|
||||
} else {
|
||||
SRV_WRN("model instance name=%s is not loaded\n", name.c_str());
|
||||
|
|
@ -444,7 +486,7 @@ void server_models::unload_all() {
|
|||
for (auto & [name, inst] : mapping) {
|
||||
if (inst.meta.is_active()) {
|
||||
SRV_INF("unloading model instance name=%s\n", name.c_str());
|
||||
subprocess_terminate(inst.subproc.get());
|
||||
interrupt_subprocess(inst.subproc.get());
|
||||
// status change will be handled by the managing thread
|
||||
}
|
||||
// moving the thread to join list to avoid deadlock
|
||||
|
|
@ -459,6 +501,10 @@ void server_models::unload_all() {
|
|||
}
|
||||
|
||||
void server_models::update_status(const std::string & name, server_model_status status) {
|
||||
// for now, we only allow updating to LOADED status
|
||||
if (status != SERVER_MODEL_STATUS_LOADED) {
|
||||
throw std::runtime_error("invalid status value");
|
||||
}
|
||||
auto meta = get_meta(name);
|
||||
if (meta.has_value()) {
|
||||
meta->status = status;
|
||||
|
|
@ -471,8 +517,7 @@ void server_models::wait_until_loaded(const std::string & name) {
|
|||
cv.wait(lk, [this, &name]() {
|
||||
auto it = mapping.find(name);
|
||||
if (it != mapping.end()) {
|
||||
return it->second.meta.status == SERVER_MODEL_STATUS_LOADED ||
|
||||
it->second.meta.status == SERVER_MODEL_STATUS_FAILED;
|
||||
return it->second.meta.status != SERVER_MODEL_STATUS_LOADING;
|
||||
}
|
||||
return false;
|
||||
});
|
||||
|
|
@ -483,19 +528,23 @@ bool server_models::ensure_model_loaded(const std::string & name) {
|
|||
if (!meta.has_value()) {
|
||||
throw std::runtime_error("model name=" + name + " is not found");
|
||||
}
|
||||
if (meta->is_active()) {
|
||||
if (meta->status == SERVER_MODEL_STATUS_LOADED) {
|
||||
return false; // already loaded
|
||||
}
|
||||
SRV_INF("model name=%s is not loaded, loading...\n", name.c_str());
|
||||
load(name);
|
||||
wait_until_loaded(name);
|
||||
{
|
||||
// check final status
|
||||
meta = get_meta(name);
|
||||
if (!meta.has_value() || meta->status == SERVER_MODEL_STATUS_FAILED) {
|
||||
throw std::runtime_error("model name=" + name + " failed to load");
|
||||
}
|
||||
if (meta->status == SERVER_MODEL_STATUS_UNLOADED) {
|
||||
SRV_INF("model name=%s is not loaded, loading...\n", name.c_str());
|
||||
load(name, {}, true);
|
||||
}
|
||||
|
||||
SRV_INF("waiting until model name=%s is fully loaded...\n", name.c_str());
|
||||
wait_until_loaded(name);
|
||||
|
||||
// check final status
|
||||
meta = get_meta(name);
|
||||
if (!meta.has_value() || meta->is_failed()) {
|
||||
throw std::runtime_error("model name=" + name + " failed to load");
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -523,15 +572,18 @@ server_http_res_ptr server_models::proxy_request(const server_http_req & req, co
|
|||
return proxy;
|
||||
}
|
||||
|
||||
void server_models::setup_child_server(const std::string & host, int router_port, const std::string & name, std::function<void(int)> & shutdown_handler) {
|
||||
void server_models::setup_child_server(const common_params & base_params, int router_port, const std::string & name, std::function<void(int)> & shutdown_handler) {
|
||||
// send a notification to the router server that a model instance is ready
|
||||
httplib::Client cli(host, router_port);
|
||||
httplib::Client cli(base_params.hostname, router_port);
|
||||
cli.set_connection_timeout(0, 200000); // 200 milliseconds
|
||||
|
||||
httplib::Request req;
|
||||
req.method = "POST";
|
||||
req.path = "/models/status";
|
||||
req.set_header("Content-Type", "application/json");
|
||||
if (!base_params.api_keys.empty()) {
|
||||
req.set_header("Authorization", "Bearer " + base_params.api_keys[0]);
|
||||
}
|
||||
|
||||
json body;
|
||||
body["model"] = name;
|
||||
|
|
@ -543,22 +595,31 @@ void server_models::setup_child_server(const std::string & host, int router_port
|
|||
if (result.error() != httplib::Error::Success) {
|
||||
auto err_str = httplib::to_string(result.error());
|
||||
SRV_ERR("failed to notify router server: %s\n", err_str.c_str());
|
||||
// TODO: maybe force shutdown here?
|
||||
exit(1); // force exit
|
||||
}
|
||||
|
||||
// setup thread for monitoring stdin
|
||||
// when EOF is detected, that means the router server requested shutdown, or the parent process died
|
||||
std::thread([shutdown_handler]() {
|
||||
// wait for EOF on stdin
|
||||
SRV_INF("%s", "child server monitoring thread started, waiting for EOF on stdin...\n");
|
||||
bool eof = false;
|
||||
while (true) {
|
||||
int c = getchar();
|
||||
if (c == EOF) {
|
||||
std::string line;
|
||||
if (!std::getline(std::cin, line)) {
|
||||
// EOF detected, that means the router server is unexpectedly exit or killed
|
||||
eof = true;
|
||||
break;
|
||||
}
|
||||
if (line.find(CMD_EXIT) != std::string::npos) {
|
||||
SRV_INF("%s", "exit command received, exiting...\n");
|
||||
shutdown_handler(0);
|
||||
break;
|
||||
}
|
||||
}
|
||||
SRV_INF("%s", "EOF on stdin detected, invoking shutdown handler...\n");
|
||||
shutdown_handler(0); // invoke shutdown handler
|
||||
if (eof) {
|
||||
SRV_INF("%s", "EOF on stdin detected, forcing shutdown...\n");
|
||||
exit(1);
|
||||
}
|
||||
}).detach();
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -15,16 +15,16 @@
|
|||
* state diagram:
|
||||
*
|
||||
* UNLOADED ──► LOADING ──► LOADED
|
||||
* ▲ │
|
||||
* │ │
|
||||
* FAILED ◄───────┘
|
||||
* ▲ │ │
|
||||
* └───failed───┘ │
|
||||
* ▲ │
|
||||
* └────────unloaded─────────┘
|
||||
*/
|
||||
enum server_model_status {
|
||||
// TODO: also add downloading state
|
||||
// TODO: also add downloading state when the logic is added
|
||||
SERVER_MODEL_STATUS_UNLOADED,
|
||||
SERVER_MODEL_STATUS_LOADING,
|
||||
SERVER_MODEL_STATUS_LOADED,
|
||||
SERVER_MODEL_STATUS_FAILED
|
||||
SERVER_MODEL_STATUS_LOADED
|
||||
};
|
||||
|
||||
static server_model_status server_model_status_from_string(const std::string & status_str) {
|
||||
|
|
@ -34,8 +34,6 @@ static server_model_status server_model_status_from_string(const std::string & s
|
|||
return SERVER_MODEL_STATUS_LOADING;
|
||||
} else if (status_str == "loaded") {
|
||||
return SERVER_MODEL_STATUS_LOADED;
|
||||
} else if (status_str == "failed") {
|
||||
return SERVER_MODEL_STATUS_FAILED;
|
||||
} else {
|
||||
throw std::runtime_error("invalid server model status");
|
||||
}
|
||||
|
|
@ -46,7 +44,6 @@ static std::string server_model_status_to_string(server_model_status status) {
|
|||
case SERVER_MODEL_STATUS_UNLOADED: return "unloaded";
|
||||
case SERVER_MODEL_STATUS_LOADING: return "loading";
|
||||
case SERVER_MODEL_STATUS_LOADED: return "loaded";
|
||||
case SERVER_MODEL_STATUS_FAILED: return "failed";
|
||||
default: return "unknown";
|
||||
}
|
||||
}
|
||||
|
|
@ -58,11 +55,17 @@ struct server_model_meta {
|
|||
bool in_cache = false; // if true, use -hf; use -m otherwise
|
||||
int port = 0;
|
||||
server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
|
||||
int64_t last_used = 0;
|
||||
int64_t last_used = 0; // for LRU unloading
|
||||
std::vector<std::string> args; // additional args passed to the model instance (used for debugging)
|
||||
int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED)
|
||||
|
||||
bool is_active() const {
|
||||
return status == SERVER_MODEL_STATUS_LOADED || status == SERVER_MODEL_STATUS_LOADING;
|
||||
}
|
||||
|
||||
bool is_failed() const {
|
||||
return status == SERVER_MODEL_STATUS_UNLOADED && exit_code != 0;
|
||||
}
|
||||
};
|
||||
|
||||
struct server_models {
|
||||
|
|
@ -98,7 +101,8 @@ public:
|
|||
// return a copy of all model metadata
|
||||
std::vector<server_model_meta> get_all_meta();
|
||||
|
||||
void load(const std::string & name);
|
||||
// if auto_load is true, load the model with previous args if any
|
||||
void load(const std::string & name, const std::vector<std::string> & extra_args, bool auto_load);
|
||||
void unload(const std::string & name);
|
||||
void unload_all();
|
||||
|
||||
|
|
@ -117,7 +121,7 @@ public:
|
|||
server_http_res_ptr proxy_request(const server_http_req & req, const std::string & method, const std::string & name, bool update_last_used);
|
||||
|
||||
// notify the router server that a model instance is ready
|
||||
static void setup_child_server(const std::string & host, int router_port, const std::string & name, std::function<void(int)> & shutdown_handler);
|
||||
static void setup_child_server(const common_params & base_params, int router_port, const std::string & name, std::function<void(int)> & shutdown_handler);
|
||||
};
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -5142,7 +5142,10 @@ public:
|
|||
server_http_context::handler_t proxy_get = [this](const server_http_req & req) {
|
||||
std::string method = "GET";
|
||||
std::string name = req.get_param("model");
|
||||
models->ensure_model_loaded(name);
|
||||
auto error_res = std::make_unique<server_res_generator>(ctx_server);
|
||||
if (!router_validate_model(name, error_res)) {
|
||||
return std::unique_ptr<server_http_res>(std::move(error_res));
|
||||
}
|
||||
return models->proxy_request(req, method, name, false);
|
||||
};
|
||||
|
||||
|
|
@ -5150,7 +5153,10 @@ public:
|
|||
std::string method = "POST";
|
||||
json body = json::parse(req.body);
|
||||
std::string name = json_value(body, "model", std::string());
|
||||
models->ensure_model_loaded(name);
|
||||
auto error_res = std::make_unique<server_res_generator>(ctx_server);
|
||||
if (!router_validate_model(name, error_res)) {
|
||||
return std::unique_ptr<server_http_res>(std::move(error_res));
|
||||
}
|
||||
return models->proxy_request(req, method, name, true); // update last usage for POST request only
|
||||
};
|
||||
|
||||
|
|
@ -5158,21 +5164,23 @@ public:
|
|||
auto res = std::make_unique<server_res_generator>(ctx_server);
|
||||
json body = json::parse(req.body);
|
||||
std::string name = json_value(body, "model", std::string());
|
||||
std::vector<std::string> extra_args = json_value(body, "extra_args", std::vector<std::string>());
|
||||
auto model = models->get_meta(name);
|
||||
if (!model.has_value()) {
|
||||
res->error(format_error_response("model is not found", ERROR_TYPE_INVALID_REQUEST));
|
||||
res->error(format_error_response("model is not found", ERROR_TYPE_NOT_FOUND));
|
||||
return res;
|
||||
}
|
||||
if (model->status == SERVER_MODEL_STATUS_LOADED) {
|
||||
res->error(format_error_response("model is already loaded", ERROR_TYPE_INVALID_REQUEST));
|
||||
return res;
|
||||
}
|
||||
models->load(name);
|
||||
models->load(name, extra_args, false);
|
||||
res->ok({{"success", true}});
|
||||
return res;
|
||||
};
|
||||
|
||||
// used by child process to notify the router about status change
|
||||
// TODO @ngxson : maybe implement authentication for this endpoint in the future
|
||||
server_http_context::handler_t post_router_models_status = [this](const server_http_req & req) {
|
||||
auto res = std::make_unique<server_res_generator>(ctx_server);
|
||||
json body = json::parse(req.body);
|
||||
|
|
@ -5187,19 +5195,32 @@ public:
|
|||
auto res = std::make_unique<server_res_generator>(ctx_server);
|
||||
json models_json = json::array();
|
||||
auto all_models = models->get_all_meta();
|
||||
for (const auto & model : all_models) {
|
||||
std::time_t t = std::time(0);
|
||||
for (const auto & meta : all_models) {
|
||||
json status {
|
||||
{"value", server_model_status_to_string(meta.status)},
|
||||
{"args", meta.args},
|
||||
};
|
||||
if (meta.is_failed()) {
|
||||
status["exit_code"] = meta.exit_code;
|
||||
status["failed"] = true;
|
||||
}
|
||||
models_json.push_back(json {
|
||||
{"name", model.name},
|
||||
{"id", model.name},
|
||||
{"in_cache", model.in_cache},
|
||||
{"path", model.path},
|
||||
// TODO: other fields...
|
||||
{"status", {
|
||||
{"value", server_model_status_to_string(model.status)}
|
||||
}},
|
||||
{"id", meta.name},
|
||||
{"name", meta.name},
|
||||
{"object", "model"}, // for OAI-compat
|
||||
{"owned_by", "llamacpp"}, // for OAI-compat
|
||||
{"created", t}, // for OAI-compat
|
||||
{"in_cache", meta.in_cache},
|
||||
{"path", meta.path},
|
||||
{"status", status},
|
||||
// TODO: add other fields, may require reading GGUF metadata
|
||||
});
|
||||
}
|
||||
res->ok({{"data", models_json}});
|
||||
res->ok({
|
||||
{"data", models_json},
|
||||
{"object", "list"},
|
||||
});
|
||||
return res;
|
||||
};
|
||||
|
||||
|
|
@ -5571,6 +5592,27 @@ private:
|
|||
res->ok(root);
|
||||
return res;
|
||||
}
|
||||
|
||||
bool router_validate_model(const std::string & name, std::unique_ptr<server_res_generator> & res) {
|
||||
if (name.empty()) {
|
||||
res->error(format_error_response("model name is missing from the request", ERROR_TYPE_INVALID_REQUEST));
|
||||
return false;
|
||||
}
|
||||
auto meta = models->get_meta(name);
|
||||
if (!meta.has_value()) {
|
||||
res->error(format_error_response("model not found", ERROR_TYPE_INVALID_REQUEST));
|
||||
return false;
|
||||
}
|
||||
if (params.models_autoload) {
|
||||
models->ensure_model_loaded(name);
|
||||
} else {
|
||||
if (meta->status != SERVER_MODEL_STATUS_LOADED) {
|
||||
res->error(format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST));
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
std::function<void(int)> shutdown_handler;
|
||||
|
|
@ -5669,7 +5711,10 @@ int main(int argc, char ** argv, char ** envp) {
|
|||
routes.models.reset(new server_models(params, argc, argv, envp));
|
||||
|
||||
// proxy handlers
|
||||
// note: routes.get_health stays the same
|
||||
routes.get_metrics = routes.proxy_get;
|
||||
routes.post_props = routes.proxy_post;
|
||||
routes.get_api_show = routes.proxy_get;
|
||||
routes.post_completions = routes.proxy_post;
|
||||
routes.post_completions_oai = routes.proxy_post;
|
||||
routes.post_chat_completions = routes.proxy_post;
|
||||
|
|
@ -5815,6 +5860,8 @@ int main(int argc, char ** argv, char ** envp) {
|
|||
|
||||
if (is_router_server) {
|
||||
LOG_INF("%s: router server is listening on %s\n", __func__, ctx_http.listening_address.c_str());
|
||||
LOG_INF("%s: NOTE: router mode is experimental\n", __func__);
|
||||
LOG_INF("%s: it is not recommended to use this mode in untrusted environments\n", __func__);
|
||||
ctx_http.is_ready.store(true);
|
||||
if (ctx_http.thread.joinable()) {
|
||||
ctx_http.thread.join(); // keep the main thread alive
|
||||
|
|
@ -5829,7 +5876,7 @@ int main(int argc, char ** argv, char ** envp) {
|
|||
// optionally, notify router server that this instance is ready
|
||||
const char * router_port = std::getenv("LLAMA_SERVER_ROUTER_PORT");
|
||||
if (router_port != nullptr) {
|
||||
server_models::setup_child_server(params.hostname, std::atoi(router_port), params.model_alias, shutdown_handler);
|
||||
server_models::setup_child_server(params, std::atoi(router_port), params.model_alias, shutdown_handler);
|
||||
}
|
||||
|
||||
// this call blocks the main thread until queue_tasks.terminate() is called
|
||||
|
|
|
|||
Loading…
Reference in New Issue