diff --git a/common/arg.cpp b/common/arg.cpp index eab26b67f2..062046c0d0 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2482,12 +2482,19 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_DIR")); add_opt(common_arg( - {"--max-models"}, "N", - string_format("for router server, maximum number of models to load simultaneously (default: %d, 0 = unlimited)", params.max_models), + {"--models-max"}, "N", + string_format("for router server, maximum number of models to load simultaneously (default: %d, 0 = unlimited)", params.models_max), [](common_params & params, int value) { - params.max_models = value; + params.models_max = value; } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MAX_MODELS")); + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX")); + add_opt(common_arg( + {"--no-models-autoload"}, + "disables automatic loading of models (default: enabled)", + [](common_params & params) { + params.models_autoload = false; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_MODELS_AUTOLOAD")); add_opt(common_arg( {"--jinja"}, "use jinja template for chat (default: disabled)", diff --git a/common/common.h b/common/common.h index 20ba209ce4..4ac9700d7b 100644 --- a/common/common.h +++ b/common/common.h @@ -460,7 +460,8 @@ struct common_params { // router server configs std::string models_dir = ""; // directory containing models for the router server - int max_models = 4; // maximum number of models to load simultaneously + int models_max = 4; // maximum number of models to load simultaneously + bool models_autoload = true; // automatically load models when requested via the router server bool log_json = false; diff --git a/tools/server/README.md b/tools/server/README.md index 3e311a657c..24984d8696 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -30,9 +30,10 @@ The project is under active development, and we are [looking for feedback and co | -------- | ----------- | | `-h, --help, --usage` | print usage and exit | | `--version` | show version and build info | +| `-cl, --cache-list` | show list of models in cache | | `--completion-bash` | print source-able bash completion script for llama.cpp | | `--verbose-prompt` | print a verbose prompt before generation (default: false) | -| `-t, --threads N` | number of threads to use during generation (default: -1)
(env: LLAMA_ARG_THREADS) | +| `-t, --threads N` | number of CPU threads to use during generation (default: -1)
(env: LLAMA_ARG_THREADS) | | `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) | | `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") | | `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask | @@ -51,7 +52,7 @@ The project is under active development, and we are [looking for feedback and co | `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) | | `--swa-full` | use full-size SWA cache (default: false)
[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
(env: LLAMA_ARG_SWA_FULL) | | `--kv-unified, -kvu` | use single unified KV buffer for the KV cache of all sequences (default: false)
[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)
(env: LLAMA_ARG_KV_SPLIT) | -| `-fa, --flash-attn` | enable Flash Attention (default: disabled)
(env: LLAMA_ARG_FLASH_ATTN) | +| `-fa, --flash-attn [on\|off\|auto]` | set Flash Attention use ('on', 'off', or 'auto', default: 'auto')
(env: LLAMA_ARG_FLASH_ATTN) | | `--no-perf` | disable internal libllama performance timings (default: false)
(env: LLAMA_ARG_NO_PERF) | | `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) | | `--no-escape` | do not process escape sequences | @@ -61,11 +62,12 @@ The project is under active development, and we are [looking for feedback and co | `--rope-freq-scale N` | RoPE frequency scaling factor, expands context by a factor of 1/N
(env: LLAMA_ARG_ROPE_FREQ_SCALE) | | `--yarn-orig-ctx N` | YaRN: original context size of model (default: 0 = model training context size)
(env: LLAMA_ARG_YARN_ORIG_CTX) | | `--yarn-ext-factor N` | YaRN: extrapolation mix factor (default: -1.0, 0.0 = full interpolation)
(env: LLAMA_ARG_YARN_EXT_FACTOR) | -| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: 1.0)
(env: LLAMA_ARG_YARN_ATTN_FACTOR) | -| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: 1.0)
(env: LLAMA_ARG_YARN_BETA_SLOW) | -| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0)
(env: LLAMA_ARG_YARN_BETA_FAST) | +| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.0)
(env: LLAMA_ARG_YARN_ATTN_FACTOR) | +| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.0)
(env: LLAMA_ARG_YARN_BETA_SLOW) | +| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.0)
(env: LLAMA_ARG_YARN_BETA_FAST) | | `-nkvo, --no-kv-offload` | disable KV offload
(env: LLAMA_ARG_NO_KV_OFFLOAD) | | `-nr, --no-repack` | disable weight repacking
(env: LLAMA_ARG_NO_REPACK) | +| `--no-host` | bypass host buffer allowing extra buffers to be used
(env: LLAMA_ARG_NO_HOST) | | `-ctk, --cache-type-k TYPE` | KV cache data type for K
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_K) | | `-ctv, --cache-type-v TYPE` | KV cache data type for V
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)
(env: LLAMA_ARG_DEFRAG_THOLD) | @@ -78,7 +80,7 @@ The project is under active development, and we are [looking for feedback and co | `--override-tensor, -ot =,...` | override tensor buffer type | | `--cpu-moe, -cmoe` | keep all Mixture of Experts (MoE) weights in the CPU
(env: LLAMA_ARG_CPU_MOE) | | `--n-cpu-moe, -ncmoe N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU
(env: LLAMA_ARG_N_CPU_MOE) | -| `-ngl, --gpu-layers, --n-gpu-layers N` | number of layers to store in VRAM
(env: LLAMA_ARG_N_GPU_LAYERS) | +| `-ngl, --gpu-layers, --n-gpu-layers N` | max. number of layers to store in VRAM (default: -1)
(env: LLAMA_ARG_N_GPU_LAYERS) | | `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:
- none: use one GPU only
- layer (default): split layers and KV across GPUs
- row: split rows across GPUs
(env: LLAMA_ARG_SPLIT_MODE) | | `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1
(env: LLAMA_ARG_TENSOR_SPLIT) | | `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)
(env: LLAMA_ARG_MAIN_GPU) | @@ -90,8 +92,9 @@ The project is under active development, and we are [looking for feedback and co | `--control-vector FNAME` | add a control vector
note: this argument can be repeated to add multiple control vectors | | `--control-vector-scaled FNAME SCALE` | add a control vector with user defined scaling SCALE
note: this argument can be repeated to add multiple scaled control vectors | | `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive | -| `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)
(env: LLAMA_ARG_MODEL) | +| `-m, --model FNAME` | model path to load
(env: LLAMA_ARG_MODEL) | | `-mu, --model-url MODEL_URL` | model download url (default: unused)
(env: LLAMA_ARG_MODEL_URL) | +| `-dr, --docker-repo [/][:quant]` | Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.
example: gemma3
(default: unused)
(env: LLAMA_ARG_DOCKER_REPO) | | `-hf, -hfr, --hf-repo /[:quant]` | Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.
mmproj is also downloaded automatically if available. to disable, add --no-mmproj
example: unsloth/phi-4-GGUF:q4_k_m
(default: unused)
(env: LLAMA_ARG_HF_REPO) | | `-hfd, -hfrd, --hf-repo-draft /[:quant]` | Same as --hf-repo, but for the draft model (default: unused)
(env: LLAMA_ARG_HFD_REPO) | | `-hff, --hf-file FILE` | Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)
(env: LLAMA_ARG_HF_FILE) | @@ -100,7 +103,7 @@ The project is under active development, and we are [looking for feedback and co | `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)
(env: HF_TOKEN) | | `--log-disable` | Log disable | | `--log-file FNAME` | Log to file | -| `--log-colors` | Enable colored logging
(env: LLAMA_LOG_COLORS) | +| `--log-colors [on\|off\|auto]` | Set colored logging ('on', 'off', or 'auto', default: 'auto')
'auto' enables colors when output is to a terminal
(env: LLAMA_LOG_COLORS) | | `-v, --verbose, --log-verbose` | Set verbosity level to infinity (i.e. log all messages, useful for debugging) | | `--offline` | Offline mode: forces use of cache, prevents network access
(env: LLAMA_OFFLINE) | | `-lv, --verbosity, --log-verbosity N` | Set the verbosity threshold. Messages with a higher verbosity will be ignored.
(env: LLAMA_LOG_VERBOSITY) | @@ -151,7 +154,8 @@ The project is under active development, and we are [looking for feedback and co | Argument | Explanation | | -------- | ----------- | -| `--swa-checkpoints N` | max number of SWA checkpoints per slot to create (default: 3)
[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)
(env: LLAMA_ARG_SWA_CHECKPOINTS) | +| `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)
[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)
(env: LLAMA_ARG_CTX_CHECKPOINTS) | +| `--cache-ram, -cram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)
[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)
(env: LLAMA_ARG_CACHE_RAM) | | `--no-context-shift` | disables context shift on infinite text generation (default: enabled)
(env: LLAMA_ARG_NO_CONTEXT_SHIFT) | | `--context-shift` | enables context shift on infinite text generation (default: disabled)
(env: LLAMA_ARG_CONTEXT_SHIFT) | | `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode
| @@ -165,6 +169,8 @@ The project is under active development, and we are [looking for feedback and co | `--mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md
(env: LLAMA_ARG_MMPROJ_URL) | | `--no-mmproj` | explicitly disable multimodal projector, useful when using -hf
(env: LLAMA_ARG_NO_MMPROJ) | | `--no-mmproj-offload` | do not offload multimodal projector to GPU
(env: LLAMA_ARG_NO_MMPROJ_OFFLOAD) | +| `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)
(env: LLAMA_ARG_IMAGE_MIN_TOKENS) | +| `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)
(env: LLAMA_ARG_IMAGE_MAX_TOKENS) | | `--override-tensor-draft, -otd =,...` | override tensor buffer type for draft model | | `--cpu-moe-draft, -cmoed` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model
(env: LLAMA_ARG_CPU_MOE_DRAFT) | | `--n-cpu-moe-draft, -ncmoed N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model
(env: LLAMA_ARG_N_CPU_MOE_DRAFT) | @@ -189,13 +195,16 @@ The project is under active development, and we are [looking for feedback and co | `--slots` | enable slots monitoring endpoint (default: enabled)
(env: LLAMA_ARG_ENDPOINT_SLOTS) | | `--no-slots` | disables slots monitoring endpoint
(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) | | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) | +| `--models-dir PATH` | directory containing models for the router server (default: disabled)
(env: LLAMA_ARG_MODELS_DIR) | +| `--models-max N` | for router server, maximum number of models to load simultaneously (default: 4, 0 = unlimited)
(env: LLAMA_ARG_MODELS_MAX) | +| `--no-models-autoload` | disables automatic loading of models (default: enabled)
(env: LLAMA_ARG_NO_MODELS_AUTOLOAD) | | `--jinja` | use jinja template for chat (default: disabled)
(env: LLAMA_ARG_JINJA) | -| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content`
- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`
(default: deepseek)
(env: LLAMA_ARG_THINK) | +| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content`
- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`
(default: auto)
(env: LLAMA_ARG_THINK) | | `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | -| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | -| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | +| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | +| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | | `--no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)
when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled

(env: LLAMA_ARG_NO_PREFILL_ASSISTANT) | -| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)
| +| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled)
| | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) | | `-td, --threads-draft N` | number of threads to use during generation (default: same as --threads) | | `-tbd, --threads-batch-draft N` | number of threads to use during batch and prompt processing (default: same as --threads-draft) | @@ -209,15 +218,17 @@ The project is under active development, and we are [looking for feedback and co | `--spec-replace TARGET DRAFT` | translate the string in TARGET into DRAFT if the draft model and main model are not compatible | | `-mv, --model-vocoder FNAME` | vocoder model for audio generation (default: unused) | | `--tts-use-guide-tokens` | Use guide tokens to improve TTS word recall | -| `--embd-bge-small-en-default` | use default bge-small-en-v1.5 model (note: can download weights from the internet) | -| `--embd-e5-small-en-default` | use default e5-small-v2 model (note: can download weights from the internet) | -| `--embd-gte-small-default` | use default gte-small model (note: can download weights from the internet) | +| `--embd-gemma-default` | use default EmbeddingGemma model (note: can download weights from the internet) | | `--fim-qwen-1.5b-default` | use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet) | | `--fim-qwen-3b-default` | use default Qwen 2.5 Coder 3B (note: can download weights from the internet) | | `--fim-qwen-7b-default` | use default Qwen 2.5 Coder 7B (note: can download weights from the internet) | | `--fim-qwen-7b-spec` | use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet) | | `--fim-qwen-14b-spec` | use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet) | | `--fim-qwen-30b-default` | use default Qwen 3 Coder 30B A3B Instruct (note: can download weights from the internet) | +| `--gpt-oss-20b-default` | use gpt-oss-20b (note: can download weights from the internet) | +| `--gpt-oss-120b-default` | use gpt-oss-120b (note: can download weights from the internet) | +| `--vision-gemma-4b-default` | use Gemma 3 4B QAT (note: can download weights from the internet) | +| `--vision-gemma-12b-default` | use Gemma 3 12B QAT (note: can download weights from the internet) | Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var. @@ -1385,14 +1396,14 @@ models_directory │ └─ mmproj-F16.gguf # file name must start with "mmproj" │ │ # multi-shard - ├─ gemma-3-4b-it-Q8_0 + ├─ Kimi-K2-Thinking-UD-IQ1_S │ ├─ Kimi-K2-Thinking-UD-IQ1_S-00001-of-00006.gguf │ ├─ Kimi-K2-Thinking-UD-IQ1_S-00002-of-00006.gguf │ ├─ ... │ └─ Kimi-K2-Thinking-UD-IQ1_S-00006-of-00006.gguf ``` -You may also specify default arguments that will be passed to every loaded model instance: +You may also specify default arguments that will be passed to every model instance: ```sh llama-server -ctx 8192 -n 1024 -np 2 @@ -1424,6 +1435,8 @@ For **GET** endpoints (`/props`, `/metrics`, etc.) The router uses the `model` q GET /props?model=ggml-org%2Fgemma-3-4b-it-GGUF%3AQ4_K_M ``` +By default, the model will be loaded automatically if it's not loaded. To disable this, add `--no-models-autoload` when starting the server. + ### GET `/models`: List available models Listing all models in cache. The model metadata will also include a field to indicate the status of the model: @@ -1436,7 +1449,8 @@ Listing all models in cache. The model metadata will also include a field to ind "in_cache": true, "path": "/Users/REDACTED/Library/Caches/llama.cpp/ggml-org_gemma-3-4b-it-GGUF_gemma-3-4b-it-Q4_K_M.gguf", "status": { - "value": "loaded" + "value": "loaded", + "args": ["llama-server", "-ctx", "4096"] }, ... }] @@ -1455,32 +1469,39 @@ The `status` object can be: ```json "status": { - "value": "loading" + "value": "loading", + "args": ["llama-server", "-ctx", "4096"] } ``` ```json "status": { - "value": "failed" + "value": "unloaded", + "args": ["llama-server", "-ctx", "4096"], + "failed": true, + "exit_code": 1 } ``` ```json "status": { - "value": "loaded" + "value": "loaded", + "args": ["llama-server", "-ctx", "4096"] } ``` ### POST `/models/load`: Load a model - Load a model Payload: +- `model`: name of the model to be loaded +- `extra_args`: (optional) an array of additional arguments to be passed to the model instance ```json { - "model": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M" + "model": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M", + "extra_args": ["-n", "128", "--top-k", "4"] } ``` @@ -1494,7 +1515,6 @@ Response: ### POST `/models/unload`: Unload a model - Unload a model Payload: diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 071b5522ea..6ab0a9c226 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -29,6 +29,8 @@ #include #endif +#define CMD_EXIT "exit" + static std::filesystem::path get_server_exec_path() { #if defined(_WIN32) wchar_t buf[32768] = { 0 }; // Large buffer to handle long paths @@ -154,7 +156,9 @@ server_models::server_models( /* in_cache */ true, /* port */ 0, /* status */ SERVER_MODEL_STATUS_UNLOADED, - /* last_used */ 0 + /* last_used */ 0, + /* args */ std::vector(), + /* exit_code */ 0 }; mapping[meta.name] = instance_t{ /* subproc */ std::make_shared(), @@ -177,7 +181,9 @@ server_models::server_models( /* in_cache */ false, /* port */ 0, /* status */ SERVER_MODEL_STATUS_UNLOADED, - /* last_used */ 0 + /* last_used */ 0, + /* args */ std::vector(), + /* exit_code */ 0 }; mapping[meta.name] = instance_t{ /* subproc */ std::make_shared(), @@ -293,10 +299,10 @@ std::vector server_models::get_all_meta() { } void server_models::unload_lru() { - if (base_params.max_models <= 0) { + if (base_params.models_max <= 0) { return; // no limit } - // remove one of the servers if we passed the max_models (least recently used - LRU) + // remove one of the servers if we passed the models_max (least recently used - LRU) std::string lru_model_name = ""; int64_t lru_last_used = ggml_time_ms(); size_t count_active = 0; @@ -312,13 +318,13 @@ void server_models::unload_lru() { } } } - if (!lru_model_name.empty() && count_active >= (size_t)base_params.max_models) { - SRV_INF("max_models limit reached, removing LRU name=%s\n", lru_model_name.c_str()); + if (!lru_model_name.empty() && count_active >= (size_t)base_params.models_max) { + SRV_INF("models_max limit reached, removing LRU name=%s\n", lru_model_name.c_str()); unload(lru_model_name); } } -void server_models::load(const std::string & name) { +void server_models::load(const std::string & name, const std::vector & extra_args, bool auto_load) { if (!has_model(name)) { throw std::runtime_error("model name=" + name + " is not found"); } @@ -327,7 +333,7 @@ void server_models::load(const std::string & name) { std::lock_guard lk(mutex); auto meta = mapping[name].meta; - if (meta.status != SERVER_MODEL_STATUS_FAILED && meta.status != SERVER_MODEL_STATUS_UNLOADED) { + if (meta.status != SERVER_MODEL_STATUS_UNLOADED) { SRV_INF("model %s is not ready\n", name.c_str()); return; } @@ -348,31 +354,48 @@ void server_models::load(const std::string & name) { std::string exec_path = get_server_exec_path().string(); SRV_INF("spawning server instance with name=%s on port %d\n", inst.meta.name.c_str(), inst.meta.port); - std::vector child_args = base_args; // copy - if (inst.meta.in_cache) { - child_args.push_back("-hf"); - child_args.push_back(inst.meta.name); + std::vector child_args; + if (auto_load && !meta.args.empty()) { + child_args = meta.args; // reuse previous args + // update port arg + for (size_t i = 0; i < child_args.size(); i++) { + if (child_args[i] == "--port" && i + 1 < child_args.size()) { + child_args[i + 1] = std::to_string(inst.meta.port); + break; + } + } } else { - child_args.push_back("-m"); - child_args.push_back(inst.meta.path); - if (!inst.meta.path_mmproj.empty()) { - child_args.push_back("--mmproj"); - child_args.push_back(inst.meta.path_mmproj); + child_args = base_args; // copy + if (inst.meta.in_cache) { + child_args.push_back("-hf"); + child_args.push_back(inst.meta.name); + } else { + child_args.push_back("-m"); + child_args.push_back(inst.meta.path); + if (!inst.meta.path_mmproj.empty()) { + child_args.push_back("--mmproj"); + child_args.push_back(inst.meta.path_mmproj); + } + } + child_args.push_back("--alias"); + child_args.push_back(inst.meta.name); + child_args.push_back("--port"); + child_args.push_back(std::to_string(inst.meta.port)); + + // append extra args + for (const auto & arg : extra_args) { + child_args.push_back(arg); } } - child_args.push_back("--alias"); - child_args.push_back(inst.meta.name); - child_args.push_back("--port"); - child_args.push_back(std::to_string(inst.meta.port)); std::vector child_env = base_env; // copy child_env.push_back("LLAMA_SERVER_ROUTER_PORT=" + std::to_string(base_params.port)); - // TODO: add logging SRV_INF("%s", "spawning server instance with args:\n"); for (const auto & arg : child_args) { SRV_INF(" %s\n", arg.c_str()); } + inst.meta.args = child_args; // save for debugging std::vector argv = to_char_ptr_array(child_args); std::vector envp = to_char_ptr_array(child_env); @@ -385,6 +408,7 @@ void server_models::load(const std::string & name) { } // start a thread to manage the child process + // captured variables are guaranteed to be destroyed only after the thread is joined inst.th = std::thread([this, name, child_proc = inst.subproc, port = inst.meta.port]() { // read stdout/stderr and forward to main server log FILE * p_stdout_stderr = subprocess_stdout(child_proc.get()); @@ -405,31 +429,49 @@ void server_models::load(const std::string & name) { std::lock_guard lk(mutex); auto it = mapping.find(name); if (it != mapping.end()) { - it->second.meta.status = exit_code == 0 - ? SERVER_MODEL_STATUS_UNLOADED - : SERVER_MODEL_STATUS_FAILED; + auto & meta = it->second.meta; + meta.exit_code = exit_code; + meta.status = SERVER_MODEL_STATUS_UNLOADED; } cv.notify_all(); } SRV_INF("instance name=%s exited with status %d\n", name.c_str(), exit_code); }); - // clean up old thread if exists - if (mapping[name].th.joinable()) { - mapping[name].th.join(); + // clean up old process/thread if exists + { + auto & old_instance = mapping[name]; + // old process should have exited already, but just in case, we clean it up here + if (subprocess_alive(old_instance.subproc.get())) { + SRV_WRN("old process for model name=%s is still alive, this is unexpected\n", name.c_str()); + subprocess_terminate(old_instance.subproc.get()); // force kill + } + if (old_instance.th.joinable()) { + old_instance.th.join(); + } } mapping[name] = std::move(inst); cv.notify_all(); } +static void interrupt_subprocess(subprocess_s * proc) { + // because subprocess.h does not provide a way to send SIGINT, + // we will send a command to the child process to exit gracefully + FILE * p_stdin = subprocess_stdin(proc); + if (p_stdin) { + fprintf(p_stdin, "%s\n", CMD_EXIT); + fflush(p_stdin); + } +} + void server_models::unload(const std::string & name) { std::lock_guard lk(mutex); auto it = mapping.find(name); if (it != mapping.end()) { if (it->second.meta.is_active()) { SRV_INF("unloading model instance name=%s\n", name.c_str()); - subprocess_terminate(it->second.subproc.get()); + interrupt_subprocess(it->second.subproc.get()); // status change will be handled by the managing thread } else { SRV_WRN("model instance name=%s is not loaded\n", name.c_str()); @@ -444,7 +486,7 @@ void server_models::unload_all() { for (auto & [name, inst] : mapping) { if (inst.meta.is_active()) { SRV_INF("unloading model instance name=%s\n", name.c_str()); - subprocess_terminate(inst.subproc.get()); + interrupt_subprocess(inst.subproc.get()); // status change will be handled by the managing thread } // moving the thread to join list to avoid deadlock @@ -459,6 +501,10 @@ void server_models::unload_all() { } void server_models::update_status(const std::string & name, server_model_status status) { + // for now, we only allow updating to LOADED status + if (status != SERVER_MODEL_STATUS_LOADED) { + throw std::runtime_error("invalid status value"); + } auto meta = get_meta(name); if (meta.has_value()) { meta->status = status; @@ -471,8 +517,7 @@ void server_models::wait_until_loaded(const std::string & name) { cv.wait(lk, [this, &name]() { auto it = mapping.find(name); if (it != mapping.end()) { - return it->second.meta.status == SERVER_MODEL_STATUS_LOADED || - it->second.meta.status == SERVER_MODEL_STATUS_FAILED; + return it->second.meta.status != SERVER_MODEL_STATUS_LOADING; } return false; }); @@ -483,19 +528,23 @@ bool server_models::ensure_model_loaded(const std::string & name) { if (!meta.has_value()) { throw std::runtime_error("model name=" + name + " is not found"); } - if (meta->is_active()) { + if (meta->status == SERVER_MODEL_STATUS_LOADED) { return false; // already loaded } - SRV_INF("model name=%s is not loaded, loading...\n", name.c_str()); - load(name); - wait_until_loaded(name); - { - // check final status - meta = get_meta(name); - if (!meta.has_value() || meta->status == SERVER_MODEL_STATUS_FAILED) { - throw std::runtime_error("model name=" + name + " failed to load"); - } + if (meta->status == SERVER_MODEL_STATUS_UNLOADED) { + SRV_INF("model name=%s is not loaded, loading...\n", name.c_str()); + load(name, {}, true); } + + SRV_INF("waiting until model name=%s is fully loaded...\n", name.c_str()); + wait_until_loaded(name); + + // check final status + meta = get_meta(name); + if (!meta.has_value() || meta->is_failed()) { + throw std::runtime_error("model name=" + name + " failed to load"); + } + return true; } @@ -523,15 +572,18 @@ server_http_res_ptr server_models::proxy_request(const server_http_req & req, co return proxy; } -void server_models::setup_child_server(const std::string & host, int router_port, const std::string & name, std::function & shutdown_handler) { +void server_models::setup_child_server(const common_params & base_params, int router_port, const std::string & name, std::function & shutdown_handler) { // send a notification to the router server that a model instance is ready - httplib::Client cli(host, router_port); + httplib::Client cli(base_params.hostname, router_port); cli.set_connection_timeout(0, 200000); // 200 milliseconds httplib::Request req; req.method = "POST"; req.path = "/models/status"; req.set_header("Content-Type", "application/json"); + if (!base_params.api_keys.empty()) { + req.set_header("Authorization", "Bearer " + base_params.api_keys[0]); + } json body; body["model"] = name; @@ -543,22 +595,31 @@ void server_models::setup_child_server(const std::string & host, int router_port if (result.error() != httplib::Error::Success) { auto err_str = httplib::to_string(result.error()); SRV_ERR("failed to notify router server: %s\n", err_str.c_str()); - // TODO: maybe force shutdown here? + exit(1); // force exit } // setup thread for monitoring stdin - // when EOF is detected, that means the router server requested shutdown, or the parent process died std::thread([shutdown_handler]() { // wait for EOF on stdin SRV_INF("%s", "child server monitoring thread started, waiting for EOF on stdin...\n"); + bool eof = false; while (true) { - int c = getchar(); - if (c == EOF) { + std::string line; + if (!std::getline(std::cin, line)) { + // EOF detected, that means the router server is unexpectedly exit or killed + eof = true; + break; + } + if (line.find(CMD_EXIT) != std::string::npos) { + SRV_INF("%s", "exit command received, exiting...\n"); + shutdown_handler(0); break; } } - SRV_INF("%s", "EOF on stdin detected, invoking shutdown handler...\n"); - shutdown_handler(0); // invoke shutdown handler + if (eof) { + SRV_INF("%s", "EOF on stdin detected, forcing shutdown...\n"); + exit(1); + } }).detach(); } diff --git a/tools/server/server-models.h b/tools/server/server-models.h index 3cb3b39fe7..c49cb7c62c 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -15,16 +15,16 @@ * state diagram: * * UNLOADED ──► LOADING ──► LOADED - * ▲ │ - * │ │ - * FAILED ◄───────┘ + * ▲ │ │ + * └───failed───┘ │ + * ▲ │ + * └────────unloaded─────────┘ */ enum server_model_status { - // TODO: also add downloading state + // TODO: also add downloading state when the logic is added SERVER_MODEL_STATUS_UNLOADED, SERVER_MODEL_STATUS_LOADING, - SERVER_MODEL_STATUS_LOADED, - SERVER_MODEL_STATUS_FAILED + SERVER_MODEL_STATUS_LOADED }; static server_model_status server_model_status_from_string(const std::string & status_str) { @@ -34,8 +34,6 @@ static server_model_status server_model_status_from_string(const std::string & s return SERVER_MODEL_STATUS_LOADING; } else if (status_str == "loaded") { return SERVER_MODEL_STATUS_LOADED; - } else if (status_str == "failed") { - return SERVER_MODEL_STATUS_FAILED; } else { throw std::runtime_error("invalid server model status"); } @@ -46,7 +44,6 @@ static std::string server_model_status_to_string(server_model_status status) { case SERVER_MODEL_STATUS_UNLOADED: return "unloaded"; case SERVER_MODEL_STATUS_LOADING: return "loading"; case SERVER_MODEL_STATUS_LOADED: return "loaded"; - case SERVER_MODEL_STATUS_FAILED: return "failed"; default: return "unknown"; } } @@ -58,11 +55,17 @@ struct server_model_meta { bool in_cache = false; // if true, use -hf; use -m otherwise int port = 0; server_model_status status = SERVER_MODEL_STATUS_UNLOADED; - int64_t last_used = 0; + int64_t last_used = 0; // for LRU unloading + std::vector args; // additional args passed to the model instance (used for debugging) + int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED) bool is_active() const { return status == SERVER_MODEL_STATUS_LOADED || status == SERVER_MODEL_STATUS_LOADING; } + + bool is_failed() const { + return status == SERVER_MODEL_STATUS_UNLOADED && exit_code != 0; + } }; struct server_models { @@ -98,7 +101,8 @@ public: // return a copy of all model metadata std::vector get_all_meta(); - void load(const std::string & name); + // if auto_load is true, load the model with previous args if any + void load(const std::string & name, const std::vector & extra_args, bool auto_load); void unload(const std::string & name); void unload_all(); @@ -117,7 +121,7 @@ public: server_http_res_ptr proxy_request(const server_http_req & req, const std::string & method, const std::string & name, bool update_last_used); // notify the router server that a model instance is ready - static void setup_child_server(const std::string & host, int router_port, const std::string & name, std::function & shutdown_handler); + static void setup_child_server(const common_params & base_params, int router_port, const std::string & name, std::function & shutdown_handler); }; /** diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 43d145fb67..6869c7826d 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -5142,7 +5142,10 @@ public: server_http_context::handler_t proxy_get = [this](const server_http_req & req) { std::string method = "GET"; std::string name = req.get_param("model"); - models->ensure_model_loaded(name); + auto error_res = std::make_unique(ctx_server); + if (!router_validate_model(name, error_res)) { + return std::unique_ptr(std::move(error_res)); + } return models->proxy_request(req, method, name, false); }; @@ -5150,7 +5153,10 @@ public: std::string method = "POST"; json body = json::parse(req.body); std::string name = json_value(body, "model", std::string()); - models->ensure_model_loaded(name); + auto error_res = std::make_unique(ctx_server); + if (!router_validate_model(name, error_res)) { + return std::unique_ptr(std::move(error_res)); + } return models->proxy_request(req, method, name, true); // update last usage for POST request only }; @@ -5158,21 +5164,23 @@ public: auto res = std::make_unique(ctx_server); json body = json::parse(req.body); std::string name = json_value(body, "model", std::string()); + std::vector extra_args = json_value(body, "extra_args", std::vector()); auto model = models->get_meta(name); if (!model.has_value()) { - res->error(format_error_response("model is not found", ERROR_TYPE_INVALID_REQUEST)); + res->error(format_error_response("model is not found", ERROR_TYPE_NOT_FOUND)); return res; } if (model->status == SERVER_MODEL_STATUS_LOADED) { res->error(format_error_response("model is already loaded", ERROR_TYPE_INVALID_REQUEST)); return res; } - models->load(name); + models->load(name, extra_args, false); res->ok({{"success", true}}); return res; }; // used by child process to notify the router about status change + // TODO @ngxson : maybe implement authentication for this endpoint in the future server_http_context::handler_t post_router_models_status = [this](const server_http_req & req) { auto res = std::make_unique(ctx_server); json body = json::parse(req.body); @@ -5187,19 +5195,32 @@ public: auto res = std::make_unique(ctx_server); json models_json = json::array(); auto all_models = models->get_all_meta(); - for (const auto & model : all_models) { + std::time_t t = std::time(0); + for (const auto & meta : all_models) { + json status { + {"value", server_model_status_to_string(meta.status)}, + {"args", meta.args}, + }; + if (meta.is_failed()) { + status["exit_code"] = meta.exit_code; + status["failed"] = true; + } models_json.push_back(json { - {"name", model.name}, - {"id", model.name}, - {"in_cache", model.in_cache}, - {"path", model.path}, - // TODO: other fields... - {"status", { - {"value", server_model_status_to_string(model.status)} - }}, + {"id", meta.name}, + {"name", meta.name}, + {"object", "model"}, // for OAI-compat + {"owned_by", "llamacpp"}, // for OAI-compat + {"created", t}, // for OAI-compat + {"in_cache", meta.in_cache}, + {"path", meta.path}, + {"status", status}, + // TODO: add other fields, may require reading GGUF metadata }); } - res->ok({{"data", models_json}}); + res->ok({ + {"data", models_json}, + {"object", "list"}, + }); return res; }; @@ -5571,6 +5592,27 @@ private: res->ok(root); return res; } + + bool router_validate_model(const std::string & name, std::unique_ptr & res) { + if (name.empty()) { + res->error(format_error_response("model name is missing from the request", ERROR_TYPE_INVALID_REQUEST)); + return false; + } + auto meta = models->get_meta(name); + if (!meta.has_value()) { + res->error(format_error_response("model not found", ERROR_TYPE_INVALID_REQUEST)); + return false; + } + if (params.models_autoload) { + models->ensure_model_loaded(name); + } else { + if (meta->status != SERVER_MODEL_STATUS_LOADED) { + res->error(format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST)); + return false; + } + } + return true; + } }; std::function shutdown_handler; @@ -5669,7 +5711,10 @@ int main(int argc, char ** argv, char ** envp) { routes.models.reset(new server_models(params, argc, argv, envp)); // proxy handlers + // note: routes.get_health stays the same + routes.get_metrics = routes.proxy_get; routes.post_props = routes.proxy_post; + routes.get_api_show = routes.proxy_get; routes.post_completions = routes.proxy_post; routes.post_completions_oai = routes.proxy_post; routes.post_chat_completions = routes.proxy_post; @@ -5815,6 +5860,8 @@ int main(int argc, char ** argv, char ** envp) { if (is_router_server) { LOG_INF("%s: router server is listening on %s\n", __func__, ctx_http.listening_address.c_str()); + LOG_INF("%s: NOTE: router mode is experimental\n", __func__); + LOG_INF("%s: it is not recommended to use this mode in untrusted environments\n", __func__); ctx_http.is_ready.store(true); if (ctx_http.thread.joinable()) { ctx_http.thread.join(); // keep the main thread alive @@ -5829,7 +5876,7 @@ int main(int argc, char ** argv, char ** envp) { // optionally, notify router server that this instance is ready const char * router_port = std::getenv("LLAMA_SERVER_ROUTER_PORT"); if (router_port != nullptr) { - server_models::setup_child_server(params.hostname, std::atoi(router_port), params.model_alias, shutdown_handler); + server_models::setup_child_server(params, std::atoi(router_port), params.model_alias, shutdown_handler); } // this call blocks the main thread until queue_tasks.terminate() is called