From 2e7e6385230ede73135191e83c81cc9f138c2d65 Mon Sep 17 00:00:00 2001 From: Pascal Date: Fri, 27 Feb 2026 07:05:23 +0100 Subject: [PATCH] server : support multiple model aliases via comma-separated --alias (#19926) * server : support multiple model aliases via comma-separated --alias * server : update --alias description and regenerate docs * server : multiple model aliases and tags - address review feedback from ngxson - --alias accepts comma-separated values (std::set, no duplicates) - --tags for informational metadata (not used for routing) - aliases resolve transparently in router via get_meta/has_model - /v1/models exposes aliases and tags fields * regenerate docs * nits * server : use first alias as model_name for backward compat address review feedback from ngxson * server : add single-model test for aliases and tags --- common/arg.cpp | 21 +++++- common/common.h | 3 +- tools/cli/README.md | 10 +-- tools/completion/README.md | 10 +-- tools/server/README.md | 21 ++++-- tools/server/server-context.cpp | 14 +++- tools/server/server-context.h | 3 + tools/server/server-models.cpp | 99 ++++++++++++++++++++++++--- tools/server/server-models.h | 2 + tools/server/server.cpp | 2 +- tools/server/tests/unit/test_basic.py | 17 +++++ tools/server/tests/utils.py | 3 + 12 files changed, 173 insertions(+), 32 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 1e8885c9ca..05f4a5244e 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2520,11 +2520,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex )); add_opt(common_arg( {"-a", "--alias"}, "STRING", - "set alias for model name (to be used by REST API)", + "set model name aliases, comma-separated (to be used by API)", [](common_params & params, const std::string & value) { - params.model_alias = value; + for (auto & alias : string_split(value, ',')) { + alias = string_strip(alias); + if (!alias.empty()) { + params.model_alias.insert(alias); + } + } } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS")); + add_opt(common_arg( + {"--tags"}, "STRING", + "set model tags, comma-separated (informational, not used for routing)", + [](common_params & params, const std::string & value) { + for (auto & tag : string_split(value, ',')) { + tag = string_strip(tag); + if (!tag.empty()) { + params.model_tags.insert(tag); + } + } + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TAGS")); add_opt(common_arg( {"-m", "--model"}, "FNAME", ex == LLAMA_EXAMPLE_EXPORT_LORA diff --git a/common/common.h b/common/common.h index 1fa1728656..c5a8037571 100644 --- a/common/common.h +++ b/common/common.h @@ -410,7 +410,8 @@ struct common_params { struct common_params_model model; - std::string model_alias = ""; // model alias // NOLINT + std::set model_alias; // model aliases // NOLINT + std::set model_tags; // model tags (informational, not used for routing) // NOLINT std::string hf_token = ""; // HF token // NOLINT std::string prompt = ""; // NOLINT std::string system_prompt = ""; // NOLINT diff --git a/tools/cli/README.md b/tools/cli/README.md index 4a15cbad9d..22d3fc87e9 100644 --- a/tools/cli/README.md +++ b/tools/cli/README.md @@ -57,8 +57,8 @@ | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)
(env: LLAMA_ARG_DEFRAG_THOLD) | | `-np, --parallel N` | number of parallel sequences to decode (default: 1)
(env: LLAMA_ARG_N_PARALLEL) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | -| `--mmap, --no-mmap` | whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | -| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. Takes precedence over --mmap (default: enabled)
(env: LLAMA_ARG_DIO) | +| `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | +| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)
(env: LLAMA_ARG_DIO) | | `--numa TYPE` | attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggml-org/llama.cpp/issues/1437
(env: LLAMA_ARG_NUMA) | | `-dev, --device ` | comma-separated list of devices to use for offloading (none = don't offload)
use --list-devices to see a list of available devices
(env: LLAMA_ARG_DEVICE) | | `--list-devices` | print list of available devices and exit | @@ -109,14 +109,14 @@ | `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) | | `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) | | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) | -| `--temp N` | temperature (default: 0.80) | +| `--temp, --temperature N` | temperature (default: 0.80) | | `--top-k N` | top-k sampling (default: 40, 0 = disabled)
(env: LLAMA_ARG_TOP_K) | | `--top-p N` | top-p sampling (default: 0.95, 1.0 = disabled) | | `--min-p N` | min-p sampling (default: 0.05, 0.0 = disabled) | -| `--top-nsigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) | +| `--top-nsigma, --top-n-sigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) | | `--xtc-probability N` | xtc probability (default: 0.00, 0.0 = disabled) | | `--xtc-threshold N` | xtc threshold (default: 0.10, 1.0 = disabled) | -| `--typical N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) | +| `--typical, --typical-p N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) | | `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) | | `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled) | | `--presence-penalty N` | repeat alpha presence penalty (default: 0.00, 0.0 = disabled) | diff --git a/tools/completion/README.md b/tools/completion/README.md index 3ca3e68454..bcc0887659 100644 --- a/tools/completion/README.md +++ b/tools/completion/README.md @@ -140,8 +140,8 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)
(env: LLAMA_ARG_DEFRAG_THOLD) | | `-np, --parallel N` | number of parallel sequences to decode (default: 1)
(env: LLAMA_ARG_N_PARALLEL) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | -| `--mmap, --no-mmap` | whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | -| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. Takes precedence over --mmap (default: enabled)
(env: LLAMA_ARG_DIO) | +| `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | +| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)
(env: LLAMA_ARG_DIO) | | `--numa TYPE` | attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggml-org/llama.cpp/issues/1437
(env: LLAMA_ARG_NUMA) | | `-dev, --device ` | comma-separated list of devices to use for offloading (none = don't offload)
use --list-devices to see a list of available devices
(env: LLAMA_ARG_DEVICE) | | `--list-devices` | print list of available devices and exit | @@ -192,14 +192,14 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) | | `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) | | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) | -| `--temp N` | temperature (default: 0.80) | +| `--temp, --temperature N` | temperature (default: 0.80) | | `--top-k N` | top-k sampling (default: 40, 0 = disabled)
(env: LLAMA_ARG_TOP_K) | | `--top-p N` | top-p sampling (default: 0.95, 1.0 = disabled) | | `--min-p N` | min-p sampling (default: 0.05, 0.0 = disabled) | -| `--top-nsigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) | +| `--top-nsigma, --top-n-sigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) | | `--xtc-probability N` | xtc probability (default: 0.00, 0.0 = disabled) | | `--xtc-threshold N` | xtc threshold (default: 0.10, 1.0 = disabled) | -| `--typical N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) | +| `--typical, --typical-p N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) | | `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) | | `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled) | | `--presence-penalty N` | repeat alpha presence penalty (default: 0.00, 0.0 = disabled) | diff --git a/tools/server/README.md b/tools/server/README.md index 34b722a27c..da16ddc756 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -74,8 +74,8 @@ For the full list of features, please refer to [server's changelog](https://gith | `-ctv, --cache-type-v TYPE` | KV cache data type for V
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)
(env: LLAMA_ARG_DEFRAG_THOLD) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | -| `--mmap, --no-mmap` | whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | -| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. Takes precedence over --mmap (default: enabled)
(env: LLAMA_ARG_DIO) | +| `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | +| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)
(env: LLAMA_ARG_DIO) | | `--numa TYPE` | attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggml-org/llama.cpp/issues/1437
(env: LLAMA_ARG_NUMA) | | `-dev, --device ` | comma-separated list of devices to use for offloading (none = don't offload)
use --list-devices to see a list of available devices
(env: LLAMA_ARG_DEVICE) | | `--list-devices` | print list of available devices and exit | @@ -126,14 +126,14 @@ For the full list of features, please refer to [server's changelog](https://gith | `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) | | `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) | | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) | -| `--temp N` | temperature (default: 0.80) | +| `--temp, --temperature N` | temperature (default: 0.80) | | `--top-k N` | top-k sampling (default: 40, 0 = disabled)
(env: LLAMA_ARG_TOP_K) | | `--top-p N` | top-p sampling (default: 0.95, 1.0 = disabled) | | `--min-p N` | min-p sampling (default: 0.05, 0.0 = disabled) | -| `--top-nsigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) | +| `--top-nsigma, --top-n-sigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) | | `--xtc-probability N` | xtc probability (default: 0.00, 0.0 = disabled) | | `--xtc-threshold N` | xtc threshold (default: 0.10, 1.0 = disabled) | -| `--typical N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) | +| `--typical, --typical-p N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) | | `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) | | `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled) | | `--presence-penalty N` | repeat alpha presence penalty (default: 0.00, 0.0 = disabled) | @@ -162,9 +162,11 @@ For the full list of features, please refer to [server's changelog](https://gith | Argument | Explanation | | -------- | ----------- | +| `-lcs, --lookup-cache-static FNAME` | path to static lookup cache to use for lookup decoding (not updated by generation) | +| `-lcd, --lookup-cache-dynamic FNAME` | path to dynamic lookup cache to use for lookup decoding (updated by generation) | | `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)
(env: LLAMA_ARG_CTX_CHECKPOINTS) | | `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)
(env: LLAMA_ARG_CACHE_RAM) | -| `-kvu, --kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)
(env: LLAMA_ARG_KV_UNIFIED) | +| `-kvu, --kv-unified, -no-kvu, --no-kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)
(env: LLAMA_ARG_KV_UNIFIED) | | `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)
(env: LLAMA_ARG_CONTEXT_SHIFT) | | `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode | | `-sp, --special` | special tokens output enabled (default: false) | @@ -182,7 +184,8 @@ For the full list of features, please refer to [server's changelog](https://gith | `-otd, --override-tensor-draft =,...` | override tensor buffer type for draft model | | `-cmoed, --cpu-moe-draft` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model
(env: LLAMA_ARG_CPU_MOE_DRAFT) | | `-ncmoed, --n-cpu-moe-draft N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model
(env: LLAMA_ARG_N_CPU_MOE_DRAFT) | -| `-a, --alias STRING` | set alias for model name (to be used by REST API)
(env: LLAMA_ARG_ALIAS) | +| `-a, --alias STRING` | set model name aliases, comma-separated (to be used by API)
(env: LLAMA_ARG_ALIAS) | +| `--tags STRING` | set model tags, comma-separated (informational, not used for routing)
(env: LLAMA_ARG_TAGS) | | `--host HOST` | ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: 127.0.0.1)
(env: LLAMA_ARG_HOST) | | `--port PORT` | port to listen (default: 8080)
(env: LLAMA_ARG_PORT) | | `--path PATH` | path to serve static files from (default: )
(env: LLAMA_ARG_STATIC_PATH) | @@ -229,6 +232,10 @@ For the full list of features, please refer to [server's changelog](https://gith | `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) | | `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)
(env: LLAMA_ARG_MODEL_DRAFT) | | `--spec-replace TARGET DRAFT` | translate the string in TARGET into DRAFT if the draft model and main model are not compatible | +| `--spec-type [none\|ngram-cache\|ngram-simple\|ngram-map-k\|ngram-map-k4v\|ngram-mod]` | type of speculative decoding to use when no draft model is provided (default: none) | +| `--spec-ngram-size-n N` | ngram size N for ngram-simple/ngram-map speculative decoding, length of lookup n-gram (default: 12) | +| `--spec-ngram-size-m N` | ngram size M for ngram-simple/ngram-map speculative decoding, length of draft m-gram (default: 48) | +| `--spec-ngram-min-hits N` | minimum hits for ngram-map speculative decoding (default: 1) | | `-mv, --model-vocoder FNAME` | vocoder model for audio generation (default: unused) | | `--tts-use-guide-tokens` | Use guide tokens to improve TTS word recall | | `--embd-gemma-default` | use default EmbeddingGemma model (note: can download weights from the internet) | diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index eba463e4da..aafed49502 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -580,6 +580,8 @@ private: float slot_prompt_similarity = 0.0f; std::string model_name; // name of the loaded model, to be used by API + std::set model_aliases; // additional names for the model + std::set model_tags; // informational tags bool sleeping = false; @@ -813,10 +815,9 @@ private: SRV_WRN("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n"); if (!params_base.model_alias.empty()) { - // user explicitly specified model name - model_name = params_base.model_alias; + // backward compat: use first alias as model name + model_name = *params_base.model_alias.begin(); } else if (!params_base.model.name.empty()) { - // use model name in registry format (for models in cache) model_name = params_base.model.name; } else { // fallback: derive model name from file name @@ -824,6 +825,9 @@ private: model_name = model_path.filename().string(); } + model_aliases = params_base.model_alias; + model_tags = params_base.model_tags; + if (!is_resume) { return init(); } @@ -2892,6 +2896,8 @@ server_context_meta server_context::get_meta() const { return server_context_meta { /* build_info */ build_info, /* model_name */ impl->model_name, + /* model_aliases */ impl->model_aliases, + /* model_tags */ impl->model_tags, /* model_path */ impl->params_base.model.path, /* has_mtmd */ impl->mctx != nullptr, /* has_inp_image */ impl->chat_params.allow_image, @@ -3688,6 +3694,8 @@ void server_routes::init_routes() { {"data", { { {"id", meta->model_name}, + {"aliases", meta->model_aliases}, + {"tags", meta->model_tags}, {"object", "model"}, {"created", std::time(0)}, {"owned_by", "llamacpp"}, diff --git a/tools/server/server-context.h b/tools/server/server-context.h index 03c29f513b..631d573fcb 100644 --- a/tools/server/server-context.h +++ b/tools/server/server-context.h @@ -6,12 +6,15 @@ #include #include +#include struct server_context_impl; // private implementation struct server_context_meta { std::string build_info; std::string model_name; + std::set model_aliases; + std::set model_tags; std::string model_path; bool has_mtmd; bool has_inp_image; diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index e162547799..bc601237b7 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -184,6 +184,51 @@ void server_models::add_model(server_model_meta && meta) { if (mapping.find(meta.name) != mapping.end()) { throw std::runtime_error(string_format("model '%s' appears multiple times", meta.name.c_str())); } + + // check model name does not conflict with existing aliases + for (const auto & [key, inst] : mapping) { + if (inst.meta.aliases.count(meta.name)) { + throw std::runtime_error(string_format("model name '%s' conflicts with alias of model '%s'", + meta.name.c_str(), key.c_str())); + } + } + + // parse aliases from preset's --alias option (comma-separated) + std::string alias_str; + if (meta.preset.get_option("LLAMA_ARG_ALIAS", alias_str) && !alias_str.empty()) { + for (auto & alias : string_split(alias_str, ',')) { + alias = string_strip(alias); + if (!alias.empty()) { + meta.aliases.insert(alias); + } + } + } + + // parse tags from preset's --tags option (comma-separated) + std::string tags_str; + if (meta.preset.get_option("LLAMA_ARG_TAGS", tags_str) && !tags_str.empty()) { + for (auto & tag : string_split(tags_str, ',')) { + tag = string_strip(tag); + if (!tag.empty()) { + meta.tags.insert(tag); + } + } + } + + // validate aliases do not conflict with existing names or aliases + for (const auto & alias : meta.aliases) { + if (mapping.find(alias) != mapping.end()) { + throw std::runtime_error(string_format("alias '%s' for model '%s' conflicts with existing model name", + alias.c_str(), meta.name.c_str())); + } + for (const auto & [key, inst] : mapping) { + if (inst.meta.aliases.count(alias)) { + throw std::runtime_error(string_format("alias '%s' for model '%s' conflicts with alias of model '%s'", + alias.c_str(), meta.name.c_str(), key.c_str())); + } + } + } + meta.update_args(ctx_preset, bin_path); // render args std::string name = meta.name; mapping[name] = instance_t{ @@ -249,6 +294,8 @@ void server_models::load_models() { server_model_meta meta{ /* preset */ preset.second, /* name */ preset.first, + /* aliases */ {}, + /* tags */ {}, /* port */ 0, /* status */ SERVER_MODEL_STATUS_UNLOADED, /* last_used */ 0, @@ -265,10 +312,28 @@ void server_models::load_models() { for (const auto & [name, preset] : custom_presets) { custom_names.insert(name); } + auto join_set = [](const std::set & s) { + std::string result; + for (const auto & v : s) { + if (!result.empty()) { + result += ", "; + } + result += v; + } + return result; + }; + SRV_INF("Available models (%zu) (*: custom preset)\n", mapping.size()); for (const auto & [name, inst] : mapping) { bool has_custom = custom_names.find(name) != custom_names.end(); - SRV_INF(" %c %s\n", has_custom ? '*' : ' ', name.c_str()); + std::string info; + if (!inst.meta.aliases.empty()) { + info += " (aliases: " + join_set(inst.meta.aliases) + ")"; + } + if (!inst.meta.tags.empty()) { + info += " [tags: " + join_set(inst.meta.tags) + "]"; + } + SRV_INF(" %c %s%s\n", has_custom ? '*' : ' ', name.c_str(), info.c_str()); } } @@ -320,7 +385,15 @@ void server_models::update_meta(const std::string & name, const server_model_met bool server_models::has_model(const std::string & name) { std::lock_guard lk(mutex); - return mapping.find(name) != mapping.end(); + if (mapping.find(name) != mapping.end()) { + return true; + } + for (const auto & [key, inst] : mapping) { + if (inst.meta.aliases.count(name)) { + return true; + } + } + return false; } std::optional server_models::get_meta(const std::string & name) { @@ -329,6 +402,11 @@ std::optional server_models::get_meta(const std::string & nam if (it != mapping.end()) { return it->second.meta; } + for (const auto & [key, inst] : mapping) { + if (inst.meta.aliases.count(name)) { + return inst.meta; + } + } return std::nullopt; } @@ -766,7 +844,7 @@ static void res_err(std::unique_ptr & res, const json & error_d res->data = safe_json_to_str({{ "error", error_data }}); } -static bool router_validate_model(const std::string & name, server_models & models, bool models_autoload, std::unique_ptr & res) { +static bool router_validate_model(std::string & name, server_models & models, bool models_autoload, std::unique_ptr & res) { if (name.empty()) { res_err(res, format_error_response("model name is missing from the request", ERROR_TYPE_INVALID_REQUEST)); return false; @@ -776,6 +854,8 @@ static bool router_validate_model(const std::string & name, server_models & mode res_err(res, format_error_response(string_format("model '%s' not found", name.c_str()), ERROR_TYPE_INVALID_REQUEST)); return false; } + // resolve alias to canonical model name + name = meta->name; if (models_autoload) { models.ensure_model_loaded(name); } else { @@ -847,16 +927,16 @@ void server_models_routes::init_routes() { auto res = std::make_unique(); json body = json::parse(req.body); std::string name = json_value(body, "model", std::string()); - auto model = models.get_meta(name); - if (!model.has_value()) { + auto meta = models.get_meta(name); + if (!meta.has_value()) { res_err(res, format_error_response("model is not found", ERROR_TYPE_NOT_FOUND)); return res; } - if (model->status == SERVER_MODEL_STATUS_LOADED) { + if (meta->status == SERVER_MODEL_STATUS_LOADED) { res_err(res, format_error_response("model is already loaded", ERROR_TYPE_INVALID_REQUEST)); return res; } - models.load(name); + models.load(meta->name); res_ok(res, {{"success", true}}); return res; }; @@ -877,6 +957,7 @@ void server_models_routes::init_routes() { preset_copy.unset_option("LLAMA_ARG_HOST"); preset_copy.unset_option("LLAMA_ARG_PORT"); preset_copy.unset_option("LLAMA_ARG_ALIAS"); + preset_copy.unset_option("LLAMA_ARG_TAGS"); status["preset"] = preset_copy.to_ini(); } if (meta.is_failed()) { @@ -885,6 +966,8 @@ void server_models_routes::init_routes() { } models_json.push_back(json { {"id", meta.name}, + {"aliases", meta.aliases}, + {"tags", meta.tags}, {"object", "model"}, // for OAI-compat {"owned_by", "llamacpp"}, // for OAI-compat {"created", t}, // for OAI-compat @@ -912,7 +995,7 @@ void server_models_routes::init_routes() { res_err(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST)); return res; } - models.unload(name); + models.unload(model->name); res_ok(res, {{"success", true}}); return res; }; diff --git a/tools/server/server-models.h b/tools/server/server-models.h index a397abda4a..78abc8d72a 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -52,6 +52,8 @@ static std::string server_model_status_to_string(server_model_status status) { struct server_model_meta { common_preset preset; std::string name; + std::set aliases; // additional names that resolve to this model + std::set tags; // informational tags, not used for routing int port = 0; server_model_status status = SERVER_MODEL_STATUS_UNLOADED; int64_t last_used = 0; // for LRU unloading diff --git a/tools/server/server.cpp b/tools/server/server.cpp index d3d4316026..542b984534 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -92,7 +92,7 @@ int main(int argc, char ** argv) { // for consistency between server router mode and single-model mode, we set the same model name as alias if (params.model_alias.empty() && !params.model.name.empty()) { - params.model_alias = params.model.name; + params.model_alias.insert(params.model.name); } common_init(); diff --git a/tools/server/tests/unit/test_basic.py b/tools/server/tests/unit/test_basic.py index 3405be3e25..d1b89cf1a9 100644 --- a/tools/server/tests/unit/test_basic.py +++ b/tools/server/tests/unit/test_basic.py @@ -94,3 +94,20 @@ def test_no_webui(): server.start() res = requests.get(url) assert res.status_code == 404 + + +def test_server_model_aliases_and_tags(): + global server + server.model_alias = "tinyllama-2,fim,code" + server.model_tags = "chat,fim,small" + server.start() + res = server.make_request("GET", "/models") + assert res.status_code == 200 + assert len(res.body["data"]) == 1 + model = res.body["data"][0] + # aliases field must contain all aliases + assert set(model["aliases"]) == {"tinyllama-2", "fim", "code"} + # tags field must contain all tags + assert set(model["tags"]) == {"chat", "fim", "small"} + # id is derived from first alias (alphabetical order from std::set) + assert model["id"] == "code" diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py index f76bb1a911..5002999d9b 100644 --- a/tools/server/tests/utils.py +++ b/tools/server/tests/utils.py @@ -56,6 +56,7 @@ class ServerProcess: # custom options model_alias: str | None = None + model_tags: str | None = None model_url: str | None = None model_file: str | None = None model_draft: str | None = None @@ -180,6 +181,8 @@ class ServerProcess: server_args.extend(["--pooling", self.pooling]) if self.model_alias: server_args.extend(["--alias", self.model_alias]) + if self.model_tags: + server_args.extend(["--tags", self.model_tags]) if self.n_ctx: server_args.extend(["--ctx-size", self.n_ctx]) if self.n_slots: