server : support multiple model aliases via comma-separated --alias (#19926)
* server : support multiple model aliases via comma-separated --alias * server : update --alias description and regenerate docs * server : multiple model aliases and tags - address review feedback from ngxson - --alias accepts comma-separated values (std::set, no duplicates) - --tags for informational metadata (not used for routing) - aliases resolve transparently in router via get_meta/has_model - /v1/models exposes aliases and tags fields * regenerate docs * nits * server : use first alias as model_name for backward compat address review feedback from ngxson * server : add single-model test for aliases and tags
This commit is contained in:
parent
a8b192b6ec
commit
2e7e638523
|
|
@ -2520,11 +2520,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
));
|
||||
add_opt(common_arg(
|
||||
{"-a", "--alias"}, "STRING",
|
||||
"set alias for model name (to be used by REST API)",
|
||||
"set model name aliases, comma-separated (to be used by API)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
params.model_alias = value;
|
||||
for (auto & alias : string_split<std::string>(value, ',')) {
|
||||
alias = string_strip(alias);
|
||||
if (!alias.empty()) {
|
||||
params.model_alias.insert(alias);
|
||||
}
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS"));
|
||||
add_opt(common_arg(
|
||||
{"--tags"}, "STRING",
|
||||
"set model tags, comma-separated (informational, not used for routing)",
|
||||
[](common_params & params, const std::string & value) {
|
||||
for (auto & tag : string_split<std::string>(value, ',')) {
|
||||
tag = string_strip(tag);
|
||||
if (!tag.empty()) {
|
||||
params.model_tags.insert(tag);
|
||||
}
|
||||
}
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TAGS"));
|
||||
add_opt(common_arg(
|
||||
{"-m", "--model"}, "FNAME",
|
||||
ex == LLAMA_EXAMPLE_EXPORT_LORA
|
||||
|
|
|
|||
|
|
@ -410,7 +410,8 @@ struct common_params {
|
|||
|
||||
struct common_params_model model;
|
||||
|
||||
std::string model_alias = ""; // model alias // NOLINT
|
||||
std::set<std::string> model_alias; // model aliases // NOLINT
|
||||
std::set<std::string> model_tags; // model tags (informational, not used for routing) // NOLINT
|
||||
std::string hf_token = ""; // HF token // NOLINT
|
||||
std::string prompt = ""; // NOLINT
|
||||
std::string system_prompt = ""; // NOLINT
|
||||
|
|
|
|||
|
|
@ -57,8 +57,8 @@
|
|||
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
|
||||
| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
|
||||
| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
|
||||
| `--mmap, --no-mmap` | whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
|
||||
| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. Takes precedence over --mmap (default: enabled)<br/>(env: LLAMA_ARG_DIO) |
|
||||
| `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
|
||||
| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)<br/>(env: LLAMA_ARG_DIO) |
|
||||
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
|
||||
| `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
|
||||
| `--list-devices` | print list of available devices and exit |
|
||||
|
|
@ -109,14 +109,14 @@
|
|||
| `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
|
||||
| `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) |
|
||||
| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
|
||||
| `--temp N` | temperature (default: 0.80) |
|
||||
| `--temp, --temperature N` | temperature (default: 0.80) |
|
||||
| `--top-k N` | top-k sampling (default: 40, 0 = disabled)<br/>(env: LLAMA_ARG_TOP_K) |
|
||||
| `--top-p N` | top-p sampling (default: 0.95, 1.0 = disabled) |
|
||||
| `--min-p N` | min-p sampling (default: 0.05, 0.0 = disabled) |
|
||||
| `--top-nsigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) |
|
||||
| `--top-nsigma, --top-n-sigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) |
|
||||
| `--xtc-probability N` | xtc probability (default: 0.00, 0.0 = disabled) |
|
||||
| `--xtc-threshold N` | xtc threshold (default: 0.10, 1.0 = disabled) |
|
||||
| `--typical N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) |
|
||||
| `--typical, --typical-p N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) |
|
||||
| `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) |
|
||||
| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled) |
|
||||
| `--presence-penalty N` | repeat alpha presence penalty (default: 0.00, 0.0 = disabled) |
|
||||
|
|
|
|||
|
|
@ -140,8 +140,8 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
|
|||
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
|
||||
| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
|
||||
| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
|
||||
| `--mmap, --no-mmap` | whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
|
||||
| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. Takes precedence over --mmap (default: enabled)<br/>(env: LLAMA_ARG_DIO) |
|
||||
| `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
|
||||
| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)<br/>(env: LLAMA_ARG_DIO) |
|
||||
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
|
||||
| `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
|
||||
| `--list-devices` | print list of available devices and exit |
|
||||
|
|
@ -192,14 +192,14 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1
|
|||
| `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
|
||||
| `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) |
|
||||
| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
|
||||
| `--temp N` | temperature (default: 0.80) |
|
||||
| `--temp, --temperature N` | temperature (default: 0.80) |
|
||||
| `--top-k N` | top-k sampling (default: 40, 0 = disabled)<br/>(env: LLAMA_ARG_TOP_K) |
|
||||
| `--top-p N` | top-p sampling (default: 0.95, 1.0 = disabled) |
|
||||
| `--min-p N` | min-p sampling (default: 0.05, 0.0 = disabled) |
|
||||
| `--top-nsigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) |
|
||||
| `--top-nsigma, --top-n-sigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) |
|
||||
| `--xtc-probability N` | xtc probability (default: 0.00, 0.0 = disabled) |
|
||||
| `--xtc-threshold N` | xtc threshold (default: 0.10, 1.0 = disabled) |
|
||||
| `--typical N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) |
|
||||
| `--typical, --typical-p N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) |
|
||||
| `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) |
|
||||
| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled) |
|
||||
| `--presence-penalty N` | repeat alpha presence penalty (default: 0.00, 0.0 = disabled) |
|
||||
|
|
|
|||
|
|
@ -74,8 +74,8 @@ For the full list of features, please refer to [server's changelog](https://gith
|
|||
| `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
|
||||
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
|
||||
| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
|
||||
| `--mmap, --no-mmap` | whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
|
||||
| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. Takes precedence over --mmap (default: enabled)<br/>(env: LLAMA_ARG_DIO) |
|
||||
| `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
|
||||
| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)<br/>(env: LLAMA_ARG_DIO) |
|
||||
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
|
||||
| `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
|
||||
| `--list-devices` | print list of available devices and exit |
|
||||
|
|
@ -126,14 +126,14 @@ For the full list of features, please refer to [server's changelog](https://gith
|
|||
| `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
|
||||
| `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) |
|
||||
| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
|
||||
| `--temp N` | temperature (default: 0.80) |
|
||||
| `--temp, --temperature N` | temperature (default: 0.80) |
|
||||
| `--top-k N` | top-k sampling (default: 40, 0 = disabled)<br/>(env: LLAMA_ARG_TOP_K) |
|
||||
| `--top-p N` | top-p sampling (default: 0.95, 1.0 = disabled) |
|
||||
| `--min-p N` | min-p sampling (default: 0.05, 0.0 = disabled) |
|
||||
| `--top-nsigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) |
|
||||
| `--top-nsigma, --top-n-sigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) |
|
||||
| `--xtc-probability N` | xtc probability (default: 0.00, 0.0 = disabled) |
|
||||
| `--xtc-threshold N` | xtc threshold (default: 0.10, 1.0 = disabled) |
|
||||
| `--typical N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) |
|
||||
| `--typical, --typical-p N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) |
|
||||
| `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) |
|
||||
| `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled) |
|
||||
| `--presence-penalty N` | repeat alpha presence penalty (default: 0.00, 0.0 = disabled) |
|
||||
|
|
@ -162,9 +162,11 @@ For the full list of features, please refer to [server's changelog](https://gith
|
|||
|
||||
| Argument | Explanation |
|
||||
| -------- | ----------- |
|
||||
| `-lcs, --lookup-cache-static FNAME` | path to static lookup cache to use for lookup decoding (not updated by generation) |
|
||||
| `-lcd, --lookup-cache-dynamic FNAME` | path to dynamic lookup cache to use for lookup decoding (updated by generation) |
|
||||
| `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)<br/>(env: LLAMA_ARG_CTX_CHECKPOINTS) |
|
||||
| `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
|
||||
| `-kvu, --kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
|
||||
| `-kvu, --kv-unified, -no-kvu, --no-kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
|
||||
| `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
|
||||
| `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode |
|
||||
| `-sp, --special` | special tokens output enabled (default: false) |
|
||||
|
|
@ -182,7 +184,8 @@ For the full list of features, please refer to [server's changelog](https://gith
|
|||
| `-otd, --override-tensor-draft <tensor name pattern>=<buffer type>,...` | override tensor buffer type for draft model |
|
||||
| `-cmoed, --cpu-moe-draft` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model<br/>(env: LLAMA_ARG_CPU_MOE_DRAFT) |
|
||||
| `-ncmoed, --n-cpu-moe-draft N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model<br/>(env: LLAMA_ARG_N_CPU_MOE_DRAFT) |
|
||||
| `-a, --alias STRING` | set alias for model name (to be used by REST API)<br/>(env: LLAMA_ARG_ALIAS) |
|
||||
| `-a, --alias STRING` | set model name aliases, comma-separated (to be used by API)<br/>(env: LLAMA_ARG_ALIAS) |
|
||||
| `--tags STRING` | set model tags, comma-separated (informational, not used for routing)<br/>(env: LLAMA_ARG_TAGS) |
|
||||
| `--host HOST` | ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) |
|
||||
| `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
|
||||
| `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
|
||||
|
|
@ -229,6 +232,10 @@ For the full list of features, please refer to [server's changelog](https://gith
|
|||
| `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)<br/>(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) |
|
||||
| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)<br/>(env: LLAMA_ARG_MODEL_DRAFT) |
|
||||
| `--spec-replace TARGET DRAFT` | translate the string in TARGET into DRAFT if the draft model and main model are not compatible |
|
||||
| `--spec-type [none\|ngram-cache\|ngram-simple\|ngram-map-k\|ngram-map-k4v\|ngram-mod]` | type of speculative decoding to use when no draft model is provided (default: none) |
|
||||
| `--spec-ngram-size-n N` | ngram size N for ngram-simple/ngram-map speculative decoding, length of lookup n-gram (default: 12) |
|
||||
| `--spec-ngram-size-m N` | ngram size M for ngram-simple/ngram-map speculative decoding, length of draft m-gram (default: 48) |
|
||||
| `--spec-ngram-min-hits N` | minimum hits for ngram-map speculative decoding (default: 1) |
|
||||
| `-mv, --model-vocoder FNAME` | vocoder model for audio generation (default: unused) |
|
||||
| `--tts-use-guide-tokens` | Use guide tokens to improve TTS word recall |
|
||||
| `--embd-gemma-default` | use default EmbeddingGemma model (note: can download weights from the internet) |
|
||||
|
|
|
|||
|
|
@ -580,6 +580,8 @@ private:
|
|||
float slot_prompt_similarity = 0.0f;
|
||||
|
||||
std::string model_name; // name of the loaded model, to be used by API
|
||||
std::set<std::string> model_aliases; // additional names for the model
|
||||
std::set<std::string> model_tags; // informational tags
|
||||
|
||||
bool sleeping = false;
|
||||
|
||||
|
|
@ -813,10 +815,9 @@ private:
|
|||
SRV_WRN("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n");
|
||||
|
||||
if (!params_base.model_alias.empty()) {
|
||||
// user explicitly specified model name
|
||||
model_name = params_base.model_alias;
|
||||
// backward compat: use first alias as model name
|
||||
model_name = *params_base.model_alias.begin();
|
||||
} else if (!params_base.model.name.empty()) {
|
||||
// use model name in registry format (for models in cache)
|
||||
model_name = params_base.model.name;
|
||||
} else {
|
||||
// fallback: derive model name from file name
|
||||
|
|
@ -824,6 +825,9 @@ private:
|
|||
model_name = model_path.filename().string();
|
||||
}
|
||||
|
||||
model_aliases = params_base.model_alias;
|
||||
model_tags = params_base.model_tags;
|
||||
|
||||
if (!is_resume) {
|
||||
return init();
|
||||
}
|
||||
|
|
@ -2892,6 +2896,8 @@ server_context_meta server_context::get_meta() const {
|
|||
return server_context_meta {
|
||||
/* build_info */ build_info,
|
||||
/* model_name */ impl->model_name,
|
||||
/* model_aliases */ impl->model_aliases,
|
||||
/* model_tags */ impl->model_tags,
|
||||
/* model_path */ impl->params_base.model.path,
|
||||
/* has_mtmd */ impl->mctx != nullptr,
|
||||
/* has_inp_image */ impl->chat_params.allow_image,
|
||||
|
|
@ -3688,6 +3694,8 @@ void server_routes::init_routes() {
|
|||
{"data", {
|
||||
{
|
||||
{"id", meta->model_name},
|
||||
{"aliases", meta->model_aliases},
|
||||
{"tags", meta->model_tags},
|
||||
{"object", "model"},
|
||||
{"created", std::time(0)},
|
||||
{"owned_by", "llamacpp"},
|
||||
|
|
|
|||
|
|
@ -6,12 +6,15 @@
|
|||
|
||||
#include <cstddef>
|
||||
#include <memory>
|
||||
#include <set>
|
||||
|
||||
struct server_context_impl; // private implementation
|
||||
|
||||
struct server_context_meta {
|
||||
std::string build_info;
|
||||
std::string model_name;
|
||||
std::set<std::string> model_aliases;
|
||||
std::set<std::string> model_tags;
|
||||
std::string model_path;
|
||||
bool has_mtmd;
|
||||
bool has_inp_image;
|
||||
|
|
|
|||
|
|
@ -184,6 +184,51 @@ void server_models::add_model(server_model_meta && meta) {
|
|||
if (mapping.find(meta.name) != mapping.end()) {
|
||||
throw std::runtime_error(string_format("model '%s' appears multiple times", meta.name.c_str()));
|
||||
}
|
||||
|
||||
// check model name does not conflict with existing aliases
|
||||
for (const auto & [key, inst] : mapping) {
|
||||
if (inst.meta.aliases.count(meta.name)) {
|
||||
throw std::runtime_error(string_format("model name '%s' conflicts with alias of model '%s'",
|
||||
meta.name.c_str(), key.c_str()));
|
||||
}
|
||||
}
|
||||
|
||||
// parse aliases from preset's --alias option (comma-separated)
|
||||
std::string alias_str;
|
||||
if (meta.preset.get_option("LLAMA_ARG_ALIAS", alias_str) && !alias_str.empty()) {
|
||||
for (auto & alias : string_split<std::string>(alias_str, ',')) {
|
||||
alias = string_strip(alias);
|
||||
if (!alias.empty()) {
|
||||
meta.aliases.insert(alias);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// parse tags from preset's --tags option (comma-separated)
|
||||
std::string tags_str;
|
||||
if (meta.preset.get_option("LLAMA_ARG_TAGS", tags_str) && !tags_str.empty()) {
|
||||
for (auto & tag : string_split<std::string>(tags_str, ',')) {
|
||||
tag = string_strip(tag);
|
||||
if (!tag.empty()) {
|
||||
meta.tags.insert(tag);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// validate aliases do not conflict with existing names or aliases
|
||||
for (const auto & alias : meta.aliases) {
|
||||
if (mapping.find(alias) != mapping.end()) {
|
||||
throw std::runtime_error(string_format("alias '%s' for model '%s' conflicts with existing model name",
|
||||
alias.c_str(), meta.name.c_str()));
|
||||
}
|
||||
for (const auto & [key, inst] : mapping) {
|
||||
if (inst.meta.aliases.count(alias)) {
|
||||
throw std::runtime_error(string_format("alias '%s' for model '%s' conflicts with alias of model '%s'",
|
||||
alias.c_str(), meta.name.c_str(), key.c_str()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
meta.update_args(ctx_preset, bin_path); // render args
|
||||
std::string name = meta.name;
|
||||
mapping[name] = instance_t{
|
||||
|
|
@ -249,6 +294,8 @@ void server_models::load_models() {
|
|||
server_model_meta meta{
|
||||
/* preset */ preset.second,
|
||||
/* name */ preset.first,
|
||||
/* aliases */ {},
|
||||
/* tags */ {},
|
||||
/* port */ 0,
|
||||
/* status */ SERVER_MODEL_STATUS_UNLOADED,
|
||||
/* last_used */ 0,
|
||||
|
|
@ -265,10 +312,28 @@ void server_models::load_models() {
|
|||
for (const auto & [name, preset] : custom_presets) {
|
||||
custom_names.insert(name);
|
||||
}
|
||||
auto join_set = [](const std::set<std::string> & s) {
|
||||
std::string result;
|
||||
for (const auto & v : s) {
|
||||
if (!result.empty()) {
|
||||
result += ", ";
|
||||
}
|
||||
result += v;
|
||||
}
|
||||
return result;
|
||||
};
|
||||
|
||||
SRV_INF("Available models (%zu) (*: custom preset)\n", mapping.size());
|
||||
for (const auto & [name, inst] : mapping) {
|
||||
bool has_custom = custom_names.find(name) != custom_names.end();
|
||||
SRV_INF(" %c %s\n", has_custom ? '*' : ' ', name.c_str());
|
||||
std::string info;
|
||||
if (!inst.meta.aliases.empty()) {
|
||||
info += " (aliases: " + join_set(inst.meta.aliases) + ")";
|
||||
}
|
||||
if (!inst.meta.tags.empty()) {
|
||||
info += " [tags: " + join_set(inst.meta.tags) + "]";
|
||||
}
|
||||
SRV_INF(" %c %s%s\n", has_custom ? '*' : ' ', name.c_str(), info.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -320,7 +385,15 @@ void server_models::update_meta(const std::string & name, const server_model_met
|
|||
|
||||
bool server_models::has_model(const std::string & name) {
|
||||
std::lock_guard<std::mutex> lk(mutex);
|
||||
return mapping.find(name) != mapping.end();
|
||||
if (mapping.find(name) != mapping.end()) {
|
||||
return true;
|
||||
}
|
||||
for (const auto & [key, inst] : mapping) {
|
||||
if (inst.meta.aliases.count(name)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
std::optional<server_model_meta> server_models::get_meta(const std::string & name) {
|
||||
|
|
@ -329,6 +402,11 @@ std::optional<server_model_meta> server_models::get_meta(const std::string & nam
|
|||
if (it != mapping.end()) {
|
||||
return it->second.meta;
|
||||
}
|
||||
for (const auto & [key, inst] : mapping) {
|
||||
if (inst.meta.aliases.count(name)) {
|
||||
return inst.meta;
|
||||
}
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
|
|
@ -766,7 +844,7 @@ static void res_err(std::unique_ptr<server_http_res> & res, const json & error_d
|
|||
res->data = safe_json_to_str({{ "error", error_data }});
|
||||
}
|
||||
|
||||
static bool router_validate_model(const std::string & name, server_models & models, bool models_autoload, std::unique_ptr<server_http_res> & res) {
|
||||
static bool router_validate_model(std::string & name, server_models & models, bool models_autoload, std::unique_ptr<server_http_res> & res) {
|
||||
if (name.empty()) {
|
||||
res_err(res, format_error_response("model name is missing from the request", ERROR_TYPE_INVALID_REQUEST));
|
||||
return false;
|
||||
|
|
@ -776,6 +854,8 @@ static bool router_validate_model(const std::string & name, server_models & mode
|
|||
res_err(res, format_error_response(string_format("model '%s' not found", name.c_str()), ERROR_TYPE_INVALID_REQUEST));
|
||||
return false;
|
||||
}
|
||||
// resolve alias to canonical model name
|
||||
name = meta->name;
|
||||
if (models_autoload) {
|
||||
models.ensure_model_loaded(name);
|
||||
} else {
|
||||
|
|
@ -847,16 +927,16 @@ void server_models_routes::init_routes() {
|
|||
auto res = std::make_unique<server_http_res>();
|
||||
json body = json::parse(req.body);
|
||||
std::string name = json_value(body, "model", std::string());
|
||||
auto model = models.get_meta(name);
|
||||
if (!model.has_value()) {
|
||||
auto meta = models.get_meta(name);
|
||||
if (!meta.has_value()) {
|
||||
res_err(res, format_error_response("model is not found", ERROR_TYPE_NOT_FOUND));
|
||||
return res;
|
||||
}
|
||||
if (model->status == SERVER_MODEL_STATUS_LOADED) {
|
||||
if (meta->status == SERVER_MODEL_STATUS_LOADED) {
|
||||
res_err(res, format_error_response("model is already loaded", ERROR_TYPE_INVALID_REQUEST));
|
||||
return res;
|
||||
}
|
||||
models.load(name);
|
||||
models.load(meta->name);
|
||||
res_ok(res, {{"success", true}});
|
||||
return res;
|
||||
};
|
||||
|
|
@ -877,6 +957,7 @@ void server_models_routes::init_routes() {
|
|||
preset_copy.unset_option("LLAMA_ARG_HOST");
|
||||
preset_copy.unset_option("LLAMA_ARG_PORT");
|
||||
preset_copy.unset_option("LLAMA_ARG_ALIAS");
|
||||
preset_copy.unset_option("LLAMA_ARG_TAGS");
|
||||
status["preset"] = preset_copy.to_ini();
|
||||
}
|
||||
if (meta.is_failed()) {
|
||||
|
|
@ -885,6 +966,8 @@ void server_models_routes::init_routes() {
|
|||
}
|
||||
models_json.push_back(json {
|
||||
{"id", meta.name},
|
||||
{"aliases", meta.aliases},
|
||||
{"tags", meta.tags},
|
||||
{"object", "model"}, // for OAI-compat
|
||||
{"owned_by", "llamacpp"}, // for OAI-compat
|
||||
{"created", t}, // for OAI-compat
|
||||
|
|
@ -912,7 +995,7 @@ void server_models_routes::init_routes() {
|
|||
res_err(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST));
|
||||
return res;
|
||||
}
|
||||
models.unload(name);
|
||||
models.unload(model->name);
|
||||
res_ok(res, {{"success", true}});
|
||||
return res;
|
||||
};
|
||||
|
|
|
|||
|
|
@ -52,6 +52,8 @@ static std::string server_model_status_to_string(server_model_status status) {
|
|||
struct server_model_meta {
|
||||
common_preset preset;
|
||||
std::string name;
|
||||
std::set<std::string> aliases; // additional names that resolve to this model
|
||||
std::set<std::string> tags; // informational tags, not used for routing
|
||||
int port = 0;
|
||||
server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
|
||||
int64_t last_used = 0; // for LRU unloading
|
||||
|
|
|
|||
|
|
@ -92,7 +92,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// for consistency between server router mode and single-model mode, we set the same model name as alias
|
||||
if (params.model_alias.empty() && !params.model.name.empty()) {
|
||||
params.model_alias = params.model.name;
|
||||
params.model_alias.insert(params.model.name);
|
||||
}
|
||||
|
||||
common_init();
|
||||
|
|
|
|||
|
|
@ -94,3 +94,20 @@ def test_no_webui():
|
|||
server.start()
|
||||
res = requests.get(url)
|
||||
assert res.status_code == 404
|
||||
|
||||
|
||||
def test_server_model_aliases_and_tags():
|
||||
global server
|
||||
server.model_alias = "tinyllama-2,fim,code"
|
||||
server.model_tags = "chat,fim,small"
|
||||
server.start()
|
||||
res = server.make_request("GET", "/models")
|
||||
assert res.status_code == 200
|
||||
assert len(res.body["data"]) == 1
|
||||
model = res.body["data"][0]
|
||||
# aliases field must contain all aliases
|
||||
assert set(model["aliases"]) == {"tinyllama-2", "fim", "code"}
|
||||
# tags field must contain all tags
|
||||
assert set(model["tags"]) == {"chat", "fim", "small"}
|
||||
# id is derived from first alias (alphabetical order from std::set)
|
||||
assert model["id"] == "code"
|
||||
|
|
|
|||
|
|
@ -56,6 +56,7 @@ class ServerProcess:
|
|||
|
||||
# custom options
|
||||
model_alias: str | None = None
|
||||
model_tags: str | None = None
|
||||
model_url: str | None = None
|
||||
model_file: str | None = None
|
||||
model_draft: str | None = None
|
||||
|
|
@ -180,6 +181,8 @@ class ServerProcess:
|
|||
server_args.extend(["--pooling", self.pooling])
|
||||
if self.model_alias:
|
||||
server_args.extend(["--alias", self.model_alias])
|
||||
if self.model_tags:
|
||||
server_args.extend(["--tags", self.model_tags])
|
||||
if self.n_ctx:
|
||||
server_args.extend(["--ctx-size", self.n_ctx])
|
||||
if self.n_slots:
|
||||
|
|
|
|||
Loading…
Reference in New Issue