From 6ce3d857962e8183f1fbd7d5aeacf7534a427dc3 Mon Sep 17 00:00:00 2001 From: Pascal Date: Wed, 17 Dec 2025 21:45:45 +0100 Subject: [PATCH] server: (webui) add --webui-config (#18028) * server/webui: add server-side WebUI config support Add CLI arguments --webui-config (inline JSON) and --webui-config-file (file path) to configure WebUI default settings from server side. Backend changes: - Parse JSON once in server_context::load_model() for performance - Cache parsed config in webui_settings member (zero overhead on /props) - Add proper error handling in router mode with try/catch - Expose webui_settings in /props endpoint for both router and child modes Frontend changes: - Add 14 configurable WebUI settings via parameter sync - Add tests for webui settings extraction - Fix subpath support with base path in API calls Addresses feedback from @ngxson and @ggerganov * server: address review feedback from ngxson * server: regenerate README with llama-gen-docs --- common/arg.cpp | 14 +++ common/common.h | 5 +- tools/server/README.md | 19 ++-- tools/server/server-context.cpp | 14 ++- tools/server/server-models.cpp | 1 + tools/server/server-models.h | 10 +++ tools/server/server.cpp | 8 +- .../app/server/ServerErrorSplash.svelte | 3 +- .../src/lib/services/parameter-sync.spec.ts | 14 +++ .../webui/src/lib/services/parameter-sync.ts | 88 ++++++++++++++++--- .../webui/src/lib/stores/server.svelte.ts | 4 + .../webui/src/lib/stores/settings.svelte.ts | 3 +- tools/server/webui/src/lib/types/api.d.ts | 1 + .../webui/src/lib/utils/api-key-validation.ts | 3 +- tools/server/webui/src/routes/+layout.svelte | 3 +- 15 files changed, 163 insertions(+), 27 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 4901a120df..b6d16168eb 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2610,6 +2610,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.api_prefix = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX")); + add_opt(common_arg( + {"--webui-config"}, "JSON", + "JSON that provides default WebUI settings (overrides WebUI defaults)", + [](common_params & params, const std::string & value) { + params.webui_config_json = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG")); + add_opt(common_arg( + {"--webui-config-file"}, "PATH", + "JSON file that provides default WebUI settings (overrides WebUI defaults)", + [](common_params & params, const std::string & value) { + params.webui_config_json = read_file(value); + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG_FILE")); add_opt(common_arg( {"--webui"}, {"--no-webui"}, diff --git a/common/common.h b/common/common.h index d70744840f..3e314f4c80 100644 --- a/common/common.h +++ b/common/common.h @@ -484,8 +484,11 @@ struct common_params { std::map default_template_kwargs; + // webui configs + bool webui = true; + std::string webui_config_json; + // "advanced" endpoints are disabled by default for better security - bool webui = true; bool endpoint_slots = true; bool endpoint_props = false; // only control POST requests, not GET bool endpoint_metrics = false; diff --git a/tools/server/README.md b/tools/server/README.md index 9a2b9b1f36..fd5a59e848 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -46,7 +46,7 @@ For the ful list of features, please refer to [server's changelog](https://githu | `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) | | `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)
| | `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) | -| `-c, --ctx-size N` | size of the prompt context (default: 4096, 0 = loaded from model)
(env: LLAMA_ARG_CTX_SIZE) | +| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)
(env: LLAMA_ARG_CTX_SIZE) | | `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity)
(env: LLAMA_ARG_N_PREDICT) | | `-b, --batch-size N` | logical maximum batch size (default: 2048)
(env: LLAMA_ARG_BATCH) | | `-ub, --ubatch-size N` | physical maximum batch size (default: 512)
(env: LLAMA_ARG_UBATCH) | @@ -82,13 +82,16 @@ For the ful list of features, please refer to [server's changelog](https://githu | `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:
- none: use one GPU only
- layer (default): split layers and KV across GPUs
- row: split rows across GPUs
(env: LLAMA_ARG_SPLIT_MODE) | | `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1
(env: LLAMA_ARG_TENSOR_SPLIT) | | `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)
(env: LLAMA_ARG_MAIN_GPU) | +| `-fit, --fit [on\|off]` | whether to adjust unset arguments to fit in device memory ('on' or 'off', default: 'on')
(env: LLAMA_ARG_FIT) | +| `-fitt, --fit-target MiB` | target margin per device for --fit option, default: 1024
(env: LLAMA_ARG_FIT_TARGET) | +| `-fitc, --fit-ctx N` | minimum ctx size that can be set by --fit option, default: 4096
(env: LLAMA_ARG_FIT_CTX) | | `--check-tensors` | check model tensor data for invalid values (default: false) | -| `--override-kv KEY=TYPE:VALUE` | advanced option to override model metadata by key. may be specified multiple times.
types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false | +| `--override-kv KEY=TYPE:VALUE,...` | advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated or repeat this argument.
types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false | | `--op-offload, --no-op-offload` | whether to offload host tensor operations to device (default: true) | -| `--lora FNAME` | path to LoRA adapter (can be repeated to use multiple adapters) | -| `--lora-scaled FNAME SCALE` | path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters) | -| `--control-vector FNAME` | add a control vector
note: this argument can be repeated to add multiple control vectors | -| `--control-vector-scaled FNAME SCALE` | add a control vector with user defined scaling SCALE
note: this argument can be repeated to add multiple scaled control vectors | +| `--lora FNAME` | path to LoRA adapter (use comma-separated values to load multiple adapters) | +| `--lora-scaled FNAME:SCALE,...` | path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)
note: use comma-separated values | +| `--control-vector FNAME` | add a control vector
note: use comma-separated values to add multiple control vectors | +| `--control-vector-scaled FNAME:SCALE,...` | add a control vector with user defined scaling SCALE
note: use comma-separated values (format: FNAME:SCALE,...) | | `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive | | `-m, --model FNAME` | model path to load
(env: LLAMA_ARG_MODEL) | | `-mu, --model-url MODEL_URL` | model download url (default: unused)
(env: LLAMA_ARG_MODEL_URL) | @@ -120,7 +123,7 @@ For the ful list of features, please refer to [server's changelog](https://githu | `--sampling-seq, --sampler-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) | | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) | | `--temp N` | temperature (default: 0.8) | -| `--top-k N` | top-k sampling (default: 40, 0 = disabled) | +| `--top-k N` | top-k sampling (default: 40, 0 = disabled)
(env: LLAMA_ARG_TOP_K) | | `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) | | `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) | | `--top-nsigma N` | top-n-sigma sampling (default: -1.0, -1.0 = disabled) | @@ -177,6 +180,8 @@ For the ful list of features, please refer to [server's changelog](https://githu | `--port PORT` | port to listen (default: 8080)
(env: LLAMA_ARG_PORT) | | `--path PATH` | path to serve static files from (default: )
(env: LLAMA_ARG_STATIC_PATH) | | `--api-prefix PREFIX` | prefix path the server serves from, without the trailing slash (default: )
(env: LLAMA_ARG_API_PREFIX) | +| `--webui-config JSON` | JSON that provides default WebUI settings (overrides WebUI defaults)
(env: LLAMA_ARG_WEBUI_CONFIG) | +| `--webui-config-file PATH` | JSON file that provides default WebUI settings (overrides WebUI defaults)
(env: LLAMA_ARG_WEBUI_CONFIG_FILE) | | `--webui, --no-webui` | whether to enable the Web UI (default: enabled)
(env: LLAMA_ARG_WEBUI) | | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)
(env: LLAMA_ARG_EMBEDDINGS) | | `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)
(env: LLAMA_ARG_RERANKING) | diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 90898b5ec4..def57d0252 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -544,6 +544,8 @@ struct server_context_impl { server_metrics metrics; + json webui_settings = json::object(); + // Necessary similarity of prompt for slot selection float slot_prompt_similarity = 0.0f; @@ -575,6 +577,16 @@ struct server_context_impl { params_base = params; + webui_settings = json::object(); + if (!params_base.webui_config_json.empty()) { + try { + webui_settings = json::parse(params_base.webui_config_json); + } catch (const std::exception & e) { + SRV_ERR("%s: failed to parse webui config: %s\n", __func__, e.what()); + return false; + } + } + llama_init = common_init_from_params(params_base); model = llama_init->model(); @@ -3103,7 +3115,6 @@ void server_routes::init_routes() { }; } - // this endpoint is publicly available, please only return what is safe to be exposed json data = { { "default_generation_settings", default_generation_settings_for_props }, { "total_slots", ctx_server.params_base.n_parallel }, @@ -3117,6 +3128,7 @@ void server_routes::init_routes() { { "endpoint_props", params.endpoint_props }, { "endpoint_metrics", params.endpoint_metrics }, { "webui", params.webui }, + { "webui_settings", ctx_server.webui_settings }, { "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) }, { "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)}, { "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)}, diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 5e25ec79e4..c1f86e5493 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -818,6 +818,7 @@ void server_models_routes::init_routes() { {"params", json{}}, {"n_ctx", 0}, }}, + {"webui_settings", webui_settings}, }); return res; } diff --git a/tools/server/server-models.h b/tools/server/server-models.h index 227b15bbc3..cbc4c43246 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -2,6 +2,7 @@ #include "common.h" #include "preset.h" +#include "server-common.h" #include "server-http.h" #include @@ -149,9 +150,18 @@ public: struct server_models_routes { common_params params; + json webui_settings = json::object(); server_models models; server_models_routes(const common_params & params, int argc, char ** argv, char ** envp) : params(params), models(params, argc, argv, envp) { + if (!this->params.webui_config_json.empty()) { + try { + webui_settings = json::parse(this->params.webui_config_json); + } catch (const std::exception & e) { + LOG_ERR("%s: failed to parse webui config: %s\n", __func__, e.what()); + throw; + } + } init_routes(); } diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 3cebe174b9..b6b611b3f4 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -8,6 +8,7 @@ #include "log.h" #include +#include #include #include // for std::thread::hardware_concurrency @@ -124,7 +125,12 @@ int main(int argc, char ** argv, char ** envp) { std::optional models_routes{}; if (is_router_server) { // setup server instances manager - models_routes.emplace(params, argc, argv, envp); + try { + models_routes.emplace(params, argc, argv, envp); + } catch (const std::exception & e) { + LOG_ERR("%s: failed to initialize router models: %s\n", __func__, e.what()); + return 1; + } // proxy handlers // note: routes.get_health stays the same diff --git a/tools/server/webui/src/lib/components/app/server/ServerErrorSplash.svelte b/tools/server/webui/src/lib/components/app/server/ServerErrorSplash.svelte index 39613f200c..fa4c2842cc 100644 --- a/tools/server/webui/src/lib/components/app/server/ServerErrorSplash.svelte +++ b/tools/server/webui/src/lib/components/app/server/ServerErrorSplash.svelte @@ -1,4 +1,5 @@