diff --git a/common/arg.cpp b/common/arg.cpp
index 062046c0d0..0da57ab1e6 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2488,6 +2488,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.models_max = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
+ add_opt(common_arg(
+ {"--models-allow-extra-args"},
+ string_format("for router server, allow extra arguments for models; important: some arguments can allow users to access local file system, use with caution (default: %s)", params.models_allow_extra_args ? "enabled" : "disabled"),
+ [](common_params & params) {
+ params.models_allow_extra_args = true;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_ALLOW_EXTRA_ARGS"));
add_opt(common_arg(
{"--no-models-autoload"},
"disables automatic loading of models (default: enabled)",
diff --git a/common/common.h b/common/common.h
index 4ac9700d7b..ff42cc6584 100644
--- a/common/common.h
+++ b/common/common.h
@@ -462,6 +462,7 @@ struct common_params {
std::string models_dir = ""; // directory containing models for the router server
int models_max = 4; // maximum number of models to load simultaneously
bool models_autoload = true; // automatically load models when requested via the router server
+ bool models_allow_extra_args = false; // allow passing extra arguments when loading models via the router server
bool log_json = false;
diff --git a/tools/server/README.md b/tools/server/README.md
index 24984d8696..6b911b635a 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -197,6 +197,7 @@ The project is under active development, and we are [looking for feedback and co
| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
| `--models-dir PATH` | directory containing models for the router server (default: disabled)
(env: LLAMA_ARG_MODELS_DIR) |
| `--models-max N` | for router server, maximum number of models to load simultaneously (default: 4, 0 = unlimited)
(env: LLAMA_ARG_MODELS_MAX) |
+| `--models-allow-extra-args` | for router server, allow extra arguments for models; important: some arguments can allow users to access local file system, use with caution (default: disabled)
(env: LLAMA_ARG_MODELS_ALLOW_EXTRA_ARGS) |
| `--no-models-autoload` | disables automatic loading of models (default: enabled)
(env: LLAMA_ARG_NO_MODELS_AUTOLOAD) |
| `--jinja` | use jinja template for chat (default: disabled)
(env: LLAMA_ARG_JINJA) |
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content`
- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`
(default: auto)
(env: LLAMA_ARG_THINK) |
@@ -1495,8 +1496,8 @@ The `status` object can be:
Load a model
Payload:
-- `model`: name of the model to be loaded
-- `extra_args`: (optional) an array of additional arguments to be passed to the model instance
+- `model`: name of the model to be loaded.
+- `extra_args`: (optional) an array of additional arguments to be passed to the model instance. Note: you must start the server with `--models-allow-extra-args` to enable this feature.
```json
{
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index 6ab0a9c226..cb19d2c341 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -383,8 +383,10 @@ void server_models::load(const std::string & name, const std::vector extra_args = json_value(body, "extra_args", std::vector());
+ if (!params.models_allow_extra_args && !extra_args.empty()) {
+ res->error(format_error_response("extra_args is not allowed", ERROR_TYPE_INVALID_REQUEST));
+ return res;
+ }
auto model = models->get_meta(name);
if (!model.has_value()) {
res->error(format_error_response("model is not found", ERROR_TYPE_NOT_FOUND));
@@ -5862,6 +5866,9 @@ int main(int argc, char ** argv, char ** envp) {
LOG_INF("%s: router server is listening on %s\n", __func__, ctx_http.listening_address.c_str());
LOG_INF("%s: NOTE: router mode is experimental\n", __func__);
LOG_INF("%s: it is not recommended to use this mode in untrusted environments\n", __func__);
+ if (params.models_allow_extra_args) {
+ LOG_WRN("%s: extra_args is enabled; this may lead to security issues if the server is exposed to untrusted clients\n", __func__);
+ }
ctx_http.is_ready.store(true);
if (ctx_http.thread.joinable()) {
ctx_http.thread.join(); // keep the main thread alive