From 6ed192b4dd2b42635008385ce35910c46bb37203 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 24 Nov 2025 12:01:16 +0100
Subject: [PATCH] add --models-allow-extra-args for security

---
 common/arg.cpp                 | 7 +++++++
 common/common.h                | 1 +
 tools/server/README.md         | 5 +++--
 tools/server/server-models.cpp | 6 ++++--
 tools/server/server.cpp        | 7 +++++++
 5 files changed, 22 insertions(+), 4 deletions(-)
diff --git a/common/arg.cpp b/common/arg.cpp
index 062046c0d0..0da57ab1e6 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2488,6 +2488,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.models_max = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
+    add_opt(common_arg(
+        {"--models-allow-extra-args"},
+        string_format("for router server, allow extra arguments for models; important: some arguments can allow users to access local file system, use with caution (default: %s)", params.models_allow_extra_args ? "enabled" : "disabled"),
+        [](common_params & params) {
+            params.models_allow_extra_args = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_ALLOW_EXTRA_ARGS"));
     add_opt(common_arg(
         {"--no-models-autoload"},
         "disables automatic loading of models (default: enabled)",
diff --git a/common/common.h b/common/common.h
index 4ac9700d7b..ff42cc6584 100644
--- a/common/common.h
+++ b/common/common.h
@@ -462,6 +462,7 @@ struct common_params {
     std::string models_dir = ""; // directory containing models for the router server
     int models_max = 4;          // maximum number of models to load simultaneously
     bool models_autoload = true; // automatically load models when requested via the router server
+    bool models_allow_extra_args = false; // allow passing extra arguments when loading models via the router server
 
     bool log_json = false;
 
diff --git a/tools/server/README.md b/tools/server/README.md
index 24984d8696..6b911b635a 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -197,6 +197,7 @@ The project is under active development, and we are [looking for feedback and co
 | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
 | `--models-dir PATH` | directory containing models for the router server (default: disabled)<br/>(env: LLAMA_ARG_MODELS_DIR) |
 | `--models-max N` | for router server, maximum number of models to load simultaneously (default: 4, 0 = unlimited)<br/>(env: LLAMA_ARG_MODELS_MAX) |
+| `--models-allow-extra-args` | for router server, allow extra arguments for models; important: some arguments can allow users to access local file system, use with caution (default: disabled)<br/>(env: LLAMA_ARG_MODELS_ALLOW_EXTRA_ARGS) |
 | `--no-models-autoload` | disables automatic loading of models (default: enabled)<br/>(env: LLAMA_ARG_NO_MODELS_AUTOLOAD) |
 | `--jinja` | use jinja template for chat (default: disabled)<br/>(env: LLAMA_ARG_JINJA) |
 | `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
@@ -1495,8 +1496,8 @@ The `status` object can be:
 Load a model
 
 Payload:
-- `model`: name of the model to be loaded
-- `extra_args`: (optional) an array of additional arguments to be passed to the model instance
+- `model`: name of the model to be loaded.
+- `extra_args`: (optional) an array of additional arguments to be passed to the model instance. Note: you must start the server with `--models-allow-extra-args` to enable this feature.
 
 ```json
 {
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index 6ab0a9c226..cb19d2c341 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -383,8 +383,10 @@ void server_models::load(const std::string & name, const std::vector<std::string
             child_args.push_back(std::to_string(inst.meta.port));
 
             // append extra args
-            for (const auto & arg : extra_args) {
-                child_args.push_back(arg);
+            if (base_params.models_allow_extra_args) {
+                for (const auto & arg : extra_args) {
+                    child_args.push_back(arg);
+                }
             }
         }
 
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 6869c7826d..1f9b2e7319 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -5165,6 +5165,10 @@ public:
         json body = json::parse(req.body);
         std::string name = json_value(body, "model", std::string());
         std::vector<std::string> extra_args = json_value(body, "extra_args", std::vector<std::string>());
+        if (!params.models_allow_extra_args && !extra_args.empty()) {
+            res->error(format_error_response("extra_args is not allowed", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
         auto model = models->get_meta(name);
         if (!model.has_value()) {
             res->error(format_error_response("model is not found", ERROR_TYPE_NOT_FOUND));
@@ -5862,6 +5866,9 @@ int main(int argc, char ** argv, char ** envp) {
         LOG_INF("%s: router server is listening on %s\n", __func__, ctx_http.listening_address.c_str());
         LOG_INF("%s: NOTE: router mode is experimental\n", __func__);
         LOG_INF("%s:       it is not recommended to use this mode in untrusted environments\n", __func__);
+        if (params.models_allow_extra_args) {
+            LOG_WRN("%s: extra_args is enabled; this may lead to security issues if the server is exposed to untrusted clients\n", __func__);
+        }
         ctx_http.is_ready.store(true);
         if (ctx_http.thread.joinable()) {
             ctx_http.thread.join(); // keep the main thread alive