From 6df686bee68ff109f62123c7a8eac003f3dd9e20 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Mon, 19 Jan 2026 23:28:01 +0100
Subject: [PATCH] server : refactor oai_parser_opt, move it to
 server_chat_params (#18937)

* server_chat_params

* move chat format into CLI

* use meta whenever possible

* clean up, no more chatml fallback
---
 common/chat.cpp                 |  14 ++--
 common/chat.h                   |   2 +-
 tools/cli/cli.cpp               |  32 ++++++--
 tools/server/server-common.cpp  |   4 +-
 tools/server/server-common.h    |  14 ++--
 tools/server/server-context.cpp | 138 ++++++++++++++------------------
 tools/server/server-context.h   |   5 +-
 tools/server/server-task.h      |   6 +-
 8 files changed, 112 insertions(+), 103 deletions(-)

diff --git a/common/chat.cpp b/common/chat.cpp
index 28721ac7da..b29544dac0 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -601,18 +601,18 @@ bool common_chat_templates_was_explicit(const struct common_chat_templates * tmp
     return tmpls->has_explicit_template;
 }
 
-const char * common_chat_templates_source(const struct common_chat_templates * tmpls, const char * variant) {
-    if (variant != nullptr) {
-        if (strcmp(variant, "tool_use") == 0) {
+std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant) {
+    if (!variant.empty()) {
+        if (variant == "tool_use") {
             if (tmpls->template_tool_use) {
-                return tmpls->template_tool_use->source().c_str();
+                return tmpls->template_tool_use->source();
             }
-            return nullptr;
+            return "";
         } else {
-            LOG_DBG("%s: unknown template variant: %s\n", __func__, variant);
+            LOG_DBG("%s: unknown template variant: %s\n", __func__, variant.c_str());
         }
     }
-    return tmpls->template_default->source().c_str();
+    return tmpls->template_default->source();
 }
 
 common_chat_templates_ptr common_chat_templates_init(
diff --git a/common/chat.h b/common/chat.h
index 454085e90e..1488017382 100644
--- a/common/chat.h
+++ b/common/chat.h
@@ -191,7 +191,7 @@ common_chat_templates_ptr common_chat_templates_init(
                                            const std::string & eos_token_override = "");
 
 bool         common_chat_templates_was_explicit(const struct common_chat_templates * tmpls);
-const char * common_chat_templates_source(const struct common_chat_templates * tmpls, const char * variant = nullptr);
+std::string  common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant = "");
 
 
 struct common_chat_params      common_chat_templates_apply(
diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp
index 2f0ffea1c2..caad29bac7 100644
--- a/tools/cli/cli.cpp
+++ b/tools/cli/cli.cpp
@@ -71,14 +71,16 @@ struct cli_context {
 
     std::string generate_completion(result_timings & out_timings) {
         server_response_reader rd = ctx_server.get_response_reader();
+        auto formatted = format_chat();
         {
             // TODO: reduce some copies here in the future
             server_task task = server_task(SERVER_TASK_TYPE_COMPLETION);
-            task.id        = rd.get_new_id();
-            task.index     = 0;
-            task.params    = defaults;    // copy
-            task.cli_input = messages;    // copy
-            task.cli_files = input_files; // copy
+            task.id         = rd.get_new_id();
+            task.index      = 0;
+            task.params     = defaults;         // copy
+            task.cli_prompt = formatted.prompt; // copy
+            task.cli_files  = input_files;      // copy
+            task.cli        = true;
             rd.post_task({std::move(task)});
         }
 
@@ -156,6 +158,26 @@ struct cli_context {
             return content;
         }
     }
+
+    common_chat_params format_chat() {
+        auto meta = ctx_server.get_meta();
+        auto & chat_params = meta.chat_params;
+
+        common_chat_templates_inputs inputs;
+        inputs.messages              = common_chat_msgs_parse_oaicompat(messages);
+        inputs.tools                 = {}; // TODO
+        inputs.tool_choice           = COMMON_CHAT_TOOL_CHOICE_NONE;
+        inputs.json_schema           = ""; // TODO
+        inputs.grammar               = ""; // TODO
+        inputs.use_jinja             = chat_params.use_jinja;
+        inputs.parallel_tool_calls   = false;
+        inputs.add_generation_prompt = true;
+        inputs.reasoning_format      = chat_params.reasoning_format;
+        inputs.enable_thinking       = chat_params.enable_thinking;
+
+        // Apply chat template to the list of messages
+        return common_chat_templates_apply(chat_params.tmpls.get(), inputs);
+    }
 };
 
 int main(int argc, char ** argv) {
diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp
index 16b0db2983..1bbe85322a 100644
--- a/tools/server/server-common.cpp
+++ b/tools/server/server-common.cpp
@@ -831,7 +831,7 @@ static void handle_media(
 // used by /chat/completions endpoint
 json oaicompat_chat_params_parse(
     json & body, /* openai api json semantics */
-    const oaicompat_parser_options & opt,
+    const server_chat_params & opt,
     std::vector<raw_buffer> & out_files)
 {
     json llama_params;
@@ -1012,7 +1012,7 @@ json oaicompat_chat_params_parse(
     }
 
     // Apply chat template to the list of messages
-    auto chat_params = common_chat_templates_apply(opt.tmpls, inputs);
+    auto chat_params = common_chat_templates_apply(opt.tmpls.get(), inputs);
 
     /* Append assistant prefilled message */
     if (prefill_assistant_message) {
diff --git a/tools/server/server-common.h b/tools/server/server-common.h
index 152a2a3c46..7f4c073874 100644
--- a/tools/server/server-common.h
+++ b/tools/server/server-common.h
@@ -274,25 +274,25 @@ std::vector<server_tokens> tokenize_input_prompts(
 // OAI utils
 //
 
-// used by /completions endpoint
-json oaicompat_completion_params_parse(const json & body);
-
-struct oaicompat_parser_options {
+struct server_chat_params {
     bool use_jinja;
     bool prefill_assistant;
     common_reasoning_format reasoning_format;
-    std::map<std::string,std::string> chat_template_kwargs;
-    common_chat_templates * tmpls;
+    std::map<std::string, std::string> chat_template_kwargs; // mapping key --> json value
+    common_chat_templates_ptr tmpls;
     bool allow_image;
     bool allow_audio;
     bool enable_thinking = true;
     std::string media_path;
 };
 
+// used by /completions endpoint
+json oaicompat_completion_params_parse(const json & body);
+
 // used by /chat/completions endpoint
 json oaicompat_chat_params_parse(
     json & body, /* openai api json semantics */
-    const oaicompat_parser_options & opt,
+    const server_chat_params & opt,
     std::vector<raw_buffer> & out_files);
 
 // convert Anthropic Messages API format to OpenAI Chat Completions API format
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index c790ac79e1..f1f677addd 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -534,8 +534,8 @@ public:
     server_queue    queue_tasks;
     server_response queue_results;
 
-    common_chat_templates_ptr chat_templates;
-    oaicompat_parser_options  oai_parser_opt;
+    // note: chat_params must not be refreshed upon existing sleeping state
+    server_chat_params chat_params;
 
     ~server_context_impl() {
         if (!sleeping) {
@@ -688,15 +688,6 @@ private:
             llama_init_dft->free_context();
         }
 
-        chat_templates = common_chat_templates_init(model, params_base.chat_template);
-        try {
-            common_chat_format_example(chat_templates.get(), params.use_jinja, params.default_template_kwargs);
-        } catch (const std::exception & e) {
-            SRV_WRN("%s: Chat template parsing error: %s\n", __func__, e.what());
-            SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
-            chat_templates = common_chat_templates_init(model, "chatml");
-        }
-
         std::string & mmproj_path = params_base.mmproj.path;
         if (!mmproj_path.empty()) {
             if (!is_resume) {
@@ -845,30 +836,6 @@ private:
             model_name = model_path.filename().string();
         }
 
-        // thinking is enabled if:
-        // 1. It's not explicitly disabled (reasoning_budget == 0)
-        // 2. The chat template supports it
-        const bool enable_thinking = params_base.use_jinja && params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
-        SRV_INF("thinking = %d\n", enable_thinking);
-
-        oai_parser_opt = {
-            /* use_jinja             */ params_base.use_jinja,
-            /* prefill_assistant     */ params_base.prefill_assistant,
-            /* reasoning_format      */ params_base.reasoning_format,
-            /* chat_template_kwargs  */ params_base.default_template_kwargs,
-            /* common_chat_templates */ chat_templates.get(),
-            /* allow_image           */ mctx ? mtmd_support_vision(mctx) : false,
-            /* allow_audio           */ mctx ? mtmd_support_audio (mctx) : false,
-            /* enable_thinking       */ enable_thinking,
-            /* media_path            */ params_base.media_path,
-        };
-
-        // print sample chat example to make it clear which template is used
-        // @ngxson modern templates are too long, spam the logs; printing the example is enough
-        LOG_INF("%s: chat template, example_format: '%s'\n", __func__,
-        //      common_chat_templates_source(chat_templates.get()),
-                common_chat_format_example(chat_templates.get(), params_base.use_jinja, params_base.default_template_kwargs).c_str());
-
         if (!is_resume) {
             return init();
         }
@@ -907,6 +874,42 @@ private:
             }
         }
 
+        // populate chat template params
+        {
+            common_chat_templates_ptr chat_templates;
+
+            try {
+                chat_templates = common_chat_templates_init(model, params_base.chat_template);
+
+                LOG_INF("%s: chat template, example_format: '%s'\n", __func__,
+                    common_chat_format_example(chat_templates.get(), params_base.use_jinja, params_base.default_template_kwargs).c_str());
+
+            } catch (const std::exception & e) {
+                SRV_ERR("%s: chat template parsing error: %s\n", __func__, e.what());
+                SRV_ERR("%s: please consider disabling jinja via --no-jinja, or use a custom chat template via --chat-template\n", __func__);
+                SRV_ERR("%s: for example: --no-jinja --chat-template chatml\n", __func__);
+                return false;
+            }
+
+            // thinking is enabled if:
+            // 1. It's not explicitly disabled (reasoning_budget == 0)
+            // 2. The chat template supports it
+            const bool enable_thinking = params_base.use_jinja && params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
+            SRV_INF("%s: chat template, thinking = %d\n", __func__, enable_thinking);
+
+            chat_params = {
+                /* use_jinja             */ params_base.use_jinja,
+                /* prefill_assistant     */ params_base.prefill_assistant,
+                /* reasoning_format      */ params_base.reasoning_format,
+                /* chat_template_kwargs  */ params_base.default_template_kwargs,
+                /* tmpls                 */ std::move(chat_templates),
+                /* allow_image           */ mctx ? mtmd_support_vision(mctx) : false,
+                /* allow_audio           */ mctx ? mtmd_support_audio (mctx) : false,
+                /* enable_thinking       */ enable_thinking,
+                /* media_path            */ params_base.media_path,
+            };
+        }
+
         return true;
     }
 
@@ -1588,32 +1591,14 @@ private:
 
     // tokenize the input if it's set by CLI, return false on error
     bool tokenize_cli_input(server_task & task) {
-        GGML_ASSERT(task.cli_input != nullptr);
         try {
-            auto & opt = oai_parser_opt;
-            common_chat_templates_inputs inputs;
-            inputs.messages              = common_chat_msgs_parse_oaicompat(task.cli_input);
-            inputs.tools                 = {}; // TODO
-            inputs.tool_choice           = COMMON_CHAT_TOOL_CHOICE_NONE;
-            inputs.json_schema           = ""; // TODO
-            inputs.grammar               = ""; // TODO
-            inputs.use_jinja             = opt.use_jinja;
-            inputs.parallel_tool_calls   = false;
-            inputs.add_generation_prompt = true;
-            inputs.reasoning_format      = opt.reasoning_format;
-            inputs.enable_thinking       = opt.enable_thinking;
-
-            // Apply chat template to the list of messages
-            auto chat_params = common_chat_templates_apply(opt.tmpls, inputs);
-
-            // tokenize the resulting prompt
-            auto & prompt = chat_params.prompt;
+            auto & prompt = task.cli_prompt;
             if (mctx != nullptr) {
                 task.tokens = process_mtmd_prompt(mctx, prompt, task.cli_files);
             } else {
                 task.tokens = std::move(tokenize_input_prompts(vocab, mctx, prompt, true, true)[0]);
             }
-            task.cli_input.clear();
+            task.cli_prompt.clear();
             task.cli_files.clear();
         } catch (const std::exception & e) {
             send_error(task, std::string("Failed to format input: ") + e.what(), ERROR_TYPE_INVALID_REQUEST);
@@ -1689,7 +1674,7 @@ private:
                 {
                     // special case: if input is provided via CLI, tokenize it first
                     // otherwise, no need to tokenize as it's already done inside the HTTP thread
-                    if (task.cli_input != nullptr) {
+                    if (task.cli) {
                         if (!tokenize_cli_input(task)) {
                             break;
                         }
@@ -2901,8 +2886,6 @@ server_response_reader server_context::get_response_reader() {
 }
 
 server_context_meta server_context::get_meta() const {
-    auto tool_use_src = common_chat_templates_source(impl->chat_templates.get(), "tool_use");
-
     auto bos_id = llama_vocab_bos(impl->vocab);
     auto eos_id = llama_vocab_eos(impl->vocab);
     auto bos_token_str = bos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx, bos_id, true) : "";
@@ -2913,14 +2896,13 @@ server_context_meta server_context::get_meta() const {
         /* model_name             */ impl->model_name,
         /* model_path             */ impl->params_base.model.path,
         /* has_mtmd               */ impl->mctx != nullptr,
-        /* has_inp_image          */ impl->oai_parser_opt.allow_image,
-        /* has_inp_audio          */ impl->oai_parser_opt.allow_audio,
+        /* has_inp_image          */ impl->chat_params.allow_image,
+        /* has_inp_audio          */ impl->chat_params.allow_audio,
         /* json_webui_settings    */ impl->json_webui_settings,
         /* slot_n_ctx             */ impl->get_slot_n_ctx(),
         /* pooling_type           */ llama_pooling_type(impl->ctx),
 
-        /* chat_template          */ common_chat_templates_source(impl->chat_templates.get()),
-        /* chat_template_tool_use */ tool_use_src ? tool_use_src : "",
+        /* chat_params            */ impl->chat_params,
 
         /* bos_token_str          */ bos_token_str,
         /* eos_token_str          */ eos_token_str,
@@ -3202,8 +3184,8 @@ void server_routes::init_routes() {
 
         // this endpoint can be accessed during sleeping
         // the next LOC is to avoid someone accidentally use ctx_server
-        bool server_ctx; // do NOT delete this line
-        GGML_UNUSED(server_ctx);
+        bool ctx_server; // do NOT delete this line
+        GGML_UNUSED(ctx_server);
 
         res->ok({{"status", "ok"}});
         return res;
@@ -3393,8 +3375,8 @@ void server_routes::init_routes() {
 
         // this endpoint can be accessed during sleeping
         // the next LOC is to avoid someone accidentally use ctx_server
-        bool server_ctx; // do NOT delete this line
-        GGML_UNUSED(server_ctx);
+        bool ctx_server; // do NOT delete this line
+        GGML_UNUSED(ctx_server);
 
         task_params tparams;
         tparams.sampling = params.sampling;
@@ -3403,6 +3385,9 @@ void server_routes::init_routes() {
             { "n_ctx",  meta->slot_n_ctx },
         };
 
+        std::string tmpl_default = common_chat_templates_source(meta->chat_params.tmpls.get(), "");
+        std::string tmpl_tools   = common_chat_templates_source(meta->chat_params.tmpls.get(), "tool_use");
+
         json props = {
             { "default_generation_settings", default_generation_settings_for_props },
             { "total_slots",                 params.n_parallel },
@@ -3417,15 +3402,15 @@ void server_routes::init_routes() {
             { "endpoint_metrics",            params.endpoint_metrics },
             { "webui",                       params.webui },
             { "webui_settings",              meta->json_webui_settings },
-            { "chat_template",               meta->chat_template },
+            { "chat_template",               tmpl_default },
             { "bos_token",                   meta->bos_token_str },
             { "eos_token",                   meta->eos_token_str },
             { "build_info",                  meta->build_info },
             { "is_sleeping",                 queue_tasks.is_sleeping() },
         };
         if (params.use_jinja) {
-            if (!meta->chat_template_tool_use.empty()) {
-                props["chat_template_tool_use"] = meta->chat_template_tool_use;
+            if (!tmpl_tools.empty()) {
+                props["chat_template_tool_use"] = tmpl_tools;
             }
         }
         res->ok(props);
@@ -3446,6 +3431,7 @@ void server_routes::init_routes() {
 
     this->get_api_show = [this](const server_http_req &) {
         auto res = create_response();
+        std::string tmpl_default = common_chat_templates_source(meta->chat_params.tmpls.get(), "");
         json data = {
             {
                 "model_info", {
@@ -3454,7 +3440,7 @@ void server_routes::init_routes() {
             },
             {"modelfile", ""},
             {"parameters", ""},
-            {"template", meta->chat_template},
+            {"template", tmpl_default},
             {"details", {
                 {"parent_model", ""},
                 {"format", "gguf"},
@@ -3579,7 +3565,7 @@ void server_routes::init_routes() {
         json body = json::parse(req.body);
         json body_parsed = oaicompat_chat_params_parse(
             body,
-            ctx_server.oai_parser_opt,
+            meta->chat_params,
             files);
         return handle_completions_impl(
             req,
@@ -3595,7 +3581,7 @@ void server_routes::init_routes() {
         json body = convert_anthropic_to_oai(json::parse(req.body));
         json body_parsed = oaicompat_chat_params_parse(
             body,
-            ctx_server.oai_parser_opt,
+            meta->chat_params,
             files);
         return handle_completions_impl(
             req,
@@ -3611,7 +3597,7 @@ void server_routes::init_routes() {
         json body = convert_anthropic_to_oai(json::parse(req.body));
         json body_parsed = oaicompat_chat_params_parse(
             body,
-            ctx_server.oai_parser_opt,
+            meta->chat_params,
             files);
 
         json prompt = body_parsed.at("prompt");
@@ -3627,7 +3613,7 @@ void server_routes::init_routes() {
         json body = json::parse(req.body);
         json data = oaicompat_chat_params_parse(
             body,
-            ctx_server.oai_parser_opt,
+            meta->chat_params,
             files);
         res->ok({{ "prompt", std::move(data.at("prompt")) }});
         return res;
@@ -3638,8 +3624,8 @@ void server_routes::init_routes() {
 
         // this endpoint can be accessed during sleeping
         // the next LOC is to avoid someone accidentally use ctx_server
-        bool server_ctx; // do NOT delete this line
-        GGML_UNUSED(server_ctx);
+        bool ctx_server; // do NOT delete this line
+        GGML_UNUSED(ctx_server);
 
         json models = {
             {"models", {
diff --git a/tools/server/server-context.h b/tools/server/server-context.h
index 09bec15ae1..ec1df96950 100644
--- a/tools/server/server-context.h
+++ b/tools/server/server-context.h
@@ -20,9 +20,8 @@ struct server_context_meta {
     int slot_n_ctx;
     enum llama_pooling_type pooling_type;
 
-    // chat template
-    std::string chat_template;
-    std::string chat_template_tool_use;
+    // chat params
+    server_chat_params & chat_params;
 
     // tokens
     std::string bos_token_str;
diff --git a/tools/server/server-task.h b/tools/server/server-task.h
index 11943ee4f8..daffe0c904 100644
--- a/tools/server/server-task.h
+++ b/tools/server/server-task.h
@@ -130,8 +130,10 @@ struct server_task {
     task_params   params;
     server_tokens tokens;
 
-    // only used by CLI, this delegates the tokenization to the server
-    json                    cli_input = nullptr;
+    // only used by CLI, this allow tokenizing CLI inputs on server side
+    // we need this because mtmd_context and vocab are not accessible outside of server_context
+    bool                    cli = false;
+    std::string             cli_prompt;
     std::vector<raw_buffer> cli_files;
 
     server_task_type type;