From 6df686bee68ff109f62123c7a8eac003f3dd9e20 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Mon, 19 Jan 2026 23:28:01 +0100 Subject: [PATCH] server : refactor oai_parser_opt, move it to server_chat_params (#18937) * server_chat_params * move chat format into CLI * use meta whenever possible * clean up, no more chatml fallback --- common/chat.cpp | 14 ++-- common/chat.h | 2 +- tools/cli/cli.cpp | 32 ++++++-- tools/server/server-common.cpp | 4 +- tools/server/server-common.h | 14 ++-- tools/server/server-context.cpp | 138 ++++++++++++++------------------ tools/server/server-context.h | 5 +- tools/server/server-task.h | 6 +- 8 files changed, 112 insertions(+), 103 deletions(-) diff --git a/common/chat.cpp b/common/chat.cpp index 28721ac7da..b29544dac0 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -601,18 +601,18 @@ bool common_chat_templates_was_explicit(const struct common_chat_templates * tmp return tmpls->has_explicit_template; } -const char * common_chat_templates_source(const struct common_chat_templates * tmpls, const char * variant) { - if (variant != nullptr) { - if (strcmp(variant, "tool_use") == 0) { +std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant) { + if (!variant.empty()) { + if (variant == "tool_use") { if (tmpls->template_tool_use) { - return tmpls->template_tool_use->source().c_str(); + return tmpls->template_tool_use->source(); } - return nullptr; + return ""; } else { - LOG_DBG("%s: unknown template variant: %s\n", __func__, variant); + LOG_DBG("%s: unknown template variant: %s\n", __func__, variant.c_str()); } } - return tmpls->template_default->source().c_str(); + return tmpls->template_default->source(); } common_chat_templates_ptr common_chat_templates_init( diff --git a/common/chat.h b/common/chat.h index 454085e90e..1488017382 100644 --- a/common/chat.h +++ b/common/chat.h @@ -191,7 +191,7 @@ common_chat_templates_ptr common_chat_templates_init( const std::string & eos_token_override = ""); bool common_chat_templates_was_explicit(const struct common_chat_templates * tmpls); -const char * common_chat_templates_source(const struct common_chat_templates * tmpls, const char * variant = nullptr); +std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant = ""); struct common_chat_params common_chat_templates_apply( diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index 2f0ffea1c2..caad29bac7 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -71,14 +71,16 @@ struct cli_context { std::string generate_completion(result_timings & out_timings) { server_response_reader rd = ctx_server.get_response_reader(); + auto formatted = format_chat(); { // TODO: reduce some copies here in the future server_task task = server_task(SERVER_TASK_TYPE_COMPLETION); - task.id = rd.get_new_id(); - task.index = 0; - task.params = defaults; // copy - task.cli_input = messages; // copy - task.cli_files = input_files; // copy + task.id = rd.get_new_id(); + task.index = 0; + task.params = defaults; // copy + task.cli_prompt = formatted.prompt; // copy + task.cli_files = input_files; // copy + task.cli = true; rd.post_task({std::move(task)}); } @@ -156,6 +158,26 @@ struct cli_context { return content; } } + + common_chat_params format_chat() { + auto meta = ctx_server.get_meta(); + auto & chat_params = meta.chat_params; + + common_chat_templates_inputs inputs; + inputs.messages = common_chat_msgs_parse_oaicompat(messages); + inputs.tools = {}; // TODO + inputs.tool_choice = COMMON_CHAT_TOOL_CHOICE_NONE; + inputs.json_schema = ""; // TODO + inputs.grammar = ""; // TODO + inputs.use_jinja = chat_params.use_jinja; + inputs.parallel_tool_calls = false; + inputs.add_generation_prompt = true; + inputs.reasoning_format = chat_params.reasoning_format; + inputs.enable_thinking = chat_params.enable_thinking; + + // Apply chat template to the list of messages + return common_chat_templates_apply(chat_params.tmpls.get(), inputs); + } }; int main(int argc, char ** argv) { diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index 16b0db2983..1bbe85322a 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -831,7 +831,7 @@ static void handle_media( // used by /chat/completions endpoint json oaicompat_chat_params_parse( json & body, /* openai api json semantics */ - const oaicompat_parser_options & opt, + const server_chat_params & opt, std::vector & out_files) { json llama_params; @@ -1012,7 +1012,7 @@ json oaicompat_chat_params_parse( } // Apply chat template to the list of messages - auto chat_params = common_chat_templates_apply(opt.tmpls, inputs); + auto chat_params = common_chat_templates_apply(opt.tmpls.get(), inputs); /* Append assistant prefilled message */ if (prefill_assistant_message) { diff --git a/tools/server/server-common.h b/tools/server/server-common.h index 152a2a3c46..7f4c073874 100644 --- a/tools/server/server-common.h +++ b/tools/server/server-common.h @@ -274,25 +274,25 @@ std::vector tokenize_input_prompts( // OAI utils // -// used by /completions endpoint -json oaicompat_completion_params_parse(const json & body); - -struct oaicompat_parser_options { +struct server_chat_params { bool use_jinja; bool prefill_assistant; common_reasoning_format reasoning_format; - std::map chat_template_kwargs; - common_chat_templates * tmpls; + std::map chat_template_kwargs; // mapping key --> json value + common_chat_templates_ptr tmpls; bool allow_image; bool allow_audio; bool enable_thinking = true; std::string media_path; }; +// used by /completions endpoint +json oaicompat_completion_params_parse(const json & body); + // used by /chat/completions endpoint json oaicompat_chat_params_parse( json & body, /* openai api json semantics */ - const oaicompat_parser_options & opt, + const server_chat_params & opt, std::vector & out_files); // convert Anthropic Messages API format to OpenAI Chat Completions API format diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index c790ac79e1..f1f677addd 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -534,8 +534,8 @@ public: server_queue queue_tasks; server_response queue_results; - common_chat_templates_ptr chat_templates; - oaicompat_parser_options oai_parser_opt; + // note: chat_params must not be refreshed upon existing sleeping state + server_chat_params chat_params; ~server_context_impl() { if (!sleeping) { @@ -688,15 +688,6 @@ private: llama_init_dft->free_context(); } - chat_templates = common_chat_templates_init(model, params_base.chat_template); - try { - common_chat_format_example(chat_templates.get(), params.use_jinja, params.default_template_kwargs); - } catch (const std::exception & e) { - SRV_WRN("%s: Chat template parsing error: %s\n", __func__, e.what()); - SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__); - chat_templates = common_chat_templates_init(model, "chatml"); - } - std::string & mmproj_path = params_base.mmproj.path; if (!mmproj_path.empty()) { if (!is_resume) { @@ -845,30 +836,6 @@ private: model_name = model_path.filename().string(); } - // thinking is enabled if: - // 1. It's not explicitly disabled (reasoning_budget == 0) - // 2. The chat template supports it - const bool enable_thinking = params_base.use_jinja && params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get()); - SRV_INF("thinking = %d\n", enable_thinking); - - oai_parser_opt = { - /* use_jinja */ params_base.use_jinja, - /* prefill_assistant */ params_base.prefill_assistant, - /* reasoning_format */ params_base.reasoning_format, - /* chat_template_kwargs */ params_base.default_template_kwargs, - /* common_chat_templates */ chat_templates.get(), - /* allow_image */ mctx ? mtmd_support_vision(mctx) : false, - /* allow_audio */ mctx ? mtmd_support_audio (mctx) : false, - /* enable_thinking */ enable_thinking, - /* media_path */ params_base.media_path, - }; - - // print sample chat example to make it clear which template is used - // @ngxson modern templates are too long, spam the logs; printing the example is enough - LOG_INF("%s: chat template, example_format: '%s'\n", __func__, - // common_chat_templates_source(chat_templates.get()), - common_chat_format_example(chat_templates.get(), params_base.use_jinja, params_base.default_template_kwargs).c_str()); - if (!is_resume) { return init(); } @@ -907,6 +874,42 @@ private: } } + // populate chat template params + { + common_chat_templates_ptr chat_templates; + + try { + chat_templates = common_chat_templates_init(model, params_base.chat_template); + + LOG_INF("%s: chat template, example_format: '%s'\n", __func__, + common_chat_format_example(chat_templates.get(), params_base.use_jinja, params_base.default_template_kwargs).c_str()); + + } catch (const std::exception & e) { + SRV_ERR("%s: chat template parsing error: %s\n", __func__, e.what()); + SRV_ERR("%s: please consider disabling jinja via --no-jinja, or use a custom chat template via --chat-template\n", __func__); + SRV_ERR("%s: for example: --no-jinja --chat-template chatml\n", __func__); + return false; + } + + // thinking is enabled if: + // 1. It's not explicitly disabled (reasoning_budget == 0) + // 2. The chat template supports it + const bool enable_thinking = params_base.use_jinja && params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get()); + SRV_INF("%s: chat template, thinking = %d\n", __func__, enable_thinking); + + chat_params = { + /* use_jinja */ params_base.use_jinja, + /* prefill_assistant */ params_base.prefill_assistant, + /* reasoning_format */ params_base.reasoning_format, + /* chat_template_kwargs */ params_base.default_template_kwargs, + /* tmpls */ std::move(chat_templates), + /* allow_image */ mctx ? mtmd_support_vision(mctx) : false, + /* allow_audio */ mctx ? mtmd_support_audio (mctx) : false, + /* enable_thinking */ enable_thinking, + /* media_path */ params_base.media_path, + }; + } + return true; } @@ -1588,32 +1591,14 @@ private: // tokenize the input if it's set by CLI, return false on error bool tokenize_cli_input(server_task & task) { - GGML_ASSERT(task.cli_input != nullptr); try { - auto & opt = oai_parser_opt; - common_chat_templates_inputs inputs; - inputs.messages = common_chat_msgs_parse_oaicompat(task.cli_input); - inputs.tools = {}; // TODO - inputs.tool_choice = COMMON_CHAT_TOOL_CHOICE_NONE; - inputs.json_schema = ""; // TODO - inputs.grammar = ""; // TODO - inputs.use_jinja = opt.use_jinja; - inputs.parallel_tool_calls = false; - inputs.add_generation_prompt = true; - inputs.reasoning_format = opt.reasoning_format; - inputs.enable_thinking = opt.enable_thinking; - - // Apply chat template to the list of messages - auto chat_params = common_chat_templates_apply(opt.tmpls, inputs); - - // tokenize the resulting prompt - auto & prompt = chat_params.prompt; + auto & prompt = task.cli_prompt; if (mctx != nullptr) { task.tokens = process_mtmd_prompt(mctx, prompt, task.cli_files); } else { task.tokens = std::move(tokenize_input_prompts(vocab, mctx, prompt, true, true)[0]); } - task.cli_input.clear(); + task.cli_prompt.clear(); task.cli_files.clear(); } catch (const std::exception & e) { send_error(task, std::string("Failed to format input: ") + e.what(), ERROR_TYPE_INVALID_REQUEST); @@ -1689,7 +1674,7 @@ private: { // special case: if input is provided via CLI, tokenize it first // otherwise, no need to tokenize as it's already done inside the HTTP thread - if (task.cli_input != nullptr) { + if (task.cli) { if (!tokenize_cli_input(task)) { break; } @@ -2901,8 +2886,6 @@ server_response_reader server_context::get_response_reader() { } server_context_meta server_context::get_meta() const { - auto tool_use_src = common_chat_templates_source(impl->chat_templates.get(), "tool_use"); - auto bos_id = llama_vocab_bos(impl->vocab); auto eos_id = llama_vocab_eos(impl->vocab); auto bos_token_str = bos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx, bos_id, true) : ""; @@ -2913,14 +2896,13 @@ server_context_meta server_context::get_meta() const { /* model_name */ impl->model_name, /* model_path */ impl->params_base.model.path, /* has_mtmd */ impl->mctx != nullptr, - /* has_inp_image */ impl->oai_parser_opt.allow_image, - /* has_inp_audio */ impl->oai_parser_opt.allow_audio, + /* has_inp_image */ impl->chat_params.allow_image, + /* has_inp_audio */ impl->chat_params.allow_audio, /* json_webui_settings */ impl->json_webui_settings, /* slot_n_ctx */ impl->get_slot_n_ctx(), /* pooling_type */ llama_pooling_type(impl->ctx), - /* chat_template */ common_chat_templates_source(impl->chat_templates.get()), - /* chat_template_tool_use */ tool_use_src ? tool_use_src : "", + /* chat_params */ impl->chat_params, /* bos_token_str */ bos_token_str, /* eos_token_str */ eos_token_str, @@ -3202,8 +3184,8 @@ void server_routes::init_routes() { // this endpoint can be accessed during sleeping // the next LOC is to avoid someone accidentally use ctx_server - bool server_ctx; // do NOT delete this line - GGML_UNUSED(server_ctx); + bool ctx_server; // do NOT delete this line + GGML_UNUSED(ctx_server); res->ok({{"status", "ok"}}); return res; @@ -3393,8 +3375,8 @@ void server_routes::init_routes() { // this endpoint can be accessed during sleeping // the next LOC is to avoid someone accidentally use ctx_server - bool server_ctx; // do NOT delete this line - GGML_UNUSED(server_ctx); + bool ctx_server; // do NOT delete this line + GGML_UNUSED(ctx_server); task_params tparams; tparams.sampling = params.sampling; @@ -3403,6 +3385,9 @@ void server_routes::init_routes() { { "n_ctx", meta->slot_n_ctx }, }; + std::string tmpl_default = common_chat_templates_source(meta->chat_params.tmpls.get(), ""); + std::string tmpl_tools = common_chat_templates_source(meta->chat_params.tmpls.get(), "tool_use"); + json props = { { "default_generation_settings", default_generation_settings_for_props }, { "total_slots", params.n_parallel }, @@ -3417,15 +3402,15 @@ void server_routes::init_routes() { { "endpoint_metrics", params.endpoint_metrics }, { "webui", params.webui }, { "webui_settings", meta->json_webui_settings }, - { "chat_template", meta->chat_template }, + { "chat_template", tmpl_default }, { "bos_token", meta->bos_token_str }, { "eos_token", meta->eos_token_str }, { "build_info", meta->build_info }, { "is_sleeping", queue_tasks.is_sleeping() }, }; if (params.use_jinja) { - if (!meta->chat_template_tool_use.empty()) { - props["chat_template_tool_use"] = meta->chat_template_tool_use; + if (!tmpl_tools.empty()) { + props["chat_template_tool_use"] = tmpl_tools; } } res->ok(props); @@ -3446,6 +3431,7 @@ void server_routes::init_routes() { this->get_api_show = [this](const server_http_req &) { auto res = create_response(); + std::string tmpl_default = common_chat_templates_source(meta->chat_params.tmpls.get(), ""); json data = { { "model_info", { @@ -3454,7 +3440,7 @@ void server_routes::init_routes() { }, {"modelfile", ""}, {"parameters", ""}, - {"template", meta->chat_template}, + {"template", tmpl_default}, {"details", { {"parent_model", ""}, {"format", "gguf"}, @@ -3579,7 +3565,7 @@ void server_routes::init_routes() { json body = json::parse(req.body); json body_parsed = oaicompat_chat_params_parse( body, - ctx_server.oai_parser_opt, + meta->chat_params, files); return handle_completions_impl( req, @@ -3595,7 +3581,7 @@ void server_routes::init_routes() { json body = convert_anthropic_to_oai(json::parse(req.body)); json body_parsed = oaicompat_chat_params_parse( body, - ctx_server.oai_parser_opt, + meta->chat_params, files); return handle_completions_impl( req, @@ -3611,7 +3597,7 @@ void server_routes::init_routes() { json body = convert_anthropic_to_oai(json::parse(req.body)); json body_parsed = oaicompat_chat_params_parse( body, - ctx_server.oai_parser_opt, + meta->chat_params, files); json prompt = body_parsed.at("prompt"); @@ -3627,7 +3613,7 @@ void server_routes::init_routes() { json body = json::parse(req.body); json data = oaicompat_chat_params_parse( body, - ctx_server.oai_parser_opt, + meta->chat_params, files); res->ok({{ "prompt", std::move(data.at("prompt")) }}); return res; @@ -3638,8 +3624,8 @@ void server_routes::init_routes() { // this endpoint can be accessed during sleeping // the next LOC is to avoid someone accidentally use ctx_server - bool server_ctx; // do NOT delete this line - GGML_UNUSED(server_ctx); + bool ctx_server; // do NOT delete this line + GGML_UNUSED(ctx_server); json models = { {"models", { diff --git a/tools/server/server-context.h b/tools/server/server-context.h index 09bec15ae1..ec1df96950 100644 --- a/tools/server/server-context.h +++ b/tools/server/server-context.h @@ -20,9 +20,8 @@ struct server_context_meta { int slot_n_ctx; enum llama_pooling_type pooling_type; - // chat template - std::string chat_template; - std::string chat_template_tool_use; + // chat params + server_chat_params & chat_params; // tokens std::string bos_token_str; diff --git a/tools/server/server-task.h b/tools/server/server-task.h index 11943ee4f8..daffe0c904 100644 --- a/tools/server/server-task.h +++ b/tools/server/server-task.h @@ -130,8 +130,10 @@ struct server_task { task_params params; server_tokens tokens; - // only used by CLI, this delegates the tokenization to the server - json cli_input = nullptr; + // only used by CLI, this allow tokenizing CLI inputs on server side + // we need this because mtmd_context and vocab are not accessible outside of server_context + bool cli = false; + std::string cli_prompt; std::vector cli_files; server_task_type type;