From 51fa458a92d6a3f305f8fd76fc8f702e3e87ddb5 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Thu, 22 Jan 2026 21:30:06 +0100 Subject: [PATCH] server : support preserving reasoning_content in assistant message (#18994) * support reasoning_content input * report template caps to webui * add docs * rm commented code --- common/chat-parser.cpp | 4 +- common/chat.cpp | 175 +++++++++++++------------------- common/chat.h | 25 +++-- common/jinja/caps.cpp | 53 +++++++++- common/jinja/caps.h | 6 +- tests/test-chat.cpp | 16 +-- tools/server/README.md | 8 ++ tools/server/server-context.cpp | 2 + tools/server/server-context.h | 1 + tools/server/server-task.cpp | 6 +- 10 files changed, 165 insertions(+), 131 deletions(-) diff --git a/common/chat-parser.cpp b/common/chat-parser.cpp index c2d1e30f35..29819e48d3 100644 --- a/common/chat-parser.cpp +++ b/common/chat-parser.cpp @@ -1630,7 +1630,7 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co } auto msg = builder.result(); if (!is_partial) { - LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat({msg}).at(0).dump().c_str()); + LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat({msg}).at(0).dump().c_str()); } return msg; } @@ -1663,7 +1663,7 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & parser, const std mapper.from_ast(ctx.ast, result); } if (!is_partial) { - LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat({msg}).at(0).dump().c_str()); + LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat({msg}).at(0).dump().c_str()); } return msg; } diff --git a/common/chat.cpp b/common/chat.cpp index b29544dac0..6853f4ad47 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -7,9 +7,6 @@ #include "log.h" #include "regex-partial.h" -// #include -// #include - #include "jinja/parser.h" #include "jinja/value.h" #include "jinja/runtime.h" @@ -56,39 +53,73 @@ static bool has_content_or_tool_calls(const common_chat_msg & msg) { return !msg.content.empty() || !msg.tool_calls.empty(); } -template <> -json common_chat_msg::to_json_oaicompat() const -{ - json message { - {"role", "assistant"}, - }; - if (!reasoning_content.empty()) { - message["reasoning_content"] = reasoning_content; +json common_chat_msg::to_json_oaicompat(bool concat_typed_text) const { + if (!content.empty() && !content_parts.empty()) { + throw std::runtime_error("Cannot specify both content and content_parts"); } - if (content.empty() && !tool_calls.empty()) { - message["content"] = json(); + json jmsg { + {"role", role}, + }; + if (!content.empty()) { + jmsg["content"] = content; + } else if (!content_parts.empty()) { + if (concat_typed_text) { + std::string text; + for (const auto & part : content_parts) { + if (part.type != "text") { + LOG_WRN("Ignoring content part type: %s\n", part.type.c_str()); + continue; + } + if (!text.empty()) { + text += '\n'; + } + text += part.text; + } + jmsg["content"] = text; + } else { + auto & parts = jmsg["content"] = json::array(); + for (const auto & part : content_parts) { + parts.push_back({ + {"type", part.type}, + {"text", part.text}, + }); + } + } } else { - message["content"] = content; + jmsg["content"] = ""; + } + if (!reasoning_content.empty()) { + jmsg["reasoning_content"] = reasoning_content; + } + if (!tool_name.empty()) { + jmsg["name"] = tool_name; + } + if (!tool_call_id.empty()) { + jmsg["tool_call_id"] = tool_call_id; } if (!tool_calls.empty()) { - auto arr = json::array(); - for (const auto & tc : tool_calls) { - arr.push_back({ + jmsg["tool_calls"] = json::array(); + auto & jtool_calls = jmsg["tool_calls"]; + for (const auto & tool_call : tool_calls) { + json tc { {"type", "function"}, {"function", { - {"name", tc.name}, - {"arguments", tc.arguments}, + {"name", tool_call.name}, + {"arguments", tool_call.arguments}, }}, - {"id", tc.id}, - // // Some templates generate and require an id (sometimes in a very specific format, e.g. Mistral Nemo). - // // We only generate a random id for the ones that don't generate one by themselves - // // (they also won't get to see it as their template likely doesn't use it, so it's all for the client) - // {"id", tc.id.empty() ? gen_tool_call_id() : tc.id}, - }); + }; + if (!tool_call.id.empty()) { + tc["id"] = tool_call.id; + } + // Some templates generate and require an id (sometimes in a very specific format, e.g. Mistral Nemo). + // We only generate a random id for the ones that don't generate one by themselves + // (they also won't get to see it as their template likely doesn't use it, so it's all for the client) + // {"id", tc.id.empty() ? gen_tool_call_id() : tc.id}, + jtool_calls.push_back(tc); } - message["tool_calls"] = arr; } - return message; + + return jmsg; } std::vector common_chat_msg_diff::compute_diffs(const common_chat_msg & msg_prv, const common_chat_msg & msg_new) { @@ -256,7 +287,6 @@ bool common_chat_templates_support_enable_thinking(const common_chat_templates * return rendered_no_thinking.prompt != rendered_with_thinking.prompt; } -template <> std::vector common_chat_msgs_parse_oaicompat(const json & messages) { std::vector msgs; @@ -350,80 +380,15 @@ std::vector common_chat_msgs_parse_oaicompat(const json & messa return msgs; } -template <> json common_chat_msgs_to_json_oaicompat(const std::vector & msgs, bool concat_typed_text) { json messages = json::array(); for (const auto & msg : msgs) { - if (!msg.content.empty() && !msg.content_parts.empty()) { - throw std::runtime_error("Cannot specify both content and content_parts"); - } - json jmsg { - {"role", msg.role}, - }; - if (!msg.content.empty()) { - jmsg["content"] = msg.content; - } else if (!msg.content_parts.empty()) { - if (concat_typed_text) { - std::string text; - for (const auto & part : msg.content_parts) { - if (part.type != "text") { - LOG_WRN("Ignoring content part type: %s\n", part.type.c_str()); - continue; - } - if (!text.empty()) { - text += '\n'; - } - text += part.text; - } - jmsg["content"] = text; - } else { - auto & parts = jmsg["content"] = json::array(); - for (const auto & part : msg.content_parts) { - parts.push_back({ - {"type", part.type}, - {"text", part.text}, - }); - } - } - } else { - jmsg["content"] = ""; - } - if (!msg.reasoning_content.empty()) { - jmsg["reasoning_content"] = msg.reasoning_content; - } - if (!msg.tool_name.empty()) { - jmsg["name"] = msg.tool_name; - } - if (!msg.tool_call_id.empty()) { - jmsg["tool_call_id"] = msg.tool_call_id; - } - if (!msg.tool_calls.empty()) { - auto & tool_calls = jmsg["tool_calls"] = json::array(); - for (const auto & tool_call : msg.tool_calls) { - json tc { - {"type", "function"}, - {"function", { - {"name", tool_call.name}, - {"arguments", tool_call.arguments}, - }}, - }; - if (!tool_call.id.empty()) { - tc["id"] = tool_call.id; - } - tool_calls.push_back(tc); - } - } + json jmsg = msg.to_json_oaicompat(concat_typed_text); messages.push_back(jmsg); } return messages; } -template <> -std::vector common_chat_msgs_parse_oaicompat(const std::string & messages) { - return common_chat_msgs_parse_oaicompat(json::parse(messages)); -} - -template <> std::vector common_chat_tools_parse_oaicompat(const json & tools) { std::vector result; @@ -459,12 +424,6 @@ std::vector common_chat_tools_parse_oaicompat(const json & too return result; } -template <> -std::vector common_chat_tools_parse_oaicompat(const std::string & tools) { - return common_chat_tools_parse_oaicompat(json::parse(tools)); -} - -template <> json common_chat_tools_to_json_oaicompat(const std::vector & tools) { if (tools.empty()) { return json(); @@ -484,7 +443,7 @@ json common_chat_tools_to_json_oaicompat(const std::vector & t return result; } -template <> json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) { +json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) { json delta = json::object(); if (!diff.reasoning_content_delta.empty()) { delta["reasoning_content"] = diff.reasoning_content_delta; @@ -2867,13 +2826,13 @@ static common_chat_params common_chat_templates_apply_jinja( const struct common_chat_templates_inputs & inputs) { templates_params params; - params.tools = common_chat_tools_to_json_oaicompat(inputs.tools); + params.tools = common_chat_tools_to_json_oaicompat(inputs.tools); const auto & tmpl = params.tools.is_array() && tmpls->template_tool_use ? *tmpls->template_tool_use : *tmpls->template_default; const auto & src = tmpl.source(); const auto & caps = tmpl.original_caps(); - params.messages = common_chat_msgs_to_json_oaicompat(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content); + params.messages = common_chat_msgs_to_json_oaicompat(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content); params.add_generation_prompt = inputs.add_generation_prompt; params.tool_choice = inputs.tool_choice; params.reasoning_format = inputs.reasoning_format; @@ -2943,6 +2902,10 @@ static common_chat_params common_chat_templates_apply_jinja( src.find("") != std::string::npos && params.json_schema.is_null()) { workaround::func_args_not_string(params.messages); + if (!params.extra_context.contains("clear_thinking")) { + // by default, do not clear reasoning_content (added since GLM-4.7) + params.extra_context["clear_thinking"] = false; + } return common_chat_params_init_glm_4_5(tmpl, params); } @@ -3174,3 +3137,9 @@ common_chat_params common_chat_templates_apply( ? common_chat_templates_apply_jinja(tmpls, inputs) : common_chat_templates_apply_legacy(tmpls, inputs); } + +std::map common_chat_templates_get_caps(const common_chat_templates * chat_templates) { + GGML_ASSERT(chat_templates != nullptr); + GGML_ASSERT(chat_templates->template_default != nullptr); + return chat_templates->template_default->caps.to_map(); +} diff --git a/common/chat.h b/common/chat.h index ac19348ece..24aa4aab5c 100644 --- a/common/chat.h +++ b/common/chat.h @@ -10,6 +10,8 @@ #include #include +#include + struct common_chat_templates; struct common_chat_tool_call { @@ -26,6 +28,11 @@ struct common_chat_msg_content_part { std::string type; std::string text; + // TODO @ngxson : no known chat templates support reasoning_content in content parts yet + // this can be useful for models with interleaved thinking (like Kimi-K2) + // if you see any templates explicitly support this, please ping me + // std::string reasoning_content; + bool operator==(const common_chat_msg_content_part & other) const { return type == other.type && text == other.text; } @@ -40,7 +47,7 @@ struct common_chat_msg { std::string tool_name; std::string tool_call_id; - template T to_json_oaicompat() const; + nlohmann::ordered_json to_json_oaicompat(bool concat_typed_text = false) const; bool empty() const { return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty(); @@ -232,13 +239,13 @@ common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::strin bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates); // Parses a JSON array of messages in OpenAI's chat completion API format. -// T can be std::string containing JSON or nlohmann::ordered_json -template std::vector common_chat_msgs_parse_oaicompat(const T & messages); -template T common_chat_msgs_to_json_oaicompat(const std::vector & msgs, bool concat_typed_text = false); +std::vector common_chat_msgs_parse_oaicompat(const nlohmann::ordered_json & messages); +nlohmann::ordered_json common_chat_msgs_to_json_oaicompat(const std::vector & msgs, bool concat_typed_text = false); -// Parses a JSON array of tools in OpenAI's chat completion tool call API format. -// T can be std::string containing JSON or nlohmann::ordered_json -template std::vector common_chat_tools_parse_oaicompat(const T & tools); -template T common_chat_tools_to_json_oaicompat(const std::vector & tools); +std::vector common_chat_tools_parse_oaicompat(const nlohmann::ordered_json & tools); +nlohmann::ordered_json common_chat_tools_to_json_oaicompat(const std::vector & tools); -template T common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff); +nlohmann::ordered_json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff); + +// get template caps, useful for reporting to server /props endpoint +std::map common_chat_templates_get_caps(const common_chat_templates * chat_templates); diff --git a/common/jinja/caps.cpp b/common/jinja/caps.cpp index 61deccd1f5..f27490f1fb 100644 --- a/common/jinja/caps.cpp +++ b/common/jinja/caps.cpp @@ -61,14 +61,23 @@ static void caps_print_stats(value & v, const std::string & path) { ops.c_str()); } +std::map caps::to_map() const { + return { + {"requires_typed_content", requires_typed_content}, + {"supports_tools", supports_tools}, + {"supports_tool_calls", supports_tool_calls}, + {"supports_parallel_tool_calls", supports_parallel_tool_calls}, + {"supports_system_role", supports_system_role}, + {"supports_preserve_reasoning", supports_preserve_reasoning}, + }; +} + std::string caps::to_string() const { std::ostringstream ss; ss << "Caps(\n"; - ss << " requires_typed_content=" << requires_typed_content << "\n"; - ss << " supports_tools=" << supports_tools << "\n"; - ss << " supports_tool_calls=" << supports_tool_calls << "\n"; - ss << " supports_parallel_tool_calls=" << supports_parallel_tool_calls << "\n"; - ss << " supports_system_role=" << supports_system_role << "\n"; + for (const auto & [key, value] : to_map()) { + ss << " " << key << "=" << (value ? "true" : "false") << "\n"; + } ss << ")"; return ss.str(); } @@ -229,6 +238,40 @@ caps caps_get(jinja::program & prog) { } ); + // case: preserve reasoning content in chat history + caps_try_execute( + prog, + [&]() { + // messages + return json::array({ + { + {"role", "user"}, + {"content", "User message"} + }, + { + {"role", "assistant"}, + {"content", "Assistant message"}, + {"reasoning_content", "Reasoning content"} + }, + { + {"role", "user"}, + {"content", "User message"} + }, + }); + }, + [&]() { + // tools + return json::array(); + }, + [&](bool, value & messages, value &) { + auto & content = messages->at(1)->at("reasoning_content"); + caps_print_stats(content, "messages[1].reasoning_content"); + if (content->stats.used) { + result.supports_preserve_reasoning = true; + } + } + ); + JJ_DEBUG("%s\n", result.to_string().c_str()); return result; diff --git a/common/jinja/caps.h b/common/jinja/caps.h index deb2df180f..77df117baa 100644 --- a/common/jinja/caps.h +++ b/common/jinja/caps.h @@ -3,6 +3,7 @@ #include "runtime.h" #include +#include namespace jinja { @@ -11,14 +12,17 @@ struct caps { bool supports_tool_calls = true; bool supports_system_role = true; bool supports_parallel_tool_calls = true; + bool supports_preserve_reasoning = false; // support assistant message with reasoning_content bool requires_typed_content = false; // default: use string content + // for reporting on server + std::map to_map() const; + // for debugging std::string to_string() const; }; caps caps_get(jinja::program & prog); -void debug_print_caps(const caps & c); } // namespace jinja diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp index 6820acf679..de7075e6e5 100644 --- a/tests/test-chat.cpp +++ b/tests/test-chat.cpp @@ -462,9 +462,9 @@ static void test_parser_with_streaming(const common_chat_msg & expected, const s for (size_t i = 1; i <= raw_message.size(); ++i) { auto curr_msg = parse_msg(std::string(utf8_truncate_safe_view(std::string_view(raw_message).substr(0, i)))); if (curr_msg == simple_assist_msg("")) continue; - LOG_INF("Streaming msg: %s\n", common_chat_msgs_to_json_oaicompat({curr_msg}).dump().c_str()); + LOG_INF("Streaming msg: %s\n", common_chat_msgs_to_json_oaicompat({curr_msg}).dump().c_str()); for (auto diff: common_chat_msg_diff::compute_diffs(last_msg, curr_msg)) { - LOG_INF("Streaming diff: %s\n", common_chat_msg_diff_to_json_oaicompat(diff).dump().c_str()); + LOG_INF("Streaming diff: %s\n", common_chat_msg_diff_to_json_oaicompat(diff).dump().c_str()); if (!diff.reasoning_content_delta.empty()) { merged.reasoning_content += diff.reasoning_content_delta; } @@ -480,7 +480,7 @@ static void test_parser_with_streaming(const common_chat_msg & expected, const s merged.tool_calls.back().arguments += diff.tool_call_delta.arguments; } } - LOG_INF("Streaming merged: %s\n", common_chat_msgs_to_json_oaicompat({merged}).dump().c_str()); + LOG_INF("Streaming merged: %s\n", common_chat_msgs_to_json_oaicompat({merged}).dump().c_str()); } assert_msg_equals(curr_msg, merged, true); last_msg = curr_msg; @@ -622,7 +622,7 @@ static void test_msgs_oaicompat_json_conversion() { message_assist_call_code_interpreter, }; for (const auto & msg : msgs) { - auto oai_json = common_chat_msgs_to_json_oaicompat({msg}); + auto oai_json = common_chat_msgs_to_json_oaicompat({msg}); auto msgs2 = common_chat_msgs_parse_oaicompat(oai_json); assert_equals((size_t) 1, msgs2.size()); auto msg2 = msgs2[0]; @@ -646,7 +646,7 @@ static void test_msgs_oaicompat_json_conversion() { " }\n" "]" ), - common_chat_msgs_to_json_oaicompat({message_user_parts}).dump(2)); + common_chat_msgs_to_json_oaicompat({message_user_parts}).dump(2)); assert_equals( std::string( @@ -666,7 +666,7 @@ static void test_msgs_oaicompat_json_conversion() { " }\n" "]" ), - common_chat_msgs_to_json_oaicompat({message_assist_call_python}).dump(2)); + common_chat_msgs_to_json_oaicompat({message_assist_call_python}).dump(2)); auto res = common_chat_msgs_parse_oaicompat(json::parse("[{\"role\": \"assistant\", \"tool_calls\": []}]")); assert_equals(1, res.size()); @@ -693,7 +693,7 @@ static void test_tools_oaicompat_json_conversion() { }; for (const auto & tool : tools) { - auto oai_json = common_chat_tools_to_json_oaicompat({tool}); + auto oai_json = common_chat_tools_to_json_oaicompat({tool}); auto tools2 = common_chat_tools_parse_oaicompat(oai_json); assert_equals((size_t) 1, tools2.size()); auto tool2 = tools2[0]; @@ -726,7 +726,7 @@ static void test_tools_oaicompat_json_conversion() { " }\n" "]" ), - common_chat_tools_to_json_oaicompat({special_function_tool}).dump(2)); + common_chat_tools_to_json_oaicompat({special_function_tool}).dump(2)); { auto tools_no_params = common_chat_tools_parse_oaicompat(json::parse( diff --git a/tools/server/README.md b/tools/server/README.md index 191391a882..f113f9cb75 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -781,6 +781,7 @@ By default, it is read-only. To make POST request to change global properties, y "total_slots": 1, "model_path": "../models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf", "chat_template": "...", + "chat_template_caps": {}, "modalities": { "vision": false }, @@ -793,6 +794,7 @@ By default, it is read-only. To make POST request to change global properties, y - `total_slots` - the total number of slots for process requests (defined by `--parallel` option) - `model_path` - the path to model file (same with `-m` argument) - `chat_template` - the model's original Jinja2 prompt template +- `chat_template_caps` - capabilities of the chat template (see `common/jinja/caps.h` for more info) - `modalities` - the list of supported modalities - `is_sleeping` - sleeping status, see [Sleeping on idle](#sleeping-on-idle) @@ -1267,6 +1269,12 @@ This provides information on the performance of the server. It also allows calcu The total number of tokens in context is equal to `prompt_n + cache_n + predicted_n` +*Reasoning support* + +The server supports parsing and returning reasoning via the `reasoning_content` field, similar to Deepseek API. + +Reasoning input (preserve reasoning in history) is also supported by some specific templates. For more details, please refer to [PR#18994](https://github.com/ggml-org/llama.cpp/pull/18994). + ### POST `/v1/responses`: OpenAI-compatible Responses API *Options:* diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 9a828e1eff..73cb4c75b3 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -2903,6 +2903,7 @@ server_context_meta server_context::get_meta() const { /* pooling_type */ llama_pooling_type(impl->ctx), /* chat_params */ impl->chat_params, + /* chat_template_caps */ common_chat_templates_get_caps(impl->chat_params.tmpls.get()), /* bos_token_str */ bos_token_str, /* eos_token_str */ eos_token_str, @@ -3410,6 +3411,7 @@ void server_routes::init_routes() { { "webui", params.webui }, { "webui_settings", meta->json_webui_settings }, { "chat_template", tmpl_default }, + { "chat_template_caps", meta->chat_template_caps }, { "bos_token", meta->bos_token_str }, { "eos_token", meta->eos_token_str }, { "build_info", meta->build_info }, diff --git a/tools/server/server-context.h b/tools/server/server-context.h index 3e5e870fc5..c0b5d373ff 100644 --- a/tools/server/server-context.h +++ b/tools/server/server-context.h @@ -22,6 +22,7 @@ struct server_context_meta { // chat params server_chat_params & chat_params; + std::map chat_template_caps; // tokens std::string bos_token_str; diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index eeaf5d2f6a..799e341d37 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -700,7 +700,7 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat() { json choice { {"finish_reason", finish_reason}, {"index", index}, - {"message", msg.to_json_oaicompat()}, + {"message", msg.to_json_oaicompat()}, }; if (!stream && probs_output.size() > 0) { @@ -750,7 +750,7 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat_stream() { json { {"finish_reason", nullptr}, {"index", 0}, - {"delta", common_chat_msg_diff_to_json_oaicompat(diff)}, + {"delta", common_chat_msg_diff_to_json_oaicompat(diff)}, }, })}, {"created", t}, @@ -1383,7 +1383,7 @@ json server_task_result_cmpl_partial::to_json_oaicompat_chat() { } for (const auto & diff : oaicompat_msg_diffs) { - add_delta(common_chat_msg_diff_to_json_oaicompat(diff)); + add_delta(common_chat_msg_diff_to_json_oaicompat(diff)); } if (!deltas.empty()) {