From 38292630106ca50a91d7c9bd7aaa13d5f78ba157 Mon Sep 17 00:00:00 2001
From: Alec Koumjian <akoumjian@gmail.com>
Date: Sun, 11 Jan 2026 19:17:35 +0000
Subject: [PATCH] enforce response_format and json_schema for Kimi K2

---
 common/chat-parser-xml-toolcall.cpp |  40 +++++++--
 common/chat.cpp                     |  20 ++++-
 tests/CMakeLists.txt                |   1 +
 tests/test-kimi-response-format.cpp | 121 ++++++++++++++++++++++++++++
 4 files changed, 173 insertions(+), 9 deletions(-)
 create mode 100644 tests/test-kimi-response-format.cpp

diff --git a/common/chat-parser-xml-toolcall.cpp b/common/chat-parser-xml-toolcall.cpp
index a80900ff8d..11bddc307e 100644
--- a/common/chat-parser-xml-toolcall.cpp
+++ b/common/chat-parser-xml-toolcall.cpp
@@ -667,18 +667,42 @@ inline void parse_msg_with_xml_tool_calls(common_chat_msg_parser & builder, cons
         return l;
     };
     constexpr auto trim_suffix = [](std::string &content, std::initializer_list<std::string_view> list) {
-        auto best_match = content.size();
-        for (auto pattern: list) {
-            if (pattern.size() == 0) continue;
+        // Trim partial suffixes that look like an incomplete special marker (e.g. "<|tool_call_end|>").
+        //
+        // Some tool syntaxes include a normal JSON delimiter *before* a special token, e.g. "}<|tool_call_end|>".
+        // In that case we must avoid trimming the valid JSON '}' when only the beginning of the pattern matches.
+        auto best_erase_from = content.size();
+
+        for (auto pattern : list) {
+            if (pattern.empty()) {
+                continue;
+            }
+
+            // If the pattern contains a '<', treat everything before it as a "normal prefix" and only trim if the
+            // model actually started emitting the special token (i.e. matched beyond the prefix).
+            const auto special_pos = pattern.find('<');
+
             for (auto match_idx = content.size() - std::min(pattern.size(), content.size()); content.size() > match_idx; match_idx++) {
-                auto match_len = content.size() - match_idx;
-                if (content.compare(match_idx, match_len, pattern.data(), match_len) == 0 && best_match > match_idx) {
-                    best_match = match_idx;
+                const auto match_len = content.size() - match_idx;
+                if (content.compare(match_idx, match_len, pattern.data(), match_len) != 0) {
+                    continue;
+                }
+
+                if (special_pos != std::string_view::npos && special_pos > 0) {
+                    // Only matched the normal prefix (e.g. "}") - do not trim.
+                    if (match_len <= special_pos) {
+                        continue;
+                    }
+                    // Trim from the start of the special token, preserving the normal prefix.
+                    best_erase_from = std::min(best_erase_from, match_idx + special_pos);
+                } else {
+                    best_erase_from = std::min(best_erase_from, match_idx);
                 }
             }
         }
-        if (content.size() > best_match) {
-            content.erase(best_match);
+
+        if (content.size() > best_erase_from) {
+            content.erase(best_erase_from);
         }
     };
     const auto trim_potential_partial_word = [&start_think, &end_think, &form, trim_suffix](std::string &content) {
diff --git a/common/chat.cpp b/common/chat.cpp
index eeb38ad06a..bd2597f498 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -1881,11 +1881,29 @@ static common_chat_params common_chat_params_init_qwen3_coder_xml(const common_c
 
 static common_chat_params common_chat_params_init_kimi_k2(const common_chat_template & tmpl, const struct templates_params & params) {
     common_chat_params data;
-    data.grammar_lazy = params.tools.is_array() && !params.tools.empty() && params.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+    const bool has_tools  = params.tools.is_array() && !params.tools.empty();
+    const bool has_schema = params.json_schema.is_object();
+
+    data.grammar_lazy = has_tools && params.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
 
     data.prompt = apply(tmpl, params);
     data.format = COMMON_CHAT_FORMAT_KIMI_K2;
 
+    
+    if (has_tools && has_schema) {
+        throw std::runtime_error("Kimi K2: cannot combine \"tools\" with \"json_schema\"/response_format; remove tools or remove response_format");
+    }
+    
+    if (!has_tools && has_schema) {
+        if (!params.grammar.empty()) {
+            throw std::runtime_error("Either \"json_schema\" or \"grammar\" can be specified, but not both");
+        }
+        // Mirror the generic "content-only" schema enforcement behavior
+        data.grammar = json_schema_to_grammar(params.json_schema);
+    } else {
+        data.grammar = params.grammar;
+    }
+
     data.preserved_tokens = {
         "<think>",
         "</think>",
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index c9436c5995..196f689ac7 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -188,6 +188,7 @@ llama_build_and_test(test-chat-peg-parser.cpp peg-parser/simple-tokenize.cpp)
 llama_build_and_test(test-chat-template.cpp)
 llama_build_and_test(test-jinja.cpp)
 llama_test(test-jinja NAME test-jinja-py ARGS -py LABEL python)
+llama_build_and_test(test-kimi-response-format.cpp)
 llama_build_and_test(test-json-partial.cpp)
 llama_build_and_test(test-log.cpp)
 llama_build_and_test(
diff --git a/tests/test-kimi-response-format.cpp b/tests/test-kimi-response-format.cpp
new file mode 100644
index 0000000000..813c9afb6e
--- /dev/null
+++ b/tests/test-kimi-response-format.cpp
@@ -0,0 +1,121 @@
+#include <cassert>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "chat.h"
+
+// Regression test:
+// - llama-server /chat/completions parses `response_format` into a JSON schema and passes it into
+//   common_chat_templates_apply() as inputs.json_schema.
+// - For templates detected as "Kimi K2", llama.cpp selected a Kimi-specific handler that did not
+//   apply json_schema-to-grammar conversion, so schema enforcement was silently dropped.
+//
+// This test asserts that for the Kimi K2 chat template, providing a json_schema results in a
+// non-empty grammar being returned by common_chat_templates_apply() (hard enforcement expected).
+
+static const char * KIMI_K2_TEMPLATE = R"JINJA({%- if tools -%}
+  <|im_system|>tool_declare<|im_middle|>
+  # Tools
+  {{ tools | tojson }}<|im_end|>
+{%- endif -%}
+{%- for message in messages -%}
+  {%- if loop.first and messages[0]['role'] != 'system' -%}
+    <|im_system|>system<|im_middle|>You are Kimi, an AI assistant created by Moonshot AI.<|im_end|>
+  {%- endif -%}
+
+  {%- set role_name =  message.get('name') or  message['role'] -%}
+  {%- if message['role'] == 'user' -%}
+    <|im_user|>{{role_name}}<|im_middle|>
+  {%- elif message['role'] == 'assistant' -%}
+    <|im_assistant|>{{role_name}}<|im_middle|>
+  {%- else -%}
+    <|im_system|>{{role_name}}<|im_middle|>
+  {% endif %}
+
+  {%- if message['role'] == 'assistant' and message.get('tool_calls') -%}
+    {%- if message['content'] -%}{{ message['content'] }}{%- endif -%}
+    <|tool_calls_section_begin|>
+    {%- for tool_call in message['tool_calls'] -%}
+      {%- set formatted_id = tool_call['id'] -%}
+      <|tool_call_begin|>{{ formatted_id }}<|tool_call_argument_begin|>{% if tool_call['function']['arguments'] is string %}{{ tool_call['function']['arguments'] }}{% else %}{{ tool_call['function']['arguments'] | tojson }}{% endif %}<|tool_call_end|>
+    {%- endfor -%}
+    <|tool_calls_section_end|>
+  {%- elif message['role'] == 'tool' -%}
+    ## Return of {{ message.tool_call_id }}
+    {{ message['content'] }}
+  {%- elif message['content'] is string -%}
+    {{ message['content'] }}
+  {%- elif message['content'] is not none -%}
+    {% for content in message['content'] -%}
+      {% if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}
+        <|media_start|>image<|media_content|><|media_pad|><|media_end|>
+      {% else -%}
+        {{ content['text'] }}
+      {%- endif -%}
+    {%- endfor -%}
+  {%- endif -%}
+  <|im_end|>
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+  <|im_assistant|>assistant<|im_middle|>
+{%- endif -%})JINJA";
+
+int main() {
+    auto tmpls = common_chat_templates_init(/* model= */ nullptr, KIMI_K2_TEMPLATE);
+
+    common_chat_templates_inputs inputs;
+    inputs.use_jinja = true;
+    inputs.add_generation_prompt = true;
+
+    // No tools
+    inputs.tools = {};
+    inputs.tool_choice = COMMON_CHAT_TOOL_CHOICE_NONE;
+
+    inputs.json_schema = R"JSON({
+      "type": "object",
+      "properties": { "ok": { "type": "boolean" } },
+      "required": ["ok"],
+      "additionalProperties": false
+    })JSON";
+
+    inputs.messages = {
+        common_chat_msg{"system", "Return ONLY JSON with key ok.", {}, {}, "", "", ""},
+        common_chat_msg{"user", "ok", {}, {}, "", "", ""},
+    };
+
+    const auto out = common_chat_templates_apply(tmpls.get(), inputs);
+    
+    // Confirm the Kimi K2 handler was actually selected (not a generic fallback).
+    assert(out.format == COMMON_CHAT_FORMAT_KIMI_K2);
+    assert(!out.grammar.empty());
+
+    // tools + json_schema is explicitly unsupported for Kimi K2 (ambiguous composition).
+    // Ensure we fail loudly rather than silently dropping schema enforcement.
+    inputs.tools = {
+        common_chat_tool{
+            /* .name = */ "noop",
+            /* .description = */ "No-op tool",
+            /* .parameters = */ R"JSON({
+              "type": "object",
+              "properties": { "x": { "type": "string" } },
+              "required": ["x"],
+              "additionalProperties": false
+            })JSON",
+        },
+    };
+    inputs.tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
+
+    bool threw = false;
+    try {
+        (void) common_chat_templates_apply(tmpls.get(), inputs);
+    } catch (const std::exception &) {
+        threw = true;
+    }
+    // Avoid relying on assert() in Release builds (may be compiled out).
+    if (!threw) {
+        return 2;
+    }
+    return 0;
+}
+