Implement proper prefill extraction

2026-03-14 15:46:36 +01:00 · 2026-03-14 15:46:36 +01:00 · ba6410ff74
parent 526e926947
commit ba6410ff74
13 changed files with 73 additions and 63 deletions
--- a/common/chat-auto-parser-generator.cpp
+++ b/common/chat-auto-parser-generator.cpp
@ -46,36 +46,48 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
    data.format           = COMMON_CHAT_FORMAT_PEG_NATIVE;
    data.preserved_tokens = autoparser.preserved_tokens;

-    // Extract reasoning prefill and detect template artifact start markers.
-    // See docs/autoparser.md "Reasoning Prefill" for details.
+    // Extract what the template appends when add_generation_prompt=true (the generation prompt suffix).
+    std::string gen_prompt_suffix;
+    {
+        template_params tparams;
+        tparams.messages              = json::array({ json{ {"role", "user"}, {"content", "x"} } });
+        tparams.add_generation_prompt = false;
+        tparams.enable_thinking       = inputs.enable_thinking;
+        auto result = compare_variants(tmpl, tparams, [](template_params & p) {
+            p.add_generation_prompt = true;
+        });
+        if (result) {
+            gen_prompt_suffix = result->diff.right;
+        }
+    }
+
+    // Fallback for templates that ignore add_generation_prompt: search the rendered prompt.
+    // Excluded for TOOLS_ONLY: the start tag there is model-generated and may appear in prior turns.
+    const std::string & prompt_to_search =
+        (gen_prompt_suffix.empty() && autoparser.reasoning.mode != reasoning_mode::TOOLS_ONLY)
+            ? data.prompt
+            : gen_prompt_suffix;
+
    bool clear_reasoning_start = false;
    if (inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE &&
        autoparser.reasoning.mode != reasoning_mode::NONE &&
        !autoparser.reasoning.end.empty()) {
-        const auto & r_start = autoparser.reasoning.start;
-        const auto & r_end   = autoparser.reasoning.end;
+        const auto & r_start    = autoparser.reasoning.start;
+        const auto & r_end      = autoparser.reasoning.end;
+        auto         r_end_t    = trim_trailing_whitespace(r_end);
+        auto         r_start_t  = trim_trailing_whitespace(r_start);

-        auto prompt_trimmed  = trim_trailing_whitespace(data.prompt);
-        auto r_end_trimmed   = trim_trailing_whitespace(r_end);
-        auto r_start_trimmed = trim_trailing_whitespace(r_start);
+        if (!r_start_t.empty()) {
+            auto start_pos = prompt_to_search.rfind(r_start_t);
+            if (start_pos != std::string::npos) {
+                std::string from_start = prompt_to_search.substr(start_pos);
+                auto         fs_trimmed = trim_trailing_whitespace(from_start);

-        if (!r_start_trimmed.empty()) {
-            if (string_ends_with(prompt_trimmed, r_end_trimmed)) {
-                auto before_end = trim_trailing_whitespace(prompt_trimmed.substr(0, prompt_trimmed.size() - r_end_trimmed.size()));
-                if (string_ends_with(before_end, r_start_trimmed)) {
-                    // Start+end at prompt end — use canonical markers to preserve whitespace.
-                    data.reasoning_prefill = r_start + r_end;
-                }
-            }
-            if (data.reasoning_prefill.empty() && string_ends_with(prompt_trimmed, r_start_trimmed)) {
-                auto start_pos = prompt_trimmed.size() - r_start_trimmed.size();
-                data.reasoning_prefill = data.prompt.substr(start_pos);
-            }
-            // Template artifact detection: start marker in prompt but not at end.
-            if (data.reasoning_prefill.empty()) {
-                auto suffix_len = std::min(data.prompt.size(), (size_t) 500);
-                auto suffix     = data.prompt.substr(data.prompt.size() - suffix_len);
-                if (suffix.find(r_start_trimmed) != std::string::npos) {
+                if (string_ends_with(fs_trimmed, r_end_t)) {
+                    data.prefill = r_start + r_end;
+                } else if (string_ends_with(fs_trimmed, r_start_t)) {
+                    data.prefill = from_start;
+                } else {
                    clear_reasoning_start = true;
                }
            }
--- a/common/chat.cpp
+++ b/common/chat.cpp
@ -1712,11 +1712,9 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena &          src_pars
        LOG_DBG("No parser definition detected, assuming pure content parser.");
    }

-    // Prepend reasoning prefill (e.g. <think> or <think></think> from template prompt)
-    // so the parser can detect reasoning markers that were part of the template output.
-    const std::string effective_input = params.reasoning_prefill.empty()
+    const std::string effective_input = params.prefill.empty()
        ? input
-        : params.reasoning_prefill + input;
+        : params.prefill + input;

    LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), effective_input.c_str());

--- a/common/chat.h
+++ b/common/chat.h
@ -211,7 +211,7 @@ struct common_chat_params {
    std::string                         prompt;
    std::string                         grammar;
    bool                                grammar_lazy         = false;
-    std::string                         reasoning_prefill;
+    std::string                         prefill;
    bool                                supports_thinking    = false;
    std::string                         thinking_start_tag;  // e.g., "<think>"
    std::string                         thinking_end_tag;    // e.g., "</think>"
@ -228,14 +228,14 @@ struct common_chat_parser_params {
    common_reasoning_format reasoning_format     = COMMON_REASONING_FORMAT_NONE; // TODO: refactor this to "bool parse_reasoning"
    // Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
    bool                    reasoning_in_content = false;
-    std::string             reasoning_prefill;
+    std::string             prefill;
    bool                    parse_tool_calls     = true;
    bool                    debug                = false;  // Enable debug output for PEG parser
    common_peg_arena        parser               = {};
    common_chat_parser_params() = default;
    common_chat_parser_params(const common_chat_params & chat_params) {
        format               = chat_params.format;
-        reasoning_prefill    = chat_params.reasoning_prefill;
+        prefill    = chat_params.prefill;
    }
 };

--- a/docs/autoparser.md
+++ b/docs/autoparser.md
@ -50,11 +50,11 @@ All structs are defined in [common/chat-auto-parser.h](common/chat-auto-parser.h
 | `TAG_BASED`     | Tag-based: `<think>...</think>` (start can be empty for delimiter-style formats)  |
 | `TOOLS_ONLY`    | Reasoning only appears in tool call responses, not plain content                  |

-**Reasoning Prefill**: Extracted in `generate_parser()` by inspecting the rendered prompt suffix. Three cases:
+**Reasoning Prefill**: Extracted in `generate_parser()` using `compare_variants(add_generation_prompt=false, add_generation_prompt=true)` on a minimal single-user-message input to get exactly what the template appends as its generation prompt. The start marker is then located within this suffix using `rfind`. Three outcomes:

-1. **Start+end at prompt end** (e.g. `<think></think>`): prefill = canonical `start + end` markers (preserving the analyzer's whitespace, e.g. trailing `\n`). The parser sees reasoning as opened and immediately closed.
-2. **Just start at prompt end** (e.g. `<think>\n`): prefill = extracted from the prompt to preserve trailing whitespace. The parser sees reasoning as already opened.
-3. **Start marker in prompt suffix but not at end** (e.g. Apriel's `<|begin_assistant|>` followed by template boilerplate): the start marker is a template artifact falsely detected by the diff analyzer. It is cleared from the parser so reasoning uses delimiter-style (empty start). The distinction from case 2 vs a genuinely model-generated start marker (e.g. `<think>` for Granite) is whether the marker appears in the prompt suffix at all.
+1. **Start+end in generation prompt** (e.g. `<think></think>\n`): `prefill = start + end`. The parser sees reasoning as opened and immediately closed.
+2. **Only start in generation prompt** (e.g. `<think>\n`): `prefill = from_start` (substring from the marker's position to the end, preserving whitespace). The parser sees reasoning as already open.
+3. **Start marker in the generation prompt but not at its end** (e.g. Apriel's `<|begin_assistant|>` followed by boilerplate): the marker is a template artifact. The start literal is cleared from the parser so reasoning uses delimiter-style (end-only). For templates that ignore `add_generation_prompt` (empty diff), the rendered `data.prompt` is used as fallback — but only for non-TOOLS_ONLY modes, since in TOOLS_ONLY the start tag is model-generated and may appear in prior conversation turns.

 The prefill is prepended to model output before PEG parsing, fed to the grammar sampler via `llama_sampler_accept`, and used to determine the reasoning budget sampler's initial state (COUNTING if prefill starts with the reasoning start tokens, IDLE otherwise).

@ -523,7 +523,7 @@ To support a new template format:

 ## Edge Cases and Quirks

-1. **Reasoning Prefill**: See the `reasoning_mode` enum section above for the full description. Key detail: template artifact detection (case 3) checks the last 500 characters of the rendered prompt for the start marker. If found but not at the very end, the start marker is cleared from the parser.
+1. **Reasoning Prefill**: See the `reasoning_mode` enum section above for the full description. Key detail: the generation prompt suffix is extracted via `compare_variants(add_generation_prompt=false, add_generation_prompt=true)` to avoid false positives from prior conversation turns.
 2. **Per-Call vs Per-Section Markers**: Some templates wrap each tool call individually (`per_call_start/end`); others wrap the entire section (`section_start/end`). T2 (`check_per_call_markers()`) disambiguates by checking if the second call in a two-call output starts with the section marker.
 3. **Python Dict Format**: The Seed template family uses single-quoted JSON (`'key': 'value'`). The `uses_python_dicts` flag causes the PEG builder to register a flexible `json-string` rule accepting both quote styles before any JSON rules are built.
 4. **Tag Boundary Fixing**: `calculate_diff_split()` iteratively adjusts prefix/suffix boundaries to avoid splitting `<tag>` or `[marker]` tokens, ensuring clean extraction.
--- a/tests/test-chat-peg-parser.cpp
+++ b/tests/test-chat-peg-parser.cpp
@ -145,7 +145,7 @@ static void test_example_native(testing & t) {
        common_reasoning_format reasoning_format;
        json                    json_schema;
        bool                    parallel_tool_calls;
-        std::string             reasoning_prefill;
+        std::string             prefill;
        std::string             input;

        // Expect
@ -157,7 +157,7 @@ static void test_example_native(testing & t) {
    auto build_parser = [](const test_case & tc) {
        return build_chat_peg_parser([&](common_chat_peg_builder & p) {
            auto reasoning_in_content = (tc.reasoning_format == COMMON_REASONING_FORMAT_NONE);
-            // Always use optional TAG_BASED pattern; reasoning_prefill is prepended to input
+            // Always use optional TAG_BASED pattern; prefill is prepended to input
            auto reasoning = p.optional("<think>" + p.reasoning(p.until("</think>")) + "</think>" + p.space());

            // tool calling parser
@ -190,7 +190,7 @@ static void test_example_native(testing & t) {
         /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
         /* .json_schema =          */ {},
         /* .parallel_tool_calls =  */ false,
-         /* .reasoning_prefill =    */ "",
+         /* .prefill =    */ "",
         /* .input =                */ ("<think>The user said hello, I must say hello back</think>\nHello"),
         /* .expect_reasoning =     */ "The user said hello, I must say hello back",
         /* .expect_content =       */ "Hello",
@ -203,7 +203,7 @@ static void test_example_native(testing & t) {
         /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
         /* .json_schema =          */ {},
         /* .parallel_tool_calls =  */ false,
-         /* .reasoning_prefill =    */ "",
+         /* .prefill =    */ "",
         /* .input =                */ ("Hello"),
         /* .expect_reasoning =     */ "",
         /* .expect_content =       */ "Hello",
@ -216,7 +216,7 @@ static void test_example_native(testing & t) {
         /* .reasoning_format =     */ COMMON_REASONING_FORMAT_NONE,
         /* .json_schema =          */ {},
         /* .parallel_tool_calls =  */ false,
-         /* .reasoning_prefill =    */ "",
+         /* .prefill =    */ "",
         /* .input =                */ ("<think>The user said hello, I must say hello back</think>\nHello"),
         /* .expect_reasoning =     */ "",
         /* .expect_content =       */ "<think>The user said hello, I must say hello back</think>\nHello",
@ -229,7 +229,7 @@ static void test_example_native(testing & t) {
         /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
         /* .json_schema =          */ {},
         /* .parallel_tool_calls =  */ false,
-         /* .reasoning_prefill =    */ "<think>",
+         /* .prefill =    */ "<think>",
         /* .input =                */ ("The user said hello, I must say hello back</think>\nHello"),
         /* .expect_reasoning =     */ "The user said hello, I must say hello back",
         /* .expect_content =       */ "Hello",
@ -242,7 +242,7 @@ static void test_example_native(testing & t) {
         /* .reasoning_format =     */ COMMON_REASONING_FORMAT_NONE,
         /* .json_schema =          */ {},
         /* .parallel_tool_calls =  */ false,
-         /* .reasoning_prefill =    */ "",
+         /* .prefill =    */ "",
         /* .input =                */ ("The user said hello, I must say hello back</think>\nHello"),
         /* .expect_reasoning =     */ "",
         /* .expect_content =       */ "The user said hello, I must say hello back</think>\nHello",
@ -255,7 +255,7 @@ static void test_example_native(testing & t) {
         /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
         /* .json_schema =          */ {},
         /* .parallel_tool_calls =  */ false,
-         /* .reasoning_prefill =    */ "<think></think>",
+         /* .prefill =    */ "<think></think>",
         /* .input =                */ ("Hello"),
         /* .expect_reasoning =     */ "",
         /* .expect_content =       */ "Hello",
@ -268,7 +268,7 @@ static void test_example_native(testing & t) {
         /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
         /* .json_schema =          */ {},
         /* .parallel_tool_calls =  */ false,
-         /* .reasoning_prefill =    */ "<think>",
+         /* .prefill =    */ "<think>",
         /* .input =                */
            ("I must get the weather in New York</think>\n"
             "<tool_call>["
@ -290,7 +290,7 @@ static void test_example_native(testing & t) {
         /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
         /* .json_schema =          */ {},
         /* .parallel_tool_calls =  */ true,
-         /* .reasoning_prefill =    */ "<think>",
+         /* .prefill =    */ "<think>",
         /* .input =                */
            ("I must get the weather in New York and San Francisco and a 3 day forecast of each.</think>\nLet me "
             "search that for you."
@ -340,7 +340,7 @@ static void test_example_native(testing & t) {
                  { "due_date", { { "type", "string" } } } } },
              { "required", { "invoice_number", "amount", "due_date" } } },
         /* .parallel_tool_calls =  */ false,
-         /* .reasoning_prefill =    */ "<think>",
+         /* .prefill =    */ "<think>",
         /* .input =                */
            ("I must produce the invoice in the requested format</think>\n"
             R"({"invoice_number": "INV-2025-001", "amount": 1250.50, "due_date": "2025-12-31"})"),
@ -368,7 +368,7 @@ static void test_example_native(testing & t) {
                t.log(line);
            }

-            std::string              effective_input = tc.reasoning_prefill + tc.input;
+            std::string              effective_input = tc.prefill + tc.input;
            common_peg_parse_context ctx(effective_input);
            auto                     result = parser.parse(ctx);

--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@ -1001,8 +1001,8 @@ static void test_peg_parser(common_chat_templates *                      tmpls,
        // already placed the opening tag in the prompt.
        // For lazy grammars, the grammar only activates from the trigger position, so the
        // reasoning prefill is irrelevant — reasoning is handled by the PEG parser.
-        if (!parser.params_.reasoning_prefill.empty() && earliest_trigger_pos == std::string::npos) {
-            constrained = parser.params_.reasoning_prefill + constrained;
+        if (!parser.params_.prefill.empty() && earliest_trigger_pos == std::string::npos) {
+            constrained = parser.params_.prefill + constrained;
        }

        // Test the constrained portion against the grammar
--- a/tools/cli/cli.cpp
+++ b/tools/cli/cli.cpp
@ -105,7 +105,7 @@ struct cli_context {
                    llama_get_model(ctx_server.get_llama_context()));

                task.params.sampling.reasoning_budget_tokens = reasoning_budget;
-                task.params.sampling.grammar_prefill = chat_params.reasoning_prefill;
+                task.params.sampling.grammar_prefill = chat_params.prefill;

                if (!chat_params.thinking_start_tag.empty()) {
                    task.params.sampling.reasoning_budget_start =
--- a/tools/server/README.md
+++ b/tools/server/README.md
@ -907,7 +907,7 @@ If query param `?fail_on_no_slot=1` is set, this endpoint will respond with stat
      "chat_format": "GPT-OSS",
      "reasoning_format": "none",
      "reasoning_in_content": false,
-      "reasoning_prefill": "",
+      "prefill": "",
      "samplers": [
        "penalties",
        "dry",
@ -972,7 +972,7 @@ If query param `?fail_on_no_slot=1` is set, this endpoint will respond with stat
      "chat_format": "GPT-OSS",
      "reasoning_format": "none",
      "reasoning_in_content": false,
-      "reasoning_prefill": "",
+      "prefill": "",
      "samplers": [
        "penalties",
        "dry",
@ -1193,7 +1193,7 @@ The `response_format` parameter supports both plain JSON output (e.g. `{"type":

 `reasoning_format`: The reasoning format to be parsed. If set to `none`, it will output the raw generated text.

-`reasoning_prefill`: The reasoning markers that were prefilled in the prompt by the template. Prepended to model output before parsing to handle dynamic thinking/non-thinking modes.
+`prefill`: The reasoning markers that were prefilled in the prompt by the template. Prepended to model output before parsing to handle dynamic thinking/non-thinking modes.

 `parse_tool_calls`: Whether to parse the generated tool call.

--- a/tools/server/public/index.html.gz
+++ b/tools/server/public/index.html.gz
--- a/tools/server/server-common.cpp
+++ b/tools/server/server-common.cpp
@ -1093,7 +1093,7 @@ json oaicompat_chat_params_parse(
    }
    llama_params["grammar_triggers"] = grammar_triggers;
    llama_params["preserved_tokens"] = chat_params.preserved_tokens;
-    llama_params["reasoning_prefill"]        = chat_params.reasoning_prefill;
+    llama_params["prefill"]        = chat_params.prefill;
    for (const auto & stop : chat_params.additional_stops) {
        llama_params["stop"].push_back(stop);
    }
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@ -72,7 +72,7 @@ json task_params::to_json(bool only_metrics) const {
            {"chat_format",               common_chat_format_name(chat_parser_params.format)},
            {"reasoning_format",          common_reasoning_format_name(chat_parser_params.reasoning_format)},
            {"reasoning_in_content",      chat_parser_params.reasoning_in_content},
-            {"reasoning_prefill",         chat_parser_params.reasoning_prefill},
+            {"prefill",         chat_parser_params.prefill},
            {"samplers",                  samplers},
            {"speculative.n_max",         speculative.n_max},
            {"speculative.n_min",         speculative.n_min},
@ -135,7 +135,7 @@ json task_params::to_json(bool only_metrics) const {
        {"chat_format",               common_chat_format_name(chat_parser_params.format)},
        {"reasoning_format",          common_reasoning_format_name(chat_parser_params.reasoning_format)},
        {"reasoning_in_content",      chat_parser_params.reasoning_in_content},
-        {"reasoning_prefill",         chat_parser_params.reasoning_prefill},
+        {"prefill",         chat_parser_params.prefill},
        {"samplers",                  samplers},
        {"speculative.n_max",         speculative.n_max},
        {"speculative.n_min",         speculative.n_min},
@ -402,8 +402,8 @@ task_params server_task::params_from_json_cmpl(
        }
        params.chat_parser_params.reasoning_format = reasoning_format;
        params.chat_parser_params.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
-        params.chat_parser_params.reasoning_prefill = json_value(data, "reasoning_prefill", std::string());
-        params.sampling.grammar_prefill = params.chat_parser_params.reasoning_prefill;
+        params.chat_parser_params.prefill = json_value(data, "prefill", std::string());
+        params.sampling.grammar_prefill = params.chat_parser_params.prefill;
        params.chat_parser_params.parse_tool_calls = json_value(data, "parse_tool_calls", false);
        if (data.contains("chat_parser")) {
            params.chat_parser_params.parser.load(data.at("chat_parser").get<std::string>());
--- a/tools/server/webui/src/lib/services/parameter-sync.service.spec.ts
+++ b/tools/server/webui/src/lib/services/parameter-sync.service.spec.ts
@ -51,7 +51,7 @@ describe('ParameterSyncService', () => {
 				chat_format: '',
 				reasoning_format: '',
 				reasoning_in_content: false,
-				reasoning_prefill: '',
+				prefill: '',
 				'speculative.n_max': 0,
 				'speculative.n_min': 0,
 				'speculative.p_min': 0.0,
@ -116,7 +116,7 @@ describe('ParameterSyncService', () => {
 				chat_format: '',
 				reasoning_format: '',
 				reasoning_in_content: false,
-				reasoning_prefill: '',
+				prefill: '',
 				'speculative.n_max': 0,
 				'speculative.n_min': 0,
 				'speculative.p_min': 0.0,
--- a/tools/server/webui/src/lib/types/api.d.ts
+++ b/tools/server/webui/src/lib/types/api.d.ts
@ -164,7 +164,7 @@ export interface ApiLlamaCppServerProps {
 			chat_format: string;
 			reasoning_format: string;
 			reasoning_in_content: boolean;
-			reasoning_prefill: string;
+			prefill: string;
 			samplers: string[];
 			backend_sampling: boolean;
 			'speculative.n_max': number;
@ -332,7 +332,7 @@ export interface ApiSlotData {
 		chat_format: string;
 		reasoning_format: string;
 		reasoning_in_content: boolean;
-		reasoning_prefill: string;
+		prefill: string;
 		samplers: string[];
 		backend_sampling: boolean;
 		'speculative.n_max': number;