Reasoning prefill

2026-03-11 12:28:22 +01:00 · 2026-03-11 12:28:22 +01:00 · 060d4e4cfd
parent 559646472d
commit 060d4e4cfd
15 changed files with 141 additions and 101 deletions
--- a/common/chat-auto-parser-generator.cpp
+++ b/common/chat-auto-parser-generator.cpp
@ -49,6 +49,42 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
    data.preserved_tokens = autoparser.preserved_tokens;
    data.parser           = parser.save();

+    // Extract reasoning prefill from the end of the rendered prompt.
+    // If the template added reasoning markers (e.g. <think> or <think></think>) at the end,
+    // store them so they can be prepended to model output before parsing.
+    if (inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE &&
+        autoparser.reasoning.mode != reasoning_mode::NONE &&
+        !autoparser.reasoning.end.empty()) {
+        const auto & r_start = autoparser.reasoning.start;
+        const auto & r_end   = autoparser.reasoning.end;
+        // Trim trailing whitespace from the prompt for suffix matching
+        auto prompt_trimmed = data.prompt;
+        while (!prompt_trimmed.empty() &&
+               (prompt_trimmed.back() == ' ' || prompt_trimmed.back() == '\n' ||
+                prompt_trimmed.back() == '\r' || prompt_trimmed.back() == '\t')) {
+            prompt_trimmed.pop_back();
+        }
+        if (!r_start.empty()) {
+            // Check for start+end at end of prompt (e.g. <think></think>)
+            if (string_ends_with(prompt_trimmed, r_end)) {
+                auto before_end = prompt_trimmed.substr(0, prompt_trimmed.size() - r_end.size());
+                while (!before_end.empty() &&
+                       (before_end.back() == ' ' || before_end.back() == '\n' ||
+                        before_end.back() == '\r' || before_end.back() == '\t')) {
+                    before_end.pop_back();
+                }
+                if (string_ends_with(before_end, r_start)) {
+                    // Prompt ends with start + whitespace + end: extract from start to end of trimmed prompt
+                    data.reasoning_prefill = prompt_trimmed.substr(before_end.size() - r_start.size());
+                }
+            }
+            // Check for just start at end of prompt (e.g. <think>)
+            if (data.reasoning_prefill.empty() && string_ends_with(prompt_trimmed, r_start)) {
+                data.reasoning_prefill = r_start;
+            }
+        }
+    }
+
    // Build grammar if tools are present
    bool has_tools =
        autoparser.tools.format.mode != tool_format::NONE && inputs.tools.is_array() && !inputs.tools.empty();
@ -96,9 +132,8 @@ common_peg_arena autoparser::build_parser(const templates_params & inputs) const

        parser_build_context ctx(p, inputs);
        bool                 extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
-        bool                 enable_thinking   = inputs.enable_thinking;

-        ctx.extracting_reasoning = extract_reasoning && enable_thinking && reasoning.mode != reasoning_mode::NONE;
+        ctx.extracting_reasoning = extract_reasoning && reasoning.mode != reasoning_mode::NONE;
        ctx.content              = &content;

        // Build reasoning parser
@ -130,24 +165,15 @@ common_peg_parser analyze_reasoning::build_parser(parser_build_context & ctx) co
        return p.eps();
    }

-    bool thinking_forced_open   = (mode == reasoning_mode::FORCED_OPEN);
-    bool thinking_forced_closed = (mode == reasoning_mode::FORCED_CLOSED);
-
-    if (thinking_forced_open || thinking_forced_closed) {
-        // Thinking is forced open OR forced closed with enable_thinking=true
-        // In both cases, expect only the closing tag (opening was in template)
-        // However, since we might have incorrectly detected the open/close pattern,
-        // we admit an optional starting marker
-        return p.optional(p.literal(start)) + p.reasoning(p.until(end)) + end;
-    }
    if (mode == reasoning_mode::TAG_BASED || mode == reasoning_mode::TOOLS_ONLY) {
-        // Standard tag-based reasoning OR tools-only mode (reasoning appears with tools)
-        // Both use the same tag-based pattern if markers are available
-        if (!start.empty() && !end.empty()) {
-            return p.optional(start + p.reasoning(p.until(end)) + end);
+        if (!end.empty()) {
+            if (!start.empty()) {
+                // Standard tag-based: optional(<think>reasoning</think>)
+                return p.optional(start + p.reasoning(p.until(end)) + end);
+            }
+            // Delimiter-style (empty start): optional(reasoning[DELIMITER])
+            return p.optional(p.reasoning(p.until(end)) + end);
        }
-    } else if (mode == reasoning_mode::DELIMITER) {
-        return p.optional(p.reasoning(p.until(end)) + end);
    }

    return p.eps();
--- a/common/chat-auto-parser.h
+++ b/common/chat-auto-parser.h
@ -77,11 +77,7 @@ struct templates_params {
 // Reasoning handling mode (derived from R1-R3 comparisons)
 enum class reasoning_mode {
    NONE,           // No reasoning markers detected
-    TAG_BASED,      // Standard tag-based: <think>...</think>
-    DELIMITER,      // Delimiter-based: [BEGIN FINAL RESPONSE] (reasoning ends at delimiter)
-    FORCED_OPEN,    // Template ends with open reasoning tag (empty start, non-empty end)
-    FORCED_CLOSED,  // Template ends with open reasoning tag on enabled thinking but
-                    // with both opened and closed tag for disabled thinking
+    TAG_BASED,      // Tag-based: <think>...</think> (start can be empty for delimiter-style)
    TOOLS_ONLY      // Only reason on tool calls, not on normal content
 };

@ -91,12 +87,6 @@ inline std::ostream & operator<<(std::ostream & os, const reasoning_mode & mode)
            return os << "NONE";
        case reasoning_mode::TAG_BASED:
            return os << "TAG_BASED";
-        case reasoning_mode::DELIMITER:
-            return os << "DELIMITER";
-        case reasoning_mode::FORCED_OPEN:
-            return os << "FORCED_OPEN";
-        case reasoning_mode::FORCED_CLOSED:
-            return os << "FORCED_CLOSED";
        case reasoning_mode::TOOLS_ONLY:
            return os << "TOOLS_ONLY";
        default:
--- a/common/chat-diff-analyzer.cpp
+++ b/common/chat-diff-analyzer.cpp
@ -32,7 +32,7 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar
          if (tmpl.src.find("content.split('</think>')") != std::string::npos &&
              tmpl.src.find("reasoning_content") == std::string::npos &&
              analysis.reasoning.mode == reasoning_mode::NONE) {
-              analysis.reasoning.mode  = reasoning_mode::FORCED_OPEN;
+              analysis.reasoning.mode  = reasoning_mode::TAG_BASED;
              analysis.reasoning.start = "<think>";
              analysis.reasoning.end   = "</think>";
              analysis.preserved_tokens.push_back("<think>");
@ -295,15 +295,11 @@ void analyze_reasoning::compare_reasoning_presence() {
        }
        if (result.result.success()) {
            if (!result.tags["pre"].empty() && !result.tags["post"].empty()) {
-                if (parser_wrapped.parse_anywhere_and_extract(diff.right).result.success()) { // both tags in the diff = no forced close
-                    mode = reasoning_mode::TAG_BASED;
-                } else {
-                    mode = reasoning_mode::FORCED_CLOSED;
-                }
+                mode = reasoning_mode::TAG_BASED;
                start = trim_whitespace(result.tags["pre"]);
                end   = result.tags["post"];
            } else if (!result.tags["post"].empty()) {
-                mode = reasoning_mode::DELIMITER;
+                mode = reasoning_mode::TAG_BASED;
                end = result.tags["post"];
            }
        }
@ -338,17 +334,17 @@ void analyze_reasoning::compare_thinking_enabled() {
        if (!right_trimmed.empty() && string_ends_with(comparison->output_B, right_trimmed)) {
            if (start.empty()) {
                start = right_trimmed;
-                mode  = reasoning_mode::FORCED_OPEN;
+                mode  = reasoning_mode::TAG_BASED;
            }
        }
    }

-    if (start.empty() && !end.empty()) {
-        mode = reasoning_mode::DELIMITER;
+    if (mode == reasoning_mode::NONE && start.empty() && !end.empty()) {
+        mode = reasoning_mode::TAG_BASED;
    }

-    // Check for FORCED_CLOSED: when enable_thinking=false produces both start and end markers,
-    // but enable_thinking=true produces only the start marker
+    // Check for start+end pattern: when enable_thinking=false produces both start and end markers,
+    // but enable_thinking=true produces only the start marker. Both cases are TAG_BASED.
    if (!comparison->output_A.empty() && !comparison->output_B.empty()) {
        auto parser_start = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
            return p.literal(start) + p.space() + p.literal(end) + p.rest();
@ -358,12 +354,12 @@ void analyze_reasoning::compare_thinking_enabled() {
        });
        if (!start.empty() && parser_start_end.parse_anywhere_and_extract(comparison->output_A).result.success() &&
            parser_start.parse_anywhere_and_extract(comparison->output_B).result.success()) {
-            mode = reasoning_mode::FORCED_CLOSED;
+            mode = reasoning_mode::TAG_BASED;
        } else if (!end.empty()) { // we extract the starting marker now since we didn't get it earlier
            auto result = parser_start_end.parse_anywhere_and_extract(comparison->output_A);
            if (result.result.success()) {
                start = result.tags["pre"];
-                mode  = reasoning_mode::FORCED_CLOSED;
+                mode  = reasoning_mode::TAG_BASED;
            }
        }
    }
@ -373,7 +369,7 @@ void analyze_reasoning::compare_thinking_enabled() {
            auto seg_A = segmentize_markers(trim_trailing_whitespace(diff.left));
            auto seg_B = segmentize_markers(trim_trailing_whitespace(diff.right));
            if (seg_A.size() == 1 && seg_B.size() == 1) {
-                mode = reasoning_mode::FORCED_CLOSED;
+                mode = reasoning_mode::TAG_BASED;
                start = seg_B[0].value;
                end = seg_A[0].value;
            }
--- a/common/chat-peg-parser.cpp
+++ b/common/chat-peg-parser.cpp
@ -229,6 +229,20 @@ void common_chat_peg_mapper::from_ast(const common_peg_ast_arena &    arena,
        result.tool_calls.push_back(pending_tool_call.value());
        pending_tool_call.reset();
    }
+
+    // Discard whitespace-only reasoning content (e.g. from <think></think> prefill)
+    if (!result.reasoning_content.empty()) {
+        bool all_whitespace = true;
+        for (char c : result.reasoning_content) {
+            if (c != ' ' && c != '\n' && c != '\r' && c != '\t') {
+                all_whitespace = false;
+                break;
+            }
+        }
+        if (all_whitespace) {
+            result.reasoning_content.clear();
+        }
+    }
 }

 void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
--- a/common/chat.cpp
+++ b/common/chat.cpp
@ -1718,14 +1718,20 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena &          src_pars
        LOG_DBG("No parser definition detected, assuming pure content parser.");
    }

-    LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), input.c_str());
+    // Prepend reasoning prefill (e.g. <think> or <think></think> from template prompt)
+    // so the parser can detect reasoning markers that were part of the template output.
+    const std::string effective_input = params.reasoning_prefill.empty()
+        ? input
+        : params.reasoning_prefill + input;
+
+    LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), effective_input.c_str());

    common_peg_parse_flags flags = COMMON_PEG_PARSE_FLAG_LENIENT;
    if (params.debug) {
        flags |= COMMON_PEG_PARSE_FLAG_DEBUG;
    }

-    common_peg_parse_context ctx(input, flags);
+    common_peg_parse_context ctx(effective_input, flags);
    auto result = parser.parse(ctx);

    if (result.fail()) {
--- a/common/chat.h
+++ b/common/chat.h
@ -211,7 +211,7 @@ struct common_chat_params {
    std::string                         prompt;
    std::string                         grammar;
    bool                                grammar_lazy         = false;
-    bool                                thinking_forced_open = false;
+    std::string                         reasoning_prefill;
    bool                                supports_thinking    = false;
    std::string                         thinking_start_tag;  // e.g., "<think>"
    std::string                         thinking_end_tag;    // e.g., "</think>"
@ -228,14 +228,14 @@ struct common_chat_parser_params {
    common_reasoning_format reasoning_format     = COMMON_REASONING_FORMAT_NONE; // TODO: refactor this to "bool parse_reasoning"
    // Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
    bool                    reasoning_in_content = false;
-    bool                    thinking_forced_open = false;
+    std::string             reasoning_prefill;
    bool                    parse_tool_calls     = true;
    bool                    debug                = false;  // Enable debug output for PEG parser
    common_peg_arena        parser               = {};
    common_chat_parser_params() = default;
    common_chat_parser_params(const common_chat_params & chat_params) {
        format               = chat_params.format;
-        thinking_forced_open = chat_params.thinking_forced_open;
+        reasoning_prefill    = chat_params.reasoning_prefill;
    }
 };

--- a/docs/autoparser.md
+++ b/docs/autoparser.md
@ -47,12 +47,11 @@ All structs are defined in [common/chat-auto-parser.h](common/chat-auto-parser.h
 | Value           | Description                                                                       |
 |-----------------|-----------------------------------------------------------------------------------|
 | `NONE`          | No reasoning markers detected                                                     |
-| `TAG_BASED`     | Standard tag-based: `<think>...</think>`                                          |
-| `DELIMITER`     | Delimiter-based: reasoning ends at a delimiter (e.g., `[BEGIN FINAL RESPONSE]`)   |
-| `FORCED_OPEN`   | Template ends with open reasoning tag when `enable_thinking=true`                 |
-| `FORCED_CLOSED` | `enable_thinking=false` emits both tags; `enable_thinking=true` emits only start  |
+| `TAG_BASED`     | Tag-based: `<think>...</think>` (start can be empty for delimiter-style formats)  |
 | `TOOLS_ONLY`    | Reasoning only appears in tool call responses, not plain content                  |

+**Reasoning Prefill**: When a template adds reasoning markers (e.g., `<think>` or `<think></think>`) at the end of the prompt, these are extracted as `reasoning_prefill` and prepended to the model output before parsing. This allows the parser to always use an optional TAG_BASED pattern while correctly handling templates that force thinking mode open or closed. Whitespace-only reasoning content (from `<think></think>` prefill) is automatically discarded.
+
 **`content_mode`**: How the template wraps assistant content.

 | Value                    | Description                                                    |
@ -263,14 +262,15 @@ Text is segmentized into markers and non-marker fragments using `segmentize_mark
 - Uses PEG parsers to find surrounding markers:
  - If both pre/post markers found in `diff.right` → `TAG_BASED` (both tags visible in diff = no forced close)
  - If both found but post marker only in the full output B → `FORCED_CLOSED`
-  - If only post marker found → `DELIMITER`
+  - If only post marker found → `TAG_BASED` (delimiter-style, empty start)
 - Sets `reasoning.start` and `reasoning.end`

 **R2 — `compare_thinking_enabled()`**: Compares `enable_thinking=false` vs `true` with a generation prompt.

- Detects `FORCED_OPEN`: `enable_thinking=true` adds a non-empty marker at the end of the prompt (where model will start generating) — sets `reasoning.start`, mode = `FORCED_OPEN`
- Detects `FORCED_CLOSED`: `enable_thinking=false` produces both start+end markers; `enable_thinking=true` produces only start marker
+- Detects template-added reasoning markers: `enable_thinking=true` adds a non-empty marker at the end of the prompt — sets `reasoning.start`, mode = `TAG_BASED`
+- Detects start+end pattern: `enable_thinking=false` produces both start+end markers; `enable_thinking=true` produces only start marker — both classified as `TAG_BASED`
 - Handles the reverse case: if both start and end are still empty, looks for a single-segment diff on each side to extract both markers
+- The reasoning prefill (markers added by the template) is later extracted in `generate_parser()` and prepended to model output before parsing

 **R3 — `compare_reasoning_scope()`**: Compares assistant message with reasoning+text-content vs reasoning+tool-calls.

@ -358,9 +358,10 @@ Each analyzer struct (`analyze_reasoning`, `analyze_content`, `analyze_tools`) i
 | Mode                              | Parser                                                              |
 |-----------------------------------|---------------------------------------------------------------------|
 | Not extracting reasoning          | `eps()`                                                             |
-| `FORCED_OPEN` or `FORCED_CLOSED`  | `reasoning(until(end)) + end` — opening tag was in the prompt       |
-| `TAG_BASED` or `TOOLS_ONLY`       | `optional(start + reasoning(until(end)) + end)`                     |
-| `DELIMITER`                       | `optional(reasoning(until(end)) + end)` — no start marker           |
+| `TAG_BASED` or `TOOLS_ONLY` (non-empty start) | `optional(start + reasoning(until(end)) + end)`          |
+| `TAG_BASED` or `TOOLS_ONLY` (empty start)     | `optional(reasoning(until(end)) + end)` — delimiter-style|
+
+Note: Templates that add reasoning markers to the prompt (e.g., `<think>`) have these extracted as `reasoning_prefill` and prepended to model output before parsing. The parser always uses the optional TAG_BASED pattern.

 #### Content Parser (`analyze_content::build_parser`)

@ -516,7 +517,7 @@ To support a new template format:

 ## Edge Cases and Quirks

-1. **Forced Thinking**: When `enable_thinking=true` and the model prompt ends with an open reasoning tag (e.g., `<think>`), the parser enters forced thinking mode and immediately expects reasoning content without waiting for a start marker.
+1. **Reasoning Prefill**: When `enable_thinking=true` and the model prompt ends with reasoning markers (e.g., `<think>` or `<think></think>`), these are extracted as `reasoning_prefill` and prepended to model output before parsing. The parser always uses optional TAG_BASED reasoning, so it handles both thinking and non-thinking outputs dynamically. Whitespace-only reasoning content (from closed prefill like `<think></think>`) is discarded.
 2. **Per-Call vs Per-Section Markers**: Some templates wrap each tool call individually (`per_call_start/end`); others wrap the entire section (`section_start/end`). T2 (`check_per_call_markers()`) disambiguates by checking if the second call in a two-call output starts with the section marker.
 3. **Python Dict Format**: The Seed template family uses single-quoted JSON (`'key': 'value'`). The `uses_python_dicts` flag causes the PEG builder to register a flexible `json-string` rule accepting both quote styles before any JSON rules are built.
 4. **Tag Boundary Fixing**: `calculate_diff_split()` iteratively adjusts prefix/suffix boundaries to avoid splitting `<tag>` or `[marker]` tokens, ensuring clean extraction.
--- a/tests/test-chat-auto-parser.cpp
+++ b/tests/test-chat-auto-parser.cpp
@ -1295,8 +1295,8 @@ static void test_nemotron_reasoning_detection(testing & t) {
    t.assert_equal("reasoning_end should be '</think>\\n'", "</think>\n", analysis.reasoning.end);

    // Check reasoning mode detection
-    // Nemotron uses forced closed reasoning with add_generation_prompt
-    t.assert_equal("reasoning should be FORCED_CLOSED", reasoning_mode::FORCED_CLOSED, analysis.reasoning.mode);
+    // Nemotron uses tag-based reasoning (formerly FORCED_CLOSED; prefill handles the template's forced markers)
+    t.assert_equal("reasoning should be TAG_BASED", reasoning_mode::TAG_BASED, analysis.reasoning.mode);

    // Make sure reasoning markers don't spill over to content markers
    t.assert_equal("content start should be empty", "", analysis.content.start);
--- a/tests/test-chat-peg-parser.cpp
+++ b/tests/test-chat-peg-parser.cpp
@ -145,7 +145,7 @@ static void test_example_native(testing & t) {
        common_reasoning_format reasoning_format;
        json                    json_schema;
        bool                    parallel_tool_calls;
-        bool                    thinking_forced_open;
+        std::string             reasoning_prefill;
        std::string             input;

        // Expect
@ -157,14 +157,8 @@ static void test_example_native(testing & t) {
    auto build_parser = [](const test_case & tc) {
        return build_chat_peg_parser([&](common_chat_peg_builder & p) {
            auto reasoning_in_content = (tc.reasoning_format == COMMON_REASONING_FORMAT_NONE);
-            auto reasoning            = p.eps();
-            if (tc.thinking_forced_open) {
-                // If thinking is forced open, expect a closing tag
-                reasoning = p.reasoning(p.until("</think>")) + "</think>" + p.space();
-            } else {
-                // Otherwise, optionally accept thinking wrapped in tags
-                reasoning = p.optional("<think>" + p.reasoning(p.until("</think>")) + "</think>" + p.space());
-            }
+            // Always use optional TAG_BASED pattern; reasoning_prefill is prepended to input
+            auto reasoning = p.optional("<think>" + p.reasoning(p.until("</think>")) + "</think>" + p.space());

            // tool calling parser
            if (tc.tools.is_array() && !tc.tools.empty()) {
@ -190,78 +184,91 @@ static void test_example_native(testing & t) {

    std::vector<test_case> test_cases = std::vector<test_case>{
        {
-         /* .name =                 */ "content with thinking_forced_open = false",
+         /* .name =                 */ "content with reasoning (no prefill)",
         /* .tools =                */ {},
         /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
         /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
         /* .json_schema =          */ {},
         /* .parallel_tool_calls =  */ false,
-         /* .thinking_forced_open = */ false,
+         /* .reasoning_prefill =    */ "",
         /* .input =                */ ("<think>The user said hello, I must say hello back</think>\nHello"),
         /* .expect_reasoning =     */ "The user said hello, I must say hello back",
         /* .expect_content =       */ "Hello",
         /* .expect_tool_calls =    */ {},
         },
        {
-         /* .name =                 */ "content with thinking_forced_open = false and no reasoning",
+         /* .name =                 */ "content without reasoning (no prefill)",
         /* .tools =                */ {},
         /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
         /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
         /* .json_schema =          */ {},
         /* .parallel_tool_calls =  */ false,
-         /* .thinking_forced_open = */ false,
+         /* .reasoning_prefill =    */ "",
         /* .input =                */ ("Hello"),
         /* .expect_reasoning =     */ "",
         /* .expect_content =       */ "Hello",
         /* .expect_tool_calls =    */ {},
         },
        {
-         /* .name =                 */ "content with thinking_forced_open = false and reasoning_format = none",
+         /* .name =                 */ "content with reasoning_format = none (tags appear in content)",
         /* .tools =                */ {},
         /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
         /* .reasoning_format =     */ COMMON_REASONING_FORMAT_NONE,
         /* .json_schema =          */ {},
         /* .parallel_tool_calls =  */ false,
-         /* .thinking_forced_open = */ true,
+         /* .reasoning_prefill =    */ "",
         /* .input =                */ ("<think>The user said hello, I must say hello back</think>\nHello"),
         /* .expect_reasoning =     */ "",
         /* .expect_content =       */ "<think>The user said hello, I must say hello back</think>\nHello",
         /* .expect_tool_calls =    */ {},
         },
        {
-         /* .name =                 */ "content with thinking_forced_open = true",
+         /* .name =                 */ "content with reasoning prefill",
         /* .tools =                */ {},
         /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
         /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
         /* .json_schema =          */ {},
         /* .parallel_tool_calls =  */ false,
-         /* .thinking_forced_open = */ true,
+         /* .reasoning_prefill =    */ "<think>",
         /* .input =                */ ("The user said hello, I must say hello back</think>\nHello"),
         /* .expect_reasoning =     */ "The user said hello, I must say hello back",
         /* .expect_content =       */ "Hello",
         /* .expect_tool_calls =    */ {},
         },
        {
-         /* .name =                 */ "content with thinking_forced_open = true and reasoning_format = none",
+         /* .name =                 */ "content with reasoning prefill and reasoning_format = none",
         /* .tools =                */ {},
         /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
         /* .reasoning_format =     */ COMMON_REASONING_FORMAT_NONE,
         /* .json_schema =          */ {},
         /* .parallel_tool_calls =  */ false,
-         /* .thinking_forced_open = */ true,
+         /* .reasoning_prefill =    */ "",
         /* .input =                */ ("The user said hello, I must say hello back</think>\nHello"),
         /* .expect_reasoning =     */ "",
         /* .expect_content =       */ "The user said hello, I must say hello back</think>\nHello",
         /* .expect_tool_calls =    */ {},
         },
        {
-         /* .name =                 */ "tools with tool_choice = auto and no parallel_tool_calls",
+         /* .name =                 */ "content with closed reasoning prefill (empty reasoning discarded)",
+         /* .tools =                */ {},
+         /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
+         /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
+         /* .json_schema =          */ {},
+         /* .parallel_tool_calls =  */ false,
+         /* .reasoning_prefill =    */ "<think></think>",
+         /* .input =                */ ("Hello"),
+         /* .expect_reasoning =     */ "",
+         /* .expect_content =       */ "Hello",
+         /* .expect_tool_calls =    */ {},
+         },
+        {
+         /* .name =                 */ "tools with reasoning prefill",
         /* .tools =                */ create_tools(),
         /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_AUTO,
         /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
         /* .json_schema =          */ {},
         /* .parallel_tool_calls =  */ false,
-         /* .thinking_forced_open = */ true,
+         /* .reasoning_prefill =    */ "<think>",
         /* .input =                */
            ("I must get the weather in New York</think>\n"
             "<tool_call>["
@ -277,13 +284,13 @@ static void test_example_native(testing & t) {
            } },
         },
        {
-         /* .name =                 */ "tools with tool_choice = auto and parallel_tool_calls",
+         /* .name =                 */ "parallel tools with reasoning prefill",
         /* .tools =                */ create_tools(),
         /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_AUTO,
         /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
         /* .json_schema =          */ {},
         /* .parallel_tool_calls =  */ true,
-         /* .thinking_forced_open = */ true,
+         /* .reasoning_prefill =    */ "<think>",
         /* .input =                */
            ("I must get the weather in New York and San Francisco and a 3 day forecast of each.</think>\nLet me "
             "search that for you."
@ -321,7 +328,7 @@ static void test_example_native(testing & t) {
              } },
         },
        {
-         /* .name =                 */ "response_format with thinking_forced_open = true",
+         /* .name =                 */ "response_format with reasoning prefill",
         /* .tools =                */ {},
         /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
         /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
@ -333,7 +340,7 @@ static void test_example_native(testing & t) {
                  { "due_date", { { "type", "string" } } } } },
              { "required", { "invoice_number", "amount", "due_date" } } },
         /* .parallel_tool_calls =  */ false,
-         /* .thinking_forced_open = */ true,
+         /* .reasoning_prefill =    */ "<think>",
         /* .input =                */
            ("I must produce the invoice in the requested format</think>\n"
             R"({"invoice_number": "INV-2025-001", "amount": 1250.50, "due_date": "2025-12-31"})"),
@ -361,7 +368,8 @@ static void test_example_native(testing & t) {
                t.log(line);
            }

-            common_peg_parse_context ctx(tc.input);
+            std::string              effective_input = tc.reasoning_prefill + tc.input;
+            common_peg_parse_context ctx(effective_input);
            auto                     result = parser.parse(ctx);

            t.assert_true("success", result.success());
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@ -822,8 +822,7 @@ struct make_peg_parser {
    }

    common_chat_msg parse(const std::string & msg, bool is_partial) const {
-        common_chat_parser_params parser_params;
-        parser_params.format = params_.format;
+        common_chat_parser_params parser_params(params_);
        parser_params.debug = detailed_debug_;
        return common_chat_peg_parse(arena_, msg, is_partial, parser_params);
    }
--- a/tools/server/README.md
+++ b/tools/server/README.md
@ -907,7 +907,7 @@ If query param `?fail_on_no_slot=1` is set, this endpoint will respond with stat
      "chat_format": "GPT-OSS",
      "reasoning_format": "none",
      "reasoning_in_content": false,
-      "thinking_forced_open": false,
+      "reasoning_prefill": "",
      "samplers": [
        "penalties",
        "dry",
@ -972,7 +972,7 @@ If query param `?fail_on_no_slot=1` is set, this endpoint will respond with stat
      "chat_format": "GPT-OSS",
      "reasoning_format": "none",
      "reasoning_in_content": false,
-      "thinking_forced_open": false,
+      "reasoning_prefill": "",
      "samplers": [
        "penalties",
        "dry",
@ -1193,7 +1193,7 @@ The `response_format` parameter supports both plain JSON output (e.g. `{"type":

 `reasoning_format`: The reasoning format to be parsed. If set to `none`, it will output the raw generated text.

-`thinking_forced_open`: Force a reasoning model to always output the reasoning. Only works on certain models.
+`reasoning_prefill`: The reasoning markers that were prefilled in the prompt by the template. Prepended to model output before parsing to handle dynamic thinking/non-thinking modes.

 `parse_tool_calls`: Whether to parse the generated tool call.

--- a/tools/server/server-common.cpp
+++ b/tools/server/server-common.cpp
@ -1093,7 +1093,7 @@ json oaicompat_chat_params_parse(
    }
    llama_params["grammar_triggers"] = grammar_triggers;
    llama_params["preserved_tokens"] = chat_params.preserved_tokens;
-    llama_params["thinking_forced_open"]     = chat_params.thinking_forced_open;
+    llama_params["reasoning_prefill"]        = chat_params.reasoning_prefill;
    for (const auto & stop : chat_params.additional_stops) {
        llama_params["stop"].push_back(stop);
    }
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@ -72,7 +72,7 @@ json task_params::to_json(bool only_metrics) const {
            {"chat_format",               common_chat_format_name(chat_parser_params.format)},
            {"reasoning_format",          common_reasoning_format_name(chat_parser_params.reasoning_format)},
            {"reasoning_in_content",      chat_parser_params.reasoning_in_content},
-            {"thinking_forced_open",      chat_parser_params.thinking_forced_open},
+            {"reasoning_prefill",         chat_parser_params.reasoning_prefill},
            {"samplers",                  samplers},
            {"speculative.n_max",         speculative.n_max},
            {"speculative.n_min",         speculative.n_min},
@ -135,7 +135,7 @@ json task_params::to_json(bool only_metrics) const {
        {"chat_format",               common_chat_format_name(chat_parser_params.format)},
        {"reasoning_format",          common_reasoning_format_name(chat_parser_params.reasoning_format)},
        {"reasoning_in_content",      chat_parser_params.reasoning_in_content},
-        {"thinking_forced_open",      chat_parser_params.thinking_forced_open},
+        {"reasoning_prefill",         chat_parser_params.reasoning_prefill},
        {"samplers",                  samplers},
        {"speculative.n_max",         speculative.n_max},
        {"speculative.n_min",         speculative.n_min},
@ -402,7 +402,7 @@ task_params server_task::params_from_json_cmpl(
        }
        params.chat_parser_params.reasoning_format = reasoning_format;
        params.chat_parser_params.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
-        params.chat_parser_params.thinking_forced_open = json_value(data, "thinking_forced_open", false);
+        params.chat_parser_params.reasoning_prefill = json_value(data, "reasoning_prefill", std::string());
        params.chat_parser_params.parse_tool_calls = json_value(data, "parse_tool_calls", false);
        if (data.contains("chat_parser")) {
            params.chat_parser_params.parser.load(data.at("chat_parser").get<std::string>());
--- a/tools/server/webui/src/lib/services/parameter-sync.service.spec.ts
+++ b/tools/server/webui/src/lib/services/parameter-sync.service.spec.ts
@ -51,7 +51,7 @@ describe('ParameterSyncService', () => {
 				chat_format: '',
 				reasoning_format: '',
 				reasoning_in_content: false,
-				thinking_forced_open: false,
+				reasoning_prefill: '',
 				'speculative.n_max': 0,
 				'speculative.n_min': 0,
 				'speculative.p_min': 0.0,
@ -116,7 +116,7 @@ describe('ParameterSyncService', () => {
 				chat_format: '',
 				reasoning_format: '',
 				reasoning_in_content: false,
-				thinking_forced_open: false,
+				reasoning_prefill: '',
 				'speculative.n_max': 0,
 				'speculative.n_min': 0,
 				'speculative.p_min': 0.0,
--- a/tools/server/webui/src/lib/types/api.d.ts
+++ b/tools/server/webui/src/lib/types/api.d.ts
@ -164,7 +164,7 @@ export interface ApiLlamaCppServerProps {
 			chat_format: string;
 			reasoning_format: string;
 			reasoning_in_content: boolean;
-			thinking_forced_open: boolean;
+			reasoning_prefill: string;
 			samplers: string[];
 			backend_sampling: boolean;
 			'speculative.n_max': number;
@ -332,7 +332,7 @@ export interface ApiSlotData {
 		chat_format: string;
 		reasoning_format: string;
 		reasoning_in_content: boolean;
-		thinking_forced_open: boolean;
+		reasoning_prefill: string;
 		samplers: string[];
 		backend_sampling: boolean;
 		'speculative.n_max': number;