From 060d4e4cfdafc49ec7519da5e5de669fefd432ba Mon Sep 17 00:00:00 2001
From: Piotr Wilkin <piotr.wilkin@syndatis.com>
Date: Wed, 11 Mar 2026 12:28:22 +0100
Subject: [PATCH] Reasoning prefill

---
 common/chat-auto-parser-generator.cpp         | 62 +++++++++++++------
 common/chat-auto-parser.h                     | 12 +---
 common/chat-diff-analyzer.cpp                 | 26 ++++----
 common/chat-peg-parser.cpp                    | 14 +++++
 common/chat.cpp                               | 10 ++-
 common/chat.h                                 |  6 +-
 docs/autoparser.md                            | 23 +++----
 tests/test-chat-auto-parser.cpp               |  4 +-
 tests/test-chat-peg-parser.cpp                | 60 ++++++++++--------
 tests/test-chat.cpp                           |  3 +-
 tools/server/README.md                        |  6 +-
 tools/server/server-common.cpp                |  2 +-
 tools/server/server-task.cpp                  |  6 +-
 .../services/parameter-sync.service.spec.ts   |  4 +-
 tools/server/webui/src/lib/types/api.d.ts     |  4 +-
 15 files changed, 141 insertions(+), 101 deletions(-)
diff --git a/common/chat-auto-parser-generator.cpp b/common/chat-auto-parser-generator.cpp
index f19819494c..6bd4b2d208 100644
--- a/common/chat-auto-parser-generator.cpp
+++ b/common/chat-auto-parser-generator.cpp
@@ -49,6 +49,42 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
     data.preserved_tokens = autoparser.preserved_tokens;
     data.parser           = parser.save();
 
+    // Extract reasoning prefill from the end of the rendered prompt.
+    // If the template added reasoning markers (e.g. <think> or <think></think>) at the end,
+    // store them so they can be prepended to model output before parsing.
+    if (inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE &&
+        autoparser.reasoning.mode != reasoning_mode::NONE &&
+        !autoparser.reasoning.end.empty()) {
+        const auto & r_start = autoparser.reasoning.start;
+        const auto & r_end   = autoparser.reasoning.end;
+        // Trim trailing whitespace from the prompt for suffix matching
+        auto prompt_trimmed = data.prompt;
+        while (!prompt_trimmed.empty() &&
+               (prompt_trimmed.back() == ' ' || prompt_trimmed.back() == '\n' ||
+                prompt_trimmed.back() == '\r' || prompt_trimmed.back() == '\t')) {
+            prompt_trimmed.pop_back();
+        }
+        if (!r_start.empty()) {
+            // Check for start+end at end of prompt (e.g. <think></think>)
+            if (string_ends_with(prompt_trimmed, r_end)) {
+                auto before_end = prompt_trimmed.substr(0, prompt_trimmed.size() - r_end.size());
+                while (!before_end.empty() &&
+                       (before_end.back() == ' ' || before_end.back() == '\n' ||
+                        before_end.back() == '\r' || before_end.back() == '\t')) {
+                    before_end.pop_back();
+                }
+                if (string_ends_with(before_end, r_start)) {
+                    // Prompt ends with start + whitespace + end: extract from start to end of trimmed prompt
+                    data.reasoning_prefill = prompt_trimmed.substr(before_end.size() - r_start.size());
+                }
+            }
+            // Check for just start at end of prompt (e.g. <think>)
+            if (data.reasoning_prefill.empty() && string_ends_with(prompt_trimmed, r_start)) {
+                data.reasoning_prefill = r_start;
+            }
+        }
+    }
+
     // Build grammar if tools are present
     bool has_tools =
         autoparser.tools.format.mode != tool_format::NONE && inputs.tools.is_array() && !inputs.tools.empty();
@@ -96,9 +132,8 @@ common_peg_arena autoparser::build_parser(const templates_params & inputs) const
 
         parser_build_context ctx(p, inputs);
         bool                 extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
-        bool                 enable_thinking   = inputs.enable_thinking;
 
-        ctx.extracting_reasoning = extract_reasoning && enable_thinking && reasoning.mode != reasoning_mode::NONE;
+        ctx.extracting_reasoning = extract_reasoning && reasoning.mode != reasoning_mode::NONE;
         ctx.content              = &content;
 
         // Build reasoning parser
@@ -130,24 +165,15 @@ common_peg_parser analyze_reasoning::build_parser(parser_build_context & ctx) co
         return p.eps();
     }
 
-    bool thinking_forced_open   = (mode == reasoning_mode::FORCED_OPEN);
-    bool thinking_forced_closed = (mode == reasoning_mode::FORCED_CLOSED);
-
-    if (thinking_forced_open || thinking_forced_closed) {
-        // Thinking is forced open OR forced closed with enable_thinking=true
-        // In both cases, expect only the closing tag (opening was in template)
-        // However, since we might have incorrectly detected the open/close pattern,
-        // we admit an optional starting marker
-        return p.optional(p.literal(start)) + p.reasoning(p.until(end)) + end;
-    }
     if (mode == reasoning_mode::TAG_BASED || mode == reasoning_mode::TOOLS_ONLY) {
-        // Standard tag-based reasoning OR tools-only mode (reasoning appears with tools)
-        // Both use the same tag-based pattern if markers are available
-        if (!start.empty() && !end.empty()) {
-            return p.optional(start + p.reasoning(p.until(end)) + end);
+        if (!end.empty()) {
+            if (!start.empty()) {
+                // Standard tag-based: optional(<think>reasoning</think>)
+                return p.optional(start + p.reasoning(p.until(end)) + end);
+            }
+            // Delimiter-style (empty start): optional(reasoning[DELIMITER])
+            return p.optional(p.reasoning(p.until(end)) + end);
         }
-    } else if (mode == reasoning_mode::DELIMITER) {
-        return p.optional(p.reasoning(p.until(end)) + end);
     }
 
     return p.eps();
diff --git a/common/chat-auto-parser.h b/common/chat-auto-parser.h
index 52c6488f4b..55713f4ef4 100644
--- a/common/chat-auto-parser.h
+++ b/common/chat-auto-parser.h
@@ -77,11 +77,7 @@ struct templates_params {
 // Reasoning handling mode (derived from R1-R3 comparisons)
 enum class reasoning_mode {
     NONE,           // No reasoning markers detected
-    TAG_BASED,      // Standard tag-based: <think>...</think>
-    DELIMITER,      // Delimiter-based: [BEGIN FINAL RESPONSE] (reasoning ends at delimiter)
-    FORCED_OPEN,    // Template ends with open reasoning tag (empty start, non-empty end)
-    FORCED_CLOSED,  // Template ends with open reasoning tag on enabled thinking but
-                    // with both opened and closed tag for disabled thinking
+    TAG_BASED,      // Tag-based: <think>...</think> (start can be empty for delimiter-style)
     TOOLS_ONLY      // Only reason on tool calls, not on normal content
 };
 
@@ -91,12 +87,6 @@ inline std::ostream & operator<<(std::ostream & os, const reasoning_mode & mode)
             return os << "NONE";
         case reasoning_mode::TAG_BASED:
             return os << "TAG_BASED";
-        case reasoning_mode::DELIMITER:
-            return os << "DELIMITER";
-        case reasoning_mode::FORCED_OPEN:
-            return os << "FORCED_OPEN";
-        case reasoning_mode::FORCED_CLOSED:
-            return os << "FORCED_CLOSED";
         case reasoning_mode::TOOLS_ONLY:
             return os << "TOOLS_ONLY";
         default:
diff --git a/common/chat-diff-analyzer.cpp b/common/chat-diff-analyzer.cpp
index 05b3b6b6a8..57bc234fca 100644
--- a/common/chat-diff-analyzer.cpp
+++ b/common/chat-diff-analyzer.cpp
@@ -32,7 +32,7 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar
           if (tmpl.src.find("content.split('</think>')") != std::string::npos &&
               tmpl.src.find("reasoning_content") == std::string::npos &&
               analysis.reasoning.mode == reasoning_mode::NONE) {
-              analysis.reasoning.mode  = reasoning_mode::FORCED_OPEN;
+              analysis.reasoning.mode  = reasoning_mode::TAG_BASED;
               analysis.reasoning.start = "<think>";
               analysis.reasoning.end   = "</think>";
               analysis.preserved_tokens.push_back("<think>");
@@ -295,15 +295,11 @@ void analyze_reasoning::compare_reasoning_presence() {
         }
         if (result.result.success()) {
             if (!result.tags["pre"].empty() && !result.tags["post"].empty()) {
-                if (parser_wrapped.parse_anywhere_and_extract(diff.right).result.success()) { // both tags in the diff = no forced close
-                    mode = reasoning_mode::TAG_BASED;
-                } else {
-                    mode = reasoning_mode::FORCED_CLOSED;
-                }
+                mode = reasoning_mode::TAG_BASED;
                 start = trim_whitespace(result.tags["pre"]);
                 end   = result.tags["post"];
             } else if (!result.tags["post"].empty()) {
-                mode = reasoning_mode::DELIMITER;
+                mode = reasoning_mode::TAG_BASED;
                 end = result.tags["post"];
             }
         }
@@ -338,17 +334,17 @@ void analyze_reasoning::compare_thinking_enabled() {
         if (!right_trimmed.empty() && string_ends_with(comparison->output_B, right_trimmed)) {
             if (start.empty()) {
                 start = right_trimmed;
-                mode  = reasoning_mode::FORCED_OPEN;
+                mode  = reasoning_mode::TAG_BASED;
             }
         }
     }
 
-    if (start.empty() && !end.empty()) {
-        mode = reasoning_mode::DELIMITER;
+    if (mode == reasoning_mode::NONE && start.empty() && !end.empty()) {
+        mode = reasoning_mode::TAG_BASED;
     }
 
-    // Check for FORCED_CLOSED: when enable_thinking=false produces both start and end markers,
-    // but enable_thinking=true produces only the start marker
+    // Check for start+end pattern: when enable_thinking=false produces both start and end markers,
+    // but enable_thinking=true produces only the start marker. Both cases are TAG_BASED.
     if (!comparison->output_A.empty() && !comparison->output_B.empty()) {
         auto parser_start = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
             return p.literal(start) + p.space() + p.literal(end) + p.rest();
@@ -358,12 +354,12 @@ void analyze_reasoning::compare_thinking_enabled() {
         });
         if (!start.empty() && parser_start_end.parse_anywhere_and_extract(comparison->output_A).result.success() &&
             parser_start.parse_anywhere_and_extract(comparison->output_B).result.success()) {
-            mode = reasoning_mode::FORCED_CLOSED;
+            mode = reasoning_mode::TAG_BASED;
         } else if (!end.empty()) { // we extract the starting marker now since we didn't get it earlier
             auto result = parser_start_end.parse_anywhere_and_extract(comparison->output_A);
             if (result.result.success()) {
                 start = result.tags["pre"];
-                mode  = reasoning_mode::FORCED_CLOSED;
+                mode  = reasoning_mode::TAG_BASED;
             }
         }
     }
@@ -373,7 +369,7 @@ void analyze_reasoning::compare_thinking_enabled() {
             auto seg_A = segmentize_markers(trim_trailing_whitespace(diff.left));
             auto seg_B = segmentize_markers(trim_trailing_whitespace(diff.right));
             if (seg_A.size() == 1 && seg_B.size() == 1) {
-                mode = reasoning_mode::FORCED_CLOSED;
+                mode = reasoning_mode::TAG_BASED;
                 start = seg_B[0].value;
                 end = seg_A[0].value;
             }
diff --git a/common/chat-peg-parser.cpp b/common/chat-peg-parser.cpp
index 4c5bb6218d..5f7d422b41 100644
--- a/common/chat-peg-parser.cpp
+++ b/common/chat-peg-parser.cpp
@@ -229,6 +229,20 @@ void common_chat_peg_mapper::from_ast(const common_peg_ast_arena &    arena,
         result.tool_calls.push_back(pending_tool_call.value());
         pending_tool_call.reset();
     }
+
+    // Discard whitespace-only reasoning content (e.g. from <think></think> prefill)
+    if (!result.reasoning_content.empty()) {
+        bool all_whitespace = true;
+        for (char c : result.reasoning_content) {
+            if (c != ' ' && c != '\n' && c != '\r' && c != '\t') {
+                all_whitespace = false;
+                break;
+            }
+        }
+        if (all_whitespace) {
+            result.reasoning_content.clear();
+        }
+    }
 }
 
 void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
diff --git a/common/chat.cpp b/common/chat.cpp
index cfd5df30a7..4f49fcf8a6 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -1718,14 +1718,20 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena &          src_pars
         LOG_DBG("No parser definition detected, assuming pure content parser.");
     }
 
-    LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), input.c_str());
+    // Prepend reasoning prefill (e.g. <think> or <think></think> from template prompt)
+    // so the parser can detect reasoning markers that were part of the template output.
+    const std::string effective_input = params.reasoning_prefill.empty()
+        ? input
+        : params.reasoning_prefill + input;
+
+    LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), effective_input.c_str());
 
     common_peg_parse_flags flags = COMMON_PEG_PARSE_FLAG_LENIENT;
     if (params.debug) {
         flags |= COMMON_PEG_PARSE_FLAG_DEBUG;
     }
 
-    common_peg_parse_context ctx(input, flags);
+    common_peg_parse_context ctx(effective_input, flags);
     auto result = parser.parse(ctx);
 
     if (result.fail()) {
diff --git a/common/chat.h b/common/chat.h
index 930987cf77..cb83da5fcc 100644
--- a/common/chat.h
+++ b/common/chat.h
@@ -211,7 +211,7 @@ struct common_chat_params {
     std::string                         prompt;
     std::string                         grammar;
     bool                                grammar_lazy         = false;
-    bool                                thinking_forced_open = false;
+    std::string                         reasoning_prefill;
     bool                                supports_thinking    = false;
     std::string                         thinking_start_tag;  // e.g., "<think>"
     std::string                         thinking_end_tag;    // e.g., "</think>"
@@ -228,14 +228,14 @@ struct common_chat_parser_params {
     common_reasoning_format reasoning_format     = COMMON_REASONING_FORMAT_NONE; // TODO: refactor this to "bool parse_reasoning"
     // Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
     bool                    reasoning_in_content = false;
-    bool                    thinking_forced_open = false;
+    std::string             reasoning_prefill;
     bool                    parse_tool_calls     = true;
     bool                    debug                = false;  // Enable debug output for PEG parser
     common_peg_arena        parser               = {};
     common_chat_parser_params() = default;
     common_chat_parser_params(const common_chat_params & chat_params) {
         format               = chat_params.format;
-        thinking_forced_open = chat_params.thinking_forced_open;
+        reasoning_prefill    = chat_params.reasoning_prefill;
     }
 };
 
diff --git a/docs/autoparser.md b/docs/autoparser.md
index 686b2c249b..08ffa6a3de 100644
--- a/docs/autoparser.md
+++ b/docs/autoparser.md
@@ -47,12 +47,11 @@ All structs are defined in [common/chat-auto-parser.h](common/chat-auto-parser.h
 | Value           | Description                                                                       |
 |-----------------|-----------------------------------------------------------------------------------|
 | `NONE`          | No reasoning markers detected                                                     |
-| `TAG_BASED`     | Standard tag-based: `<think>...</think>`                                          |
-| `DELIMITER`     | Delimiter-based: reasoning ends at a delimiter (e.g., `[BEGIN FINAL RESPONSE]`)   |
-| `FORCED_OPEN`   | Template ends with open reasoning tag when `enable_thinking=true`                 |
-| `FORCED_CLOSED` | `enable_thinking=false` emits both tags; `enable_thinking=true` emits only start  |
+| `TAG_BASED`     | Tag-based: `<think>...</think>` (start can be empty for delimiter-style formats)  |
 | `TOOLS_ONLY`    | Reasoning only appears in tool call responses, not plain content                  |
 
+**Reasoning Prefill**: When a template adds reasoning markers (e.g., `<think>` or `<think></think>`) at the end of the prompt, these are extracted as `reasoning_prefill` and prepended to the model output before parsing. This allows the parser to always use an optional TAG_BASED pattern while correctly handling templates that force thinking mode open or closed. Whitespace-only reasoning content (from `<think></think>` prefill) is automatically discarded.
+
 **`content_mode`**: How the template wraps assistant content.
 
 | Value                    | Description                                                    |
@@ -263,14 +262,15 @@ Text is segmentized into markers and non-marker fragments using `segmentize_mark
 - Uses PEG parsers to find surrounding markers:
   - If both pre/post markers found in `diff.right` → `TAG_BASED` (both tags visible in diff = no forced close)
   - If both found but post marker only in the full output B → `FORCED_CLOSED`
-  - If only post marker found → `DELIMITER`
+  - If only post marker found → `TAG_BASED` (delimiter-style, empty start)
 - Sets `reasoning.start` and `reasoning.end`
 
 **R2 — `compare_thinking_enabled()`**: Compares `enable_thinking=false` vs `true` with a generation prompt.
 
-- Detects `FORCED_OPEN`: `enable_thinking=true` adds a non-empty marker at the end of the prompt (where model will start generating) — sets `reasoning.start`, mode = `FORCED_OPEN`
-- Detects `FORCED_CLOSED`: `enable_thinking=false` produces both start+end markers; `enable_thinking=true` produces only start marker
+- Detects template-added reasoning markers: `enable_thinking=true` adds a non-empty marker at the end of the prompt — sets `reasoning.start`, mode = `TAG_BASED`
+- Detects start+end pattern: `enable_thinking=false` produces both start+end markers; `enable_thinking=true` produces only start marker — both classified as `TAG_BASED`
 - Handles the reverse case: if both start and end are still empty, looks for a single-segment diff on each side to extract both markers
+- The reasoning prefill (markers added by the template) is later extracted in `generate_parser()` and prepended to model output before parsing
 
 **R3 — `compare_reasoning_scope()`**: Compares assistant message with reasoning+text-content vs reasoning+tool-calls.
 
@@ -358,9 +358,10 @@ Each analyzer struct (`analyze_reasoning`, `analyze_content`, `analyze_tools`) i
 | Mode                              | Parser                                                              |
 |-----------------------------------|---------------------------------------------------------------------|
 | Not extracting reasoning          | `eps()`                                                             |
-| `FORCED_OPEN` or `FORCED_CLOSED`  | `reasoning(until(end)) + end` — opening tag was in the prompt       |
-| `TAG_BASED` or `TOOLS_ONLY`       | `optional(start + reasoning(until(end)) + end)`                     |
-| `DELIMITER`                       | `optional(reasoning(until(end)) + end)` — no start marker           |
+| `TAG_BASED` or `TOOLS_ONLY` (non-empty start) | `optional(start + reasoning(until(end)) + end)`          |
+| `TAG_BASED` or `TOOLS_ONLY` (empty start)     | `optional(reasoning(until(end)) + end)` — delimiter-style|
+
+Note: Templates that add reasoning markers to the prompt (e.g., `<think>`) have these extracted as `reasoning_prefill` and prepended to model output before parsing. The parser always uses the optional TAG_BASED pattern.
 
 #### Content Parser (`analyze_content::build_parser`)
 
@@ -516,7 +517,7 @@ To support a new template format:
 
 ## Edge Cases and Quirks
 
-1. **Forced Thinking**: When `enable_thinking=true` and the model prompt ends with an open reasoning tag (e.g., `<think>`), the parser enters forced thinking mode and immediately expects reasoning content without waiting for a start marker.
+1. **Reasoning Prefill**: When `enable_thinking=true` and the model prompt ends with reasoning markers (e.g., `<think>` or `<think></think>`), these are extracted as `reasoning_prefill` and prepended to model output before parsing. The parser always uses optional TAG_BASED reasoning, so it handles both thinking and non-thinking outputs dynamically. Whitespace-only reasoning content (from closed prefill like `<think></think>`) is discarded.
 2. **Per-Call vs Per-Section Markers**: Some templates wrap each tool call individually (`per_call_start/end`); others wrap the entire section (`section_start/end`). T2 (`check_per_call_markers()`) disambiguates by checking if the second call in a two-call output starts with the section marker.
 3. **Python Dict Format**: The Seed template family uses single-quoted JSON (`'key': 'value'`). The `uses_python_dicts` flag causes the PEG builder to register a flexible `json-string` rule accepting both quote styles before any JSON rules are built.
 4. **Tag Boundary Fixing**: `calculate_diff_split()` iteratively adjusts prefix/suffix boundaries to avoid splitting `<tag>` or `[marker]` tokens, ensuring clean extraction.
diff --git a/tests/test-chat-auto-parser.cpp b/tests/test-chat-auto-parser.cpp
index eaa57872a8..491522324a 100644
--- a/tests/test-chat-auto-parser.cpp
+++ b/tests/test-chat-auto-parser.cpp
@@ -1295,8 +1295,8 @@ static void test_nemotron_reasoning_detection(testing & t) {
     t.assert_equal("reasoning_end should be '</think>\\n'", "</think>\n", analysis.reasoning.end);
 
     // Check reasoning mode detection
-    // Nemotron uses forced closed reasoning with add_generation_prompt
-    t.assert_equal("reasoning should be FORCED_CLOSED", reasoning_mode::FORCED_CLOSED, analysis.reasoning.mode);
+    // Nemotron uses tag-based reasoning (formerly FORCED_CLOSED; prefill handles the template's forced markers)
+    t.assert_equal("reasoning should be TAG_BASED", reasoning_mode::TAG_BASED, analysis.reasoning.mode);
 
     // Make sure reasoning markers don't spill over to content markers
     t.assert_equal("content start should be empty", "", analysis.content.start);
diff --git a/tests/test-chat-peg-parser.cpp b/tests/test-chat-peg-parser.cpp
index dc8724be34..cbde951d53 100644
--- a/tests/test-chat-peg-parser.cpp
+++ b/tests/test-chat-peg-parser.cpp
@@ -145,7 +145,7 @@ static void test_example_native(testing & t) {
         common_reasoning_format reasoning_format;
         json                    json_schema;
         bool                    parallel_tool_calls;
-        bool                    thinking_forced_open;
+        std::string             reasoning_prefill;
         std::string             input;
 
         // Expect
@@ -157,14 +157,8 @@ static void test_example_native(testing & t) {
     auto build_parser = [](const test_case & tc) {
         return build_chat_peg_parser([&](common_chat_peg_builder & p) {
             auto reasoning_in_content = (tc.reasoning_format == COMMON_REASONING_FORMAT_NONE);
-            auto reasoning            = p.eps();
-            if (tc.thinking_forced_open) {
-                // If thinking is forced open, expect a closing tag
-                reasoning = p.reasoning(p.until("</think>")) + "</think>" + p.space();
-            } else {
-                // Otherwise, optionally accept thinking wrapped in tags
-                reasoning = p.optional("<think>" + p.reasoning(p.until("</think>")) + "</think>" + p.space());
-            }
+            // Always use optional TAG_BASED pattern; reasoning_prefill is prepended to input
+            auto reasoning = p.optional("<think>" + p.reasoning(p.until("</think>")) + "</think>" + p.space());
 
             // tool calling parser
             if (tc.tools.is_array() && !tc.tools.empty()) {
@@ -190,78 +184,91 @@ static void test_example_native(testing & t) {
 
     std::vector<test_case> test_cases = std::vector<test_case>{
         {
-         /* .name =                 */ "content with thinking_forced_open = false",
+         /* .name =                 */ "content with reasoning (no prefill)",
          /* .tools =                */ {},
          /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
          /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
          /* .json_schema =          */ {},
          /* .parallel_tool_calls =  */ false,
-         /* .thinking_forced_open = */ false,
+         /* .reasoning_prefill =    */ "",
          /* .input =                */ ("<think>The user said hello, I must say hello back</think>\nHello"),
          /* .expect_reasoning =     */ "The user said hello, I must say hello back",
          /* .expect_content =       */ "Hello",
          /* .expect_tool_calls =    */ {},
          },
         {
-         /* .name =                 */ "content with thinking_forced_open = false and no reasoning",
+         /* .name =                 */ "content without reasoning (no prefill)",
          /* .tools =                */ {},
          /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
          /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
          /* .json_schema =          */ {},
          /* .parallel_tool_calls =  */ false,
-         /* .thinking_forced_open = */ false,
+         /* .reasoning_prefill =    */ "",
          /* .input =                */ ("Hello"),
          /* .expect_reasoning =     */ "",
          /* .expect_content =       */ "Hello",
          /* .expect_tool_calls =    */ {},
          },
         {
-         /* .name =                 */ "content with thinking_forced_open = false and reasoning_format = none",
+         /* .name =                 */ "content with reasoning_format = none (tags appear in content)",
          /* .tools =                */ {},
          /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
          /* .reasoning_format =     */ COMMON_REASONING_FORMAT_NONE,
          /* .json_schema =          */ {},
          /* .parallel_tool_calls =  */ false,
-         /* .thinking_forced_open = */ true,
+         /* .reasoning_prefill =    */ "",
          /* .input =                */ ("<think>The user said hello, I must say hello back</think>\nHello"),
          /* .expect_reasoning =     */ "",
          /* .expect_content =       */ "<think>The user said hello, I must say hello back</think>\nHello",
          /* .expect_tool_calls =    */ {},
          },
         {
-         /* .name =                 */ "content with thinking_forced_open = true",
+         /* .name =                 */ "content with reasoning prefill",
          /* .tools =                */ {},
          /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
          /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
          /* .json_schema =          */ {},
          /* .parallel_tool_calls =  */ false,
-         /* .thinking_forced_open = */ true,
+         /* .reasoning_prefill =    */ "<think>",
          /* .input =                */ ("The user said hello, I must say hello back</think>\nHello"),
          /* .expect_reasoning =     */ "The user said hello, I must say hello back",
          /* .expect_content =       */ "Hello",
          /* .expect_tool_calls =    */ {},
          },
         {
-         /* .name =                 */ "content with thinking_forced_open = true and reasoning_format = none",
+         /* .name =                 */ "content with reasoning prefill and reasoning_format = none",
          /* .tools =                */ {},
          /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
          /* .reasoning_format =     */ COMMON_REASONING_FORMAT_NONE,
          /* .json_schema =          */ {},
          /* .parallel_tool_calls =  */ false,
-         /* .thinking_forced_open = */ true,
+         /* .reasoning_prefill =    */ "",
          /* .input =                */ ("The user said hello, I must say hello back</think>\nHello"),
          /* .expect_reasoning =     */ "",
          /* .expect_content =       */ "The user said hello, I must say hello back</think>\nHello",
          /* .expect_tool_calls =    */ {},
          },
         {
-         /* .name =                 */ "tools with tool_choice = auto and no parallel_tool_calls",
+         /* .name =                 */ "content with closed reasoning prefill (empty reasoning discarded)",
+         /* .tools =                */ {},
+         /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
+         /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
+         /* .json_schema =          */ {},
+         /* .parallel_tool_calls =  */ false,
+         /* .reasoning_prefill =    */ "<think></think>",
+         /* .input =                */ ("Hello"),
+         /* .expect_reasoning =     */ "",
+         /* .expect_content =       */ "Hello",
+         /* .expect_tool_calls =    */ {},
+         },
+        {
+         /* .name =                 */ "tools with reasoning prefill",
          /* .tools =                */ create_tools(),
          /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_AUTO,
          /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
          /* .json_schema =          */ {},
          /* .parallel_tool_calls =  */ false,
-         /* .thinking_forced_open = */ true,
+         /* .reasoning_prefill =    */ "<think>",
          /* .input =                */
             ("I must get the weather in New York</think>\n"
              "<tool_call>["
@@ -277,13 +284,13 @@ static void test_example_native(testing & t) {
             } },
          },
         {
-         /* .name =                 */ "tools with tool_choice = auto and parallel_tool_calls",
+         /* .name =                 */ "parallel tools with reasoning prefill",
          /* .tools =                */ create_tools(),
          /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_AUTO,
          /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
          /* .json_schema =          */ {},
          /* .parallel_tool_calls =  */ true,
-         /* .thinking_forced_open = */ true,
+         /* .reasoning_prefill =    */ "<think>",
          /* .input =                */
             ("I must get the weather in New York and San Francisco and a 3 day forecast of each.</think>\nLet me "
              "search that for you."
@@ -321,7 +328,7 @@ static void test_example_native(testing & t) {
               } },
          },
         {
-         /* .name =                 */ "response_format with thinking_forced_open = true",
+         /* .name =                 */ "response_format with reasoning prefill",
          /* .tools =                */ {},
          /* .tool_choice =          */ COMMON_CHAT_TOOL_CHOICE_NONE,
          /* .reasoning_format =     */ COMMON_REASONING_FORMAT_AUTO,
@@ -333,7 +340,7 @@ static void test_example_native(testing & t) {
                   { "due_date", { { "type", "string" } } } } },
               { "required", { "invoice_number", "amount", "due_date" } } },
          /* .parallel_tool_calls =  */ false,
-         /* .thinking_forced_open = */ true,
+         /* .reasoning_prefill =    */ "<think>",
          /* .input =                */
             ("I must produce the invoice in the requested format</think>\n"
              R"({"invoice_number": "INV-2025-001", "amount": 1250.50, "due_date": "2025-12-31"})"),
@@ -361,7 +368,8 @@ static void test_example_native(testing & t) {
                 t.log(line);
             }
 
-            common_peg_parse_context ctx(tc.input);
+            std::string              effective_input = tc.reasoning_prefill + tc.input;
+            common_peg_parse_context ctx(effective_input);
             auto                     result = parser.parse(ctx);
 
             t.assert_true("success", result.success());
diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp
index 3a6297e148..8e2117c4e7 100644
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@@ -822,8 +822,7 @@ struct make_peg_parser {
     }
 
     common_chat_msg parse(const std::string & msg, bool is_partial) const {
-        common_chat_parser_params parser_params;
-        parser_params.format = params_.format;
+        common_chat_parser_params parser_params(params_);
         parser_params.debug = detailed_debug_;
         return common_chat_peg_parse(arena_, msg, is_partial, parser_params);
     }
diff --git a/tools/server/README.md b/tools/server/README.md
index da16ddc756..363f3fa5ea 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -907,7 +907,7 @@ If query param `?fail_on_no_slot=1` is set, this endpoint will respond with stat
       "chat_format": "GPT-OSS",
       "reasoning_format": "none",
       "reasoning_in_content": false,
-      "thinking_forced_open": false,
+      "reasoning_prefill": "",
       "samplers": [
         "penalties",
         "dry",
@@ -972,7 +972,7 @@ If query param `?fail_on_no_slot=1` is set, this endpoint will respond with stat
       "chat_format": "GPT-OSS",
       "reasoning_format": "none",
       "reasoning_in_content": false,
-      "thinking_forced_open": false,
+      "reasoning_prefill": "",
       "samplers": [
         "penalties",
         "dry",
@@ -1193,7 +1193,7 @@ The `response_format` parameter supports both plain JSON output (e.g. `{"type":
 
 `reasoning_format`: The reasoning format to be parsed. If set to `none`, it will output the raw generated text.
 
-`thinking_forced_open`: Force a reasoning model to always output the reasoning. Only works on certain models.
+`reasoning_prefill`: The reasoning markers that were prefilled in the prompt by the template. Prepended to model output before parsing to handle dynamic thinking/non-thinking modes.
 
 `parse_tool_calls`: Whether to parse the generated tool call.
 
diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp
index bd203228cc..1b74f50fcd 100644
--- a/tools/server/server-common.cpp
+++ b/tools/server/server-common.cpp
@@ -1093,7 +1093,7 @@ json oaicompat_chat_params_parse(
     }
     llama_params["grammar_triggers"] = grammar_triggers;
     llama_params["preserved_tokens"] = chat_params.preserved_tokens;
-    llama_params["thinking_forced_open"]     = chat_params.thinking_forced_open;
+    llama_params["reasoning_prefill"]        = chat_params.reasoning_prefill;
     for (const auto & stop : chat_params.additional_stops) {
         llama_params["stop"].push_back(stop);
     }
diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp
index b3d510977b..a47ab5cbb0 100644
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@@ -72,7 +72,7 @@ json task_params::to_json(bool only_metrics) const {
             {"chat_format",               common_chat_format_name(chat_parser_params.format)},
             {"reasoning_format",          common_reasoning_format_name(chat_parser_params.reasoning_format)},
             {"reasoning_in_content",      chat_parser_params.reasoning_in_content},
-            {"thinking_forced_open",      chat_parser_params.thinking_forced_open},
+            {"reasoning_prefill",         chat_parser_params.reasoning_prefill},
             {"samplers",                  samplers},
             {"speculative.n_max",         speculative.n_max},
             {"speculative.n_min",         speculative.n_min},
@@ -135,7 +135,7 @@ json task_params::to_json(bool only_metrics) const {
         {"chat_format",               common_chat_format_name(chat_parser_params.format)},
         {"reasoning_format",          common_reasoning_format_name(chat_parser_params.reasoning_format)},
         {"reasoning_in_content",      chat_parser_params.reasoning_in_content},
-        {"thinking_forced_open",      chat_parser_params.thinking_forced_open},
+        {"reasoning_prefill",         chat_parser_params.reasoning_prefill},
         {"samplers",                  samplers},
         {"speculative.n_max",         speculative.n_max},
         {"speculative.n_min",         speculative.n_min},
@@ -402,7 +402,7 @@ task_params server_task::params_from_json_cmpl(
         }
         params.chat_parser_params.reasoning_format = reasoning_format;
         params.chat_parser_params.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
-        params.chat_parser_params.thinking_forced_open = json_value(data, "thinking_forced_open", false);
+        params.chat_parser_params.reasoning_prefill = json_value(data, "reasoning_prefill", std::string());
         params.chat_parser_params.parse_tool_calls = json_value(data, "parse_tool_calls", false);
         if (data.contains("chat_parser")) {
             params.chat_parser_params.parser.load(data.at("chat_parser").get<std::string>());
diff --git a/tools/server/webui/src/lib/services/parameter-sync.service.spec.ts b/tools/server/webui/src/lib/services/parameter-sync.service.spec.ts
index ce91de7410..63e303959e 100644
--- a/tools/server/webui/src/lib/services/parameter-sync.service.spec.ts
+++ b/tools/server/webui/src/lib/services/parameter-sync.service.spec.ts
@@ -51,7 +51,7 @@ describe('ParameterSyncService', () => {
 				chat_format: '',
 				reasoning_format: '',
 				reasoning_in_content: false,
-				thinking_forced_open: false,
+				reasoning_prefill: '',
 				'speculative.n_max': 0,
 				'speculative.n_min': 0,
 				'speculative.p_min': 0.0,
@@ -116,7 +116,7 @@ describe('ParameterSyncService', () => {
 				chat_format: '',
 				reasoning_format: '',
 				reasoning_in_content: false,
-				thinking_forced_open: false,
+				reasoning_prefill: '',
 				'speculative.n_max': 0,
 				'speculative.n_min': 0,
 				'speculative.p_min': 0.0,
diff --git a/tools/server/webui/src/lib/types/api.d.ts b/tools/server/webui/src/lib/types/api.d.ts
index c908258427..ea32079364 100644
--- a/tools/server/webui/src/lib/types/api.d.ts
+++ b/tools/server/webui/src/lib/types/api.d.ts
@@ -164,7 +164,7 @@ export interface ApiLlamaCppServerProps {
 			chat_format: string;
 			reasoning_format: string;
 			reasoning_in_content: boolean;
-			thinking_forced_open: boolean;
+			reasoning_prefill: string;
 			samplers: string[];
 			backend_sampling: boolean;
 			'speculative.n_max': number;
@@ -332,7 +332,7 @@ export interface ApiSlotData {
 		chat_format: string;
 		reasoning_format: string;
 		reasoning_in_content: boolean;
-		thinking_forced_open: boolean;
+		reasoning_prefill: string;
 		samplers: string[];
 		backend_sampling: boolean;
 		'speculative.n_max': number;