diff --git a/common/chat-auto-parser-generator.cpp b/common/chat-auto-parser-generator.cpp
index f19819494c..6bd4b2d208 100644
--- a/common/chat-auto-parser-generator.cpp
+++ b/common/chat-auto-parser-generator.cpp
@@ -49,6 +49,42 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
data.preserved_tokens = autoparser.preserved_tokens;
data.parser = parser.save();
+ // Extract reasoning prefill from the end of the rendered prompt.
+ // If the template added reasoning markers (e.g. or ) at the end,
+ // store them so they can be prepended to model output before parsing.
+ if (inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE &&
+ autoparser.reasoning.mode != reasoning_mode::NONE &&
+ !autoparser.reasoning.end.empty()) {
+ const auto & r_start = autoparser.reasoning.start;
+ const auto & r_end = autoparser.reasoning.end;
+ // Trim trailing whitespace from the prompt for suffix matching
+ auto prompt_trimmed = data.prompt;
+ while (!prompt_trimmed.empty() &&
+ (prompt_trimmed.back() == ' ' || prompt_trimmed.back() == '\n' ||
+ prompt_trimmed.back() == '\r' || prompt_trimmed.back() == '\t')) {
+ prompt_trimmed.pop_back();
+ }
+ if (!r_start.empty()) {
+ // Check for start+end at end of prompt (e.g. )
+ if (string_ends_with(prompt_trimmed, r_end)) {
+ auto before_end = prompt_trimmed.substr(0, prompt_trimmed.size() - r_end.size());
+ while (!before_end.empty() &&
+ (before_end.back() == ' ' || before_end.back() == '\n' ||
+ before_end.back() == '\r' || before_end.back() == '\t')) {
+ before_end.pop_back();
+ }
+ if (string_ends_with(before_end, r_start)) {
+ // Prompt ends with start + whitespace + end: extract from start to end of trimmed prompt
+ data.reasoning_prefill = prompt_trimmed.substr(before_end.size() - r_start.size());
+ }
+ }
+ // Check for just start at end of prompt (e.g. )
+ if (data.reasoning_prefill.empty() && string_ends_with(prompt_trimmed, r_start)) {
+ data.reasoning_prefill = r_start;
+ }
+ }
+ }
+
// Build grammar if tools are present
bool has_tools =
autoparser.tools.format.mode != tool_format::NONE && inputs.tools.is_array() && !inputs.tools.empty();
@@ -96,9 +132,8 @@ common_peg_arena autoparser::build_parser(const templates_params & inputs) const
parser_build_context ctx(p, inputs);
bool extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
- bool enable_thinking = inputs.enable_thinking;
- ctx.extracting_reasoning = extract_reasoning && enable_thinking && reasoning.mode != reasoning_mode::NONE;
+ ctx.extracting_reasoning = extract_reasoning && reasoning.mode != reasoning_mode::NONE;
ctx.content = &content;
// Build reasoning parser
@@ -130,24 +165,15 @@ common_peg_parser analyze_reasoning::build_parser(parser_build_context & ctx) co
return p.eps();
}
- bool thinking_forced_open = (mode == reasoning_mode::FORCED_OPEN);
- bool thinking_forced_closed = (mode == reasoning_mode::FORCED_CLOSED);
-
- if (thinking_forced_open || thinking_forced_closed) {
- // Thinking is forced open OR forced closed with enable_thinking=true
- // In both cases, expect only the closing tag (opening was in template)
- // However, since we might have incorrectly detected the open/close pattern,
- // we admit an optional starting marker
- return p.optional(p.literal(start)) + p.reasoning(p.until(end)) + end;
- }
if (mode == reasoning_mode::TAG_BASED || mode == reasoning_mode::TOOLS_ONLY) {
- // Standard tag-based reasoning OR tools-only mode (reasoning appears with tools)
- // Both use the same tag-based pattern if markers are available
- if (!start.empty() && !end.empty()) {
- return p.optional(start + p.reasoning(p.until(end)) + end);
+ if (!end.empty()) {
+ if (!start.empty()) {
+ // Standard tag-based: optional(reasoning)
+ return p.optional(start + p.reasoning(p.until(end)) + end);
+ }
+ // Delimiter-style (empty start): optional(reasoning[DELIMITER])
+ return p.optional(p.reasoning(p.until(end)) + end);
}
- } else if (mode == reasoning_mode::DELIMITER) {
- return p.optional(p.reasoning(p.until(end)) + end);
}
return p.eps();
diff --git a/common/chat-auto-parser.h b/common/chat-auto-parser.h
index 52c6488f4b..55713f4ef4 100644
--- a/common/chat-auto-parser.h
+++ b/common/chat-auto-parser.h
@@ -77,11 +77,7 @@ struct templates_params {
// Reasoning handling mode (derived from R1-R3 comparisons)
enum class reasoning_mode {
NONE, // No reasoning markers detected
- TAG_BASED, // Standard tag-based: ...
- DELIMITER, // Delimiter-based: [BEGIN FINAL RESPONSE] (reasoning ends at delimiter)
- FORCED_OPEN, // Template ends with open reasoning tag (empty start, non-empty end)
- FORCED_CLOSED, // Template ends with open reasoning tag on enabled thinking but
- // with both opened and closed tag for disabled thinking
+ TAG_BASED, // Tag-based: ... (start can be empty for delimiter-style)
TOOLS_ONLY // Only reason on tool calls, not on normal content
};
@@ -91,12 +87,6 @@ inline std::ostream & operator<<(std::ostream & os, const reasoning_mode & mode)
return os << "NONE";
case reasoning_mode::TAG_BASED:
return os << "TAG_BASED";
- case reasoning_mode::DELIMITER:
- return os << "DELIMITER";
- case reasoning_mode::FORCED_OPEN:
- return os << "FORCED_OPEN";
- case reasoning_mode::FORCED_CLOSED:
- return os << "FORCED_CLOSED";
case reasoning_mode::TOOLS_ONLY:
return os << "TOOLS_ONLY";
default:
diff --git a/common/chat-diff-analyzer.cpp b/common/chat-diff-analyzer.cpp
index 05b3b6b6a8..57bc234fca 100644
--- a/common/chat-diff-analyzer.cpp
+++ b/common/chat-diff-analyzer.cpp
@@ -32,7 +32,7 @@ static std::vector')") != std::string::npos &&
tmpl.src.find("reasoning_content") == std::string::npos &&
analysis.reasoning.mode == reasoning_mode::NONE) {
- analysis.reasoning.mode = reasoning_mode::FORCED_OPEN;
+ analysis.reasoning.mode = reasoning_mode::TAG_BASED;
analysis.reasoning.start = "";
analysis.reasoning.end = "";
analysis.preserved_tokens.push_back("");
@@ -295,15 +295,11 @@ void analyze_reasoning::compare_reasoning_presence() {
}
if (result.result.success()) {
if (!result.tags["pre"].empty() && !result.tags["post"].empty()) {
- if (parser_wrapped.parse_anywhere_and_extract(diff.right).result.success()) { // both tags in the diff = no forced close
- mode = reasoning_mode::TAG_BASED;
- } else {
- mode = reasoning_mode::FORCED_CLOSED;
- }
+ mode = reasoning_mode::TAG_BASED;
start = trim_whitespace(result.tags["pre"]);
end = result.tags["post"];
} else if (!result.tags["post"].empty()) {
- mode = reasoning_mode::DELIMITER;
+ mode = reasoning_mode::TAG_BASED;
end = result.tags["post"];
}
}
@@ -338,17 +334,17 @@ void analyze_reasoning::compare_thinking_enabled() {
if (!right_trimmed.empty() && string_ends_with(comparison->output_B, right_trimmed)) {
if (start.empty()) {
start = right_trimmed;
- mode = reasoning_mode::FORCED_OPEN;
+ mode = reasoning_mode::TAG_BASED;
}
}
}
- if (start.empty() && !end.empty()) {
- mode = reasoning_mode::DELIMITER;
+ if (mode == reasoning_mode::NONE && start.empty() && !end.empty()) {
+ mode = reasoning_mode::TAG_BASED;
}
- // Check for FORCED_CLOSED: when enable_thinking=false produces both start and end markers,
- // but enable_thinking=true produces only the start marker
+ // Check for start+end pattern: when enable_thinking=false produces both start and end markers,
+ // but enable_thinking=true produces only the start marker. Both cases are TAG_BASED.
if (!comparison->output_A.empty() && !comparison->output_B.empty()) {
auto parser_start = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
return p.literal(start) + p.space() + p.literal(end) + p.rest();
@@ -358,12 +354,12 @@ void analyze_reasoning::compare_thinking_enabled() {
});
if (!start.empty() && parser_start_end.parse_anywhere_and_extract(comparison->output_A).result.success() &&
parser_start.parse_anywhere_and_extract(comparison->output_B).result.success()) {
- mode = reasoning_mode::FORCED_CLOSED;
+ mode = reasoning_mode::TAG_BASED;
} else if (!end.empty()) { // we extract the starting marker now since we didn't get it earlier
auto result = parser_start_end.parse_anywhere_and_extract(comparison->output_A);
if (result.result.success()) {
start = result.tags["pre"];
- mode = reasoning_mode::FORCED_CLOSED;
+ mode = reasoning_mode::TAG_BASED;
}
}
}
@@ -373,7 +369,7 @@ void analyze_reasoning::compare_thinking_enabled() {
auto seg_A = segmentize_markers(trim_trailing_whitespace(diff.left));
auto seg_B = segmentize_markers(trim_trailing_whitespace(diff.right));
if (seg_A.size() == 1 && seg_B.size() == 1) {
- mode = reasoning_mode::FORCED_CLOSED;
+ mode = reasoning_mode::TAG_BASED;
start = seg_B[0].value;
end = seg_A[0].value;
}
diff --git a/common/chat-peg-parser.cpp b/common/chat-peg-parser.cpp
index 4c5bb6218d..5f7d422b41 100644
--- a/common/chat-peg-parser.cpp
+++ b/common/chat-peg-parser.cpp
@@ -229,6 +229,20 @@ void common_chat_peg_mapper::from_ast(const common_peg_ast_arena & arena,
result.tool_calls.push_back(pending_tool_call.value());
pending_tool_call.reset();
}
+
+ // Discard whitespace-only reasoning content (e.g. from prefill)
+ if (!result.reasoning_content.empty()) {
+ bool all_whitespace = true;
+ for (char c : result.reasoning_content) {
+ if (c != ' ' && c != '\n' && c != '\r' && c != '\t') {
+ all_whitespace = false;
+ break;
+ }
+ }
+ if (all_whitespace) {
+ result.reasoning_content.clear();
+ }
+ }
}
void common_chat_peg_mapper::map(const common_peg_ast_node & node) {
diff --git a/common/chat.cpp b/common/chat.cpp
index cfd5df30a7..4f49fcf8a6 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -1718,14 +1718,20 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & src_pars
LOG_DBG("No parser definition detected, assuming pure content parser.");
}
- LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), input.c_str());
+ // Prepend reasoning prefill (e.g. or from template prompt)
+ // so the parser can detect reasoning markers that were part of the template output.
+ const std::string effective_input = params.reasoning_prefill.empty()
+ ? input
+ : params.reasoning_prefill + input;
+
+ LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), effective_input.c_str());
common_peg_parse_flags flags = COMMON_PEG_PARSE_FLAG_LENIENT;
if (params.debug) {
flags |= COMMON_PEG_PARSE_FLAG_DEBUG;
}
- common_peg_parse_context ctx(input, flags);
+ common_peg_parse_context ctx(effective_input, flags);
auto result = parser.parse(ctx);
if (result.fail()) {
diff --git a/common/chat.h b/common/chat.h
index 930987cf77..cb83da5fcc 100644
--- a/common/chat.h
+++ b/common/chat.h
@@ -211,7 +211,7 @@ struct common_chat_params {
std::string prompt;
std::string grammar;
bool grammar_lazy = false;
- bool thinking_forced_open = false;
+ std::string reasoning_prefill;
bool supports_thinking = false;
std::string thinking_start_tag; // e.g., ""
std::string thinking_end_tag; // e.g., ""
@@ -228,14 +228,14 @@ struct common_chat_parser_params {
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE; // TODO: refactor this to "bool parse_reasoning"
// Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
bool reasoning_in_content = false;
- bool thinking_forced_open = false;
+ std::string reasoning_prefill;
bool parse_tool_calls = true;
bool debug = false; // Enable debug output for PEG parser
common_peg_arena parser = {};
common_chat_parser_params() = default;
common_chat_parser_params(const common_chat_params & chat_params) {
format = chat_params.format;
- thinking_forced_open = chat_params.thinking_forced_open;
+ reasoning_prefill = chat_params.reasoning_prefill;
}
};
diff --git a/docs/autoparser.md b/docs/autoparser.md
index 686b2c249b..08ffa6a3de 100644
--- a/docs/autoparser.md
+++ b/docs/autoparser.md
@@ -47,12 +47,11 @@ All structs are defined in [common/chat-auto-parser.h](common/chat-auto-parser.h
| Value | Description |
|-----------------|-----------------------------------------------------------------------------------|
| `NONE` | No reasoning markers detected |
-| `TAG_BASED` | Standard tag-based: `...` |
-| `DELIMITER` | Delimiter-based: reasoning ends at a delimiter (e.g., `[BEGIN FINAL RESPONSE]`) |
-| `FORCED_OPEN` | Template ends with open reasoning tag when `enable_thinking=true` |
-| `FORCED_CLOSED` | `enable_thinking=false` emits both tags; `enable_thinking=true` emits only start |
+| `TAG_BASED` | Tag-based: `...` (start can be empty for delimiter-style formats) |
| `TOOLS_ONLY` | Reasoning only appears in tool call responses, not plain content |
+**Reasoning Prefill**: When a template adds reasoning markers (e.g., `` or ``) at the end of the prompt, these are extracted as `reasoning_prefill` and prepended to the model output before parsing. This allows the parser to always use an optional TAG_BASED pattern while correctly handling templates that force thinking mode open or closed. Whitespace-only reasoning content (from `` prefill) is automatically discarded.
+
**`content_mode`**: How the template wraps assistant content.
| Value | Description |
@@ -263,14 +262,15 @@ Text is segmentized into markers and non-marker fragments using `segmentize_mark
- Uses PEG parsers to find surrounding markers:
- If both pre/post markers found in `diff.right` → `TAG_BASED` (both tags visible in diff = no forced close)
- If both found but post marker only in the full output B → `FORCED_CLOSED`
- - If only post marker found → `DELIMITER`
+ - If only post marker found → `TAG_BASED` (delimiter-style, empty start)
- Sets `reasoning.start` and `reasoning.end`
**R2 — `compare_thinking_enabled()`**: Compares `enable_thinking=false` vs `true` with a generation prompt.
-- Detects `FORCED_OPEN`: `enable_thinking=true` adds a non-empty marker at the end of the prompt (where model will start generating) — sets `reasoning.start`, mode = `FORCED_OPEN`
-- Detects `FORCED_CLOSED`: `enable_thinking=false` produces both start+end markers; `enable_thinking=true` produces only start marker
+- Detects template-added reasoning markers: `enable_thinking=true` adds a non-empty marker at the end of the prompt — sets `reasoning.start`, mode = `TAG_BASED`
+- Detects start+end pattern: `enable_thinking=false` produces both start+end markers; `enable_thinking=true` produces only start marker — both classified as `TAG_BASED`
- Handles the reverse case: if both start and end are still empty, looks for a single-segment diff on each side to extract both markers
+- The reasoning prefill (markers added by the template) is later extracted in `generate_parser()` and prepended to model output before parsing
**R3 — `compare_reasoning_scope()`**: Compares assistant message with reasoning+text-content vs reasoning+tool-calls.
@@ -358,9 +358,10 @@ Each analyzer struct (`analyze_reasoning`, `analyze_content`, `analyze_tools`) i
| Mode | Parser |
|-----------------------------------|---------------------------------------------------------------------|
| Not extracting reasoning | `eps()` |
-| `FORCED_OPEN` or `FORCED_CLOSED` | `reasoning(until(end)) + end` — opening tag was in the prompt |
-| `TAG_BASED` or `TOOLS_ONLY` | `optional(start + reasoning(until(end)) + end)` |
-| `DELIMITER` | `optional(reasoning(until(end)) + end)` — no start marker |
+| `TAG_BASED` or `TOOLS_ONLY` (non-empty start) | `optional(start + reasoning(until(end)) + end)` |
+| `TAG_BASED` or `TOOLS_ONLY` (empty start) | `optional(reasoning(until(end)) + end)` — delimiter-style|
+
+Note: Templates that add reasoning markers to the prompt (e.g., ``) have these extracted as `reasoning_prefill` and prepended to model output before parsing. The parser always uses the optional TAG_BASED pattern.
#### Content Parser (`analyze_content::build_parser`)
@@ -516,7 +517,7 @@ To support a new template format:
## Edge Cases and Quirks
-1. **Forced Thinking**: When `enable_thinking=true` and the model prompt ends with an open reasoning tag (e.g., ``), the parser enters forced thinking mode and immediately expects reasoning content without waiting for a start marker.
+1. **Reasoning Prefill**: When `enable_thinking=true` and the model prompt ends with reasoning markers (e.g., `` or ``), these are extracted as `reasoning_prefill` and prepended to model output before parsing. The parser always uses optional TAG_BASED reasoning, so it handles both thinking and non-thinking outputs dynamically. Whitespace-only reasoning content (from closed prefill like ``) is discarded.
2. **Per-Call vs Per-Section Markers**: Some templates wrap each tool call individually (`per_call_start/end`); others wrap the entire section (`section_start/end`). T2 (`check_per_call_markers()`) disambiguates by checking if the second call in a two-call output starts with the section marker.
3. **Python Dict Format**: The Seed template family uses single-quoted JSON (`'key': 'value'`). The `uses_python_dicts` flag causes the PEG builder to register a flexible `json-string` rule accepting both quote styles before any JSON rules are built.
4. **Tag Boundary Fixing**: `calculate_diff_split()` iteratively adjusts prefix/suffix boundaries to avoid splitting `` or `[marker]` tokens, ensuring clean extraction.
diff --git a/tests/test-chat-auto-parser.cpp b/tests/test-chat-auto-parser.cpp
index eaa57872a8..491522324a 100644
--- a/tests/test-chat-auto-parser.cpp
+++ b/tests/test-chat-auto-parser.cpp
@@ -1295,8 +1295,8 @@ static void test_nemotron_reasoning_detection(testing & t) {
t.assert_equal("reasoning_end should be '\\n'", "\n", analysis.reasoning.end);
// Check reasoning mode detection
- // Nemotron uses forced closed reasoning with add_generation_prompt
- t.assert_equal("reasoning should be FORCED_CLOSED", reasoning_mode::FORCED_CLOSED, analysis.reasoning.mode);
+ // Nemotron uses tag-based reasoning (formerly FORCED_CLOSED; prefill handles the template's forced markers)
+ t.assert_equal("reasoning should be TAG_BASED", reasoning_mode::TAG_BASED, analysis.reasoning.mode);
// Make sure reasoning markers don't spill over to content markers
t.assert_equal("content start should be empty", "", analysis.content.start);
diff --git a/tests/test-chat-peg-parser.cpp b/tests/test-chat-peg-parser.cpp
index dc8724be34..cbde951d53 100644
--- a/tests/test-chat-peg-parser.cpp
+++ b/tests/test-chat-peg-parser.cpp
@@ -145,7 +145,7 @@ static void test_example_native(testing & t) {
common_reasoning_format reasoning_format;
json json_schema;
bool parallel_tool_calls;
- bool thinking_forced_open;
+ std::string reasoning_prefill;
std::string input;
// Expect
@@ -157,14 +157,8 @@ static void test_example_native(testing & t) {
auto build_parser = [](const test_case & tc) {
return build_chat_peg_parser([&](common_chat_peg_builder & p) {
auto reasoning_in_content = (tc.reasoning_format == COMMON_REASONING_FORMAT_NONE);
- auto reasoning = p.eps();
- if (tc.thinking_forced_open) {
- // If thinking is forced open, expect a closing tag
- reasoning = p.reasoning(p.until("")) + "" + p.space();
- } else {
- // Otherwise, optionally accept thinking wrapped in tags
- reasoning = p.optional("" + p.reasoning(p.until("")) + "" + p.space());
- }
+ // Always use optional TAG_BASED pattern; reasoning_prefill is prepended to input
+ auto reasoning = p.optional("" + p.reasoning(p.until("")) + "" + p.space());
// tool calling parser
if (tc.tools.is_array() && !tc.tools.empty()) {
@@ -190,78 +184,91 @@ static void test_example_native(testing & t) {
std::vector test_cases = std::vector{
{
- /* .name = */ "content with thinking_forced_open = false",
+ /* .name = */ "content with reasoning (no prefill)",
/* .tools = */ {},
/* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_NONE,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
/* .json_schema = */ {},
/* .parallel_tool_calls = */ false,
- /* .thinking_forced_open = */ false,
+ /* .reasoning_prefill = */ "",
/* .input = */ ("The user said hello, I must say hello back\nHello"),
/* .expect_reasoning = */ "The user said hello, I must say hello back",
/* .expect_content = */ "Hello",
/* .expect_tool_calls = */ {},
},
{
- /* .name = */ "content with thinking_forced_open = false and no reasoning",
+ /* .name = */ "content without reasoning (no prefill)",
/* .tools = */ {},
/* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_NONE,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
/* .json_schema = */ {},
/* .parallel_tool_calls = */ false,
- /* .thinking_forced_open = */ false,
+ /* .reasoning_prefill = */ "",
/* .input = */ ("Hello"),
/* .expect_reasoning = */ "",
/* .expect_content = */ "Hello",
/* .expect_tool_calls = */ {},
},
{
- /* .name = */ "content with thinking_forced_open = false and reasoning_format = none",
+ /* .name = */ "content with reasoning_format = none (tags appear in content)",
/* .tools = */ {},
/* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_NONE,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE,
/* .json_schema = */ {},
/* .parallel_tool_calls = */ false,
- /* .thinking_forced_open = */ true,
+ /* .reasoning_prefill = */ "",
/* .input = */ ("The user said hello, I must say hello back\nHello"),
/* .expect_reasoning = */ "",
/* .expect_content = */ "The user said hello, I must say hello back\nHello",
/* .expect_tool_calls = */ {},
},
{
- /* .name = */ "content with thinking_forced_open = true",
+ /* .name = */ "content with reasoning prefill",
/* .tools = */ {},
/* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_NONE,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
/* .json_schema = */ {},
/* .parallel_tool_calls = */ false,
- /* .thinking_forced_open = */ true,
+ /* .reasoning_prefill = */ "",
/* .input = */ ("The user said hello, I must say hello back\nHello"),
/* .expect_reasoning = */ "The user said hello, I must say hello back",
/* .expect_content = */ "Hello",
/* .expect_tool_calls = */ {},
},
{
- /* .name = */ "content with thinking_forced_open = true and reasoning_format = none",
+ /* .name = */ "content with reasoning prefill and reasoning_format = none",
/* .tools = */ {},
/* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_NONE,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE,
/* .json_schema = */ {},
/* .parallel_tool_calls = */ false,
- /* .thinking_forced_open = */ true,
+ /* .reasoning_prefill = */ "",
/* .input = */ ("The user said hello, I must say hello back\nHello"),
/* .expect_reasoning = */ "",
/* .expect_content = */ "The user said hello, I must say hello back\nHello",
/* .expect_tool_calls = */ {},
},
{
- /* .name = */ "tools with tool_choice = auto and no parallel_tool_calls",
+ /* .name = */ "content with closed reasoning prefill (empty reasoning discarded)",
+ /* .tools = */ {},
+ /* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_NONE,
+ /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
+ /* .json_schema = */ {},
+ /* .parallel_tool_calls = */ false,
+ /* .reasoning_prefill = */ "",
+ /* .input = */ ("Hello"),
+ /* .expect_reasoning = */ "",
+ /* .expect_content = */ "Hello",
+ /* .expect_tool_calls = */ {},
+ },
+ {
+ /* .name = */ "tools with reasoning prefill",
/* .tools = */ create_tools(),
/* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_AUTO,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
/* .json_schema = */ {},
/* .parallel_tool_calls = */ false,
- /* .thinking_forced_open = */ true,
+ /* .reasoning_prefill = */ "",
/* .input = */
("I must get the weather in New York\n"
"["
@@ -277,13 +284,13 @@ static void test_example_native(testing & t) {
} },
},
{
- /* .name = */ "tools with tool_choice = auto and parallel_tool_calls",
+ /* .name = */ "parallel tools with reasoning prefill",
/* .tools = */ create_tools(),
/* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_AUTO,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
/* .json_schema = */ {},
/* .parallel_tool_calls = */ true,
- /* .thinking_forced_open = */ true,
+ /* .reasoning_prefill = */ "",
/* .input = */
("I must get the weather in New York and San Francisco and a 3 day forecast of each.\nLet me "
"search that for you."
@@ -321,7 +328,7 @@ static void test_example_native(testing & t) {
} },
},
{
- /* .name = */ "response_format with thinking_forced_open = true",
+ /* .name = */ "response_format with reasoning prefill",
/* .tools = */ {},
/* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_NONE,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
@@ -333,7 +340,7 @@ static void test_example_native(testing & t) {
{ "due_date", { { "type", "string" } } } } },
{ "required", { "invoice_number", "amount", "due_date" } } },
/* .parallel_tool_calls = */ false,
- /* .thinking_forced_open = */ true,
+ /* .reasoning_prefill = */ "",
/* .input = */
("I must produce the invoice in the requested format\n"
R"({"invoice_number": "INV-2025-001", "amount": 1250.50, "due_date": "2025-12-31"})"),
@@ -361,7 +368,8 @@ static void test_example_native(testing & t) {
t.log(line);
}
- common_peg_parse_context ctx(tc.input);
+ std::string effective_input = tc.reasoning_prefill + tc.input;
+ common_peg_parse_context ctx(effective_input);
auto result = parser.parse(ctx);
t.assert_true("success", result.success());
diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp
index 3a6297e148..8e2117c4e7 100644
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@@ -822,8 +822,7 @@ struct make_peg_parser {
}
common_chat_msg parse(const std::string & msg, bool is_partial) const {
- common_chat_parser_params parser_params;
- parser_params.format = params_.format;
+ common_chat_parser_params parser_params(params_);
parser_params.debug = detailed_debug_;
return common_chat_peg_parse(arena_, msg, is_partial, parser_params);
}
diff --git a/tools/server/README.md b/tools/server/README.md
index da16ddc756..363f3fa5ea 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -907,7 +907,7 @@ If query param `?fail_on_no_slot=1` is set, this endpoint will respond with stat
"chat_format": "GPT-OSS",
"reasoning_format": "none",
"reasoning_in_content": false,
- "thinking_forced_open": false,
+ "reasoning_prefill": "",
"samplers": [
"penalties",
"dry",
@@ -972,7 +972,7 @@ If query param `?fail_on_no_slot=1` is set, this endpoint will respond with stat
"chat_format": "GPT-OSS",
"reasoning_format": "none",
"reasoning_in_content": false,
- "thinking_forced_open": false,
+ "reasoning_prefill": "",
"samplers": [
"penalties",
"dry",
@@ -1193,7 +1193,7 @@ The `response_format` parameter supports both plain JSON output (e.g. `{"type":
`reasoning_format`: The reasoning format to be parsed. If set to `none`, it will output the raw generated text.
-`thinking_forced_open`: Force a reasoning model to always output the reasoning. Only works on certain models.
+`reasoning_prefill`: The reasoning markers that were prefilled in the prompt by the template. Prepended to model output before parsing to handle dynamic thinking/non-thinking modes.
`parse_tool_calls`: Whether to parse the generated tool call.
diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp
index bd203228cc..1b74f50fcd 100644
--- a/tools/server/server-common.cpp
+++ b/tools/server/server-common.cpp
@@ -1093,7 +1093,7 @@ json oaicompat_chat_params_parse(
}
llama_params["grammar_triggers"] = grammar_triggers;
llama_params["preserved_tokens"] = chat_params.preserved_tokens;
- llama_params["thinking_forced_open"] = chat_params.thinking_forced_open;
+ llama_params["reasoning_prefill"] = chat_params.reasoning_prefill;
for (const auto & stop : chat_params.additional_stops) {
llama_params["stop"].push_back(stop);
}
diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp
index b3d510977b..a47ab5cbb0 100644
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@@ -72,7 +72,7 @@ json task_params::to_json(bool only_metrics) const {
{"chat_format", common_chat_format_name(chat_parser_params.format)},
{"reasoning_format", common_reasoning_format_name(chat_parser_params.reasoning_format)},
{"reasoning_in_content", chat_parser_params.reasoning_in_content},
- {"thinking_forced_open", chat_parser_params.thinking_forced_open},
+ {"reasoning_prefill", chat_parser_params.reasoning_prefill},
{"samplers", samplers},
{"speculative.n_max", speculative.n_max},
{"speculative.n_min", speculative.n_min},
@@ -135,7 +135,7 @@ json task_params::to_json(bool only_metrics) const {
{"chat_format", common_chat_format_name(chat_parser_params.format)},
{"reasoning_format", common_reasoning_format_name(chat_parser_params.reasoning_format)},
{"reasoning_in_content", chat_parser_params.reasoning_in_content},
- {"thinking_forced_open", chat_parser_params.thinking_forced_open},
+ {"reasoning_prefill", chat_parser_params.reasoning_prefill},
{"samplers", samplers},
{"speculative.n_max", speculative.n_max},
{"speculative.n_min", speculative.n_min},
@@ -402,7 +402,7 @@ task_params server_task::params_from_json_cmpl(
}
params.chat_parser_params.reasoning_format = reasoning_format;
params.chat_parser_params.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
- params.chat_parser_params.thinking_forced_open = json_value(data, "thinking_forced_open", false);
+ params.chat_parser_params.reasoning_prefill = json_value(data, "reasoning_prefill", std::string());
params.chat_parser_params.parse_tool_calls = json_value(data, "parse_tool_calls", false);
if (data.contains("chat_parser")) {
params.chat_parser_params.parser.load(data.at("chat_parser").get());
diff --git a/tools/server/webui/src/lib/services/parameter-sync.service.spec.ts b/tools/server/webui/src/lib/services/parameter-sync.service.spec.ts
index ce91de7410..63e303959e 100644
--- a/tools/server/webui/src/lib/services/parameter-sync.service.spec.ts
+++ b/tools/server/webui/src/lib/services/parameter-sync.service.spec.ts
@@ -51,7 +51,7 @@ describe('ParameterSyncService', () => {
chat_format: '',
reasoning_format: '',
reasoning_in_content: false,
- thinking_forced_open: false,
+ reasoning_prefill: '',
'speculative.n_max': 0,
'speculative.n_min': 0,
'speculative.p_min': 0.0,
@@ -116,7 +116,7 @@ describe('ParameterSyncService', () => {
chat_format: '',
reasoning_format: '',
reasoning_in_content: false,
- thinking_forced_open: false,
+ reasoning_prefill: '',
'speculative.n_max': 0,
'speculative.n_min': 0,
'speculative.p_min': 0.0,
diff --git a/tools/server/webui/src/lib/types/api.d.ts b/tools/server/webui/src/lib/types/api.d.ts
index c908258427..ea32079364 100644
--- a/tools/server/webui/src/lib/types/api.d.ts
+++ b/tools/server/webui/src/lib/types/api.d.ts
@@ -164,7 +164,7 @@ export interface ApiLlamaCppServerProps {
chat_format: string;
reasoning_format: string;
reasoning_in_content: boolean;
- thinking_forced_open: boolean;
+ reasoning_prefill: string;
samplers: string[];
backend_sampling: boolean;
'speculative.n_max': number;
@@ -332,7 +332,7 @@ export interface ApiSlotData {
chat_format: string;
reasoning_format: string;
reasoning_in_content: boolean;
- thinking_forced_open: boolean;
+ reasoning_prefill: string;
samplers: string[];
backend_sampling: boolean;
'speculative.n_max': number;