Reasoning prefill

This commit is contained in:
Piotr Wilkin 2026-03-11 12:28:22 +01:00
parent 559646472d
commit 060d4e4cfd
15 changed files with 141 additions and 101 deletions

View File

@ -49,6 +49,42 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
data.preserved_tokens = autoparser.preserved_tokens;
data.parser = parser.save();
// Extract reasoning prefill from the end of the rendered prompt.
// If the template added reasoning markers (e.g. <think> or <think></think>) at the end,
// store them so they can be prepended to model output before parsing.
if (inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE &&
autoparser.reasoning.mode != reasoning_mode::NONE &&
!autoparser.reasoning.end.empty()) {
const auto & r_start = autoparser.reasoning.start;
const auto & r_end = autoparser.reasoning.end;
// Trim trailing whitespace from the prompt for suffix matching
auto prompt_trimmed = data.prompt;
while (!prompt_trimmed.empty() &&
(prompt_trimmed.back() == ' ' || prompt_trimmed.back() == '\n' ||
prompt_trimmed.back() == '\r' || prompt_trimmed.back() == '\t')) {
prompt_trimmed.pop_back();
}
if (!r_start.empty()) {
// Check for start+end at end of prompt (e.g. <think></think>)
if (string_ends_with(prompt_trimmed, r_end)) {
auto before_end = prompt_trimmed.substr(0, prompt_trimmed.size() - r_end.size());
while (!before_end.empty() &&
(before_end.back() == ' ' || before_end.back() == '\n' ||
before_end.back() == '\r' || before_end.back() == '\t')) {
before_end.pop_back();
}
if (string_ends_with(before_end, r_start)) {
// Prompt ends with start + whitespace + end: extract from start to end of trimmed prompt
data.reasoning_prefill = prompt_trimmed.substr(before_end.size() - r_start.size());
}
}
// Check for just start at end of prompt (e.g. <think>)
if (data.reasoning_prefill.empty() && string_ends_with(prompt_trimmed, r_start)) {
data.reasoning_prefill = r_start;
}
}
}
// Build grammar if tools are present
bool has_tools =
autoparser.tools.format.mode != tool_format::NONE && inputs.tools.is_array() && !inputs.tools.empty();
@ -96,9 +132,8 @@ common_peg_arena autoparser::build_parser(const templates_params & inputs) const
parser_build_context ctx(p, inputs);
bool extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
bool enable_thinking = inputs.enable_thinking;
ctx.extracting_reasoning = extract_reasoning && enable_thinking && reasoning.mode != reasoning_mode::NONE;
ctx.extracting_reasoning = extract_reasoning && reasoning.mode != reasoning_mode::NONE;
ctx.content = &content;
// Build reasoning parser
@ -130,24 +165,15 @@ common_peg_parser analyze_reasoning::build_parser(parser_build_context & ctx) co
return p.eps();
}
bool thinking_forced_open = (mode == reasoning_mode::FORCED_OPEN);
bool thinking_forced_closed = (mode == reasoning_mode::FORCED_CLOSED);
if (thinking_forced_open || thinking_forced_closed) {
// Thinking is forced open OR forced closed with enable_thinking=true
// In both cases, expect only the closing tag (opening was in template)
// However, since we might have incorrectly detected the open/close pattern,
// we admit an optional starting marker
return p.optional(p.literal(start)) + p.reasoning(p.until(end)) + end;
}
if (mode == reasoning_mode::TAG_BASED || mode == reasoning_mode::TOOLS_ONLY) {
// Standard tag-based reasoning OR tools-only mode (reasoning appears with tools)
// Both use the same tag-based pattern if markers are available
if (!start.empty() && !end.empty()) {
return p.optional(start + p.reasoning(p.until(end)) + end);
if (!end.empty()) {
if (!start.empty()) {
// Standard tag-based: optional(<think>reasoning</think>)
return p.optional(start + p.reasoning(p.until(end)) + end);
}
// Delimiter-style (empty start): optional(reasoning[DELIMITER])
return p.optional(p.reasoning(p.until(end)) + end);
}
} else if (mode == reasoning_mode::DELIMITER) {
return p.optional(p.reasoning(p.until(end)) + end);
}
return p.eps();

View File

@ -77,11 +77,7 @@ struct templates_params {
// Reasoning handling mode (derived from R1-R3 comparisons)
enum class reasoning_mode {
NONE, // No reasoning markers detected
TAG_BASED, // Standard tag-based: <think>...</think>
DELIMITER, // Delimiter-based: [BEGIN FINAL RESPONSE] (reasoning ends at delimiter)
FORCED_OPEN, // Template ends with open reasoning tag (empty start, non-empty end)
FORCED_CLOSED, // Template ends with open reasoning tag on enabled thinking but
// with both opened and closed tag for disabled thinking
TAG_BASED, // Tag-based: <think>...</think> (start can be empty for delimiter-style)
TOOLS_ONLY // Only reason on tool calls, not on normal content
};
@ -91,12 +87,6 @@ inline std::ostream & operator<<(std::ostream & os, const reasoning_mode & mode)
return os << "NONE";
case reasoning_mode::TAG_BASED:
return os << "TAG_BASED";
case reasoning_mode::DELIMITER:
return os << "DELIMITER";
case reasoning_mode::FORCED_OPEN:
return os << "FORCED_OPEN";
case reasoning_mode::FORCED_CLOSED:
return os << "FORCED_CLOSED";
case reasoning_mode::TOOLS_ONLY:
return os << "TOOLS_ONLY";
default:

View File

@ -32,7 +32,7 @@ static std::vector<std::function<void(const common_chat_template & tmpl, autopar
if (tmpl.src.find("content.split('</think>')") != std::string::npos &&
tmpl.src.find("reasoning_content") == std::string::npos &&
analysis.reasoning.mode == reasoning_mode::NONE) {
analysis.reasoning.mode = reasoning_mode::FORCED_OPEN;
analysis.reasoning.mode = reasoning_mode::TAG_BASED;
analysis.reasoning.start = "<think>";
analysis.reasoning.end = "</think>";
analysis.preserved_tokens.push_back("<think>");
@ -295,15 +295,11 @@ void analyze_reasoning::compare_reasoning_presence() {
}
if (result.result.success()) {
if (!result.tags["pre"].empty() && !result.tags["post"].empty()) {
if (parser_wrapped.parse_anywhere_and_extract(diff.right).result.success()) { // both tags in the diff = no forced close
mode = reasoning_mode::TAG_BASED;
} else {
mode = reasoning_mode::FORCED_CLOSED;
}
mode = reasoning_mode::TAG_BASED;
start = trim_whitespace(result.tags["pre"]);
end = result.tags["post"];
} else if (!result.tags["post"].empty()) {
mode = reasoning_mode::DELIMITER;
mode = reasoning_mode::TAG_BASED;
end = result.tags["post"];
}
}
@ -338,17 +334,17 @@ void analyze_reasoning::compare_thinking_enabled() {
if (!right_trimmed.empty() && string_ends_with(comparison->output_B, right_trimmed)) {
if (start.empty()) {
start = right_trimmed;
mode = reasoning_mode::FORCED_OPEN;
mode = reasoning_mode::TAG_BASED;
}
}
}
if (start.empty() && !end.empty()) {
mode = reasoning_mode::DELIMITER;
if (mode == reasoning_mode::NONE && start.empty() && !end.empty()) {
mode = reasoning_mode::TAG_BASED;
}
// Check for FORCED_CLOSED: when enable_thinking=false produces both start and end markers,
// but enable_thinking=true produces only the start marker
// Check for start+end pattern: when enable_thinking=false produces both start and end markers,
// but enable_thinking=true produces only the start marker. Both cases are TAG_BASED.
if (!comparison->output_A.empty() && !comparison->output_B.empty()) {
auto parser_start = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
return p.literal(start) + p.space() + p.literal(end) + p.rest();
@ -358,12 +354,12 @@ void analyze_reasoning::compare_thinking_enabled() {
});
if (!start.empty() && parser_start_end.parse_anywhere_and_extract(comparison->output_A).result.success() &&
parser_start.parse_anywhere_and_extract(comparison->output_B).result.success()) {
mode = reasoning_mode::FORCED_CLOSED;
mode = reasoning_mode::TAG_BASED;
} else if (!end.empty()) { // we extract the starting marker now since we didn't get it earlier
auto result = parser_start_end.parse_anywhere_and_extract(comparison->output_A);
if (result.result.success()) {
start = result.tags["pre"];
mode = reasoning_mode::FORCED_CLOSED;
mode = reasoning_mode::TAG_BASED;
}
}
}
@ -373,7 +369,7 @@ void analyze_reasoning::compare_thinking_enabled() {
auto seg_A = segmentize_markers(trim_trailing_whitespace(diff.left));
auto seg_B = segmentize_markers(trim_trailing_whitespace(diff.right));
if (seg_A.size() == 1 && seg_B.size() == 1) {
mode = reasoning_mode::FORCED_CLOSED;
mode = reasoning_mode::TAG_BASED;
start = seg_B[0].value;
end = seg_A[0].value;
}

View File

@ -229,6 +229,20 @@ void common_chat_peg_mapper::from_ast(const common_peg_ast_arena & arena,
result.tool_calls.push_back(pending_tool_call.value());
pending_tool_call.reset();
}
// Discard whitespace-only reasoning content (e.g. from <think></think> prefill)
if (!result.reasoning_content.empty()) {
bool all_whitespace = true;
for (char c : result.reasoning_content) {
if (c != ' ' && c != '\n' && c != '\r' && c != '\t') {
all_whitespace = false;
break;
}
}
if (all_whitespace) {
result.reasoning_content.clear();
}
}
}
void common_chat_peg_mapper::map(const common_peg_ast_node & node) {

View File

@ -1718,14 +1718,20 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & src_pars
LOG_DBG("No parser definition detected, assuming pure content parser.");
}
LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), input.c_str());
// Prepend reasoning prefill (e.g. <think> or <think></think> from template prompt)
// so the parser can detect reasoning markers that were part of the template output.
const std::string effective_input = params.reasoning_prefill.empty()
? input
: params.reasoning_prefill + input;
LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), effective_input.c_str());
common_peg_parse_flags flags = COMMON_PEG_PARSE_FLAG_LENIENT;
if (params.debug) {
flags |= COMMON_PEG_PARSE_FLAG_DEBUG;
}
common_peg_parse_context ctx(input, flags);
common_peg_parse_context ctx(effective_input, flags);
auto result = parser.parse(ctx);
if (result.fail()) {

View File

@ -211,7 +211,7 @@ struct common_chat_params {
std::string prompt;
std::string grammar;
bool grammar_lazy = false;
bool thinking_forced_open = false;
std::string reasoning_prefill;
bool supports_thinking = false;
std::string thinking_start_tag; // e.g., "<think>"
std::string thinking_end_tag; // e.g., "</think>"
@ -228,14 +228,14 @@ struct common_chat_parser_params {
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE; // TODO: refactor this to "bool parse_reasoning"
// Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
bool reasoning_in_content = false;
bool thinking_forced_open = false;
std::string reasoning_prefill;
bool parse_tool_calls = true;
bool debug = false; // Enable debug output for PEG parser
common_peg_arena parser = {};
common_chat_parser_params() = default;
common_chat_parser_params(const common_chat_params & chat_params) {
format = chat_params.format;
thinking_forced_open = chat_params.thinking_forced_open;
reasoning_prefill = chat_params.reasoning_prefill;
}
};

View File

@ -47,12 +47,11 @@ All structs are defined in [common/chat-auto-parser.h](common/chat-auto-parser.h
| Value | Description |
|-----------------|-----------------------------------------------------------------------------------|
| `NONE` | No reasoning markers detected |
| `TAG_BASED` | Standard tag-based: `<think>...</think>` |
| `DELIMITER` | Delimiter-based: reasoning ends at a delimiter (e.g., `[BEGIN FINAL RESPONSE]`) |
| `FORCED_OPEN` | Template ends with open reasoning tag when `enable_thinking=true` |
| `FORCED_CLOSED` | `enable_thinking=false` emits both tags; `enable_thinking=true` emits only start |
| `TAG_BASED` | Tag-based: `<think>...</think>` (start can be empty for delimiter-style formats) |
| `TOOLS_ONLY` | Reasoning only appears in tool call responses, not plain content |
**Reasoning Prefill**: When a template adds reasoning markers (e.g., `<think>` or `<think></think>`) at the end of the prompt, these are extracted as `reasoning_prefill` and prepended to the model output before parsing. This allows the parser to always use an optional TAG_BASED pattern while correctly handling templates that force thinking mode open or closed. Whitespace-only reasoning content (from `<think></think>` prefill) is automatically discarded.
**`content_mode`**: How the template wraps assistant content.
| Value | Description |
@ -263,14 +262,15 @@ Text is segmentized into markers and non-marker fragments using `segmentize_mark
- Uses PEG parsers to find surrounding markers:
- If both pre/post markers found in `diff.right``TAG_BASED` (both tags visible in diff = no forced close)
- If both found but post marker only in the full output B → `FORCED_CLOSED`
- If only post marker found → `DELIMITER`
- If only post marker found → `TAG_BASED` (delimiter-style, empty start)
- Sets `reasoning.start` and `reasoning.end`
**R2 — `compare_thinking_enabled()`**: Compares `enable_thinking=false` vs `true` with a generation prompt.
- Detects `FORCED_OPEN`: `enable_thinking=true` adds a non-empty marker at the end of the prompt (where model will start generating) — sets `reasoning.start`, mode = `FORCED_OPEN`
- Detects `FORCED_CLOSED`: `enable_thinking=false` produces both start+end markers; `enable_thinking=true` produces only start marker
- Detects template-added reasoning markers: `enable_thinking=true` adds a non-empty marker at the end of the prompt — sets `reasoning.start`, mode = `TAG_BASED`
- Detects start+end pattern: `enable_thinking=false` produces both start+end markers; `enable_thinking=true` produces only start marker — both classified as `TAG_BASED`
- Handles the reverse case: if both start and end are still empty, looks for a single-segment diff on each side to extract both markers
- The reasoning prefill (markers added by the template) is later extracted in `generate_parser()` and prepended to model output before parsing
**R3 — `compare_reasoning_scope()`**: Compares assistant message with reasoning+text-content vs reasoning+tool-calls.
@ -358,9 +358,10 @@ Each analyzer struct (`analyze_reasoning`, `analyze_content`, `analyze_tools`) i
| Mode | Parser |
|-----------------------------------|---------------------------------------------------------------------|
| Not extracting reasoning | `eps()` |
| `FORCED_OPEN` or `FORCED_CLOSED` | `reasoning(until(end)) + end` — opening tag was in the prompt |
| `TAG_BASED` or `TOOLS_ONLY` | `optional(start + reasoning(until(end)) + end)` |
| `DELIMITER` | `optional(reasoning(until(end)) + end)` — no start marker |
| `TAG_BASED` or `TOOLS_ONLY` (non-empty start) | `optional(start + reasoning(until(end)) + end)` |
| `TAG_BASED` or `TOOLS_ONLY` (empty start) | `optional(reasoning(until(end)) + end)` — delimiter-style|
Note: Templates that add reasoning markers to the prompt (e.g., `<think>`) have these extracted as `reasoning_prefill` and prepended to model output before parsing. The parser always uses the optional TAG_BASED pattern.
#### Content Parser (`analyze_content::build_parser`)
@ -516,7 +517,7 @@ To support a new template format:
## Edge Cases and Quirks
1. **Forced Thinking**: When `enable_thinking=true` and the model prompt ends with an open reasoning tag (e.g., `<think>`), the parser enters forced thinking mode and immediately expects reasoning content without waiting for a start marker.
1. **Reasoning Prefill**: When `enable_thinking=true` and the model prompt ends with reasoning markers (e.g., `<think>` or `<think></think>`), these are extracted as `reasoning_prefill` and prepended to model output before parsing. The parser always uses optional TAG_BASED reasoning, so it handles both thinking and non-thinking outputs dynamically. Whitespace-only reasoning content (from closed prefill like `<think></think>`) is discarded.
2. **Per-Call vs Per-Section Markers**: Some templates wrap each tool call individually (`per_call_start/end`); others wrap the entire section (`section_start/end`). T2 (`check_per_call_markers()`) disambiguates by checking if the second call in a two-call output starts with the section marker.
3. **Python Dict Format**: The Seed template family uses single-quoted JSON (`'key': 'value'`). The `uses_python_dicts` flag causes the PEG builder to register a flexible `json-string` rule accepting both quote styles before any JSON rules are built.
4. **Tag Boundary Fixing**: `calculate_diff_split()` iteratively adjusts prefix/suffix boundaries to avoid splitting `<tag>` or `[marker]` tokens, ensuring clean extraction.

View File

@ -1295,8 +1295,8 @@ static void test_nemotron_reasoning_detection(testing & t) {
t.assert_equal("reasoning_end should be '</think>\\n'", "</think>\n", analysis.reasoning.end);
// Check reasoning mode detection
// Nemotron uses forced closed reasoning with add_generation_prompt
t.assert_equal("reasoning should be FORCED_CLOSED", reasoning_mode::FORCED_CLOSED, analysis.reasoning.mode);
// Nemotron uses tag-based reasoning (formerly FORCED_CLOSED; prefill handles the template's forced markers)
t.assert_equal("reasoning should be TAG_BASED", reasoning_mode::TAG_BASED, analysis.reasoning.mode);
// Make sure reasoning markers don't spill over to content markers
t.assert_equal("content start should be empty", "", analysis.content.start);

View File

@ -145,7 +145,7 @@ static void test_example_native(testing & t) {
common_reasoning_format reasoning_format;
json json_schema;
bool parallel_tool_calls;
bool thinking_forced_open;
std::string reasoning_prefill;
std::string input;
// Expect
@ -157,14 +157,8 @@ static void test_example_native(testing & t) {
auto build_parser = [](const test_case & tc) {
return build_chat_peg_parser([&](common_chat_peg_builder & p) {
auto reasoning_in_content = (tc.reasoning_format == COMMON_REASONING_FORMAT_NONE);
auto reasoning = p.eps();
if (tc.thinking_forced_open) {
// If thinking is forced open, expect a closing tag
reasoning = p.reasoning(p.until("</think>")) + "</think>" + p.space();
} else {
// Otherwise, optionally accept thinking wrapped in tags
reasoning = p.optional("<think>" + p.reasoning(p.until("</think>")) + "</think>" + p.space());
}
// Always use optional TAG_BASED pattern; reasoning_prefill is prepended to input
auto reasoning = p.optional("<think>" + p.reasoning(p.until("</think>")) + "</think>" + p.space());
// tool calling parser
if (tc.tools.is_array() && !tc.tools.empty()) {
@ -190,78 +184,91 @@ static void test_example_native(testing & t) {
std::vector<test_case> test_cases = std::vector<test_case>{
{
/* .name = */ "content with thinking_forced_open = false",
/* .name = */ "content with reasoning (no prefill)",
/* .tools = */ {},
/* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_NONE,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
/* .json_schema = */ {},
/* .parallel_tool_calls = */ false,
/* .thinking_forced_open = */ false,
/* .reasoning_prefill = */ "",
/* .input = */ ("<think>The user said hello, I must say hello back</think>\nHello"),
/* .expect_reasoning = */ "The user said hello, I must say hello back",
/* .expect_content = */ "Hello",
/* .expect_tool_calls = */ {},
},
{
/* .name = */ "content with thinking_forced_open = false and no reasoning",
/* .name = */ "content without reasoning (no prefill)",
/* .tools = */ {},
/* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_NONE,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
/* .json_schema = */ {},
/* .parallel_tool_calls = */ false,
/* .thinking_forced_open = */ false,
/* .reasoning_prefill = */ "",
/* .input = */ ("Hello"),
/* .expect_reasoning = */ "",
/* .expect_content = */ "Hello",
/* .expect_tool_calls = */ {},
},
{
/* .name = */ "content with thinking_forced_open = false and reasoning_format = none",
/* .name = */ "content with reasoning_format = none (tags appear in content)",
/* .tools = */ {},
/* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_NONE,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE,
/* .json_schema = */ {},
/* .parallel_tool_calls = */ false,
/* .thinking_forced_open = */ true,
/* .reasoning_prefill = */ "",
/* .input = */ ("<think>The user said hello, I must say hello back</think>\nHello"),
/* .expect_reasoning = */ "",
/* .expect_content = */ "<think>The user said hello, I must say hello back</think>\nHello",
/* .expect_tool_calls = */ {},
},
{
/* .name = */ "content with thinking_forced_open = true",
/* .name = */ "content with reasoning prefill",
/* .tools = */ {},
/* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_NONE,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
/* .json_schema = */ {},
/* .parallel_tool_calls = */ false,
/* .thinking_forced_open = */ true,
/* .reasoning_prefill = */ "<think>",
/* .input = */ ("The user said hello, I must say hello back</think>\nHello"),
/* .expect_reasoning = */ "The user said hello, I must say hello back",
/* .expect_content = */ "Hello",
/* .expect_tool_calls = */ {},
},
{
/* .name = */ "content with thinking_forced_open = true and reasoning_format = none",
/* .name = */ "content with reasoning prefill and reasoning_format = none",
/* .tools = */ {},
/* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_NONE,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE,
/* .json_schema = */ {},
/* .parallel_tool_calls = */ false,
/* .thinking_forced_open = */ true,
/* .reasoning_prefill = */ "",
/* .input = */ ("The user said hello, I must say hello back</think>\nHello"),
/* .expect_reasoning = */ "",
/* .expect_content = */ "The user said hello, I must say hello back</think>\nHello",
/* .expect_tool_calls = */ {},
},
{
/* .name = */ "tools with tool_choice = auto and no parallel_tool_calls",
/* .name = */ "content with closed reasoning prefill (empty reasoning discarded)",
/* .tools = */ {},
/* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_NONE,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
/* .json_schema = */ {},
/* .parallel_tool_calls = */ false,
/* .reasoning_prefill = */ "<think></think>",
/* .input = */ ("Hello"),
/* .expect_reasoning = */ "",
/* .expect_content = */ "Hello",
/* .expect_tool_calls = */ {},
},
{
/* .name = */ "tools with reasoning prefill",
/* .tools = */ create_tools(),
/* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_AUTO,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
/* .json_schema = */ {},
/* .parallel_tool_calls = */ false,
/* .thinking_forced_open = */ true,
/* .reasoning_prefill = */ "<think>",
/* .input = */
("I must get the weather in New York</think>\n"
"<tool_call>["
@ -277,13 +284,13 @@ static void test_example_native(testing & t) {
} },
},
{
/* .name = */ "tools with tool_choice = auto and parallel_tool_calls",
/* .name = */ "parallel tools with reasoning prefill",
/* .tools = */ create_tools(),
/* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_AUTO,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
/* .json_schema = */ {},
/* .parallel_tool_calls = */ true,
/* .thinking_forced_open = */ true,
/* .reasoning_prefill = */ "<think>",
/* .input = */
("I must get the weather in New York and San Francisco and a 3 day forecast of each.</think>\nLet me "
"search that for you."
@ -321,7 +328,7 @@ static void test_example_native(testing & t) {
} },
},
{
/* .name = */ "response_format with thinking_forced_open = true",
/* .name = */ "response_format with reasoning prefill",
/* .tools = */ {},
/* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_NONE,
/* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
@ -333,7 +340,7 @@ static void test_example_native(testing & t) {
{ "due_date", { { "type", "string" } } } } },
{ "required", { "invoice_number", "amount", "due_date" } } },
/* .parallel_tool_calls = */ false,
/* .thinking_forced_open = */ true,
/* .reasoning_prefill = */ "<think>",
/* .input = */
("I must produce the invoice in the requested format</think>\n"
R"({"invoice_number": "INV-2025-001", "amount": 1250.50, "due_date": "2025-12-31"})"),
@ -361,7 +368,8 @@ static void test_example_native(testing & t) {
t.log(line);
}
common_peg_parse_context ctx(tc.input);
std::string effective_input = tc.reasoning_prefill + tc.input;
common_peg_parse_context ctx(effective_input);
auto result = parser.parse(ctx);
t.assert_true("success", result.success());

View File

@ -822,8 +822,7 @@ struct make_peg_parser {
}
common_chat_msg parse(const std::string & msg, bool is_partial) const {
common_chat_parser_params parser_params;
parser_params.format = params_.format;
common_chat_parser_params parser_params(params_);
parser_params.debug = detailed_debug_;
return common_chat_peg_parse(arena_, msg, is_partial, parser_params);
}

View File

@ -907,7 +907,7 @@ If query param `?fail_on_no_slot=1` is set, this endpoint will respond with stat
"chat_format": "GPT-OSS",
"reasoning_format": "none",
"reasoning_in_content": false,
"thinking_forced_open": false,
"reasoning_prefill": "",
"samplers": [
"penalties",
"dry",
@ -972,7 +972,7 @@ If query param `?fail_on_no_slot=1` is set, this endpoint will respond with stat
"chat_format": "GPT-OSS",
"reasoning_format": "none",
"reasoning_in_content": false,
"thinking_forced_open": false,
"reasoning_prefill": "",
"samplers": [
"penalties",
"dry",
@ -1193,7 +1193,7 @@ The `response_format` parameter supports both plain JSON output (e.g. `{"type":
`reasoning_format`: The reasoning format to be parsed. If set to `none`, it will output the raw generated text.
`thinking_forced_open`: Force a reasoning model to always output the reasoning. Only works on certain models.
`reasoning_prefill`: The reasoning markers that were prefilled in the prompt by the template. Prepended to model output before parsing to handle dynamic thinking/non-thinking modes.
`parse_tool_calls`: Whether to parse the generated tool call.

View File

@ -1093,7 +1093,7 @@ json oaicompat_chat_params_parse(
}
llama_params["grammar_triggers"] = grammar_triggers;
llama_params["preserved_tokens"] = chat_params.preserved_tokens;
llama_params["thinking_forced_open"] = chat_params.thinking_forced_open;
llama_params["reasoning_prefill"] = chat_params.reasoning_prefill;
for (const auto & stop : chat_params.additional_stops) {
llama_params["stop"].push_back(stop);
}

View File

@ -72,7 +72,7 @@ json task_params::to_json(bool only_metrics) const {
{"chat_format", common_chat_format_name(chat_parser_params.format)},
{"reasoning_format", common_reasoning_format_name(chat_parser_params.reasoning_format)},
{"reasoning_in_content", chat_parser_params.reasoning_in_content},
{"thinking_forced_open", chat_parser_params.thinking_forced_open},
{"reasoning_prefill", chat_parser_params.reasoning_prefill},
{"samplers", samplers},
{"speculative.n_max", speculative.n_max},
{"speculative.n_min", speculative.n_min},
@ -135,7 +135,7 @@ json task_params::to_json(bool only_metrics) const {
{"chat_format", common_chat_format_name(chat_parser_params.format)},
{"reasoning_format", common_reasoning_format_name(chat_parser_params.reasoning_format)},
{"reasoning_in_content", chat_parser_params.reasoning_in_content},
{"thinking_forced_open", chat_parser_params.thinking_forced_open},
{"reasoning_prefill", chat_parser_params.reasoning_prefill},
{"samplers", samplers},
{"speculative.n_max", speculative.n_max},
{"speculative.n_min", speculative.n_min},
@ -402,7 +402,7 @@ task_params server_task::params_from_json_cmpl(
}
params.chat_parser_params.reasoning_format = reasoning_format;
params.chat_parser_params.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
params.chat_parser_params.thinking_forced_open = json_value(data, "thinking_forced_open", false);
params.chat_parser_params.reasoning_prefill = json_value(data, "reasoning_prefill", std::string());
params.chat_parser_params.parse_tool_calls = json_value(data, "parse_tool_calls", false);
if (data.contains("chat_parser")) {
params.chat_parser_params.parser.load(data.at("chat_parser").get<std::string>());

View File

@ -51,7 +51,7 @@ describe('ParameterSyncService', () => {
chat_format: '',
reasoning_format: '',
reasoning_in_content: false,
thinking_forced_open: false,
reasoning_prefill: '',
'speculative.n_max': 0,
'speculative.n_min': 0,
'speculative.p_min': 0.0,
@ -116,7 +116,7 @@ describe('ParameterSyncService', () => {
chat_format: '',
reasoning_format: '',
reasoning_in_content: false,
thinking_forced_open: false,
reasoning_prefill: '',
'speculative.n_max': 0,
'speculative.n_min': 0,
'speculative.p_min': 0.0,

View File

@ -164,7 +164,7 @@ export interface ApiLlamaCppServerProps {
chat_format: string;
reasoning_format: string;
reasoning_in_content: boolean;
thinking_forced_open: boolean;
reasoning_prefill: string;
samplers: string[];
backend_sampling: boolean;
'speculative.n_max': number;
@ -332,7 +332,7 @@ export interface ApiSlotData {
chat_format: string;
reasoning_format: string;
reasoning_in_content: boolean;
thinking_forced_open: boolean;
reasoning_prefill: string;
samplers: string[];
backend_sampling: boolean;
'speculative.n_max': number;