From 060d4e4cfdafc49ec7519da5e5de669fefd432ba Mon Sep 17 00:00:00 2001 From: Piotr Wilkin Date: Wed, 11 Mar 2026 12:28:22 +0100 Subject: [PATCH] Reasoning prefill --- common/chat-auto-parser-generator.cpp | 62 +++++++++++++------ common/chat-auto-parser.h | 12 +--- common/chat-diff-analyzer.cpp | 26 ++++---- common/chat-peg-parser.cpp | 14 +++++ common/chat.cpp | 10 ++- common/chat.h | 6 +- docs/autoparser.md | 23 +++---- tests/test-chat-auto-parser.cpp | 4 +- tests/test-chat-peg-parser.cpp | 60 ++++++++++-------- tests/test-chat.cpp | 3 +- tools/server/README.md | 6 +- tools/server/server-common.cpp | 2 +- tools/server/server-task.cpp | 6 +- .../services/parameter-sync.service.spec.ts | 4 +- tools/server/webui/src/lib/types/api.d.ts | 4 +- 15 files changed, 141 insertions(+), 101 deletions(-) diff --git a/common/chat-auto-parser-generator.cpp b/common/chat-auto-parser-generator.cpp index f19819494c..6bd4b2d208 100644 --- a/common/chat-auto-parser-generator.cpp +++ b/common/chat-auto-parser-generator.cpp @@ -49,6 +49,42 @@ common_chat_params peg_generator::generate_parser(const common_chat_template & data.preserved_tokens = autoparser.preserved_tokens; data.parser = parser.save(); + // Extract reasoning prefill from the end of the rendered prompt. + // If the template added reasoning markers (e.g. or ) at the end, + // store them so they can be prepended to model output before parsing. + if (inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE && + autoparser.reasoning.mode != reasoning_mode::NONE && + !autoparser.reasoning.end.empty()) { + const auto & r_start = autoparser.reasoning.start; + const auto & r_end = autoparser.reasoning.end; + // Trim trailing whitespace from the prompt for suffix matching + auto prompt_trimmed = data.prompt; + while (!prompt_trimmed.empty() && + (prompt_trimmed.back() == ' ' || prompt_trimmed.back() == '\n' || + prompt_trimmed.back() == '\r' || prompt_trimmed.back() == '\t')) { + prompt_trimmed.pop_back(); + } + if (!r_start.empty()) { + // Check for start+end at end of prompt (e.g. ) + if (string_ends_with(prompt_trimmed, r_end)) { + auto before_end = prompt_trimmed.substr(0, prompt_trimmed.size() - r_end.size()); + while (!before_end.empty() && + (before_end.back() == ' ' || before_end.back() == '\n' || + before_end.back() == '\r' || before_end.back() == '\t')) { + before_end.pop_back(); + } + if (string_ends_with(before_end, r_start)) { + // Prompt ends with start + whitespace + end: extract from start to end of trimmed prompt + data.reasoning_prefill = prompt_trimmed.substr(before_end.size() - r_start.size()); + } + } + // Check for just start at end of prompt (e.g. ) + if (data.reasoning_prefill.empty() && string_ends_with(prompt_trimmed, r_start)) { + data.reasoning_prefill = r_start; + } + } + } + // Build grammar if tools are present bool has_tools = autoparser.tools.format.mode != tool_format::NONE && inputs.tools.is_array() && !inputs.tools.empty(); @@ -96,9 +132,8 @@ common_peg_arena autoparser::build_parser(const templates_params & inputs) const parser_build_context ctx(p, inputs); bool extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE; - bool enable_thinking = inputs.enable_thinking; - ctx.extracting_reasoning = extract_reasoning && enable_thinking && reasoning.mode != reasoning_mode::NONE; + ctx.extracting_reasoning = extract_reasoning && reasoning.mode != reasoning_mode::NONE; ctx.content = &content; // Build reasoning parser @@ -130,24 +165,15 @@ common_peg_parser analyze_reasoning::build_parser(parser_build_context & ctx) co return p.eps(); } - bool thinking_forced_open = (mode == reasoning_mode::FORCED_OPEN); - bool thinking_forced_closed = (mode == reasoning_mode::FORCED_CLOSED); - - if (thinking_forced_open || thinking_forced_closed) { - // Thinking is forced open OR forced closed with enable_thinking=true - // In both cases, expect only the closing tag (opening was in template) - // However, since we might have incorrectly detected the open/close pattern, - // we admit an optional starting marker - return p.optional(p.literal(start)) + p.reasoning(p.until(end)) + end; - } if (mode == reasoning_mode::TAG_BASED || mode == reasoning_mode::TOOLS_ONLY) { - // Standard tag-based reasoning OR tools-only mode (reasoning appears with tools) - // Both use the same tag-based pattern if markers are available - if (!start.empty() && !end.empty()) { - return p.optional(start + p.reasoning(p.until(end)) + end); + if (!end.empty()) { + if (!start.empty()) { + // Standard tag-based: optional(reasoning) + return p.optional(start + p.reasoning(p.until(end)) + end); + } + // Delimiter-style (empty start): optional(reasoning[DELIMITER]) + return p.optional(p.reasoning(p.until(end)) + end); } - } else if (mode == reasoning_mode::DELIMITER) { - return p.optional(p.reasoning(p.until(end)) + end); } return p.eps(); diff --git a/common/chat-auto-parser.h b/common/chat-auto-parser.h index 52c6488f4b..55713f4ef4 100644 --- a/common/chat-auto-parser.h +++ b/common/chat-auto-parser.h @@ -77,11 +77,7 @@ struct templates_params { // Reasoning handling mode (derived from R1-R3 comparisons) enum class reasoning_mode { NONE, // No reasoning markers detected - TAG_BASED, // Standard tag-based: ... - DELIMITER, // Delimiter-based: [BEGIN FINAL RESPONSE] (reasoning ends at delimiter) - FORCED_OPEN, // Template ends with open reasoning tag (empty start, non-empty end) - FORCED_CLOSED, // Template ends with open reasoning tag on enabled thinking but - // with both opened and closed tag for disabled thinking + TAG_BASED, // Tag-based: ... (start can be empty for delimiter-style) TOOLS_ONLY // Only reason on tool calls, not on normal content }; @@ -91,12 +87,6 @@ inline std::ostream & operator<<(std::ostream & os, const reasoning_mode & mode) return os << "NONE"; case reasoning_mode::TAG_BASED: return os << "TAG_BASED"; - case reasoning_mode::DELIMITER: - return os << "DELIMITER"; - case reasoning_mode::FORCED_OPEN: - return os << "FORCED_OPEN"; - case reasoning_mode::FORCED_CLOSED: - return os << "FORCED_CLOSED"; case reasoning_mode::TOOLS_ONLY: return os << "TOOLS_ONLY"; default: diff --git a/common/chat-diff-analyzer.cpp b/common/chat-diff-analyzer.cpp index 05b3b6b6a8..57bc234fca 100644 --- a/common/chat-diff-analyzer.cpp +++ b/common/chat-diff-analyzer.cpp @@ -32,7 +32,7 @@ static std::vector')") != std::string::npos && tmpl.src.find("reasoning_content") == std::string::npos && analysis.reasoning.mode == reasoning_mode::NONE) { - analysis.reasoning.mode = reasoning_mode::FORCED_OPEN; + analysis.reasoning.mode = reasoning_mode::TAG_BASED; analysis.reasoning.start = ""; analysis.reasoning.end = ""; analysis.preserved_tokens.push_back(""); @@ -295,15 +295,11 @@ void analyze_reasoning::compare_reasoning_presence() { } if (result.result.success()) { if (!result.tags["pre"].empty() && !result.tags["post"].empty()) { - if (parser_wrapped.parse_anywhere_and_extract(diff.right).result.success()) { // both tags in the diff = no forced close - mode = reasoning_mode::TAG_BASED; - } else { - mode = reasoning_mode::FORCED_CLOSED; - } + mode = reasoning_mode::TAG_BASED; start = trim_whitespace(result.tags["pre"]); end = result.tags["post"]; } else if (!result.tags["post"].empty()) { - mode = reasoning_mode::DELIMITER; + mode = reasoning_mode::TAG_BASED; end = result.tags["post"]; } } @@ -338,17 +334,17 @@ void analyze_reasoning::compare_thinking_enabled() { if (!right_trimmed.empty() && string_ends_with(comparison->output_B, right_trimmed)) { if (start.empty()) { start = right_trimmed; - mode = reasoning_mode::FORCED_OPEN; + mode = reasoning_mode::TAG_BASED; } } } - if (start.empty() && !end.empty()) { - mode = reasoning_mode::DELIMITER; + if (mode == reasoning_mode::NONE && start.empty() && !end.empty()) { + mode = reasoning_mode::TAG_BASED; } - // Check for FORCED_CLOSED: when enable_thinking=false produces both start and end markers, - // but enable_thinking=true produces only the start marker + // Check for start+end pattern: when enable_thinking=false produces both start and end markers, + // but enable_thinking=true produces only the start marker. Both cases are TAG_BASED. if (!comparison->output_A.empty() && !comparison->output_B.empty()) { auto parser_start = build_tagged_peg_parser([&](common_peg_parser_builder &p) { return p.literal(start) + p.space() + p.literal(end) + p.rest(); @@ -358,12 +354,12 @@ void analyze_reasoning::compare_thinking_enabled() { }); if (!start.empty() && parser_start_end.parse_anywhere_and_extract(comparison->output_A).result.success() && parser_start.parse_anywhere_and_extract(comparison->output_B).result.success()) { - mode = reasoning_mode::FORCED_CLOSED; + mode = reasoning_mode::TAG_BASED; } else if (!end.empty()) { // we extract the starting marker now since we didn't get it earlier auto result = parser_start_end.parse_anywhere_and_extract(comparison->output_A); if (result.result.success()) { start = result.tags["pre"]; - mode = reasoning_mode::FORCED_CLOSED; + mode = reasoning_mode::TAG_BASED; } } } @@ -373,7 +369,7 @@ void analyze_reasoning::compare_thinking_enabled() { auto seg_A = segmentize_markers(trim_trailing_whitespace(diff.left)); auto seg_B = segmentize_markers(trim_trailing_whitespace(diff.right)); if (seg_A.size() == 1 && seg_B.size() == 1) { - mode = reasoning_mode::FORCED_CLOSED; + mode = reasoning_mode::TAG_BASED; start = seg_B[0].value; end = seg_A[0].value; } diff --git a/common/chat-peg-parser.cpp b/common/chat-peg-parser.cpp index 4c5bb6218d..5f7d422b41 100644 --- a/common/chat-peg-parser.cpp +++ b/common/chat-peg-parser.cpp @@ -229,6 +229,20 @@ void common_chat_peg_mapper::from_ast(const common_peg_ast_arena & arena, result.tool_calls.push_back(pending_tool_call.value()); pending_tool_call.reset(); } + + // Discard whitespace-only reasoning content (e.g. from prefill) + if (!result.reasoning_content.empty()) { + bool all_whitespace = true; + for (char c : result.reasoning_content) { + if (c != ' ' && c != '\n' && c != '\r' && c != '\t') { + all_whitespace = false; + break; + } + } + if (all_whitespace) { + result.reasoning_content.clear(); + } + } } void common_chat_peg_mapper::map(const common_peg_ast_node & node) { diff --git a/common/chat.cpp b/common/chat.cpp index cfd5df30a7..4f49fcf8a6 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -1718,14 +1718,20 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & src_pars LOG_DBG("No parser definition detected, assuming pure content parser."); } - LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), input.c_str()); + // Prepend reasoning prefill (e.g. or from template prompt) + // so the parser can detect reasoning markers that were part of the template output. + const std::string effective_input = params.reasoning_prefill.empty() + ? input + : params.reasoning_prefill + input; + + LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), effective_input.c_str()); common_peg_parse_flags flags = COMMON_PEG_PARSE_FLAG_LENIENT; if (params.debug) { flags |= COMMON_PEG_PARSE_FLAG_DEBUG; } - common_peg_parse_context ctx(input, flags); + common_peg_parse_context ctx(effective_input, flags); auto result = parser.parse(ctx); if (result.fail()) { diff --git a/common/chat.h b/common/chat.h index 930987cf77..cb83da5fcc 100644 --- a/common/chat.h +++ b/common/chat.h @@ -211,7 +211,7 @@ struct common_chat_params { std::string prompt; std::string grammar; bool grammar_lazy = false; - bool thinking_forced_open = false; + std::string reasoning_prefill; bool supports_thinking = false; std::string thinking_start_tag; // e.g., "" std::string thinking_end_tag; // e.g., "" @@ -228,14 +228,14 @@ struct common_chat_parser_params { common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE; // TODO: refactor this to "bool parse_reasoning" // Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode) bool reasoning_in_content = false; - bool thinking_forced_open = false; + std::string reasoning_prefill; bool parse_tool_calls = true; bool debug = false; // Enable debug output for PEG parser common_peg_arena parser = {}; common_chat_parser_params() = default; common_chat_parser_params(const common_chat_params & chat_params) { format = chat_params.format; - thinking_forced_open = chat_params.thinking_forced_open; + reasoning_prefill = chat_params.reasoning_prefill; } }; diff --git a/docs/autoparser.md b/docs/autoparser.md index 686b2c249b..08ffa6a3de 100644 --- a/docs/autoparser.md +++ b/docs/autoparser.md @@ -47,12 +47,11 @@ All structs are defined in [common/chat-auto-parser.h](common/chat-auto-parser.h | Value | Description | |-----------------|-----------------------------------------------------------------------------------| | `NONE` | No reasoning markers detected | -| `TAG_BASED` | Standard tag-based: `...` | -| `DELIMITER` | Delimiter-based: reasoning ends at a delimiter (e.g., `[BEGIN FINAL RESPONSE]`) | -| `FORCED_OPEN` | Template ends with open reasoning tag when `enable_thinking=true` | -| `FORCED_CLOSED` | `enable_thinking=false` emits both tags; `enable_thinking=true` emits only start | +| `TAG_BASED` | Tag-based: `...` (start can be empty for delimiter-style formats) | | `TOOLS_ONLY` | Reasoning only appears in tool call responses, not plain content | +**Reasoning Prefill**: When a template adds reasoning markers (e.g., `` or ``) at the end of the prompt, these are extracted as `reasoning_prefill` and prepended to the model output before parsing. This allows the parser to always use an optional TAG_BASED pattern while correctly handling templates that force thinking mode open or closed. Whitespace-only reasoning content (from `` prefill) is automatically discarded. + **`content_mode`**: How the template wraps assistant content. | Value | Description | @@ -263,14 +262,15 @@ Text is segmentized into markers and non-marker fragments using `segmentize_mark - Uses PEG parsers to find surrounding markers: - If both pre/post markers found in `diff.right` → `TAG_BASED` (both tags visible in diff = no forced close) - If both found but post marker only in the full output B → `FORCED_CLOSED` - - If only post marker found → `DELIMITER` + - If only post marker found → `TAG_BASED` (delimiter-style, empty start) - Sets `reasoning.start` and `reasoning.end` **R2 — `compare_thinking_enabled()`**: Compares `enable_thinking=false` vs `true` with a generation prompt. -- Detects `FORCED_OPEN`: `enable_thinking=true` adds a non-empty marker at the end of the prompt (where model will start generating) — sets `reasoning.start`, mode = `FORCED_OPEN` -- Detects `FORCED_CLOSED`: `enable_thinking=false` produces both start+end markers; `enable_thinking=true` produces only start marker +- Detects template-added reasoning markers: `enable_thinking=true` adds a non-empty marker at the end of the prompt — sets `reasoning.start`, mode = `TAG_BASED` +- Detects start+end pattern: `enable_thinking=false` produces both start+end markers; `enable_thinking=true` produces only start marker — both classified as `TAG_BASED` - Handles the reverse case: if both start and end are still empty, looks for a single-segment diff on each side to extract both markers +- The reasoning prefill (markers added by the template) is later extracted in `generate_parser()` and prepended to model output before parsing **R3 — `compare_reasoning_scope()`**: Compares assistant message with reasoning+text-content vs reasoning+tool-calls. @@ -358,9 +358,10 @@ Each analyzer struct (`analyze_reasoning`, `analyze_content`, `analyze_tools`) i | Mode | Parser | |-----------------------------------|---------------------------------------------------------------------| | Not extracting reasoning | `eps()` | -| `FORCED_OPEN` or `FORCED_CLOSED` | `reasoning(until(end)) + end` — opening tag was in the prompt | -| `TAG_BASED` or `TOOLS_ONLY` | `optional(start + reasoning(until(end)) + end)` | -| `DELIMITER` | `optional(reasoning(until(end)) + end)` — no start marker | +| `TAG_BASED` or `TOOLS_ONLY` (non-empty start) | `optional(start + reasoning(until(end)) + end)` | +| `TAG_BASED` or `TOOLS_ONLY` (empty start) | `optional(reasoning(until(end)) + end)` — delimiter-style| + +Note: Templates that add reasoning markers to the prompt (e.g., ``) have these extracted as `reasoning_prefill` and prepended to model output before parsing. The parser always uses the optional TAG_BASED pattern. #### Content Parser (`analyze_content::build_parser`) @@ -516,7 +517,7 @@ To support a new template format: ## Edge Cases and Quirks -1. **Forced Thinking**: When `enable_thinking=true` and the model prompt ends with an open reasoning tag (e.g., ``), the parser enters forced thinking mode and immediately expects reasoning content without waiting for a start marker. +1. **Reasoning Prefill**: When `enable_thinking=true` and the model prompt ends with reasoning markers (e.g., `` or ``), these are extracted as `reasoning_prefill` and prepended to model output before parsing. The parser always uses optional TAG_BASED reasoning, so it handles both thinking and non-thinking outputs dynamically. Whitespace-only reasoning content (from closed prefill like ``) is discarded. 2. **Per-Call vs Per-Section Markers**: Some templates wrap each tool call individually (`per_call_start/end`); others wrap the entire section (`section_start/end`). T2 (`check_per_call_markers()`) disambiguates by checking if the second call in a two-call output starts with the section marker. 3. **Python Dict Format**: The Seed template family uses single-quoted JSON (`'key': 'value'`). The `uses_python_dicts` flag causes the PEG builder to register a flexible `json-string` rule accepting both quote styles before any JSON rules are built. 4. **Tag Boundary Fixing**: `calculate_diff_split()` iteratively adjusts prefix/suffix boundaries to avoid splitting `` or `[marker]` tokens, ensuring clean extraction. diff --git a/tests/test-chat-auto-parser.cpp b/tests/test-chat-auto-parser.cpp index eaa57872a8..491522324a 100644 --- a/tests/test-chat-auto-parser.cpp +++ b/tests/test-chat-auto-parser.cpp @@ -1295,8 +1295,8 @@ static void test_nemotron_reasoning_detection(testing & t) { t.assert_equal("reasoning_end should be '\\n'", "\n", analysis.reasoning.end); // Check reasoning mode detection - // Nemotron uses forced closed reasoning with add_generation_prompt - t.assert_equal("reasoning should be FORCED_CLOSED", reasoning_mode::FORCED_CLOSED, analysis.reasoning.mode); + // Nemotron uses tag-based reasoning (formerly FORCED_CLOSED; prefill handles the template's forced markers) + t.assert_equal("reasoning should be TAG_BASED", reasoning_mode::TAG_BASED, analysis.reasoning.mode); // Make sure reasoning markers don't spill over to content markers t.assert_equal("content start should be empty", "", analysis.content.start); diff --git a/tests/test-chat-peg-parser.cpp b/tests/test-chat-peg-parser.cpp index dc8724be34..cbde951d53 100644 --- a/tests/test-chat-peg-parser.cpp +++ b/tests/test-chat-peg-parser.cpp @@ -145,7 +145,7 @@ static void test_example_native(testing & t) { common_reasoning_format reasoning_format; json json_schema; bool parallel_tool_calls; - bool thinking_forced_open; + std::string reasoning_prefill; std::string input; // Expect @@ -157,14 +157,8 @@ static void test_example_native(testing & t) { auto build_parser = [](const test_case & tc) { return build_chat_peg_parser([&](common_chat_peg_builder & p) { auto reasoning_in_content = (tc.reasoning_format == COMMON_REASONING_FORMAT_NONE); - auto reasoning = p.eps(); - if (tc.thinking_forced_open) { - // If thinking is forced open, expect a closing tag - reasoning = p.reasoning(p.until("")) + "" + p.space(); - } else { - // Otherwise, optionally accept thinking wrapped in tags - reasoning = p.optional("" + p.reasoning(p.until("")) + "" + p.space()); - } + // Always use optional TAG_BASED pattern; reasoning_prefill is prepended to input + auto reasoning = p.optional("" + p.reasoning(p.until("")) + "" + p.space()); // tool calling parser if (tc.tools.is_array() && !tc.tools.empty()) { @@ -190,78 +184,91 @@ static void test_example_native(testing & t) { std::vector test_cases = std::vector{ { - /* .name = */ "content with thinking_forced_open = false", + /* .name = */ "content with reasoning (no prefill)", /* .tools = */ {}, /* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_NONE, /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO, /* .json_schema = */ {}, /* .parallel_tool_calls = */ false, - /* .thinking_forced_open = */ false, + /* .reasoning_prefill = */ "", /* .input = */ ("The user said hello, I must say hello back\nHello"), /* .expect_reasoning = */ "The user said hello, I must say hello back", /* .expect_content = */ "Hello", /* .expect_tool_calls = */ {}, }, { - /* .name = */ "content with thinking_forced_open = false and no reasoning", + /* .name = */ "content without reasoning (no prefill)", /* .tools = */ {}, /* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_NONE, /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO, /* .json_schema = */ {}, /* .parallel_tool_calls = */ false, - /* .thinking_forced_open = */ false, + /* .reasoning_prefill = */ "", /* .input = */ ("Hello"), /* .expect_reasoning = */ "", /* .expect_content = */ "Hello", /* .expect_tool_calls = */ {}, }, { - /* .name = */ "content with thinking_forced_open = false and reasoning_format = none", + /* .name = */ "content with reasoning_format = none (tags appear in content)", /* .tools = */ {}, /* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_NONE, /* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE, /* .json_schema = */ {}, /* .parallel_tool_calls = */ false, - /* .thinking_forced_open = */ true, + /* .reasoning_prefill = */ "", /* .input = */ ("The user said hello, I must say hello back\nHello"), /* .expect_reasoning = */ "", /* .expect_content = */ "The user said hello, I must say hello back\nHello", /* .expect_tool_calls = */ {}, }, { - /* .name = */ "content with thinking_forced_open = true", + /* .name = */ "content with reasoning prefill", /* .tools = */ {}, /* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_NONE, /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO, /* .json_schema = */ {}, /* .parallel_tool_calls = */ false, - /* .thinking_forced_open = */ true, + /* .reasoning_prefill = */ "", /* .input = */ ("The user said hello, I must say hello back\nHello"), /* .expect_reasoning = */ "The user said hello, I must say hello back", /* .expect_content = */ "Hello", /* .expect_tool_calls = */ {}, }, { - /* .name = */ "content with thinking_forced_open = true and reasoning_format = none", + /* .name = */ "content with reasoning prefill and reasoning_format = none", /* .tools = */ {}, /* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_NONE, /* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE, /* .json_schema = */ {}, /* .parallel_tool_calls = */ false, - /* .thinking_forced_open = */ true, + /* .reasoning_prefill = */ "", /* .input = */ ("The user said hello, I must say hello back\nHello"), /* .expect_reasoning = */ "", /* .expect_content = */ "The user said hello, I must say hello back\nHello", /* .expect_tool_calls = */ {}, }, { - /* .name = */ "tools with tool_choice = auto and no parallel_tool_calls", + /* .name = */ "content with closed reasoning prefill (empty reasoning discarded)", + /* .tools = */ {}, + /* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_NONE, + /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO, + /* .json_schema = */ {}, + /* .parallel_tool_calls = */ false, + /* .reasoning_prefill = */ "", + /* .input = */ ("Hello"), + /* .expect_reasoning = */ "", + /* .expect_content = */ "Hello", + /* .expect_tool_calls = */ {}, + }, + { + /* .name = */ "tools with reasoning prefill", /* .tools = */ create_tools(), /* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_AUTO, /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO, /* .json_schema = */ {}, /* .parallel_tool_calls = */ false, - /* .thinking_forced_open = */ true, + /* .reasoning_prefill = */ "", /* .input = */ ("I must get the weather in New York\n" "[" @@ -277,13 +284,13 @@ static void test_example_native(testing & t) { } }, }, { - /* .name = */ "tools with tool_choice = auto and parallel_tool_calls", + /* .name = */ "parallel tools with reasoning prefill", /* .tools = */ create_tools(), /* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_AUTO, /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO, /* .json_schema = */ {}, /* .parallel_tool_calls = */ true, - /* .thinking_forced_open = */ true, + /* .reasoning_prefill = */ "", /* .input = */ ("I must get the weather in New York and San Francisco and a 3 day forecast of each.\nLet me " "search that for you." @@ -321,7 +328,7 @@ static void test_example_native(testing & t) { } }, }, { - /* .name = */ "response_format with thinking_forced_open = true", + /* .name = */ "response_format with reasoning prefill", /* .tools = */ {}, /* .tool_choice = */ COMMON_CHAT_TOOL_CHOICE_NONE, /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO, @@ -333,7 +340,7 @@ static void test_example_native(testing & t) { { "due_date", { { "type", "string" } } } } }, { "required", { "invoice_number", "amount", "due_date" } } }, /* .parallel_tool_calls = */ false, - /* .thinking_forced_open = */ true, + /* .reasoning_prefill = */ "", /* .input = */ ("I must produce the invoice in the requested format\n" R"({"invoice_number": "INV-2025-001", "amount": 1250.50, "due_date": "2025-12-31"})"), @@ -361,7 +368,8 @@ static void test_example_native(testing & t) { t.log(line); } - common_peg_parse_context ctx(tc.input); + std::string effective_input = tc.reasoning_prefill + tc.input; + common_peg_parse_context ctx(effective_input); auto result = parser.parse(ctx); t.assert_true("success", result.success()); diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp index 3a6297e148..8e2117c4e7 100644 --- a/tests/test-chat.cpp +++ b/tests/test-chat.cpp @@ -822,8 +822,7 @@ struct make_peg_parser { } common_chat_msg parse(const std::string & msg, bool is_partial) const { - common_chat_parser_params parser_params; - parser_params.format = params_.format; + common_chat_parser_params parser_params(params_); parser_params.debug = detailed_debug_; return common_chat_peg_parse(arena_, msg, is_partial, parser_params); } diff --git a/tools/server/README.md b/tools/server/README.md index da16ddc756..363f3fa5ea 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -907,7 +907,7 @@ If query param `?fail_on_no_slot=1` is set, this endpoint will respond with stat "chat_format": "GPT-OSS", "reasoning_format": "none", "reasoning_in_content": false, - "thinking_forced_open": false, + "reasoning_prefill": "", "samplers": [ "penalties", "dry", @@ -972,7 +972,7 @@ If query param `?fail_on_no_slot=1` is set, this endpoint will respond with stat "chat_format": "GPT-OSS", "reasoning_format": "none", "reasoning_in_content": false, - "thinking_forced_open": false, + "reasoning_prefill": "", "samplers": [ "penalties", "dry", @@ -1193,7 +1193,7 @@ The `response_format` parameter supports both plain JSON output (e.g. `{"type": `reasoning_format`: The reasoning format to be parsed. If set to `none`, it will output the raw generated text. -`thinking_forced_open`: Force a reasoning model to always output the reasoning. Only works on certain models. +`reasoning_prefill`: The reasoning markers that were prefilled in the prompt by the template. Prepended to model output before parsing to handle dynamic thinking/non-thinking modes. `parse_tool_calls`: Whether to parse the generated tool call. diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index bd203228cc..1b74f50fcd 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -1093,7 +1093,7 @@ json oaicompat_chat_params_parse( } llama_params["grammar_triggers"] = grammar_triggers; llama_params["preserved_tokens"] = chat_params.preserved_tokens; - llama_params["thinking_forced_open"] = chat_params.thinking_forced_open; + llama_params["reasoning_prefill"] = chat_params.reasoning_prefill; for (const auto & stop : chat_params.additional_stops) { llama_params["stop"].push_back(stop); } diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index b3d510977b..a47ab5cbb0 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -72,7 +72,7 @@ json task_params::to_json(bool only_metrics) const { {"chat_format", common_chat_format_name(chat_parser_params.format)}, {"reasoning_format", common_reasoning_format_name(chat_parser_params.reasoning_format)}, {"reasoning_in_content", chat_parser_params.reasoning_in_content}, - {"thinking_forced_open", chat_parser_params.thinking_forced_open}, + {"reasoning_prefill", chat_parser_params.reasoning_prefill}, {"samplers", samplers}, {"speculative.n_max", speculative.n_max}, {"speculative.n_min", speculative.n_min}, @@ -135,7 +135,7 @@ json task_params::to_json(bool only_metrics) const { {"chat_format", common_chat_format_name(chat_parser_params.format)}, {"reasoning_format", common_reasoning_format_name(chat_parser_params.reasoning_format)}, {"reasoning_in_content", chat_parser_params.reasoning_in_content}, - {"thinking_forced_open", chat_parser_params.thinking_forced_open}, + {"reasoning_prefill", chat_parser_params.reasoning_prefill}, {"samplers", samplers}, {"speculative.n_max", speculative.n_max}, {"speculative.n_min", speculative.n_min}, @@ -402,7 +402,7 @@ task_params server_task::params_from_json_cmpl( } params.chat_parser_params.reasoning_format = reasoning_format; params.chat_parser_params.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY); - params.chat_parser_params.thinking_forced_open = json_value(data, "thinking_forced_open", false); + params.chat_parser_params.reasoning_prefill = json_value(data, "reasoning_prefill", std::string()); params.chat_parser_params.parse_tool_calls = json_value(data, "parse_tool_calls", false); if (data.contains("chat_parser")) { params.chat_parser_params.parser.load(data.at("chat_parser").get()); diff --git a/tools/server/webui/src/lib/services/parameter-sync.service.spec.ts b/tools/server/webui/src/lib/services/parameter-sync.service.spec.ts index ce91de7410..63e303959e 100644 --- a/tools/server/webui/src/lib/services/parameter-sync.service.spec.ts +++ b/tools/server/webui/src/lib/services/parameter-sync.service.spec.ts @@ -51,7 +51,7 @@ describe('ParameterSyncService', () => { chat_format: '', reasoning_format: '', reasoning_in_content: false, - thinking_forced_open: false, + reasoning_prefill: '', 'speculative.n_max': 0, 'speculative.n_min': 0, 'speculative.p_min': 0.0, @@ -116,7 +116,7 @@ describe('ParameterSyncService', () => { chat_format: '', reasoning_format: '', reasoning_in_content: false, - thinking_forced_open: false, + reasoning_prefill: '', 'speculative.n_max': 0, 'speculative.n_min': 0, 'speculative.p_min': 0.0, diff --git a/tools/server/webui/src/lib/types/api.d.ts b/tools/server/webui/src/lib/types/api.d.ts index c908258427..ea32079364 100644 --- a/tools/server/webui/src/lib/types/api.d.ts +++ b/tools/server/webui/src/lib/types/api.d.ts @@ -164,7 +164,7 @@ export interface ApiLlamaCppServerProps { chat_format: string; reasoning_format: string; reasoning_in_content: boolean; - thinking_forced_open: boolean; + reasoning_prefill: string; samplers: string[]; backend_sampling: boolean; 'speculative.n_max': number; @@ -332,7 +332,7 @@ export interface ApiSlotData { chat_format: string; reasoning_format: string; reasoning_in_content: boolean; - thinking_forced_open: boolean; + reasoning_prefill: string; samplers: string[]; backend_sampling: boolean; 'speculative.n_max': number;