diff --git a/common/chat-auto-parser-generator.cpp b/common/chat-auto-parser-generator.cpp index 6bd4b2d208..f36819209c 100644 --- a/common/chat-auto-parser-generator.cpp +++ b/common/chat-auto-parser-generator.cpp @@ -39,15 +39,11 @@ common_chat_params peg_generator::generate_parser(const common_chat_template & common_chat_params peg_generator::generate_parser(const common_chat_template & tmpl, const struct templates_params & inputs, const autoparser & autoparser) { - // Build the parser using the analysis results - auto parser = autoparser.build_parser(inputs); - // Create the result structure common_chat_params data; data.prompt = common_chat_template_direct_apply(tmpl, inputs); data.format = COMMON_CHAT_FORMAT_PEG_NATIVE; data.preserved_tokens = autoparser.preserved_tokens; - data.parser = parser.save(); // Extract reasoning prefill from the end of the rendered prompt. // If the template added reasoning markers (e.g. or ) at the end, @@ -57,34 +53,53 @@ common_chat_params peg_generator::generate_parser(const common_chat_template & !autoparser.reasoning.end.empty()) { const auto & r_start = autoparser.reasoning.start; const auto & r_end = autoparser.reasoning.end; - // Trim trailing whitespace from the prompt for suffix matching - auto prompt_trimmed = data.prompt; - while (!prompt_trimmed.empty() && - (prompt_trimmed.back() == ' ' || prompt_trimmed.back() == '\n' || - prompt_trimmed.back() == '\r' || prompt_trimmed.back() == '\t')) { - prompt_trimmed.pop_back(); - } - if (!r_start.empty()) { + + // Helper to trim trailing whitespace from a string + auto rtrim = [](std::string s) { + while (!s.empty() && (s.back() == ' ' || s.back() == '\n' || + s.back() == '\r' || s.back() == '\t')) { + s.pop_back(); + } + return s; + }; + + // Trim both the prompt and markers for suffix matching (markers may have trailing \n) + auto prompt_trimmed = rtrim(data.prompt); + auto r_end_trimmed = rtrim(r_end); + auto r_start_trimmed = rtrim(r_start); + + if (!r_start_trimmed.empty()) { // Check for start+end at end of prompt (e.g. ) - if (string_ends_with(prompt_trimmed, r_end)) { - auto before_end = prompt_trimmed.substr(0, prompt_trimmed.size() - r_end.size()); - while (!before_end.empty() && - (before_end.back() == ' ' || before_end.back() == '\n' || - before_end.back() == '\r' || before_end.back() == '\t')) { - before_end.pop_back(); - } - if (string_ends_with(before_end, r_start)) { - // Prompt ends with start + whitespace + end: extract from start to end of trimmed prompt - data.reasoning_prefill = prompt_trimmed.substr(before_end.size() - r_start.size()); + if (string_ends_with(prompt_trimmed, r_end_trimmed)) { + auto before_end = rtrim(prompt_trimmed.substr(0, prompt_trimmed.size() - r_end_trimmed.size())); + if (string_ends_with(before_end, r_start_trimmed)) { + // Prompt ends with start + end markers (reasoning closed). + // Use the canonical markers from the analyzer to ensure whitespace + // (e.g. trailing \n in \n) is preserved, even if the template + // rendered them without intermediate whitespace. + data.reasoning_prefill = r_start + r_end; } } - // Check for just start at end of prompt (e.g. ) - if (data.reasoning_prefill.empty() && string_ends_with(prompt_trimmed, r_start)) { - data.reasoning_prefill = r_start; + // Check for just start at end of prompt (e.g. \n) + if (data.reasoning_prefill.empty() && string_ends_with(prompt_trimmed, r_start_trimmed)) { + // Extract from the original prompt to preserve trailing whitespace + auto start_pos = prompt_trimmed.size() - r_start_trimmed.size(); + data.reasoning_prefill = data.prompt.substr(start_pos); } } } + fprintf(stderr, "DEBUG reasoning_prefill: '%s' (start='%s', end='%s', mode=%d, reasoning_format=%d)\n", + data.reasoning_prefill.c_str(), + autoparser.reasoning.start.c_str(), + autoparser.reasoning.end.c_str(), + (int) autoparser.reasoning.mode, + (int) inputs.reasoning_format); + + // Build the parser using the analysis results. + common_peg_arena parser = autoparser.build_parser(inputs); + data.parser = parser.save(); + // Build grammar if tools are present bool has_tools = autoparser.tools.format.mode != tool_format::NONE && inputs.tools.is_array() && !inputs.tools.empty(); diff --git a/common/chat.cpp b/common/chat.cpp index 4f49fcf8a6..fec9717808 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -1614,12 +1614,6 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_ if (auto_params.supports_thinking) { auto_params.thinking_start_tag = autoparser.reasoning.start; auto_params.thinking_end_tag = autoparser.reasoning.end; - // FORCED_OPEN and FORCED_CLOSED both put in the generation prompt - // (FORCED_CLOSED forces empty when thinking is disabled, - // but forces open when thinking is enabled) - auto_params.thinking_forced_open = - autoparser.reasoning.mode == autoparser::reasoning_mode::FORCED_OPEN || - autoparser.reasoning.mode == autoparser::reasoning_mode::FORCED_CLOSED; } return auto_params; } catch (const std::exception & e) { diff --git a/common/common.h b/common/common.h index ee7a2d805e..c9ab673d66 100644 --- a/common/common.h +++ b/common/common.h @@ -236,10 +236,14 @@ struct common_params_sampling { std::vector logit_bias; // logit biases to apply std::vector logit_bias_eog; // pre-calculated logit biases for EOG tokens + // Grammar prefill: reasoning markers already present in the prompt suffix. + // Fed to the grammar sampler (to advance past pre-existing tokens) and used + // to determine the reasoning budget sampler's initial state. + std::string grammar_prefill; + // reasoning budget sampler parameters // these are populated by the server/CLI based on chat template params int32_t reasoning_budget_tokens = -1; // -1 = disabled, >= 0 = token budget - bool reasoning_budget_activate_immediately = false; std::vector reasoning_budget_start; // start tag token sequence std::vector reasoning_budget_end; // end tag token sequence std::vector reasoning_budget_forced; // forced sequence (message + end tag) diff --git a/common/sampling.cpp b/common/sampling.cpp index f849d4f61a..ebed0f19e6 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -251,15 +251,51 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st } } + // Feed grammar prefill tokens to the grammar sampler so it advances past + // reasoning markers that the template already placed in the prompt. + std::vector prefill_tokens; + if (!params.grammar_prefill.empty() && vocab) { + prefill_tokens = common_tokenize(vocab, params.grammar_prefill, false, true); + if (grmr) { + for (const auto & token : prefill_tokens) { + llama_sampler_accept(grmr, token); + } + } + } + // reasoning budget sampler — added first so it can force tokens before other samplers if (params.reasoning_budget_tokens >= 0 && !params.reasoning_budget_forced.empty()) { + // Determine initial state from the grammar prefill: if the prefill tokens + // match the start sequence, reasoning is already open → start COUNTING. + bool activate_immediately = false; + if (!prefill_tokens.empty() && !params.reasoning_budget_start.empty() && + prefill_tokens.size() >= params.reasoning_budget_start.size()) { + activate_immediately = std::equal( + params.reasoning_budget_start.begin(), + params.reasoning_budget_start.end(), + prefill_tokens.begin()); + // But if the prefill also contains the end sequence after the start, + // reasoning was opened and closed — start IDLE instead. + if (activate_immediately && + prefill_tokens.size() >= params.reasoning_budget_start.size() + params.reasoning_budget_end.size()) { + auto end_begin = prefill_tokens.begin() + (ptrdiff_t) params.reasoning_budget_start.size(); + // Check if remaining tokens after start match the end sequence + // (possibly with whitespace tokens in between, but for simplicity check suffix) + auto end_start = prefill_tokens.end() - (ptrdiff_t) params.reasoning_budget_end.size(); + if (end_start >= end_begin && + std::equal(params.reasoning_budget_end.begin(), params.reasoning_budget_end.end(), end_start)) { + activate_immediately = false; + } + } + } + samplers.push_back(common_reasoning_budget_init( vocab, params.reasoning_budget_start, params.reasoning_budget_end, params.reasoning_budget_forced, params.reasoning_budget_tokens, - params.reasoning_budget_activate_immediately ? REASONING_BUDGET_COUNTING : REASONING_BUDGET_IDLE)); + activate_immediately ? REASONING_BUDGET_COUNTING : REASONING_BUDGET_IDLE)); } if (params.has_logit_bias()) { diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp index 8e2117c4e7..c764af6bf8 100644 --- a/tests/test-chat.cpp +++ b/tests/test-chat.cpp @@ -995,6 +995,16 @@ static void test_peg_parser(common_chat_templates * tmpls, grammar_triggered = true; } + // For non-lazy grammars, prepend reasoning prefill to grammar input, just like + // PEG parsing does. The grammar includes the full reasoning pattern (e.g. optional + // ...), but the model output may start mid-reasoning if the template + // already placed the opening tag in the prompt. + // For lazy grammars, the grammar only activates from the trigger position, so the + // reasoning prefill is irrelevant — reasoning is handled by the PEG parser. + if (!parser.params_.reasoning_prefill.empty() && earliest_trigger_pos == std::string::npos) { + constrained = parser.params_.reasoning_prefill + constrained; + } + // Test the constrained portion against the grammar if (grammar_triggered && !tc.is_partial) { auto result = match_string_detailed(constrained, grammar.get()); diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index 4c2ae7a033..c08b2cf715 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -105,7 +105,7 @@ struct cli_context { llama_get_model(ctx_server.get_llama_context())); task.params.sampling.reasoning_budget_tokens = reasoning_budget; - task.params.sampling.reasoning_budget_activate_immediately = chat_params.thinking_forced_open; + task.params.sampling.grammar_prefill = chat_params.reasoning_prefill; if (!chat_params.thinking_start_tag.empty()) { task.params.sampling.reasoning_budget_start = diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index 1b74f50fcd..a536d6c751 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -1113,7 +1113,6 @@ json oaicompat_chat_params_parse( llama_params["reasoning_budget_start_tag"] = chat_params.thinking_start_tag; llama_params["reasoning_budget_end_tag"] = chat_params.thinking_end_tag; llama_params["reasoning_budget_message"] = opt.reasoning_budget_message; - llama_params["reasoning_budget_activate_immediately"] = chat_params.thinking_forced_open; } } diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index a47ab5cbb0..174a3779c9 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -403,6 +403,7 @@ task_params server_task::params_from_json_cmpl( params.chat_parser_params.reasoning_format = reasoning_format; params.chat_parser_params.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY); params.chat_parser_params.reasoning_prefill = json_value(data, "reasoning_prefill", std::string()); + params.sampling.grammar_prefill = params.chat_parser_params.reasoning_prefill; params.chat_parser_params.parse_tool_calls = json_value(data, "parse_tool_calls", false); if (data.contains("chat_parser")) { params.chat_parser_params.parser.load(data.at("chat_parser").get()); @@ -469,10 +470,7 @@ task_params server_task::params_from_json_cmpl( const auto start_tag = json_value(data, "reasoning_budget_start_tag", std::string()); const auto end_tag = json_value(data, "reasoning_budget_end_tag", std::string()); const auto message = json_value(data, "reasoning_budget_message", std::string()); - const bool activate_imm = json_value(data, "reasoning_budget_activate_immediately", false); - params.sampling.reasoning_budget_tokens = budget; - params.sampling.reasoning_budget_activate_immediately = activate_imm; if (!start_tag.empty()) { params.sampling.reasoning_budget_start = common_tokenize(vocab, start_tag, false, true); @@ -482,8 +480,8 @@ task_params server_task::params_from_json_cmpl( params.sampling.reasoning_budget_forced = common_tokenize(vocab, message + end_tag, false, true); } - SRV_DBG("reasoning budget: tokens=%d, activate_immediately=%s, start=%zu toks, end=%zu toks, forced=%zu toks\n", - budget, activate_imm ? "true" : "false", + SRV_DBG("reasoning budget: tokens=%d, grammar_prefill='%s', start=%zu toks, end=%zu toks, forced=%zu toks\n", + budget, params.sampling.grammar_prefill.c_str(), params.sampling.reasoning_budget_start.size(), params.sampling.reasoning_budget_end.size(), params.sampling.reasoning_budget_forced.size());