diff --git a/common/chat-auto-parser-generator.cpp b/common/chat-auto-parser-generator.cpp
index 6bd4b2d208..f36819209c 100644
--- a/common/chat-auto-parser-generator.cpp
+++ b/common/chat-auto-parser-generator.cpp
@@ -39,15 +39,11 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
common_chat_params peg_generator::generate_parser(const common_chat_template & tmpl,
const struct templates_params & inputs,
const autoparser & autoparser) {
- // Build the parser using the analysis results
- auto parser = autoparser.build_parser(inputs);
-
// Create the result structure
common_chat_params data;
data.prompt = common_chat_template_direct_apply(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
data.preserved_tokens = autoparser.preserved_tokens;
- data.parser = parser.save();
// Extract reasoning prefill from the end of the rendered prompt.
// If the template added reasoning markers (e.g. or ) at the end,
@@ -57,34 +53,53 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
!autoparser.reasoning.end.empty()) {
const auto & r_start = autoparser.reasoning.start;
const auto & r_end = autoparser.reasoning.end;
- // Trim trailing whitespace from the prompt for suffix matching
- auto prompt_trimmed = data.prompt;
- while (!prompt_trimmed.empty() &&
- (prompt_trimmed.back() == ' ' || prompt_trimmed.back() == '\n' ||
- prompt_trimmed.back() == '\r' || prompt_trimmed.back() == '\t')) {
- prompt_trimmed.pop_back();
- }
- if (!r_start.empty()) {
+
+ // Helper to trim trailing whitespace from a string
+ auto rtrim = [](std::string s) {
+ while (!s.empty() && (s.back() == ' ' || s.back() == '\n' ||
+ s.back() == '\r' || s.back() == '\t')) {
+ s.pop_back();
+ }
+ return s;
+ };
+
+ // Trim both the prompt and markers for suffix matching (markers may have trailing \n)
+ auto prompt_trimmed = rtrim(data.prompt);
+ auto r_end_trimmed = rtrim(r_end);
+ auto r_start_trimmed = rtrim(r_start);
+
+ if (!r_start_trimmed.empty()) {
// Check for start+end at end of prompt (e.g. )
- if (string_ends_with(prompt_trimmed, r_end)) {
- auto before_end = prompt_trimmed.substr(0, prompt_trimmed.size() - r_end.size());
- while (!before_end.empty() &&
- (before_end.back() == ' ' || before_end.back() == '\n' ||
- before_end.back() == '\r' || before_end.back() == '\t')) {
- before_end.pop_back();
- }
- if (string_ends_with(before_end, r_start)) {
- // Prompt ends with start + whitespace + end: extract from start to end of trimmed prompt
- data.reasoning_prefill = prompt_trimmed.substr(before_end.size() - r_start.size());
+ if (string_ends_with(prompt_trimmed, r_end_trimmed)) {
+ auto before_end = rtrim(prompt_trimmed.substr(0, prompt_trimmed.size() - r_end_trimmed.size()));
+ if (string_ends_with(before_end, r_start_trimmed)) {
+ // Prompt ends with start + end markers (reasoning closed).
+ // Use the canonical markers from the analyzer to ensure whitespace
+ // (e.g. trailing \n in \n) is preserved, even if the template
+ // rendered them without intermediate whitespace.
+ data.reasoning_prefill = r_start + r_end;
}
}
- // Check for just start at end of prompt (e.g. )
- if (data.reasoning_prefill.empty() && string_ends_with(prompt_trimmed, r_start)) {
- data.reasoning_prefill = r_start;
+ // Check for just start at end of prompt (e.g. \n)
+ if (data.reasoning_prefill.empty() && string_ends_with(prompt_trimmed, r_start_trimmed)) {
+ // Extract from the original prompt to preserve trailing whitespace
+ auto start_pos = prompt_trimmed.size() - r_start_trimmed.size();
+ data.reasoning_prefill = data.prompt.substr(start_pos);
}
}
}
+ fprintf(stderr, "DEBUG reasoning_prefill: '%s' (start='%s', end='%s', mode=%d, reasoning_format=%d)\n",
+ data.reasoning_prefill.c_str(),
+ autoparser.reasoning.start.c_str(),
+ autoparser.reasoning.end.c_str(),
+ (int) autoparser.reasoning.mode,
+ (int) inputs.reasoning_format);
+
+ // Build the parser using the analysis results.
+ common_peg_arena parser = autoparser.build_parser(inputs);
+ data.parser = parser.save();
+
// Build grammar if tools are present
bool has_tools =
autoparser.tools.format.mode != tool_format::NONE && inputs.tools.is_array() && !inputs.tools.empty();
diff --git a/common/chat.cpp b/common/chat.cpp
index 4f49fcf8a6..fec9717808 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -1614,12 +1614,6 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
if (auto_params.supports_thinking) {
auto_params.thinking_start_tag = autoparser.reasoning.start;
auto_params.thinking_end_tag = autoparser.reasoning.end;
- // FORCED_OPEN and FORCED_CLOSED both put in the generation prompt
- // (FORCED_CLOSED forces empty when thinking is disabled,
- // but forces open when thinking is enabled)
- auto_params.thinking_forced_open =
- autoparser.reasoning.mode == autoparser::reasoning_mode::FORCED_OPEN ||
- autoparser.reasoning.mode == autoparser::reasoning_mode::FORCED_CLOSED;
}
return auto_params;
} catch (const std::exception & e) {
diff --git a/common/common.h b/common/common.h
index ee7a2d805e..c9ab673d66 100644
--- a/common/common.h
+++ b/common/common.h
@@ -236,10 +236,14 @@ struct common_params_sampling {
std::vector logit_bias; // logit biases to apply
std::vector logit_bias_eog; // pre-calculated logit biases for EOG tokens
+ // Grammar prefill: reasoning markers already present in the prompt suffix.
+ // Fed to the grammar sampler (to advance past pre-existing tokens) and used
+ // to determine the reasoning budget sampler's initial state.
+ std::string grammar_prefill;
+
// reasoning budget sampler parameters
// these are populated by the server/CLI based on chat template params
int32_t reasoning_budget_tokens = -1; // -1 = disabled, >= 0 = token budget
- bool reasoning_budget_activate_immediately = false;
std::vector reasoning_budget_start; // start tag token sequence
std::vector reasoning_budget_end; // end tag token sequence
std::vector reasoning_budget_forced; // forced sequence (message + end tag)
diff --git a/common/sampling.cpp b/common/sampling.cpp
index f849d4f61a..ebed0f19e6 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -251,15 +251,51 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
}
}
+ // Feed grammar prefill tokens to the grammar sampler so it advances past
+ // reasoning markers that the template already placed in the prompt.
+ std::vector prefill_tokens;
+ if (!params.grammar_prefill.empty() && vocab) {
+ prefill_tokens = common_tokenize(vocab, params.grammar_prefill, false, true);
+ if (grmr) {
+ for (const auto & token : prefill_tokens) {
+ llama_sampler_accept(grmr, token);
+ }
+ }
+ }
+
// reasoning budget sampler — added first so it can force tokens before other samplers
if (params.reasoning_budget_tokens >= 0 && !params.reasoning_budget_forced.empty()) {
+ // Determine initial state from the grammar prefill: if the prefill tokens
+ // match the start sequence, reasoning is already open → start COUNTING.
+ bool activate_immediately = false;
+ if (!prefill_tokens.empty() && !params.reasoning_budget_start.empty() &&
+ prefill_tokens.size() >= params.reasoning_budget_start.size()) {
+ activate_immediately = std::equal(
+ params.reasoning_budget_start.begin(),
+ params.reasoning_budget_start.end(),
+ prefill_tokens.begin());
+ // But if the prefill also contains the end sequence after the start,
+ // reasoning was opened and closed — start IDLE instead.
+ if (activate_immediately &&
+ prefill_tokens.size() >= params.reasoning_budget_start.size() + params.reasoning_budget_end.size()) {
+ auto end_begin = prefill_tokens.begin() + (ptrdiff_t) params.reasoning_budget_start.size();
+ // Check if remaining tokens after start match the end sequence
+ // (possibly with whitespace tokens in between, but for simplicity check suffix)
+ auto end_start = prefill_tokens.end() - (ptrdiff_t) params.reasoning_budget_end.size();
+ if (end_start >= end_begin &&
+ std::equal(params.reasoning_budget_end.begin(), params.reasoning_budget_end.end(), end_start)) {
+ activate_immediately = false;
+ }
+ }
+ }
+
samplers.push_back(common_reasoning_budget_init(
vocab,
params.reasoning_budget_start,
params.reasoning_budget_end,
params.reasoning_budget_forced,
params.reasoning_budget_tokens,
- params.reasoning_budget_activate_immediately ? REASONING_BUDGET_COUNTING : REASONING_BUDGET_IDLE));
+ activate_immediately ? REASONING_BUDGET_COUNTING : REASONING_BUDGET_IDLE));
}
if (params.has_logit_bias()) {
diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp
index 8e2117c4e7..c764af6bf8 100644
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@@ -995,6 +995,16 @@ static void test_peg_parser(common_chat_templates * tmpls,
grammar_triggered = true;
}
+ // For non-lazy grammars, prepend reasoning prefill to grammar input, just like
+ // PEG parsing does. The grammar includes the full reasoning pattern (e.g. optional
+ // ...), but the model output may start mid-reasoning if the template
+ // already placed the opening tag in the prompt.
+ // For lazy grammars, the grammar only activates from the trigger position, so the
+ // reasoning prefill is irrelevant — reasoning is handled by the PEG parser.
+ if (!parser.params_.reasoning_prefill.empty() && earliest_trigger_pos == std::string::npos) {
+ constrained = parser.params_.reasoning_prefill + constrained;
+ }
+
// Test the constrained portion against the grammar
if (grammar_triggered && !tc.is_partial) {
auto result = match_string_detailed(constrained, grammar.get());
diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp
index 4c2ae7a033..c08b2cf715 100644
--- a/tools/cli/cli.cpp
+++ b/tools/cli/cli.cpp
@@ -105,7 +105,7 @@ struct cli_context {
llama_get_model(ctx_server.get_llama_context()));
task.params.sampling.reasoning_budget_tokens = reasoning_budget;
- task.params.sampling.reasoning_budget_activate_immediately = chat_params.thinking_forced_open;
+ task.params.sampling.grammar_prefill = chat_params.reasoning_prefill;
if (!chat_params.thinking_start_tag.empty()) {
task.params.sampling.reasoning_budget_start =
diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp
index 1b74f50fcd..a536d6c751 100644
--- a/tools/server/server-common.cpp
+++ b/tools/server/server-common.cpp
@@ -1113,7 +1113,6 @@ json oaicompat_chat_params_parse(
llama_params["reasoning_budget_start_tag"] = chat_params.thinking_start_tag;
llama_params["reasoning_budget_end_tag"] = chat_params.thinking_end_tag;
llama_params["reasoning_budget_message"] = opt.reasoning_budget_message;
- llama_params["reasoning_budget_activate_immediately"] = chat_params.thinking_forced_open;
}
}
diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp
index a47ab5cbb0..174a3779c9 100644
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@@ -403,6 +403,7 @@ task_params server_task::params_from_json_cmpl(
params.chat_parser_params.reasoning_format = reasoning_format;
params.chat_parser_params.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
params.chat_parser_params.reasoning_prefill = json_value(data, "reasoning_prefill", std::string());
+ params.sampling.grammar_prefill = params.chat_parser_params.reasoning_prefill;
params.chat_parser_params.parse_tool_calls = json_value(data, "parse_tool_calls", false);
if (data.contains("chat_parser")) {
params.chat_parser_params.parser.load(data.at("chat_parser").get());
@@ -469,10 +470,7 @@ task_params server_task::params_from_json_cmpl(
const auto start_tag = json_value(data, "reasoning_budget_start_tag", std::string());
const auto end_tag = json_value(data, "reasoning_budget_end_tag", std::string());
const auto message = json_value(data, "reasoning_budget_message", std::string());
- const bool activate_imm = json_value(data, "reasoning_budget_activate_immediately", false);
-
params.sampling.reasoning_budget_tokens = budget;
- params.sampling.reasoning_budget_activate_immediately = activate_imm;
if (!start_tag.empty()) {
params.sampling.reasoning_budget_start = common_tokenize(vocab, start_tag, false, true);
@@ -482,8 +480,8 @@ task_params server_task::params_from_json_cmpl(
params.sampling.reasoning_budget_forced = common_tokenize(vocab, message + end_tag, false, true);
}
- SRV_DBG("reasoning budget: tokens=%d, activate_immediately=%s, start=%zu toks, end=%zu toks, forced=%zu toks\n",
- budget, activate_imm ? "true" : "false",
+ SRV_DBG("reasoning budget: tokens=%d, grammar_prefill='%s', start=%zu toks, end=%zu toks, forced=%zu toks\n",
+ budget, params.sampling.grammar_prefill.c_str(),
params.sampling.reasoning_budget_start.size(),
params.sampling.reasoning_budget_end.size(),
params.sampling.reasoning_budget_forced.size());