This commit is contained in:
Piotr Wilkin 2026-03-11 15:42:54 +01:00
parent 060d4e4cfd
commit 2249a09f12
8 changed files with 96 additions and 40 deletions

View File

@ -39,15 +39,11 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
common_chat_params peg_generator::generate_parser(const common_chat_template & tmpl,
const struct templates_params & inputs,
const autoparser & autoparser) {
// Build the parser using the analysis results
auto parser = autoparser.build_parser(inputs);
// Create the result structure
common_chat_params data;
data.prompt = common_chat_template_direct_apply(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
data.preserved_tokens = autoparser.preserved_tokens;
data.parser = parser.save();
// Extract reasoning prefill from the end of the rendered prompt.
// If the template added reasoning markers (e.g. <think> or <think></think>) at the end,
@ -57,34 +53,53 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
!autoparser.reasoning.end.empty()) {
const auto & r_start = autoparser.reasoning.start;
const auto & r_end = autoparser.reasoning.end;
// Trim trailing whitespace from the prompt for suffix matching
auto prompt_trimmed = data.prompt;
while (!prompt_trimmed.empty() &&
(prompt_trimmed.back() == ' ' || prompt_trimmed.back() == '\n' ||
prompt_trimmed.back() == '\r' || prompt_trimmed.back() == '\t')) {
prompt_trimmed.pop_back();
}
if (!r_start.empty()) {
// Helper to trim trailing whitespace from a string
auto rtrim = [](std::string s) {
while (!s.empty() && (s.back() == ' ' || s.back() == '\n' ||
s.back() == '\r' || s.back() == '\t')) {
s.pop_back();
}
return s;
};
// Trim both the prompt and markers for suffix matching (markers may have trailing \n)
auto prompt_trimmed = rtrim(data.prompt);
auto r_end_trimmed = rtrim(r_end);
auto r_start_trimmed = rtrim(r_start);
if (!r_start_trimmed.empty()) {
// Check for start+end at end of prompt (e.g. <think></think>)
if (string_ends_with(prompt_trimmed, r_end)) {
auto before_end = prompt_trimmed.substr(0, prompt_trimmed.size() - r_end.size());
while (!before_end.empty() &&
(before_end.back() == ' ' || before_end.back() == '\n' ||
before_end.back() == '\r' || before_end.back() == '\t')) {
before_end.pop_back();
}
if (string_ends_with(before_end, r_start)) {
// Prompt ends with start + whitespace + end: extract from start to end of trimmed prompt
data.reasoning_prefill = prompt_trimmed.substr(before_end.size() - r_start.size());
if (string_ends_with(prompt_trimmed, r_end_trimmed)) {
auto before_end = rtrim(prompt_trimmed.substr(0, prompt_trimmed.size() - r_end_trimmed.size()));
if (string_ends_with(before_end, r_start_trimmed)) {
// Prompt ends with start + end markers (reasoning closed).
// Use the canonical markers from the analyzer to ensure whitespace
// (e.g. trailing \n in </think>\n) is preserved, even if the template
// rendered them without intermediate whitespace.
data.reasoning_prefill = r_start + r_end;
}
}
// Check for just start at end of prompt (e.g. <think>)
if (data.reasoning_prefill.empty() && string_ends_with(prompt_trimmed, r_start)) {
data.reasoning_prefill = r_start;
// Check for just start at end of prompt (e.g. <think>\n)
if (data.reasoning_prefill.empty() && string_ends_with(prompt_trimmed, r_start_trimmed)) {
// Extract from the original prompt to preserve trailing whitespace
auto start_pos = prompt_trimmed.size() - r_start_trimmed.size();
data.reasoning_prefill = data.prompt.substr(start_pos);
}
}
}
fprintf(stderr, "DEBUG reasoning_prefill: '%s' (start='%s', end='%s', mode=%d, reasoning_format=%d)\n",
data.reasoning_prefill.c_str(),
autoparser.reasoning.start.c_str(),
autoparser.reasoning.end.c_str(),
(int) autoparser.reasoning.mode,
(int) inputs.reasoning_format);
// Build the parser using the analysis results.
common_peg_arena parser = autoparser.build_parser(inputs);
data.parser = parser.save();
// Build grammar if tools are present
bool has_tools =
autoparser.tools.format.mode != tool_format::NONE && inputs.tools.is_array() && !inputs.tools.empty();

View File

@ -1614,12 +1614,6 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
if (auto_params.supports_thinking) {
auto_params.thinking_start_tag = autoparser.reasoning.start;
auto_params.thinking_end_tag = autoparser.reasoning.end;
// FORCED_OPEN and FORCED_CLOSED both put <think> in the generation prompt
// (FORCED_CLOSED forces empty <think></think> when thinking is disabled,
// but forces <think> open when thinking is enabled)
auto_params.thinking_forced_open =
autoparser.reasoning.mode == autoparser::reasoning_mode::FORCED_OPEN ||
autoparser.reasoning.mode == autoparser::reasoning_mode::FORCED_CLOSED;
}
return auto_params;
} catch (const std::exception & e) {

View File

@ -236,10 +236,14 @@ struct common_params_sampling {
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
// Grammar prefill: reasoning markers already present in the prompt suffix.
// Fed to the grammar sampler (to advance past pre-existing tokens) and used
// to determine the reasoning budget sampler's initial state.
std::string grammar_prefill;
// reasoning budget sampler parameters
// these are populated by the server/CLI based on chat template params
int32_t reasoning_budget_tokens = -1; // -1 = disabled, >= 0 = token budget
bool reasoning_budget_activate_immediately = false;
std::vector<llama_token> reasoning_budget_start; // start tag token sequence
std::vector<llama_token> reasoning_budget_end; // end tag token sequence
std::vector<llama_token> reasoning_budget_forced; // forced sequence (message + end tag)

View File

@ -251,15 +251,51 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
}
}
// Feed grammar prefill tokens to the grammar sampler so it advances past
// reasoning markers that the template already placed in the prompt.
std::vector<llama_token> prefill_tokens;
if (!params.grammar_prefill.empty() && vocab) {
prefill_tokens = common_tokenize(vocab, params.grammar_prefill, false, true);
if (grmr) {
for (const auto & token : prefill_tokens) {
llama_sampler_accept(grmr, token);
}
}
}
// reasoning budget sampler — added first so it can force tokens before other samplers
if (params.reasoning_budget_tokens >= 0 && !params.reasoning_budget_forced.empty()) {
// Determine initial state from the grammar prefill: if the prefill tokens
// match the start sequence, reasoning is already open → start COUNTING.
bool activate_immediately = false;
if (!prefill_tokens.empty() && !params.reasoning_budget_start.empty() &&
prefill_tokens.size() >= params.reasoning_budget_start.size()) {
activate_immediately = std::equal(
params.reasoning_budget_start.begin(),
params.reasoning_budget_start.end(),
prefill_tokens.begin());
// But if the prefill also contains the end sequence after the start,
// reasoning was opened and closed — start IDLE instead.
if (activate_immediately &&
prefill_tokens.size() >= params.reasoning_budget_start.size() + params.reasoning_budget_end.size()) {
auto end_begin = prefill_tokens.begin() + (ptrdiff_t) params.reasoning_budget_start.size();
// Check if remaining tokens after start match the end sequence
// (possibly with whitespace tokens in between, but for simplicity check suffix)
auto end_start = prefill_tokens.end() - (ptrdiff_t) params.reasoning_budget_end.size();
if (end_start >= end_begin &&
std::equal(params.reasoning_budget_end.begin(), params.reasoning_budget_end.end(), end_start)) {
activate_immediately = false;
}
}
}
samplers.push_back(common_reasoning_budget_init(
vocab,
params.reasoning_budget_start,
params.reasoning_budget_end,
params.reasoning_budget_forced,
params.reasoning_budget_tokens,
params.reasoning_budget_activate_immediately ? REASONING_BUDGET_COUNTING : REASONING_BUDGET_IDLE));
activate_immediately ? REASONING_BUDGET_COUNTING : REASONING_BUDGET_IDLE));
}
if (params.has_logit_bias()) {

View File

@ -995,6 +995,16 @@ static void test_peg_parser(common_chat_templates * tmpls,
grammar_triggered = true;
}
// For non-lazy grammars, prepend reasoning prefill to grammar input, just like
// PEG parsing does. The grammar includes the full reasoning pattern (e.g. optional
// <think>...</think>), but the model output may start mid-reasoning if the template
// already placed the opening tag in the prompt.
// For lazy grammars, the grammar only activates from the trigger position, so the
// reasoning prefill is irrelevant — reasoning is handled by the PEG parser.
if (!parser.params_.reasoning_prefill.empty() && earliest_trigger_pos == std::string::npos) {
constrained = parser.params_.reasoning_prefill + constrained;
}
// Test the constrained portion against the grammar
if (grammar_triggered && !tc.is_partial) {
auto result = match_string_detailed(constrained, grammar.get());

View File

@ -105,7 +105,7 @@ struct cli_context {
llama_get_model(ctx_server.get_llama_context()));
task.params.sampling.reasoning_budget_tokens = reasoning_budget;
task.params.sampling.reasoning_budget_activate_immediately = chat_params.thinking_forced_open;
task.params.sampling.grammar_prefill = chat_params.reasoning_prefill;
if (!chat_params.thinking_start_tag.empty()) {
task.params.sampling.reasoning_budget_start =

View File

@ -1113,7 +1113,6 @@ json oaicompat_chat_params_parse(
llama_params["reasoning_budget_start_tag"] = chat_params.thinking_start_tag;
llama_params["reasoning_budget_end_tag"] = chat_params.thinking_end_tag;
llama_params["reasoning_budget_message"] = opt.reasoning_budget_message;
llama_params["reasoning_budget_activate_immediately"] = chat_params.thinking_forced_open;
}
}

View File

@ -403,6 +403,7 @@ task_params server_task::params_from_json_cmpl(
params.chat_parser_params.reasoning_format = reasoning_format;
params.chat_parser_params.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
params.chat_parser_params.reasoning_prefill = json_value(data, "reasoning_prefill", std::string());
params.sampling.grammar_prefill = params.chat_parser_params.reasoning_prefill;
params.chat_parser_params.parse_tool_calls = json_value(data, "parse_tool_calls", false);
if (data.contains("chat_parser")) {
params.chat_parser_params.parser.load(data.at("chat_parser").get<std::string>());
@ -469,10 +470,7 @@ task_params server_task::params_from_json_cmpl(
const auto start_tag = json_value(data, "reasoning_budget_start_tag", std::string());
const auto end_tag = json_value(data, "reasoning_budget_end_tag", std::string());
const auto message = json_value(data, "reasoning_budget_message", std::string());
const bool activate_imm = json_value(data, "reasoning_budget_activate_immediately", false);
params.sampling.reasoning_budget_tokens = budget;
params.sampling.reasoning_budget_activate_immediately = activate_imm;
if (!start_tag.empty()) {
params.sampling.reasoning_budget_start = common_tokenize(vocab, start_tag, false, true);
@ -482,8 +480,8 @@ task_params server_task::params_from_json_cmpl(
params.sampling.reasoning_budget_forced = common_tokenize(vocab, message + end_tag, false, true);
}
SRV_DBG("reasoning budget: tokens=%d, activate_immediately=%s, start=%zu toks, end=%zu toks, forced=%zu toks\n",
budget, activate_imm ? "true" : "false",
SRV_DBG("reasoning budget: tokens=%d, grammar_prefill='%s', start=%zu toks, end=%zu toks, forced=%zu toks\n",
budget, params.sampling.grammar_prefill.c_str(),
params.sampling.reasoning_budget_start.size(),
params.sampling.reasoning_budget_end.size(),
params.sampling.reasoning_budget_forced.size());