wip
This commit is contained in:
parent
060d4e4cfd
commit
2249a09f12
|
|
@ -39,15 +39,11 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
|
|||
common_chat_params peg_generator::generate_parser(const common_chat_template & tmpl,
|
||||
const struct templates_params & inputs,
|
||||
const autoparser & autoparser) {
|
||||
// Build the parser using the analysis results
|
||||
auto parser = autoparser.build_parser(inputs);
|
||||
|
||||
// Create the result structure
|
||||
common_chat_params data;
|
||||
data.prompt = common_chat_template_direct_apply(tmpl, inputs);
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||
data.preserved_tokens = autoparser.preserved_tokens;
|
||||
data.parser = parser.save();
|
||||
|
||||
// Extract reasoning prefill from the end of the rendered prompt.
|
||||
// If the template added reasoning markers (e.g. <think> or <think></think>) at the end,
|
||||
|
|
@ -57,34 +53,53 @@ common_chat_params peg_generator::generate_parser(const common_chat_template &
|
|||
!autoparser.reasoning.end.empty()) {
|
||||
const auto & r_start = autoparser.reasoning.start;
|
||||
const auto & r_end = autoparser.reasoning.end;
|
||||
// Trim trailing whitespace from the prompt for suffix matching
|
||||
auto prompt_trimmed = data.prompt;
|
||||
while (!prompt_trimmed.empty() &&
|
||||
(prompt_trimmed.back() == ' ' || prompt_trimmed.back() == '\n' ||
|
||||
prompt_trimmed.back() == '\r' || prompt_trimmed.back() == '\t')) {
|
||||
prompt_trimmed.pop_back();
|
||||
}
|
||||
if (!r_start.empty()) {
|
||||
|
||||
// Helper to trim trailing whitespace from a string
|
||||
auto rtrim = [](std::string s) {
|
||||
while (!s.empty() && (s.back() == ' ' || s.back() == '\n' ||
|
||||
s.back() == '\r' || s.back() == '\t')) {
|
||||
s.pop_back();
|
||||
}
|
||||
return s;
|
||||
};
|
||||
|
||||
// Trim both the prompt and markers for suffix matching (markers may have trailing \n)
|
||||
auto prompt_trimmed = rtrim(data.prompt);
|
||||
auto r_end_trimmed = rtrim(r_end);
|
||||
auto r_start_trimmed = rtrim(r_start);
|
||||
|
||||
if (!r_start_trimmed.empty()) {
|
||||
// Check for start+end at end of prompt (e.g. <think></think>)
|
||||
if (string_ends_with(prompt_trimmed, r_end)) {
|
||||
auto before_end = prompt_trimmed.substr(0, prompt_trimmed.size() - r_end.size());
|
||||
while (!before_end.empty() &&
|
||||
(before_end.back() == ' ' || before_end.back() == '\n' ||
|
||||
before_end.back() == '\r' || before_end.back() == '\t')) {
|
||||
before_end.pop_back();
|
||||
}
|
||||
if (string_ends_with(before_end, r_start)) {
|
||||
// Prompt ends with start + whitespace + end: extract from start to end of trimmed prompt
|
||||
data.reasoning_prefill = prompt_trimmed.substr(before_end.size() - r_start.size());
|
||||
if (string_ends_with(prompt_trimmed, r_end_trimmed)) {
|
||||
auto before_end = rtrim(prompt_trimmed.substr(0, prompt_trimmed.size() - r_end_trimmed.size()));
|
||||
if (string_ends_with(before_end, r_start_trimmed)) {
|
||||
// Prompt ends with start + end markers (reasoning closed).
|
||||
// Use the canonical markers from the analyzer to ensure whitespace
|
||||
// (e.g. trailing \n in </think>\n) is preserved, even if the template
|
||||
// rendered them without intermediate whitespace.
|
||||
data.reasoning_prefill = r_start + r_end;
|
||||
}
|
||||
}
|
||||
// Check for just start at end of prompt (e.g. <think>)
|
||||
if (data.reasoning_prefill.empty() && string_ends_with(prompt_trimmed, r_start)) {
|
||||
data.reasoning_prefill = r_start;
|
||||
// Check for just start at end of prompt (e.g. <think>\n)
|
||||
if (data.reasoning_prefill.empty() && string_ends_with(prompt_trimmed, r_start_trimmed)) {
|
||||
// Extract from the original prompt to preserve trailing whitespace
|
||||
auto start_pos = prompt_trimmed.size() - r_start_trimmed.size();
|
||||
data.reasoning_prefill = data.prompt.substr(start_pos);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(stderr, "DEBUG reasoning_prefill: '%s' (start='%s', end='%s', mode=%d, reasoning_format=%d)\n",
|
||||
data.reasoning_prefill.c_str(),
|
||||
autoparser.reasoning.start.c_str(),
|
||||
autoparser.reasoning.end.c_str(),
|
||||
(int) autoparser.reasoning.mode,
|
||||
(int) inputs.reasoning_format);
|
||||
|
||||
// Build the parser using the analysis results.
|
||||
common_peg_arena parser = autoparser.build_parser(inputs);
|
||||
data.parser = parser.save();
|
||||
|
||||
// Build grammar if tools are present
|
||||
bool has_tools =
|
||||
autoparser.tools.format.mode != tool_format::NONE && inputs.tools.is_array() && !inputs.tools.empty();
|
||||
|
|
|
|||
|
|
@ -1614,12 +1614,6 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
|
|||
if (auto_params.supports_thinking) {
|
||||
auto_params.thinking_start_tag = autoparser.reasoning.start;
|
||||
auto_params.thinking_end_tag = autoparser.reasoning.end;
|
||||
// FORCED_OPEN and FORCED_CLOSED both put <think> in the generation prompt
|
||||
// (FORCED_CLOSED forces empty <think></think> when thinking is disabled,
|
||||
// but forces <think> open when thinking is enabled)
|
||||
auto_params.thinking_forced_open =
|
||||
autoparser.reasoning.mode == autoparser::reasoning_mode::FORCED_OPEN ||
|
||||
autoparser.reasoning.mode == autoparser::reasoning_mode::FORCED_CLOSED;
|
||||
}
|
||||
return auto_params;
|
||||
} catch (const std::exception & e) {
|
||||
|
|
|
|||
|
|
@ -236,10 +236,14 @@ struct common_params_sampling {
|
|||
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
||||
std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
|
||||
|
||||
// Grammar prefill: reasoning markers already present in the prompt suffix.
|
||||
// Fed to the grammar sampler (to advance past pre-existing tokens) and used
|
||||
// to determine the reasoning budget sampler's initial state.
|
||||
std::string grammar_prefill;
|
||||
|
||||
// reasoning budget sampler parameters
|
||||
// these are populated by the server/CLI based on chat template params
|
||||
int32_t reasoning_budget_tokens = -1; // -1 = disabled, >= 0 = token budget
|
||||
bool reasoning_budget_activate_immediately = false;
|
||||
std::vector<llama_token> reasoning_budget_start; // start tag token sequence
|
||||
std::vector<llama_token> reasoning_budget_end; // end tag token sequence
|
||||
std::vector<llama_token> reasoning_budget_forced; // forced sequence (message + end tag)
|
||||
|
|
|
|||
|
|
@ -251,15 +251,51 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
|
|||
}
|
||||
}
|
||||
|
||||
// Feed grammar prefill tokens to the grammar sampler so it advances past
|
||||
// reasoning markers that the template already placed in the prompt.
|
||||
std::vector<llama_token> prefill_tokens;
|
||||
if (!params.grammar_prefill.empty() && vocab) {
|
||||
prefill_tokens = common_tokenize(vocab, params.grammar_prefill, false, true);
|
||||
if (grmr) {
|
||||
for (const auto & token : prefill_tokens) {
|
||||
llama_sampler_accept(grmr, token);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// reasoning budget sampler — added first so it can force tokens before other samplers
|
||||
if (params.reasoning_budget_tokens >= 0 && !params.reasoning_budget_forced.empty()) {
|
||||
// Determine initial state from the grammar prefill: if the prefill tokens
|
||||
// match the start sequence, reasoning is already open → start COUNTING.
|
||||
bool activate_immediately = false;
|
||||
if (!prefill_tokens.empty() && !params.reasoning_budget_start.empty() &&
|
||||
prefill_tokens.size() >= params.reasoning_budget_start.size()) {
|
||||
activate_immediately = std::equal(
|
||||
params.reasoning_budget_start.begin(),
|
||||
params.reasoning_budget_start.end(),
|
||||
prefill_tokens.begin());
|
||||
// But if the prefill also contains the end sequence after the start,
|
||||
// reasoning was opened and closed — start IDLE instead.
|
||||
if (activate_immediately &&
|
||||
prefill_tokens.size() >= params.reasoning_budget_start.size() + params.reasoning_budget_end.size()) {
|
||||
auto end_begin = prefill_tokens.begin() + (ptrdiff_t) params.reasoning_budget_start.size();
|
||||
// Check if remaining tokens after start match the end sequence
|
||||
// (possibly with whitespace tokens in between, but for simplicity check suffix)
|
||||
auto end_start = prefill_tokens.end() - (ptrdiff_t) params.reasoning_budget_end.size();
|
||||
if (end_start >= end_begin &&
|
||||
std::equal(params.reasoning_budget_end.begin(), params.reasoning_budget_end.end(), end_start)) {
|
||||
activate_immediately = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
samplers.push_back(common_reasoning_budget_init(
|
||||
vocab,
|
||||
params.reasoning_budget_start,
|
||||
params.reasoning_budget_end,
|
||||
params.reasoning_budget_forced,
|
||||
params.reasoning_budget_tokens,
|
||||
params.reasoning_budget_activate_immediately ? REASONING_BUDGET_COUNTING : REASONING_BUDGET_IDLE));
|
||||
activate_immediately ? REASONING_BUDGET_COUNTING : REASONING_BUDGET_IDLE));
|
||||
}
|
||||
|
||||
if (params.has_logit_bias()) {
|
||||
|
|
|
|||
|
|
@ -995,6 +995,16 @@ static void test_peg_parser(common_chat_templates * tmpls,
|
|||
grammar_triggered = true;
|
||||
}
|
||||
|
||||
// For non-lazy grammars, prepend reasoning prefill to grammar input, just like
|
||||
// PEG parsing does. The grammar includes the full reasoning pattern (e.g. optional
|
||||
// <think>...</think>), but the model output may start mid-reasoning if the template
|
||||
// already placed the opening tag in the prompt.
|
||||
// For lazy grammars, the grammar only activates from the trigger position, so the
|
||||
// reasoning prefill is irrelevant — reasoning is handled by the PEG parser.
|
||||
if (!parser.params_.reasoning_prefill.empty() && earliest_trigger_pos == std::string::npos) {
|
||||
constrained = parser.params_.reasoning_prefill + constrained;
|
||||
}
|
||||
|
||||
// Test the constrained portion against the grammar
|
||||
if (grammar_triggered && !tc.is_partial) {
|
||||
auto result = match_string_detailed(constrained, grammar.get());
|
||||
|
|
|
|||
|
|
@ -105,7 +105,7 @@ struct cli_context {
|
|||
llama_get_model(ctx_server.get_llama_context()));
|
||||
|
||||
task.params.sampling.reasoning_budget_tokens = reasoning_budget;
|
||||
task.params.sampling.reasoning_budget_activate_immediately = chat_params.thinking_forced_open;
|
||||
task.params.sampling.grammar_prefill = chat_params.reasoning_prefill;
|
||||
|
||||
if (!chat_params.thinking_start_tag.empty()) {
|
||||
task.params.sampling.reasoning_budget_start =
|
||||
|
|
|
|||
|
|
@ -1113,7 +1113,6 @@ json oaicompat_chat_params_parse(
|
|||
llama_params["reasoning_budget_start_tag"] = chat_params.thinking_start_tag;
|
||||
llama_params["reasoning_budget_end_tag"] = chat_params.thinking_end_tag;
|
||||
llama_params["reasoning_budget_message"] = opt.reasoning_budget_message;
|
||||
llama_params["reasoning_budget_activate_immediately"] = chat_params.thinking_forced_open;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -403,6 +403,7 @@ task_params server_task::params_from_json_cmpl(
|
|||
params.chat_parser_params.reasoning_format = reasoning_format;
|
||||
params.chat_parser_params.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
|
||||
params.chat_parser_params.reasoning_prefill = json_value(data, "reasoning_prefill", std::string());
|
||||
params.sampling.grammar_prefill = params.chat_parser_params.reasoning_prefill;
|
||||
params.chat_parser_params.parse_tool_calls = json_value(data, "parse_tool_calls", false);
|
||||
if (data.contains("chat_parser")) {
|
||||
params.chat_parser_params.parser.load(data.at("chat_parser").get<std::string>());
|
||||
|
|
@ -469,10 +470,7 @@ task_params server_task::params_from_json_cmpl(
|
|||
const auto start_tag = json_value(data, "reasoning_budget_start_tag", std::string());
|
||||
const auto end_tag = json_value(data, "reasoning_budget_end_tag", std::string());
|
||||
const auto message = json_value(data, "reasoning_budget_message", std::string());
|
||||
const bool activate_imm = json_value(data, "reasoning_budget_activate_immediately", false);
|
||||
|
||||
params.sampling.reasoning_budget_tokens = budget;
|
||||
params.sampling.reasoning_budget_activate_immediately = activate_imm;
|
||||
|
||||
if (!start_tag.empty()) {
|
||||
params.sampling.reasoning_budget_start = common_tokenize(vocab, start_tag, false, true);
|
||||
|
|
@ -482,8 +480,8 @@ task_params server_task::params_from_json_cmpl(
|
|||
params.sampling.reasoning_budget_forced = common_tokenize(vocab, message + end_tag, false, true);
|
||||
}
|
||||
|
||||
SRV_DBG("reasoning budget: tokens=%d, activate_immediately=%s, start=%zu toks, end=%zu toks, forced=%zu toks\n",
|
||||
budget, activate_imm ? "true" : "false",
|
||||
SRV_DBG("reasoning budget: tokens=%d, grammar_prefill='%s', start=%zu toks, end=%zu toks, forced=%zu toks\n",
|
||||
budget, params.sampling.grammar_prefill.c_str(),
|
||||
params.sampling.reasoning_budget_start.size(),
|
||||
params.sampling.reasoning_budget_end.size(),
|
||||
params.sampling.reasoning_budget_forced.size());
|
||||
|
|
|
|||
Loading…
Reference in New Issue