sampling : handle grammar prefill crash for Functionary v3.2
The Functionary v3.2 trigger pattern >>>(?!all) matches >>> at the end of the generation_prompt during grammar prefill. This activates the grammar prematurely and crashes with 'Unexpected empty grammar stack', returning HTTP 400 before inference starts. Fix: catch the prefill exception, disable grammar, and warn. The model generates unconstrained but the parser still extracts tool calls from well-formed output. This is safe because the crash only occurs with template overrides (--chat-template-file) where the generation_prompt contains the trigger text. Test verifies the precondition: trigger pattern matches the generation_prompt, confirming the prefill catch path is exercised. Test: cmake -B build -DLLAMA_BUILD_TESTS=ON -DLLAMA_BUILD_TOOLS=OFF cmake --build build --target test-chat ./build/bin/test-chat
This commit is contained in:
parent
c1b911654a
commit
fba6b87ab2
|
|
@ -1774,7 +1774,7 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & src_pars
|
|||
return msg;
|
||||
}
|
||||
throw std::runtime_error(std::string("Failed to parse input at pos ") + std::to_string(result.end) + ": " +
|
||||
input.substr(result.end));
|
||||
effective_input.substr(result.end));
|
||||
}
|
||||
|
||||
common_chat_msg msg;
|
||||
|
|
|
|||
|
|
@ -277,9 +277,10 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
|
|||
LOG_DBG("%s: accepted prefill token (%d)\n", __func__, token);
|
||||
}
|
||||
} catch (std::exception &e) {
|
||||
LOG_ERR("%s: error initializing grammar sampler for grammar:\n%s\n\nGeneration prompt:\n'%s'\n", __func__,
|
||||
common_grammar_value(params.grammar).c_str(), params.generation_prompt.c_str());
|
||||
throw e;
|
||||
LOG_WRN("%s: grammar prefill failed, disabling grammar constraints: %s\n",
|
||||
__func__, e.what());
|
||||
llama_sampler_free(grmr);
|
||||
grmr = nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1954,6 +1954,99 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
|||
}
|
||||
}
|
||||
|
||||
// Verify the throw path produces a readable error message, not std::out_of_range.
|
||||
// #20424 introduced effective_input = generation_prompt + input, but the throw
|
||||
// uses input.substr(result.end) where result.end is in effective_input space.
|
||||
{
|
||||
auto tmpls = common_chat_templates_ptr(
|
||||
common_chat_templates_init(nullptr, read_file("models/templates/GLM-4.7-Flash.jinja")));
|
||||
|
||||
static common_chat_tool weather_tool{
|
||||
"get_weather", "Get weather",
|
||||
R"({"type":"object","properties":{"city":{"type":"string"}},"required":["city"]})",
|
||||
};
|
||||
|
||||
common_chat_templates_inputs inputs;
|
||||
inputs.tools = { weather_tool };
|
||||
inputs.enable_thinking = true;
|
||||
inputs.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
|
||||
inputs.add_generation_prompt = true;
|
||||
inputs.use_jinja = true;
|
||||
common_chat_msg msg;
|
||||
msg.role = "user";
|
||||
msg.content = "get_weather";
|
||||
inputs.messages = { msg };
|
||||
|
||||
auto params = common_chat_templates_apply(tmpls.get(), inputs);
|
||||
common_peg_arena arena;
|
||||
arena.load(params.parser);
|
||||
common_chat_parser_params pp(params);
|
||||
|
||||
// generation_prompt is non-empty for thinking models, so result.end
|
||||
// will be offset by generation_prompt.size() into effective_input space.
|
||||
assert(!pp.generation_prompt.empty());
|
||||
|
||||
std::string bad_input =
|
||||
"Thinking.\n"
|
||||
"</think>"
|
||||
"<tool_call>get_weather"
|
||||
"<arg_key>city</arg_key><arg_value>Tokyo</arg_value>"
|
||||
"</tool_call>\n";
|
||||
|
||||
bool got_runtime_error = false;
|
||||
bool got_out_of_range = false;
|
||||
std::string error_msg;
|
||||
try {
|
||||
common_chat_peg_parse(arena, bad_input, /*is_partial=*/false, pp);
|
||||
} catch (const std::out_of_range & e) {
|
||||
got_out_of_range = true;
|
||||
error_msg = e.what();
|
||||
} catch (const std::runtime_error & e) {
|
||||
got_runtime_error = true;
|
||||
error_msg = e.what();
|
||||
}
|
||||
GGML_ASSERT(!got_out_of_range && "throw path crashed with out_of_range (input.substr in effective_input space)");
|
||||
GGML_ASSERT(got_runtime_error && "throw path should produce std::runtime_error with parse position");
|
||||
}
|
||||
|
||||
// Functionary v3.2: the trigger pattern >>>(?!all) matches >>> at the end
|
||||
// of the generation_prompt. This causes the grammar to activate during prefill
|
||||
// and crash. The fix in sampling.cpp catches the prefill exception and disables
|
||||
// grammar constraints rather than crashing with 400.
|
||||
{
|
||||
auto tmpls = common_chat_templates_ptr(
|
||||
common_chat_templates_init(nullptr, read_file("models/templates/meetkai-functionary-medium-v3.2.jinja")));
|
||||
|
||||
common_chat_templates_inputs inputs;
|
||||
inputs.tools = { special_function_tool };
|
||||
inputs.add_generation_prompt = true;
|
||||
inputs.use_jinja = true;
|
||||
inputs.messages = {{"user", "hi"}};
|
||||
|
||||
auto params = common_chat_templates_apply(tmpls.get(), inputs);
|
||||
GGML_ASSERT(params.grammar_lazy);
|
||||
GGML_ASSERT(!params.grammar.empty());
|
||||
|
||||
// generation_prompt ends with >>> which is the trigger text
|
||||
GGML_ASSERT(params.generation_prompt.size() >= 3);
|
||||
GGML_ASSERT(params.generation_prompt.substr(params.generation_prompt.size() - 3) == ">>>");
|
||||
|
||||
// The trigger pattern matches the generation_prompt -- this is the
|
||||
// precondition that causes the grammar to activate during prefill.
|
||||
// Without the catch in sampling.cpp, this crashes with 400.
|
||||
bool trigger_matches_prompt = false;
|
||||
for (const auto & trigger : params.grammar_triggers) {
|
||||
if (trigger.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN) {
|
||||
llama_grammar_trigger_pattern pat { trigger.value, std::regex(trigger.value) };
|
||||
if (pat.find(params.generation_prompt) != std::string::npos) {
|
||||
trigger_matches_prompt = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(trigger_matches_prompt
|
||||
&& "trigger must match generation_prompt to exercise the prefill catch path");
|
||||
}
|
||||
|
||||
// Kimi-K2-Thinking tests - custom parser
|
||||
// Unique feature: tool call ID embeds function name as functions.<name>:<counter>
|
||||
{
|
||||
|
|
|
|||
Loading…
Reference in New Issue