From fba6b87ab2fac78df18b886e089a04d65e7ee57c Mon Sep 17 00:00:00 2001 From: James O'Leary <65884233+jpohhhh@users.noreply.github.com> Date: Thu, 19 Mar 2026 18:37:22 -0700 Subject: [PATCH] sampling : handle grammar prefill crash for Functionary v3.2 The Functionary v3.2 trigger pattern >>>(?!all) matches >>> at the end of the generation_prompt during grammar prefill. This activates the grammar prematurely and crashes with 'Unexpected empty grammar stack', returning HTTP 400 before inference starts. Fix: catch the prefill exception, disable grammar, and warn. The model generates unconstrained but the parser still extracts tool calls from well-formed output. This is safe because the crash only occurs with template overrides (--chat-template-file) where the generation_prompt contains the trigger text. Test verifies the precondition: trigger pattern matches the generation_prompt, confirming the prefill catch path is exercised. Test: cmake -B build -DLLAMA_BUILD_TESTS=ON -DLLAMA_BUILD_TOOLS=OFF cmake --build build --target test-chat ./build/bin/test-chat --- common/chat.cpp | 2 +- common/sampling.cpp | 7 ++-- tests/test-chat.cpp | 93 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 98 insertions(+), 4 deletions(-) diff --git a/common/chat.cpp b/common/chat.cpp index e129581fd2..a79d564b34 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -1774,7 +1774,7 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & src_pars return msg; } throw std::runtime_error(std::string("Failed to parse input at pos ") + std::to_string(result.end) + ": " + - input.substr(result.end)); + effective_input.substr(result.end)); } common_chat_msg msg; diff --git a/common/sampling.cpp b/common/sampling.cpp index 012e212660..5e3a761ba9 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -277,9 +277,10 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st LOG_DBG("%s: accepted prefill token (%d)\n", __func__, token); } } catch (std::exception &e) { - LOG_ERR("%s: error initializing grammar sampler for grammar:\n%s\n\nGeneration prompt:\n'%s'\n", __func__, - common_grammar_value(params.grammar).c_str(), params.generation_prompt.c_str()); - throw e; + LOG_WRN("%s: grammar prefill failed, disabling grammar constraints: %s\n", + __func__, e.what()); + llama_sampler_free(grmr); + grmr = nullptr; } } } diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp index 58fef8e99c..07102db391 100644 --- a/tests/test-chat.cpp +++ b/tests/test-chat.cpp @@ -1954,6 +1954,99 @@ static void test_template_output_peg_parsers(bool detailed_debug) { } } + // Verify the throw path produces a readable error message, not std::out_of_range. + // #20424 introduced effective_input = generation_prompt + input, but the throw + // uses input.substr(result.end) where result.end is in effective_input space. + { + auto tmpls = common_chat_templates_ptr( + common_chat_templates_init(nullptr, read_file("models/templates/GLM-4.7-Flash.jinja"))); + + static common_chat_tool weather_tool{ + "get_weather", "Get weather", + R"({"type":"object","properties":{"city":{"type":"string"}},"required":["city"]})", + }; + + common_chat_templates_inputs inputs; + inputs.tools = { weather_tool }; + inputs.enable_thinking = true; + inputs.reasoning_format = COMMON_REASONING_FORMAT_AUTO; + inputs.add_generation_prompt = true; + inputs.use_jinja = true; + common_chat_msg msg; + msg.role = "user"; + msg.content = "get_weather"; + inputs.messages = { msg }; + + auto params = common_chat_templates_apply(tmpls.get(), inputs); + common_peg_arena arena; + arena.load(params.parser); + common_chat_parser_params pp(params); + + // generation_prompt is non-empty for thinking models, so result.end + // will be offset by generation_prompt.size() into effective_input space. + assert(!pp.generation_prompt.empty()); + + std::string bad_input = + "Thinking.\n" + "" + "get_weather" + "cityTokyo" + "\n"; + + bool got_runtime_error = false; + bool got_out_of_range = false; + std::string error_msg; + try { + common_chat_peg_parse(arena, bad_input, /*is_partial=*/false, pp); + } catch (const std::out_of_range & e) { + got_out_of_range = true; + error_msg = e.what(); + } catch (const std::runtime_error & e) { + got_runtime_error = true; + error_msg = e.what(); + } + GGML_ASSERT(!got_out_of_range && "throw path crashed with out_of_range (input.substr in effective_input space)"); + GGML_ASSERT(got_runtime_error && "throw path should produce std::runtime_error with parse position"); + } + + // Functionary v3.2: the trigger pattern >>>(?!all) matches >>> at the end + // of the generation_prompt. This causes the grammar to activate during prefill + // and crash. The fix in sampling.cpp catches the prefill exception and disables + // grammar constraints rather than crashing with 400. + { + auto tmpls = common_chat_templates_ptr( + common_chat_templates_init(nullptr, read_file("models/templates/meetkai-functionary-medium-v3.2.jinja"))); + + common_chat_templates_inputs inputs; + inputs.tools = { special_function_tool }; + inputs.add_generation_prompt = true; + inputs.use_jinja = true; + inputs.messages = {{"user", "hi"}}; + + auto params = common_chat_templates_apply(tmpls.get(), inputs); + GGML_ASSERT(params.grammar_lazy); + GGML_ASSERT(!params.grammar.empty()); + + // generation_prompt ends with >>> which is the trigger text + GGML_ASSERT(params.generation_prompt.size() >= 3); + GGML_ASSERT(params.generation_prompt.substr(params.generation_prompt.size() - 3) == ">>>"); + + // The trigger pattern matches the generation_prompt -- this is the + // precondition that causes the grammar to activate during prefill. + // Without the catch in sampling.cpp, this crashes with 400. + bool trigger_matches_prompt = false; + for (const auto & trigger : params.grammar_triggers) { + if (trigger.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN) { + llama_grammar_trigger_pattern pat { trigger.value, std::regex(trigger.value) }; + if (pat.find(params.generation_prompt) != std::string::npos) { + trigger_matches_prompt = true; + } + } + } + GGML_ASSERT(trigger_matches_prompt + && "trigger must match generation_prompt to exercise the prefill catch path"); + } + // Kimi-K2-Thinking tests - custom parser // Unique feature: tool call ID embeds function name as functions.: {