From fba6b87ab2fac78df18b886e089a04d65e7ee57c Mon Sep 17 00:00:00 2001
From: James O'Leary <65884233+jpohhhh@users.noreply.github.com>
Date: Thu, 19 Mar 2026 18:37:22 -0700
Subject: [PATCH] sampling : handle grammar prefill crash for Functionary v3.2

The Functionary v3.2 trigger pattern >>>(?!all) matches >>> at the end
of the generation_prompt during grammar prefill. This activates the
grammar prematurely and crashes with 'Unexpected empty grammar stack',
returning HTTP 400 before inference starts.

Fix: catch the prefill exception, disable grammar, and warn. The model
generates unconstrained but the parser still extracts tool calls from
well-formed output. This is safe because the crash only occurs with
template overrides (--chat-template-file) where the generation_prompt
contains the trigger text.

Test verifies the precondition: trigger pattern matches the
generation_prompt, confirming the prefill catch path is exercised.

Test:

  cmake -B build -DLLAMA_BUILD_TESTS=ON -DLLAMA_BUILD_TOOLS=OFF
  cmake --build build --target test-chat
  ./build/bin/test-chat
---
 common/chat.cpp     |  2 +-
 common/sampling.cpp |  7 ++--
 tests/test-chat.cpp | 93 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 98 insertions(+), 4 deletions(-)

diff --git a/common/chat.cpp b/common/chat.cpp
index e129581fd2..a79d564b34 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -1774,7 +1774,7 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena &          src_pars
             return msg;
         }
         throw std::runtime_error(std::string("Failed to parse input at pos ") + std::to_string(result.end) + ": " +
-                                 input.substr(result.end));
+                                 effective_input.substr(result.end));
     }
 
     common_chat_msg msg;
diff --git a/common/sampling.cpp b/common/sampling.cpp
index 012e212660..5e3a761ba9 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -277,9 +277,10 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
                     LOG_DBG("%s: accepted prefill token (%d)\n", __func__, token);
                 }
             } catch (std::exception &e) {
-                LOG_ERR("%s: error initializing grammar sampler for grammar:\n%s\n\nGeneration prompt:\n'%s'\n", __func__,
-                    common_grammar_value(params.grammar).c_str(), params.generation_prompt.c_str());
-                throw e;
+                LOG_WRN("%s: grammar prefill failed, disabling grammar constraints: %s\n",
+                        __func__, e.what());
+                llama_sampler_free(grmr);
+                grmr = nullptr;
             }
         }
     }
diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp
index 58fef8e99c..07102db391 100644
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@@ -1954,6 +1954,99 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
         }
     }
 
+    // Verify the throw path produces a readable error message, not std::out_of_range.
+    // #20424 introduced effective_input = generation_prompt + input, but the throw
+    // uses input.substr(result.end) where result.end is in effective_input space.
+    {
+        auto tmpls = common_chat_templates_ptr(
+            common_chat_templates_init(nullptr, read_file("models/templates/GLM-4.7-Flash.jinja")));
+
+        static common_chat_tool weather_tool{
+            "get_weather", "Get weather",
+            R"({"type":"object","properties":{"city":{"type":"string"}},"required":["city"]})",
+        };
+
+        common_chat_templates_inputs inputs;
+        inputs.tools = { weather_tool };
+        inputs.enable_thinking = true;
+        inputs.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
+        inputs.add_generation_prompt = true;
+        inputs.use_jinja = true;
+        common_chat_msg msg;
+        msg.role = "user";
+        msg.content = "get_weather";
+        inputs.messages = { msg };
+
+        auto params = common_chat_templates_apply(tmpls.get(), inputs);
+        common_peg_arena arena;
+        arena.load(params.parser);
+        common_chat_parser_params pp(params);
+
+        // generation_prompt is non-empty for thinking models, so result.end
+        // will be offset by generation_prompt.size() into effective_input space.
+        assert(!pp.generation_prompt.empty());
+
+        std::string bad_input =
+            "Thinking.\n"
+            "</think>"
+            "<tool_call>get_weather"
+            "<arg_key>city</arg_key><arg_value>Tokyo</arg_value>"
+            "</tool_call>\n";
+
+        bool got_runtime_error = false;
+        bool got_out_of_range = false;
+        std::string error_msg;
+        try {
+            common_chat_peg_parse(arena, bad_input, /*is_partial=*/false, pp);
+        } catch (const std::out_of_range & e) {
+            got_out_of_range = true;
+            error_msg = e.what();
+        } catch (const std::runtime_error & e) {
+            got_runtime_error = true;
+            error_msg = e.what();
+        }
+        GGML_ASSERT(!got_out_of_range && "throw path crashed with out_of_range (input.substr in effective_input space)");
+        GGML_ASSERT(got_runtime_error  && "throw path should produce std::runtime_error with parse position");
+    }
+
+    // Functionary v3.2: the trigger pattern >>>(?!all) matches >>> at the end
+    // of the generation_prompt. This causes the grammar to activate during prefill
+    // and crash. The fix in sampling.cpp catches the prefill exception and disables
+    // grammar constraints rather than crashing with 400.
+    {
+        auto tmpls = common_chat_templates_ptr(
+            common_chat_templates_init(nullptr, read_file("models/templates/meetkai-functionary-medium-v3.2.jinja")));
+
+        common_chat_templates_inputs inputs;
+        inputs.tools = { special_function_tool };
+        inputs.add_generation_prompt = true;
+        inputs.use_jinja = true;
+        inputs.messages = {{"user", "hi"}};
+
+        auto params = common_chat_templates_apply(tmpls.get(), inputs);
+        GGML_ASSERT(params.grammar_lazy);
+        GGML_ASSERT(!params.grammar.empty());
+
+        // generation_prompt ends with >>> which is the trigger text
+        GGML_ASSERT(params.generation_prompt.size() >= 3);
+        GGML_ASSERT(params.generation_prompt.substr(params.generation_prompt.size() - 3) == ">>>");
+
+        // The trigger pattern matches the generation_prompt -- this is the
+        // precondition that causes the grammar to activate during prefill.
+        // Without the catch in sampling.cpp, this crashes with 400.
+        bool trigger_matches_prompt = false;
+        for (const auto & trigger : params.grammar_triggers) {
+            if (trigger.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN) {
+                llama_grammar_trigger_pattern pat { trigger.value, std::regex(trigger.value) };
+                if (pat.find(params.generation_prompt) != std::string::npos) {
+                    trigger_matches_prompt = true;
+                }
+            }
+        }
+        GGML_ASSERT(trigger_matches_prompt
+                    && "trigger must match generation_prompt to exercise the prefill catch path");
+    }
+
     // Kimi-K2-Thinking tests - custom parser
     // Unique feature: tool call ID embeds function name as functions.<name>:<counter>
     {