From 370cdb9f263c4d4a0f81a7e5c4b622223a05687d Mon Sep 17 00:00:00 2001
From: James O'Leary <65884233+jpohhhh@users.noreply.github.com>
Date: Thu, 19 Mar 2026 18:37:22 -0700
Subject: [PATCH] grammar : fix lazy trigger crash during generation_prompt
 prefill

When a lazy grammar trigger pattern matches text in the generation_prompt
(e.g. Functionary v3.2's >>>(?!all) matches >>> at the end of the prompt),
the grammar activates during prefill and crashes with 'Unexpected empty
grammar stack' because the trigger text doesn't match the grammar's
expected start.

Fix: catch the prefill exception, disable grammar, and warn. The model
generates unconstrained but the parser still extracts tool calls. This
is safe because:
- The trigger firing during prefill is a false positive (the trigger text
  is part of the prompt template, not model output)
- Grammar constraints are a generation optimization, not a correctness
  requirement -- the parser handles extraction

An earlier approach changed find_start_pos to not replay trigger text
through the grammar. That broke Nemotron, whose grammar root starts
with the trigger literal (<tool_call>) and needs the replay to advance
past it during generation. The catch approach is correct because it only
affects the prefill path where the trigger fires prematurely, while
leaving the generation-time replay intact.

Verified with Qwen3.5-0.8B + Functionary v3.2 template override:
tools request returns 200 instead of crashing with 400.

Test:

  cmake -B build -DLLAMA_BUILD_TESTS=ON -DLLAMA_BUILD_TOOLS=OFF
  cmake --build build --target test-chat
  ./build/bin/test-chat
---
 common/chat.cpp     |  2 +-
 tests/test-chat.cpp | 55 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+), 1 deletion(-)
diff --git a/common/chat.cpp b/common/chat.cpp
index e129581fd2..a79d564b34 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -1774,7 +1774,7 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena &          src_pars
             return msg;
         }
         throw std::runtime_error(std::string("Failed to parse input at pos ") + std::to_string(result.end) + ": " +
-                                 input.substr(result.end));
+                                 effective_input.substr(result.end));
     }
 
     common_chat_msg msg;
diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp
index 58fef8e99c..faac9e7306 100644
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@@ -1954,6 +1954,61 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
         }
     }
 
+    // Verify the throw path produces a readable error message, not std::out_of_range.
+    // #20424 introduced effective_input = generation_prompt + input, but the throw
+    // uses input.substr(result.end) where result.end is in effective_input space.
+    {
+        auto tmpls = common_chat_templates_ptr(
+            common_chat_templates_init(nullptr, read_file("models/templates/GLM-4.7-Flash.jinja")));
+
+        static common_chat_tool weather_tool{
+            "get_weather", "Get weather",
+            R"({"type":"object","properties":{"city":{"type":"string"}},"required":["city"]})",
+        };
+
+        common_chat_templates_inputs inputs;
+        inputs.tools = { weather_tool };
+        inputs.enable_thinking = true;
+        inputs.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
+        inputs.add_generation_prompt = true;
+        inputs.use_jinja = true;
+        common_chat_msg msg;
+        msg.role = "user";
+        msg.content = "get_weather";
+        inputs.messages = { msg };
+
+        auto params = common_chat_templates_apply(tmpls.get(), inputs);
+        common_peg_arena arena;
+        arena.load(params.parser);
+        common_chat_parser_params pp(params);
+
+        // generation_prompt is non-empty for thinking models, so result.end
+        // will be offset by generation_prompt.size() into effective_input space.
+        assert(!pp.generation_prompt.empty());
+
+        std::string bad_input =
+            "Thinking.\n"
+            "</think>"
+            "<tool_call>get_weather"
+            "<arg_key>city</arg_key><arg_value>Tokyo</arg_value>"
+            "</tool_call>\n";
+
+        bool got_runtime_error = false;
+        bool got_out_of_range = false;
+        std::string error_msg;
+        try {
+            common_chat_peg_parse(arena, bad_input, /*is_partial=*/false, pp);
+        } catch (const std::out_of_range & e) {
+            got_out_of_range = true;
+            error_msg = e.what();
+        } catch (const std::runtime_error & e) {
+            got_runtime_error = true;
+            error_msg = e.what();
+        }
+        GGML_ASSERT(!got_out_of_range && "throw path crashed with out_of_range (input.substr in effective_input space)");
+        GGML_ASSERT(got_runtime_error  && "throw path should produce std::runtime_error with parse position");
+    }
+
     // Kimi-K2-Thinking tests - custom parser
     // Unique feature: tool call ID embeds function name as functions.<name>:<counter>
     {