From c25aed1f5c6ccb030cf761634847e612f5a7b6a2 Mon Sep 17 00:00:00 2001
From: Piotr Wilkin <piotr.wilkin@syndatis.com>
Date: Sat, 21 Mar 2026 22:00:33 +0100
Subject: [PATCH] Compilation fixes

---
 common/sampling.cpp             |  3 ---
 include/llama.h                 |  7 +++++++
 src/llama-sampler.h             |  4 ----
 tools/server/server-context.cpp | 35 ++++++++++++++++-----------------
 4 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/common/sampling.cpp b/common/sampling.cpp
index c3194ba3f0..db7d2912be 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -483,9 +483,6 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
     }
 }
 
-// forward declaration of internal function (defined in llama-sampler.cpp)
-void llama_sampler_grammar_set_trigger_suppressed(struct llama_sampler * smpl, bool suppressed);
-
 void common_sampler_set_grammar_trigger_suppressed(struct common_sampler * gsmpl, bool suppressed) {
     if (!gsmpl || !gsmpl->grmr) {
         return;
diff --git a/include/llama.h b/include/llama.h
index 6e72db7e3c..34ceeefdf1 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -1380,6 +1380,13 @@ extern "C" {
                const llama_token * trigger_tokens,
                             size_t num_trigger_tokens);
 
+    /// @details Suppress or un-suppress trigger detection on a grammar sampler.
+    ///          When suppressed, the grammar still buffers tokens but does not check for triggers.
+    ///          Useful for suppressing grammar activation during reasoning/thinking blocks.
+    ///          No-op if the sampler is not a grammar sampler.
+    LLAMA_API void llama_sampler_grammar_set_trigger_suppressed(
+            struct llama_sampler * smpl,
+                              bool suppressed);
 
     /// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
     LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
diff --git a/src/llama-sampler.h b/src/llama-sampler.h
index 11ad399fb8..b9bfc20d25 100644
--- a/src/llama-sampler.h
+++ b/src/llama-sampler.h
@@ -33,10 +33,6 @@ struct llama_sampler_chain {
     mutable int32_t n_sample;
 };
 
-// set trigger_suppressed on a grammar sampler (e.g. to suppress triggers during reasoning)
-// the sampler must have been created by llama_sampler_init_grammar* or this is a no-op
-void llama_sampler_grammar_set_trigger_suppressed(struct llama_sampler * smpl, bool suppressed);
-
 struct llama_sampler * llama_sampler_init_dry_testing(
         int32_t context_size,
         float   dry_multiplier,
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index ca4d15ec88..0e137447b7 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -1241,26 +1241,25 @@ private:
             const auto & end_tag   = slot.task->params.thinking_end_tag;
             const auto & start_tag = slot.task->params.thinking_start_tag;
             if (slot.in_reasoning) {
-                // check if the end tag just appeared
-                if (slot.generated_text.size() >= end_tag.size()) {
-                    auto tail = std::string_view(slot.generated_text).substr(
-                        slot.generated_text.size() - end_tag.size());
-                    if (tail == end_tag) {
-                        slot.in_reasoning = false;
-                        common_sampler_set_grammar_trigger_suppressed(slot.smpl.get(), false);
-                        SLT_DBG(slot, "reasoning ended, grammar triggers un-suppressed\n%s", "");
-                    }
+                // check if the end tag just appeared at the end of generated_text
+                if (slot.generated_text.size() >= end_tag.size()
+                        && slot.generated_text.compare(
+                            slot.generated_text.size() - end_tag.size(),
+                            end_tag.size(), end_tag) == 0) {
+                    slot.in_reasoning = false;
+                    common_sampler_set_grammar_trigger_suppressed(slot.smpl.get(), false);
+                    SLT_DBG(slot, "reasoning ended, grammar triggers un-suppressed\n%s", "");
                 }
             } else {
-                // check if the start tag just appeared
-                if (!start_tag.empty() && slot.generated_text.size() >= start_tag.size()) {
-                    auto tail = std::string_view(slot.generated_text).substr(
-                        slot.generated_text.size() - start_tag.size());
-                    if (tail == start_tag) {
-                        slot.in_reasoning = true;
-                        common_sampler_set_grammar_trigger_suppressed(slot.smpl.get(), true);
-                        SLT_DBG(slot, "reasoning started, grammar triggers suppressed\n%s", "");
-                    }
+                // check if the start tag just appeared at the end of generated_text
+                if (!start_tag.empty()
+                        && slot.generated_text.size() >= start_tag.size()
+                        && slot.generated_text.compare(
+                            slot.generated_text.size() - start_tag.size(),
+                            start_tag.size(), start_tag) == 0) {
+                    slot.in_reasoning = true;
+                    common_sampler_set_grammar_trigger_suppressed(slot.smpl.get(), true);
+                    SLT_DBG(slot, "reasoning started, grammar triggers suppressed\n%s", "");
                 }
             }
         }