Compilation fixes

2026-03-21 22:00:33 +01:00 · 2026-03-21 22:00:33 +01:00 · c25aed1f5c
parent 982cf3b6a2
commit c25aed1f5c
4 changed files with 24 additions and 25 deletions
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -483,9 +483,6 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
    }
 }

-// forward declaration of internal function (defined in llama-sampler.cpp)
-void llama_sampler_grammar_set_trigger_suppressed(struct llama_sampler * smpl, bool suppressed);
-
 void common_sampler_set_grammar_trigger_suppressed(struct common_sampler * gsmpl, bool suppressed) {
    if (!gsmpl || !gsmpl->grmr) {
        return;
--- a/include/llama.h
+++ b/include/llama.h
@ -1380,6 +1380,13 @@ extern "C" {
               const llama_token * trigger_tokens,
                            size_t num_trigger_tokens);

+    /// @details Suppress or un-suppress trigger detection on a grammar sampler.
+    ///          When suppressed, the grammar still buffers tokens but does not check for triggers.
+    ///          Useful for suppressing grammar activation during reasoning/thinking blocks.
+    ///          No-op if the sampler is not a grammar sampler.
+    LLAMA_API void llama_sampler_grammar_set_trigger_suppressed(
+            struct llama_sampler * smpl,
+                              bool suppressed);

    /// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
    LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
--- a/src/llama-sampler.h
+++ b/src/llama-sampler.h
@ -33,10 +33,6 @@ struct llama_sampler_chain {
    mutable int32_t n_sample;
 };

-// set trigger_suppressed on a grammar sampler (e.g. to suppress triggers during reasoning)
-// the sampler must have been created by llama_sampler_init_grammar* or this is a no-op
-void llama_sampler_grammar_set_trigger_suppressed(struct llama_sampler * smpl, bool suppressed);
-
 struct llama_sampler * llama_sampler_init_dry_testing(
        int32_t context_size,
        float   dry_multiplier,
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@ -1241,26 +1241,25 @@ private:
            const auto & end_tag   = slot.task->params.thinking_end_tag;
            const auto & start_tag = slot.task->params.thinking_start_tag;
            if (slot.in_reasoning) {
-                // check if the end tag just appeared
-                if (slot.generated_text.size() >= end_tag.size()) {
-                    auto tail = std::string_view(slot.generated_text).substr(
-                        slot.generated_text.size() - end_tag.size());
-                    if (tail == end_tag) {
-                        slot.in_reasoning = false;
-                        common_sampler_set_grammar_trigger_suppressed(slot.smpl.get(), false);
-                        SLT_DBG(slot, "reasoning ended, grammar triggers un-suppressed\n%s", "");
-                    }
+                // check if the end tag just appeared at the end of generated_text
+                if (slot.generated_text.size() >= end_tag.size()
+                        && slot.generated_text.compare(
+                            slot.generated_text.size() - end_tag.size(),
+                            end_tag.size(), end_tag) == 0) {
+                    slot.in_reasoning = false;
+                    common_sampler_set_grammar_trigger_suppressed(slot.smpl.get(), false);
+                    SLT_DBG(slot, "reasoning ended, grammar triggers un-suppressed\n%s", "");
                }
            } else {
-                // check if the start tag just appeared
-                if (!start_tag.empty() && slot.generated_text.size() >= start_tag.size()) {
-                    auto tail = std::string_view(slot.generated_text).substr(
-                        slot.generated_text.size() - start_tag.size());
-                    if (tail == start_tag) {
-                        slot.in_reasoning = true;
-                        common_sampler_set_grammar_trigger_suppressed(slot.smpl.get(), true);
-                        SLT_DBG(slot, "reasoning started, grammar triggers suppressed\n%s", "");
-                    }
+                // check if the start tag just appeared at the end of generated_text
+                if (!start_tag.empty()
+                        && slot.generated_text.size() >= start_tag.size()
+                        && slot.generated_text.compare(
+                            slot.generated_text.size() - start_tag.size(),
+                            start_tag.size(), start_tag) == 0) {
+                    slot.in_reasoning = true;
+                    common_sampler_set_grammar_trigger_suppressed(slot.smpl.get(), true);
+                    SLT_DBG(slot, "reasoning started, grammar triggers suppressed\n%s", "");
                }
            }
        }