From c25aed1f5c6ccb030cf761634847e612f5a7b6a2 Mon Sep 17 00:00:00 2001 From: Piotr Wilkin Date: Sat, 21 Mar 2026 22:00:33 +0100 Subject: [PATCH] Compilation fixes --- common/sampling.cpp | 3 --- include/llama.h | 7 +++++++ src/llama-sampler.h | 4 ---- tools/server/server-context.cpp | 35 ++++++++++++++++----------------- 4 files changed, 24 insertions(+), 25 deletions(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index c3194ba3f0..db7d2912be 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -483,9 +483,6 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam } } -// forward declaration of internal function (defined in llama-sampler.cpp) -void llama_sampler_grammar_set_trigger_suppressed(struct llama_sampler * smpl, bool suppressed); - void common_sampler_set_grammar_trigger_suppressed(struct common_sampler * gsmpl, bool suppressed) { if (!gsmpl || !gsmpl->grmr) { return; diff --git a/include/llama.h b/include/llama.h index 6e72db7e3c..34ceeefdf1 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1380,6 +1380,13 @@ extern "C" { const llama_token * trigger_tokens, size_t num_trigger_tokens); + /// @details Suppress or un-suppress trigger detection on a grammar sampler. + /// When suppressed, the grammar still buffers tokens but does not check for triggers. + /// Useful for suppressing grammar activation during reasoning/thinking blocks. + /// No-op if the sampler is not a grammar sampler. + LLAMA_API void llama_sampler_grammar_set_trigger_suppressed( + struct llama_sampler * smpl, + bool suppressed); /// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first. LLAMA_API struct llama_sampler * llama_sampler_init_penalties( diff --git a/src/llama-sampler.h b/src/llama-sampler.h index 11ad399fb8..b9bfc20d25 100644 --- a/src/llama-sampler.h +++ b/src/llama-sampler.h @@ -33,10 +33,6 @@ struct llama_sampler_chain { mutable int32_t n_sample; }; -// set trigger_suppressed on a grammar sampler (e.g. to suppress triggers during reasoning) -// the sampler must have been created by llama_sampler_init_grammar* or this is a no-op -void llama_sampler_grammar_set_trigger_suppressed(struct llama_sampler * smpl, bool suppressed); - struct llama_sampler * llama_sampler_init_dry_testing( int32_t context_size, float dry_multiplier, diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index ca4d15ec88..0e137447b7 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -1241,26 +1241,25 @@ private: const auto & end_tag = slot.task->params.thinking_end_tag; const auto & start_tag = slot.task->params.thinking_start_tag; if (slot.in_reasoning) { - // check if the end tag just appeared - if (slot.generated_text.size() >= end_tag.size()) { - auto tail = std::string_view(slot.generated_text).substr( - slot.generated_text.size() - end_tag.size()); - if (tail == end_tag) { - slot.in_reasoning = false; - common_sampler_set_grammar_trigger_suppressed(slot.smpl.get(), false); - SLT_DBG(slot, "reasoning ended, grammar triggers un-suppressed\n%s", ""); - } + // check if the end tag just appeared at the end of generated_text + if (slot.generated_text.size() >= end_tag.size() + && slot.generated_text.compare( + slot.generated_text.size() - end_tag.size(), + end_tag.size(), end_tag) == 0) { + slot.in_reasoning = false; + common_sampler_set_grammar_trigger_suppressed(slot.smpl.get(), false); + SLT_DBG(slot, "reasoning ended, grammar triggers un-suppressed\n%s", ""); } } else { - // check if the start tag just appeared - if (!start_tag.empty() && slot.generated_text.size() >= start_tag.size()) { - auto tail = std::string_view(slot.generated_text).substr( - slot.generated_text.size() - start_tag.size()); - if (tail == start_tag) { - slot.in_reasoning = true; - common_sampler_set_grammar_trigger_suppressed(slot.smpl.get(), true); - SLT_DBG(slot, "reasoning started, grammar triggers suppressed\n%s", ""); - } + // check if the start tag just appeared at the end of generated_text + if (!start_tag.empty() + && slot.generated_text.size() >= start_tag.size() + && slot.generated_text.compare( + slot.generated_text.size() - start_tag.size(), + start_tag.size(), start_tag) == 0) { + slot.in_reasoning = true; + common_sampler_set_grammar_trigger_suppressed(slot.smpl.get(), true); + SLT_DBG(slot, "reasoning started, grammar triggers suppressed\n%s", ""); } } }