From 87b2719eca55b30afff600fc7f61c6cce9452cbf Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Thu, 4 Dec 2025 08:13:49 +0100 Subject: [PATCH] sampling : stop short if backend sampler sampled a token This commit modifies the graph building logic to immediately continue when a token has already been sampled by the backend sampler. It also updates the test for backend temporary sampling to include top-k and distribution samplers in the chain to verify that they are not producing any logits (they are not run). --- src/llama-graph.cpp | 1 + tests/test-backend-sampler.cpp | 2 ++ 2 files changed, 3 insertions(+) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index a621c4ebf5..c0ff7d1791 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -2100,6 +2100,7 @@ void llm_graph_context::build_sampling() const { if (data.sampled != nullptr) { res->t_sampled[seq_id] = data.sampled; ggml_build_forward_expand(gf, data.sampled); + continue; } if (data.probs != nullptr) { diff --git a/tests/test-backend-sampler.cpp b/tests/test-backend-sampler.cpp index f56cce6350..eb3a0e248d 100644 --- a/tests/test-backend-sampler.cpp +++ b/tests/test-backend-sampler.cpp @@ -441,6 +441,8 @@ static void test_backend_temp_sampling(const char * model_path) { struct llama_sampler_chain_params backend_chain_params = llama_sampler_chain_default_params(); struct llama_sampler * backend_sampler_chain = llama_sampler_chain_init(backend_chain_params); llama_sampler_chain_add(backend_sampler_chain, llama_sampler_init_temp(temp)); + llama_sampler_chain_add(backend_sampler_chain, llama_sampler_init_top_k(40)); + llama_sampler_chain_add(backend_sampler_chain, llama_sampler_init_dist(18)); std::vector backend_sampler_configs = { { seq_id, backend_sampler_chain },