From a8dc54672caf30fae6bdbd7214d8b0bc03b9b999 Mon Sep 17 00:00:00 2001 From: samuel Date: Fri, 19 Dec 2025 21:57:15 -0300 Subject: [PATCH] common: simplify speculative sampling to greedy-only for performance Removes heavy penalty checks (repetition, frequency, presence, DRY) from `common_sampler_sample_speculative`. The specialized speculative sampler now uses a pure ArgMax (Greedy) approach. This significantly reduces CPU overhead during the drafting phase, which improves overall tokens per second. --- common/sampling.cpp | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index 3254f8d66c..c33d58ae5e 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -670,26 +670,13 @@ std::vector common_sampler_types_from_chars(const std::stri /** * Specialized sampling for speculative drafting. * - * Prioritizes performance by using a direct ArgMax loop (Greedy) when no - * penalties (repetition, frequency, presence, DRY) are configured. - * Falls back to the full sampler chain if penalties are active to prevent - * generative loops or adhere to constraints. + * Prioritizes performance by using a direct ArgMax loop (Greedy). + * Penalties and complex sampling logic are bypassed to minimize + * drafting latency. */ llama_token common_sampler_sample_speculative(struct common_sampler * gsmpl, struct llama_context * ctx, int idx) { const auto & params = gsmpl->params; - bool use_heavy_sampler = - (params.penalty_last_n > 0 && ( - params.penalty_repeat != 1.0f || - params.penalty_freq != 0.0f || - params.penalty_present != 0.0f - )) || - (params.dry_allowed_length > 0 && params.dry_multiplier != 0.0f); - - if (use_heavy_sampler) { - return common_sampler_sample(gsmpl, ctx, idx, false); - } - float * logits = llama_get_logits_ith(ctx, idx); const int n_vocab = llama_n_vocab(llama_model_get_vocab(llama_get_model(ctx)));