From a8dc54672caf30fae6bdbd7214d8b0bc03b9b999 Mon Sep 17 00:00:00 2001
From: samuel <samueloliveira32df@gmail.com>
Date: Fri, 19 Dec 2025 21:57:15 -0300
Subject: [PATCH] common: simplify speculative sampling to greedy-only for
 performance

Removes heavy penalty checks (repetition, frequency, presence, DRY) from
`common_sampler_sample_speculative`.

The specialized speculative sampler now uses a pure ArgMax (Greedy) approach.
This significantly reduces CPU overhead during the drafting phase, which
improves overall tokens per second.
---
 common/sampling.cpp | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

diff --git a/common/sampling.cpp b/common/sampling.cpp
index 3254f8d66c..c33d58ae5e 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -670,26 +670,13 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
 /**
  * Specialized sampling for speculative drafting.
  * 
- * Prioritizes performance by using a direct ArgMax loop (Greedy) when no 
- * penalties (repetition, frequency, presence, DRY) are configured.
- * Falls back to the full sampler chain if penalties are active to prevent 
- * generative loops or adhere to constraints.
+ * Prioritizes performance by using a direct ArgMax loop (Greedy).
+ * Penalties and complex sampling logic are bypassed to minimize 
+ * drafting latency.
  */
 llama_token common_sampler_sample_speculative(struct common_sampler * gsmpl, struct llama_context * ctx, int idx) {
     const auto & params = gsmpl->params;
 
-    bool use_heavy_sampler = 
-        (params.penalty_last_n > 0 && (
-            params.penalty_repeat  != 1.0f || 
-            params.penalty_freq    != 0.0f || 
-            params.penalty_present != 0.0f
-        )) ||
-        (params.dry_allowed_length > 0 && params.dry_multiplier != 0.0f);
-
-    if (use_heavy_sampler) {
-        return common_sampler_sample(gsmpl, ctx, idx, false);
-    } 
-
     float * logits = llama_get_logits_ith(ctx, idx);
     const int n_vocab = llama_n_vocab(llama_model_get_vocab(llama_get_model(ctx)));