From cbb492a837b57560b0611c529f592c6e958edbed Mon Sep 17 00:00:00 2001
From: Sascha Rogmann <github@rogmann.org>
Date: Wed, 31 Dec 2025 00:55:39 +0100
Subject: [PATCH] server: moved self-call into speculative.cpp

---
 common/speculative.cpp          | 12 ++++++++++++
 common/speculative.h            |  9 ++++++---
 tools/server/server-context.cpp | 25 ++++++++-----------------
 3 files changed, 26 insertions(+), 20 deletions(-)

diff --git a/common/speculative.cpp b/common/speculative.cpp
index e2c2a983a3..2581fc1126 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -187,6 +187,18 @@ llama_tokens common_speculative_gen_draft(
         struct common_speculative_params params,
         const llama_tokens & prompt_tgt_main_model, // specified in target model vocab
         llama_token id_last) {
+    if (params.self_mode == 1) {
+        // Look in the current context for a n-gram and return the following tokens as the draft.
+        llama_tokens draft_self = common_speculative_gen_self_draft(prompt_tgt_main_model, id_last,
+                params.self_ngram_size, params.n_draft);
+        if (!draft_self.empty()) {
+            return draft_self;
+        }
+    }
+    if (spec == nullptr) {
+        return {};
+    }
+
     auto & batch  = spec->batch;
     auto & ctx_tgt = spec->ctx_tgt;
     auto & ctx_dft = spec->ctx_dft;
diff --git a/common/speculative.h b/common/speculative.h
index 9b236c0806..6407563c6f 100644
--- a/common/speculative.h
+++ b/common/speculative.h
@@ -6,10 +6,13 @@
 struct common_speculative;
 
 struct common_speculative_params {
-    int n_draft = 16;  // max drafted tokens
-    int n_reuse = 256;
+    int n_draft         = 16;  // max drafted tokens
+    int n_reuse         = 256;
 
-    float p_min = 0.75f; // min probability required to accept a token in the draft
+    float p_min         = 0.75f; // min probability required to accept a token in the draft
+
+    int self_mode       = 0; // 0: off, 1: self speculative lookup
+    int self_ngram_size = 12; // length of pattern to search for in self mode
 };
 
 struct common_speculative * common_speculative_init(
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 5b97d2c6d1..caf115e2ee 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -1974,23 +1974,14 @@ private:
                     GGML_ABORT("not supported by multimodal");
                 }
 
-                llama_tokens draft = {};
-
-                if (slot.task->params.speculative.use_self) {
-                    // we search at least 5 tokens in history to try a self-speculative draft
-                    const int n_draft_min = std::max(5, slot.task->params.speculative.n_min);
-                    const llama_tokens & tokens = slot.prompt.tokens.get_text_tokens();
-                    llama_token id = slot.sampled;
-                    draft = common_speculative_gen_self_draft(tokens, id, n_draft_min, n_draft_max);
-                }
-                if (draft.empty() && slot.can_speculate()) {
-                    struct common_speculative_params params_spec;
-                    params_spec.n_draft = n_draft_max;
-                    params_spec.n_reuse = llama_n_ctx(slot.ctx_dft) - slot.task->params.speculative.n_max;
-                    params_spec.p_min   = slot.task->params.speculative.p_min;
-                    const llama_tokens & cached_text_tokens = slot.prompt.tokens.get_text_tokens();
-                    draft = common_speculative_gen_draft(slot.spec, params_spec, cached_text_tokens, slot.sampled);
-                }
+                struct common_speculative_params params_spec;
+                params_spec.n_draft   = n_draft_max;
+                params_spec.n_reuse   = slot.ctx_dft ? (llama_n_ctx(slot.ctx_dft) - slot.task->params.speculative.n_max) : 0;
+                params_spec.p_min     = slot.task->params.speculative.p_min;
+                params_spec.self_mode       = slot.task->params.speculative.use_self;
+                params_spec.self_ngram_size = std::max(5, slot.task->params.speculative.n_min);
+                const llama_tokens & cached_text_tokens = slot.prompt.tokens.get_text_tokens();
+                llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, cached_text_tokens, slot.sampled);
 
                 // add the sampled token to the batch
                 slot.i_batch_dft.push_back(batch.n_tokens);