From cbb492a837b57560b0611c529f592c6e958edbed Mon Sep 17 00:00:00 2001 From: Sascha Rogmann Date: Wed, 31 Dec 2025 00:55:39 +0100 Subject: [PATCH] server: moved self-call into speculative.cpp --- common/speculative.cpp | 12 ++++++++++++ common/speculative.h | 9 ++++++--- tools/server/server-context.cpp | 25 ++++++++----------------- 3 files changed, 26 insertions(+), 20 deletions(-) diff --git a/common/speculative.cpp b/common/speculative.cpp index e2c2a983a3..2581fc1126 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -187,6 +187,18 @@ llama_tokens common_speculative_gen_draft( struct common_speculative_params params, const llama_tokens & prompt_tgt_main_model, // specified in target model vocab llama_token id_last) { + if (params.self_mode == 1) { + // Look in the current context for a n-gram and return the following tokens as the draft. + llama_tokens draft_self = common_speculative_gen_self_draft(prompt_tgt_main_model, id_last, + params.self_ngram_size, params.n_draft); + if (!draft_self.empty()) { + return draft_self; + } + } + if (spec == nullptr) { + return {}; + } + auto & batch = spec->batch; auto & ctx_tgt = spec->ctx_tgt; auto & ctx_dft = spec->ctx_dft; diff --git a/common/speculative.h b/common/speculative.h index 9b236c0806..6407563c6f 100644 --- a/common/speculative.h +++ b/common/speculative.h @@ -6,10 +6,13 @@ struct common_speculative; struct common_speculative_params { - int n_draft = 16; // max drafted tokens - int n_reuse = 256; + int n_draft = 16; // max drafted tokens + int n_reuse = 256; - float p_min = 0.75f; // min probability required to accept a token in the draft + float p_min = 0.75f; // min probability required to accept a token in the draft + + int self_mode = 0; // 0: off, 1: self speculative lookup + int self_ngram_size = 12; // length of pattern to search for in self mode }; struct common_speculative * common_speculative_init( diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 5b97d2c6d1..caf115e2ee 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -1974,23 +1974,14 @@ private: GGML_ABORT("not supported by multimodal"); } - llama_tokens draft = {}; - - if (slot.task->params.speculative.use_self) { - // we search at least 5 tokens in history to try a self-speculative draft - const int n_draft_min = std::max(5, slot.task->params.speculative.n_min); - const llama_tokens & tokens = slot.prompt.tokens.get_text_tokens(); - llama_token id = slot.sampled; - draft = common_speculative_gen_self_draft(tokens, id, n_draft_min, n_draft_max); - } - if (draft.empty() && slot.can_speculate()) { - struct common_speculative_params params_spec; - params_spec.n_draft = n_draft_max; - params_spec.n_reuse = llama_n_ctx(slot.ctx_dft) - slot.task->params.speculative.n_max; - params_spec.p_min = slot.task->params.speculative.p_min; - const llama_tokens & cached_text_tokens = slot.prompt.tokens.get_text_tokens(); - draft = common_speculative_gen_draft(slot.spec, params_spec, cached_text_tokens, slot.sampled); - } + struct common_speculative_params params_spec; + params_spec.n_draft = n_draft_max; + params_spec.n_reuse = slot.ctx_dft ? (llama_n_ctx(slot.ctx_dft) - slot.task->params.speculative.n_max) : 0; + params_spec.p_min = slot.task->params.speculative.p_min; + params_spec.self_mode = slot.task->params.speculative.use_self; + params_spec.self_ngram_size = std::max(5, slot.task->params.speculative.n_min); + const llama_tokens & cached_text_tokens = slot.prompt.tokens.get_text_tokens(); + llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, cached_text_tokens, slot.sampled); // add the sampled token to the batch slot.i_batch_dft.push_back(batch.n_tokens);