From dabaa2e77a9d94ee1466ce55f9ff47d5c1daae43 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 30 Jan 2026 18:21:48 +0200 Subject: [PATCH] spec : add ngram-mod (#19164) * spec : add ngram-mod * cont : simplify + keep track of occupancy * cont : cleanup * cont : move initialization to common/speculative * cont : cleanup * cont : cleanup * cont : fix --- common/CMakeLists.txt | 2 + common/arg.cpp | 4 +- common/common.h | 5 + common/ngram-map.cpp | 35 +++---- common/ngram-map.h | 1 + common/ngram-mod.cpp | 60 ++++++++++++ common/ngram-mod.h | 37 +++++++ common/speculative.cpp | 164 +++++++++++++++++++++++++++++++- common/speculative.h | 4 +- tools/server/server-context.cpp | 9 +- 10 files changed, 292 insertions(+), 29 deletions(-) create mode 100644 common/ngram-mod.cpp create mode 100644 common/ngram-mod.h diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 3bc7bc6210..295ae9ea25 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -75,6 +75,8 @@ add_library(${TARGET} STATIC ngram-cache.h ngram-map.cpp ngram-map.h + ngram-mod.cpp + ngram-mod.h peg-parser.cpp peg-parser.h preset.cpp diff --git a/common/arg.cpp b/common/arg.cpp index 218418f070..5fbc9022c0 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3396,7 +3396,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( - {"--spec-type"}, "[none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v]", + {"--spec-type"}, "[none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]", string_format("type of speculative decoding to use when no draft model is provided (default: %s)\n", common_speculative_type_to_str(params.speculative.type).c_str()), [](common_params & params, const std::string & value) { @@ -3410,6 +3410,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K; } else if (value == "ngram-map-k4v") { params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V; + } else if (value == "ngram-mod") { + params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD; } else { throw std::invalid_argument("unknown speculative decoding type without draft model"); } diff --git a/common/common.h b/common/common.h index fd3ab8cd18..398ebb0960 100644 --- a/common/common.h +++ b/common/common.h @@ -171,6 +171,7 @@ enum common_speculative_type { COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, // simple self-speculative decoding COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, // self-speculative decoding with n-gram keys only COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values + COMMON_SPECULATIVE_TYPE_NGRAM_MOD, COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, // self-speculative decoding with 3-level n-gram cache COMMON_SPECULATIVE_TYPE_COUNT // number of types, unknown type }; @@ -252,6 +253,8 @@ struct common_params_model { std::string name = ""; // in format /[:] (tag is optional) // NOLINT }; +struct common_ngram_mod; + struct common_params_speculative { common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding @@ -269,6 +272,8 @@ struct common_params_speculative { uint16_t ngram_check_rate = 1; // check rate for ngram lookup uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed + std::shared_ptr ngram_mod; + std::string lookup_cache_static; // path of static ngram cache file for lookup decoding // NOLINT std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding // NOLINT diff --git a/common/ngram-map.cpp b/common/ngram-map.cpp index 930e7a3c10..84fd761367 100644 --- a/common/ngram-map.cpp +++ b/common/ngram-map.cpp @@ -7,6 +7,21 @@ #include #include +// Print the values of a sublist of `llama_tokens & inp` to a string in the form [v0, v1, v2, ...]. +static std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length) { + std::ostringstream oss; + oss << '['; + for (size_t i = 0; i < length; ++i) { + if (i > 0) { + oss << ", "; + } + oss << inp[start + i]; + } + oss << ']'; + return oss.str(); +} + + // n-gram simple // @@ -100,8 +115,6 @@ llama_tokens common_ngram_simple_draft( // maximum number of counted values of a ngram map value. #define COMMON_NGRAM_MAX_VALUE_COUNT 16380 -static std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length); - void common_ngram_map_draft(common_ngram_map & map, const llama_tokens & inp, llama_token sampled, llama_tokens & draft) { @@ -347,21 +360,3 @@ void common_ngram_map_accept(common_ngram_map & map, uint16_t n_accepted) { n_accepted, curr_value.n_accepted); curr_value.n_accepted = n_accepted; } - -// Helper functions. -// - -// Print the values of a sublist of `llama_tokens & inp` to a string in the form [v0, v1, v2, ...]. -std::string common_tokens_to_str(const llama_tokens & inp, size_t start, size_t length) { - std::ostringstream oss; - oss << '['; - for (size_t i = 0; i < length; ++i) { - if (i > 0) { - oss << ", "; - } - oss << inp[start + i]; - } - oss << ']'; - return oss.str(); -} - diff --git a/common/ngram-map.h b/common/ngram-map.h index bf91883f0c..b365034ac5 100644 --- a/common/ngram-map.h +++ b/common/ngram-map.h @@ -11,6 +11,7 @@ // #include "llama.h" +#include "common.h" #include diff --git a/common/ngram-mod.cpp b/common/ngram-mod.cpp new file mode 100644 index 0000000000..76f7257f61 --- /dev/null +++ b/common/ngram-mod.cpp @@ -0,0 +1,60 @@ +#include "ngram-mod.h" + +// +// common_ngram_mod +// + +common_ngram_mod::common_ngram_mod(uint16_t n, size_t size) : n(n), used(0) { + entries.resize(size); + + reset(); +} + +size_t common_ngram_mod::idx(const entry_t * tokens) const { + size_t res = 0; + + for (size_t i = 0; i < n; ++i) { + res = res*6364136223846793005ULL + tokens[i]; + } + + res = res % entries.size(); + + return res; +} + +void common_ngram_mod::add(const entry_t * tokens) { + const size_t i = idx(tokens); + + if (entries[i] == EMPTY) { + used++; + } + + entries[i] = tokens[n]; +} + +common_ngram_mod::entry_t common_ngram_mod::get(const entry_t * tokens) const { + const size_t i = idx(tokens); + + return entries[i]; +} + +void common_ngram_mod::reset() { + std::fill(entries.begin(), entries.end(), EMPTY); + used = 0; +} + +size_t common_ngram_mod::get_n() const { + return n; +} + +size_t common_ngram_mod::get_used() const { + return used; +} + +size_t common_ngram_mod::size() const { + return entries.size(); +} + +size_t common_ngram_mod::size_bytes() const { + return entries.size() * sizeof(entries[0]); +} diff --git a/common/ngram-mod.h b/common/ngram-mod.h new file mode 100644 index 0000000000..cf3c89c915 --- /dev/null +++ b/common/ngram-mod.h @@ -0,0 +1,37 @@ +#pragma once + +#include +#include + +// +// common_ngram_mod +// ref: https://github.com/ggml-org/llama.cpp/pull/19164 +// + +// basic n-gram hasher +struct common_ngram_mod { + using entry_t = int32_t; + + static constexpr entry_t EMPTY = -1; + + common_ngram_mod(uint16_t n, size_t size); + + size_t idx(const entry_t * tokens) const; + void add(const entry_t * tokens); + entry_t get(const entry_t * tokens) const; // return -1 if not found + + void reset(); + + size_t get_n() const; + size_t get_used() const; + + size_t size() const; + size_t size_bytes() const; + +private: + size_t n; // ngram size to hash + + size_t used; + + std::vector entries; +}; diff --git a/common/speculative.cpp b/common/speculative.cpp index 3f314b5d57..a1a3b51c13 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -6,6 +6,7 @@ #include "log.h" #include "ngram-cache.h" #include "ngram-map.h" +#include "ngram-mod.h" #include "sampling.h" #include @@ -23,6 +24,7 @@ const std::vector common_speculative_types = { COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, + COMMON_SPECULATIVE_TYPE_NGRAM_MOD, COMMON_SPECULATIVE_TYPE_NGRAM_CACHE }; @@ -33,6 +35,7 @@ const std::map common_speculative_typ {"ngram_simple", COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE}, {"ngram_map_k", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K}, {"ngram_map_k4v", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V}, + {"ngram_mod", COMMON_SPECULATIVE_TYPE_NGRAM_MOD}, {"ngram_cache", COMMON_SPECULATIVE_TYPE_NGRAM_CACHE} }; @@ -110,6 +113,8 @@ static bool common_speculative_are_compatible( struct common_speculative_state { const enum common_speculative_type type; + // TODO: rename to n_call_draft, n_gen_drafts, n_acc_drafts, n_gen_tokens, n_acc_tokens + // TODO: add n_call_begin, n_call_accept size_t drafts_call_count = 0; // number of times this implementation was called. size_t drafts_generated_count = 0; // number of times a draft or part was generated by this implementation. size_t drafts_accepted_count = 0; // number of times a draft or part was accepted by the target model. @@ -119,6 +124,8 @@ struct common_speculative_state { // TODO: track performance of most recent calls const bool gen_perf = true; // whether to generate performance stats. + // TODO: rename to t_draft_us + // TODO: add t_begin_us, t_accept_us int64_t gen_duration_us = 0; // total time spent in this implementation in microseconds. common_speculative_state(enum common_speculative_type type) : type(type) {} @@ -509,6 +516,132 @@ struct common_speculative_state_ngram_map_k : public common_speculative_state { } }; +struct common_speculative_state_ngram_mod : public common_speculative_state { + common_ngram_mod & mod; + + // the last position in the prompt that was added to the ngram container + size_t i_last = 0; + + // length of the last drafted n‑gram (number of tokens returned by draft) + size_t n_draft_last = 0; + + // consecutive accept rounds with low acceptance fraction (< 0.5) + int n_low = 0; + + // enable trace logging if LLAMA_TRACE is set + const bool verbose; + + common_speculative_state_ngram_mod(enum common_speculative_type type, common_ngram_mod & mod) + : common_speculative_state(type), mod(mod), verbose(std::getenv("LLAMA_TRACE") != nullptr) { + static_assert(sizeof(llama_token) == sizeof(common_ngram_mod::entry_t)); + } + + void begin(const llama_tokens & prompt) override { + i_last = 0; + + n_draft_last = 0; + + const size_t n = mod.get_n(); + + if (prompt.size() < n) { + return; + } + + for (size_t i = 0; i < prompt.size() - n; ++i) { + mod.add(prompt.data() + i); + } + + i_last = prompt.size() - n; + + const double f = (double)mod.get_used() / (double)mod.size(); + LOG_INF("%s: ngram_mod occupancy = %zu/%zu (%.2f)\n", __func__, mod.get_used(), mod.size(), f); + + constexpr double f_thold = 0.25; + if (f > f_thold) { + LOG_WRN("%s: ngram_mod occupancy %.2f exceeds threshold (%.2f) - resetting\n", __func__, f, f_thold); + + mod.reset(); + } + } + + void draft( + const common_params_speculative & params, + const llama_tokens & prompt_tgt, + llama_token id_last, + llama_tokens & result) override { + GGML_UNUSED(params); + + n_draft_last = 0; + + const size_t cur_len = prompt_tgt.size(); + if (cur_len < mod.get_n()) { + return; + } + + const size_t n = mod.get_n(); + + // add new ngrams in chunks + if (i_last + 32 < cur_len) { + for (size_t i = i_last; i < cur_len - n; ++i) { + mod.add(prompt_tgt.data() + i); + } + + i_last = cur_len - n; + } + + result.resize(n + params.n_max); + for (size_t i = 0; i < n - 1; ++i) { + result[i] = prompt_tgt[cur_len - n + 1 + i]; + } + result[n - 1] = id_last; + + for (int i = 0; i < params.n_max; ++i) { + const llama_token token = mod.get(result.data() + i); + if (token == common_ngram_mod::EMPTY) { + if (i < params.n_min) { + result.clear(); + return; + } + + result.resize(n + i); + break; + } + result[n + i] = token; + } + + // only return the m tokens that were drafted + for (size_t i = 0; n + i < result.size(); ++i) { + result[i] = result[n + i]; + } + result.resize(result.size() - n); + + // store length of drafted n‑gram for later acceptance analysis + n_draft_last = result.size(); + } + + void accept(uint16_t n_accepted) override { + if (verbose) { + LOG_INF("%s: accepted %d tokens from %zu drafted tokens\n", __func__, n_accepted, n_draft_last); + } + + // compute acceptance fraction if we have a recorded draft length + if (n_draft_last > 0) { + const double f_acc = (double)n_accepted / (double)n_draft_last; + if (f_acc < 0.5) { + n_low++; + if (n_low >= 3) { + LOG_WRN("%s: low acceptance streak (%d) – resetting ngram_mod\n", __func__, n_low); + + mod.reset(); + n_low = 0; + } + } else { + n_low = 0; + } + } + } +}; + struct common_speculative_state_ngram_cache : public common_speculative_state { uint16_t n_draft; bool save_dynamic; @@ -650,6 +783,7 @@ std::string common_speculative_type_to_str(enum common_speculative_type type) { case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: return "ngram_simple"; case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K: return "ngram_map_k"; case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: return "ngram_map_k4v"; + case COMMON_SPECULATIVE_TYPE_NGRAM_MOD: return "ngram_mod"; case COMMON_SPECULATIVE_TYPE_NGRAM_CACHE: return "ngram_cache"; default: return "unknown"; } @@ -666,8 +800,8 @@ enum common_speculative_type common_speculative_type_from_name(const std::string // initialization of the speculative decoding system // common_speculative * common_speculative_init( - const common_params_speculative & params, - llama_context * ctx_tgt) { + common_params_speculative & params, + llama_context * ctx_tgt) { llama_context * ctx_dft = nullptr; if (params.model_dft) { ctx_dft = llama_init_from_model(params.model_dft, params.cparams_dft); @@ -687,6 +821,7 @@ common_speculative * common_speculative_init( bool has_ngram_simple = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE); bool has_ngram_map_k = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K); bool has_ngram_map_k4v = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V); + bool has_ngram_mod = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_MOD); // In a more complex implementation we could use the same implementation but with different parameters. // This was initially used in PR-18471 but removed to simplify the code. @@ -701,6 +836,22 @@ common_speculative * common_speculative_init( // This implementation can guess tokens with high acceptance rate but is more expensive. configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, params)); } + if (has_ngram_mod) { + // shared instance for all speculative decoding contexts + if (!params.ngram_mod) { + params.ngram_mod = std::make_shared(params.ngram_size_n, 4*1024*1024); + + LOG_INF("%s: initialized ngram_mod with n=%d, size=%zu (%.3f MB)\n", __func__, + params.ngram_size_n, params.ngram_mod->size(), + (float)(params.ngram_mod->size_bytes())/1024/1024); + + if (params.ngram_size_n < 16) { + LOG_WRN("%s: ngram_mod n=%d is too small - poor quality is possible, see: https://github.com/ggml-org/llama.cpp/pull/19164\n", __func__, params.ngram_size_n); + } + } + + configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_MOD, params)); + } if (has_ngram_cache) { configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, params)); } @@ -758,6 +909,11 @@ common_speculative * common_speculative_init( )); break; } + case COMMON_SPECULATIVE_TYPE_NGRAM_MOD: { + GGML_ASSERT(config.params.ngram_mod); + impls.push_back(std::make_unique(config.type, *config.params.ngram_mod)); + break; + } case COMMON_SPECULATIVE_TYPE_NGRAM_CACHE: { auto state = create_state_ngram_cache( params.lookup_cache_static, params.lookup_cache_dynamic, config); @@ -822,8 +978,7 @@ llama_tokens common_speculative_draft( if (!result.empty()) { LOG_DBG("%s: called impl %s, hist size = %zu, call_count = %zu, gen = %zu\n", __func__, - common_speculative_type_to_str(impl.get()->type).c_str(), - prompt_tgt.size(), + common_speculative_type_to_str(impl.get()->type).c_str(), prompt_tgt.size(), impl.get()->drafts_call_count, result.size()); spec->curr_impl = impl.get(); // set current implementation for stats @@ -869,6 +1024,7 @@ void common_speculative_print_stats(const common_speculative * spec) { str_perf = ""; } + // TODO: report time for begin() and accept() LOG_INF("statistics %s: #calls = %zu, #gen drafts = %zu, #acc drafts = %zu, #gen tokens = %zu, #acc tokens = %zu%s\n", common_speculative_type_to_str(impl->type).c_str(), impl->drafts_call_count, diff --git a/common/speculative.h b/common/speculative.h index 9e1888e4be..76fe6bb7bc 100644 --- a/common/speculative.h +++ b/common/speculative.h @@ -15,8 +15,8 @@ enum common_speculative_type common_speculative_type_from_name(const std::string std::string common_speculative_type_to_str(enum common_speculative_type type); common_speculative * common_speculative_init( - const common_params_speculative & params, - llama_context * ctx_tgt); + common_params_speculative & params, + llama_context * ctx_tgt); void common_speculative_free(common_speculative * spec); diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 1ca4e3cc0e..6f26fc9a9b 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -705,6 +705,11 @@ private: params_base.n_cache_reuse = 0; SRV_WRN("%s\n", "cache_reuse is not supported by multimodal, it will be disabled"); } + + if (params_base.speculative.type != COMMON_SPECULATIVE_TYPE_NONE) { + params_base.speculative.type = COMMON_SPECULATIVE_TYPE_NONE; + SRV_WRN("%s\n", "speculative decoding is not supported by multimodal, it will be disabled"); + } } if (!llama_memory_can_shift(llama_get_memory(ctx))) { @@ -754,9 +759,9 @@ private: SRV_ERR("%s\n", "speculative decoding is not supported with multimodal"); return false; } - SRV_WRN("%s", "speculative decoding context initialized\n"); + SLT_INF(slot, "%s", "speculative decoding context initialized\n"); } else { - SRV_WRN("%s", "speculative decoding context not initialized\n"); + SLT_INF(slot, "%s", "speculative decoding context not initialized\n"); } }