From e543f8895245e0b23fcd618b641f56eb89648610 Mon Sep 17 00:00:00 2001 From: Bernhard Froemel Date: Tue, 3 Feb 2026 11:31:26 +0000 Subject: [PATCH 1/4] initial ngram-mod proof of concept, score-based pruning --- common/ngram-mod.cpp | 89 +++++++++++++++++++++++++++++++++++++++++- common/ngram-mod.h | 35 +++++++++++++++++ common/speculative.cpp | 30 ++++++++++++-- 3 files changed, 149 insertions(+), 5 deletions(-) diff --git a/common/ngram-mod.cpp b/common/ngram-mod.cpp index 76f7257f61..4dce2e844b 100644 --- a/common/ngram-mod.cpp +++ b/common/ngram-mod.cpp @@ -6,6 +6,7 @@ common_ngram_mod::common_ngram_mod(uint16_t n, size_t size) : n(n), used(0) { entries.resize(size); + scores.resize(size, SCORE_INIT); reset(); } @@ -27,8 +28,12 @@ void common_ngram_mod::add(const entry_t * tokens) { if (entries[i] == EMPTY) { used++; + scores[i] = SCORE_INS; + } else if (entries[i] != tokens[n]) { + // a different token hashes to the same bucket + ++collisions; } - + // keep existing score if entry already occupied entries[i] = tokens[n]; } @@ -40,7 +45,9 @@ common_ngram_mod::entry_t common_ngram_mod::get(const entry_t * tokens) const { void common_ngram_mod::reset() { std::fill(entries.begin(), entries.end(), EMPTY); + std::fill(scores.begin(), scores.end(), 0); used = 0; + collisions = 0; } size_t common_ngram_mod::get_n() const { @@ -56,5 +63,83 @@ size_t common_ngram_mod::size() const { } size_t common_ngram_mod::size_bytes() const { - return entries.size() * sizeof(entries[0]); + return entries.size() * sizeof(entries[0]) + scores.size() * sizeof(scores[0]); +} + +size_t common_ngram_mod::index(const entry_t * tokens) const { + return idx(tokens); +} + +void common_ngram_mod::inc_score(const entry_t * tokens) { + const size_t i = idx(tokens); + if (scores[i] < common_ngram_mod::SCORE_MAX) { + ++scores[i]; + } +} + +void common_ngram_mod::dec_score(const entry_t * tokens) { + const size_t i = idx(tokens); + if (scores[i] > common_ngram_mod::SCORE_MIN) { + --scores[i]; + } +} + +void common_ngram_mod::inc_score_by_index(size_t i) { + if (i < scores.size() && scores[i] < common_ngram_mod::SCORE_MAX) { + ++scores[i]; + } +} + +void common_ngram_mod::dec_score_by_index(size_t i) { + if (i < scores.size() && scores[i] > common_ngram_mod::SCORE_MIN) { + --scores[i]; + } +} + +void common_ngram_mod::prune_low_score() { + used = 0; + for (size_t i = 0; i < entries.size(); ++i) { + if (scores[i] < common_ngram_mod::SCORE_THR) { + entries[i] = EMPTY; + scores[i] = 0; + } else { + ++used; + } + } +} + +size_t common_ngram_mod::get_collisions() const { + return collisions; +} + +size_t common_ngram_mod::get_below_thr() const { + return count_below_thr; +} + +size_t common_ngram_mod::get_at_min() const { + return count_at_min; +} + +size_t common_ngram_mod::get_at_max() const { + return count_at_max; +} + +size_t common_ngram_mod::get_at_ins() const { + return count_at_ins; +} + +void common_ngram_mod::update_score_stats() { + // reset counters + count_below_thr = 0; + count_at_min = 0; + count_at_max = 0; + count_at_ins = 0; + + for (size_t i = 0; i < scores.size(); ++i) { + const int8_t s = scores[i]; + if (s < SCORE_THR) ++count_below_thr; + if (s == SCORE_MIN) ++count_at_min; + if (s == SCORE_MAX) ++count_at_max; + if (s == SCORE_INS) ++count_at_ins; + } } diff --git a/common/ngram-mod.h b/common/ngram-mod.h index 7af92e9dde..29227f8467 100644 --- a/common/ngram-mod.h +++ b/common/ngram-mod.h @@ -15,6 +15,12 @@ struct common_ngram_mod { static constexpr entry_t EMPTY = -1; + static constexpr int8_t SCORE_INIT = 0; + static constexpr int8_t SCORE_MIN = -5; + static constexpr int8_t SCORE_MAX = 20; + static constexpr int8_t SCORE_THR = 0; // keep equal or lower than SCORE_INIT + static constexpr int8_t SCORE_INS = 3; + common_ngram_mod(uint16_t n, size_t size); size_t idx(const entry_t * tokens) const; @@ -23,9 +29,27 @@ struct common_ngram_mod { void reset(); + // expose the hash index for external bookkeeping + size_t index(const entry_t * tokens) const; + + // score handling + void inc_score(const entry_t * tokens); + void dec_score(const entry_t * tokens); + void inc_score_by_index(size_t i); + void dec_score_by_index(size_t i); + void prune_low_score(); // remove entries below SCORE_THR + size_t get_n() const; size_t get_used() const; + void update_score_stats(); + + size_t get_collisions() const; + size_t get_below_thr() const; + size_t get_at_min() const; + size_t get_at_max() const; + size_t get_at_ins() const; + size_t size() const; size_t size_bytes() const; @@ -35,4 +59,15 @@ private: size_t used; std::vector entries; + // per-entry score, range SCORE_MIN .. SCORE_MAX + std::vector scores; + + // stats + // count of hash collisions + size_t collisions = 0; + // counts for score + size_t count_below_thr = 0; + size_t count_at_min = 0; + size_t count_at_max = 0; + size_t count_at_ins = 0; }; diff --git a/common/speculative.cpp b/common/speculative.cpp index 80cd31e35f..a14bef5341 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -527,6 +527,8 @@ struct common_speculative_state_ngram_mod : public common_speculative_state { // consecutive accept rounds with low acceptance fraction (< 0.5) int n_low = 0; + // hash indices of ngrams consulted during the most recent draft + std::vector used_hashes; // enable trace logging if LLAMA_TRACE is set const bool verbose; @@ -558,7 +560,7 @@ struct common_speculative_state_ngram_mod : public common_speculative_state { constexpr double f_thold = 0.25; if (f > f_thold) { - LOG_WRN("%s: ngram_mod occupancy %.2f exceeds threshold (%.2f) - resetting\n", __func__, f, f_thold); + LOG_WRN("%s: ngram_mod occupancy %.2f exceeds threshold (%.2f) - resetting (collisions=%zu)\n", __func__, f, f_thold, mod.get_collisions()); mod.reset(); } @@ -572,6 +574,7 @@ struct common_speculative_state_ngram_mod : public common_speculative_state { GGML_UNUSED(params); n_draft_last = 0; + used_hashes.clear(); const size_t cur_len = prompt_tgt.size(); if (cur_len < mod.get_n()) { @@ -607,6 +610,8 @@ struct common_speculative_state_ngram_mod : public common_speculative_state { break; } result[n + i] = token; + // remember which hash entry produced this token + used_hashes.push_back(mod.index(result.data() + i)); } // only return the m tokens that were drafted @@ -627,18 +632,37 @@ struct common_speculative_state_ngram_mod : public common_speculative_state { // compute acceptance fraction if we have a recorded draft length if (n_draft_last > 0) { const double f_acc = (double)n_accepted / (double)n_draft_last; + + // update per-ngram scores based on acceptance outcome + for (size_t i = 0; i < n_draft_last; ++i) { + if (i < static_cast(n_accepted)) { + mod.inc_score_by_index(used_hashes[i]); + } else { + mod.dec_score_by_index(used_hashes[i]); + } + } + if (f_acc < 0.5) { n_low++; if (n_low >= 3) { - LOG_WRN("%s: low acceptance streak (%d) – resetting ngram_mod\n", __func__, n_low); + LOG_WRN("%s: low acceptance streak (%d) - pruning ngram_mod (collisions=%zu)\n", __func__, n_low, mod.get_collisions()); + // Log detailed score metrics before pruning + mod.update_score_stats(); + LOG_WRN("%s: before prune scores - below_thr=%zu, at_min=%zu, at_max=%zu, at_ins=%zu\n", + __func__, + mod.get_below_thr(), + mod.get_at_min(), + mod.get_at_max(), + mod.get_at_ins()); - mod.reset(); + mod.prune_low_score(); n_low = 0; } } else { n_low = 0; } } + used_hashes.clear(); } }; From b6cb26dc4556c6bf4ef57643a1cbc15a05d4eedf Mon Sep 17 00:00:00 2001 From: Bernhard Froemel Date: Tue, 3 Feb 2026 11:54:11 +0000 Subject: [PATCH 2/4] cleanup --- common/ngram-mod.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/ngram-mod.cpp b/common/ngram-mod.cpp index 4dce2e844b..e1e020ba3c 100644 --- a/common/ngram-mod.cpp +++ b/common/ngram-mod.cpp @@ -45,7 +45,7 @@ common_ngram_mod::entry_t common_ngram_mod::get(const entry_t * tokens) const { void common_ngram_mod::reset() { std::fill(entries.begin(), entries.end(), EMPTY); - std::fill(scores.begin(), scores.end(), 0); + std::fill(scores.begin(), scores.end(), SCORE_INIT); used = 0; collisions = 0; } From dff25819209730d5be153087d3efd7f9370062a9 Mon Sep 17 00:00:00 2001 From: Bernhard Froemel Date: Tue, 3 Feb 2026 12:27:25 +0000 Subject: [PATCH 3/4] don't prune EMPTY entries --- common/ngram-mod.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/common/ngram-mod.cpp b/common/ngram-mod.cpp index e1e020ba3c..484727ae34 100644 --- a/common/ngram-mod.cpp +++ b/common/ngram-mod.cpp @@ -99,11 +99,13 @@ void common_ngram_mod::dec_score_by_index(size_t i) { void common_ngram_mod::prune_low_score() { used = 0; for (size_t i = 0; i < entries.size(); ++i) { - if (scores[i] < common_ngram_mod::SCORE_THR) { - entries[i] = EMPTY; - scores[i] = 0; - } else { - ++used; + if (entries[i] != EMPTY) { + if (scores[i] < common_ngram_mod::SCORE_THR) { + entries[i] = EMPTY; + scores[i] = 0; + } else { + ++used; + } } } } From 820ad0d191c39cc48cc165d6568082ed8d974788 Mon Sep 17 00:00:00 2001 From: Bernhard Froemel Date: Tue, 3 Feb 2026 14:57:47 +0100 Subject: [PATCH 4/4] cleanup --- common/ngram-mod.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/ngram-mod.h b/common/ngram-mod.h index 29227f8467..f18f005d22 100644 --- a/common/ngram-mod.h +++ b/common/ngram-mod.h @@ -18,7 +18,7 @@ struct common_ngram_mod { static constexpr int8_t SCORE_INIT = 0; static constexpr int8_t SCORE_MIN = -5; static constexpr int8_t SCORE_MAX = 20; - static constexpr int8_t SCORE_THR = 0; // keep equal or lower than SCORE_INIT + static constexpr int8_t SCORE_THR = 0; static constexpr int8_t SCORE_INS = 3; common_ngram_mod(uint16_t n, size_t size);