initial ngram-mod proof of concept, score-based pruning

2026-02-03 11:31:26 +00:00 · 2026-02-03 11:31:26 +00:00 · e543f88952
parent 1f1e57f2bf
commit e543f88952
3 changed files with 149 additions and 5 deletions
--- a/common/ngram-mod.cpp
+++ b/common/ngram-mod.cpp
@ -6,6 +6,7 @@

 common_ngram_mod::common_ngram_mod(uint16_t n, size_t size) : n(n), used(0) {
    entries.resize(size);
+    scores.resize(size, SCORE_INIT);

    reset();
 }
@ -27,8 +28,12 @@ void common_ngram_mod::add(const entry_t * tokens) {

    if (entries[i] == EMPTY) {
        used++;
+        scores[i] = SCORE_INS;
+    } else if (entries[i] != tokens[n]) {
+        // a different token hashes to the same bucket
+        ++collisions;
    }
-
+    // keep existing score if entry already occupied
    entries[i] = tokens[n];
 }

@ -40,7 +45,9 @@ common_ngram_mod::entry_t common_ngram_mod::get(const entry_t * tokens) const {

 void common_ngram_mod::reset() {
    std::fill(entries.begin(), entries.end(), EMPTY);
+    std::fill(scores.begin(),   scores.end(),   0);
    used = 0;
+    collisions = 0;
 }

 size_t common_ngram_mod::get_n() const {
@ -56,5 +63,83 @@ size_t common_ngram_mod::size() const {
 }

 size_t common_ngram_mod::size_bytes() const {
-    return entries.size() * sizeof(entries[0]);
+    return entries.size() * sizeof(entries[0]) + scores.size() * sizeof(scores[0]);
+}
+
+size_t common_ngram_mod::index(const entry_t * tokens) const {
+    return idx(tokens);
+}
+
+void common_ngram_mod::inc_score(const entry_t * tokens) {
+    const size_t i = idx(tokens);
+    if (scores[i] < common_ngram_mod::SCORE_MAX) {
+        ++scores[i];
+    }
+}
+
+void common_ngram_mod::dec_score(const entry_t * tokens) {
+    const size_t i = idx(tokens);
+    if (scores[i] > common_ngram_mod::SCORE_MIN) {
+        --scores[i];
+    }
+}
+
+void common_ngram_mod::inc_score_by_index(size_t i) {
+    if (i < scores.size() && scores[i] < common_ngram_mod::SCORE_MAX) {
+        ++scores[i];
+    }
+}
+
+void common_ngram_mod::dec_score_by_index(size_t i) {
+    if (i < scores.size() && scores[i] > common_ngram_mod::SCORE_MIN) {
+        --scores[i];
+    }
+}
+
+void common_ngram_mod::prune_low_score() {
+    used = 0;
+    for (size_t i = 0; i < entries.size(); ++i) {
+        if (scores[i] < common_ngram_mod::SCORE_THR) {
+            entries[i] = EMPTY;
+            scores[i] = 0;
+        } else {
+            ++used;
+        }
+    }
+}
+
+size_t common_ngram_mod::get_collisions() const {
+    return collisions;
+}
+
+size_t common_ngram_mod::get_below_thr() const {
+    return count_below_thr;
+}
+
+size_t common_ngram_mod::get_at_min() const {
+    return count_at_min;
+}
+
+size_t common_ngram_mod::get_at_max() const {
+    return count_at_max;
+}
+
+size_t common_ngram_mod::get_at_ins() const {
+    return count_at_ins;
+}
+
+void common_ngram_mod::update_score_stats() {
+    // reset counters
+    count_below_thr = 0;
+    count_at_min   = 0;
+    count_at_max   = 0;
+    count_at_ins  = 0;
+
+    for (size_t i = 0; i < scores.size(); ++i) {
+        const int8_t s = scores[i];
+        if (s < SCORE_THR) ++count_below_thr;
+        if (s == SCORE_MIN) ++count_at_min;
+        if (s == SCORE_MAX) ++count_at_max;
+        if (s == SCORE_INS) ++count_at_ins;
+    }
 }
--- a/common/ngram-mod.h
+++ b/common/ngram-mod.h
@ -15,6 +15,12 @@ struct common_ngram_mod {

    static constexpr entry_t EMPTY = -1;

+    static constexpr int8_t SCORE_INIT = 0;
+    static constexpr int8_t SCORE_MIN  = -5;
+    static constexpr int8_t SCORE_MAX  = 20;
+    static constexpr int8_t SCORE_THR  = 0; // keep equal or lower than SCORE_INIT
+    static constexpr int8_t SCORE_INS  = 3;
+
    common_ngram_mod(uint16_t n, size_t size);

    size_t  idx(const entry_t * tokens) const;
@ -23,9 +29,27 @@ struct common_ngram_mod {

    void reset();

+    // expose the hash index for external bookkeeping
+    size_t index(const entry_t * tokens) const;
+
+    // score handling
+    void inc_score(const entry_t * tokens);
+    void dec_score(const entry_t * tokens);
+    void inc_score_by_index(size_t i);
+    void dec_score_by_index(size_t i);
+    void prune_low_score(); // remove entries below SCORE_THR
+
    size_t get_n()    const;
    size_t get_used() const;

+    void update_score_stats();
+
+    size_t get_collisions() const;
+    size_t get_below_thr()  const;
+    size_t get_at_min()     const;
+    size_t get_at_max()     const;
+    size_t get_at_ins()    const;
+
    size_t size()       const;
    size_t size_bytes() const;

@ -35,4 +59,15 @@ private:
    size_t used;

    std::vector<entry_t> entries;
+    // per-entry score, range SCORE_MIN .. SCORE_MAX
+    std::vector<int8_t> scores;
+
+    // stats
+    // count of hash collisions
+    size_t collisions = 0;
+    // counts for score
+    size_t count_below_thr = 0;
+    size_t count_at_min   = 0;
+    size_t count_at_max   = 0;
+    size_t count_at_ins  = 0;
 };
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@ -527,6 +527,8 @@ struct common_speculative_state_ngram_mod : public common_speculative_state {

    // consecutive accept rounds with low acceptance fraction (< 0.5)
    int n_low = 0;
+    // hash indices of ngrams consulted during the most recent draft
+    std::vector<size_t> used_hashes;

    // enable trace logging if LLAMA_TRACE is set
    const bool verbose;
@ -558,7 +560,7 @@ struct common_speculative_state_ngram_mod : public common_speculative_state {

        constexpr double f_thold = 0.25;
        if (f > f_thold) {
-            LOG_WRN("%s: ngram_mod occupancy %.2f exceeds threshold (%.2f) - resetting\n", __func__, f, f_thold);
+            LOG_WRN("%s: ngram_mod occupancy %.2f exceeds threshold (%.2f) - resetting (collisions=%zu)\n", __func__, f, f_thold, mod.get_collisions());

            mod.reset();
        }
@ -572,6 +574,7 @@ struct common_speculative_state_ngram_mod : public common_speculative_state {
        GGML_UNUSED(params);

        n_draft_last = 0;
+        used_hashes.clear();

        const size_t cur_len = prompt_tgt.size();
        if (cur_len < mod.get_n()) {
@ -607,6 +610,8 @@ struct common_speculative_state_ngram_mod : public common_speculative_state {
                break;
            }
            result[n + i] = token;
+            // remember which hash entry produced this token
+            used_hashes.push_back(mod.index(result.data() + i));
        }

        // only return the m tokens that were drafted
@ -627,18 +632,37 @@ struct common_speculative_state_ngram_mod : public common_speculative_state {
        // compute acceptance fraction if we have a recorded draft length
        if (n_draft_last > 0) {
            const double f_acc = (double)n_accepted / (double)n_draft_last;
+
+            // update per-ngram scores based on acceptance outcome
+            for (size_t i = 0; i < n_draft_last; ++i) {
+                if (i < static_cast<size_t>(n_accepted)) {
+                    mod.inc_score_by_index(used_hashes[i]);
+                } else {
+                    mod.dec_score_by_index(used_hashes[i]);
+                }
+            }
+
            if (f_acc < 0.5) {
                n_low++;
                if (n_low >= 3) {
-                    LOG_WRN("%s: low acceptance streak (%d) – resetting ngram_mod\n", __func__, n_low);
+                    LOG_WRN("%s: low acceptance streak (%d) - pruning ngram_mod (collisions=%zu)\n", __func__, n_low, mod.get_collisions());
+                    // Log detailed score metrics before pruning
+                    mod.update_score_stats();
+                    LOG_WRN("%s: before prune scores - below_thr=%zu, at_min=%zu, at_max=%zu, at_ins=%zu\n",
+                            __func__,
+                            mod.get_below_thr(),
+                            mod.get_at_min(),
+                            mod.get_at_max(),
+                            mod.get_at_ins());

-                    mod.reset();
+                    mod.prune_low_score();
                    n_low = 0;
                }
            } else {
                n_low = 0;
            }
        }
+        used_hashes.clear();
    }
 };