Merge ca7fff5159 into 8a98ba4582

2026-02-02 00:16:02 +03:00 · 2026-02-02 00:16:02 +03:00 · 7fd062ea62
parent 8a98ba4582 ca7fff5159
commit 7fd062ea62
3 changed files with 506 additions and 0 deletions
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@ -37,4 +37,5 @@ else()
        add_subdirectory(export-lora)
    endif()
    add_subdirectory(fit-params)
    add_subdirectory(expected-attention)
 endif()
--- a/tools/expected-attention/CMakeLists.txt
+++ b/tools/expected-attention/CMakeLists.txt
@ -0,0 +1,9 @@
 set(TARGET llama-expected-attention)
 add_executable(${TARGET} expected-attention.cpp)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 target_include_directories(llama-expected-attention PRIVATE ${CMAKE_SOURCE_DIR}/src)
 if(LLAMA_TOOLS_INSTALL)
    install(TARGETS ${TARGET} RUNTIME)
 endif()
--- a/tools/expected-attention/expected-attention.cpp
+++ b/tools/expected-attention/expected-attention.cpp
@ -0,0 +1,496 @@
 /* expected-attention.cpp */
 /***********************************************************************************************
 This program serves as a proof-of-concept for implementing _Expected Attention_ in llama.cpp.
 We are just trying to see if this will be viable or not.
 If we get it to work in this program, we will try to implement it into llama.cpp proper.
 First we need to prove it can work.
 NOTES
 ---
 ### Expected Attention: KV Cache Compression by Estimating Attention from Future Queries Distribution
 > Memory consumption of the Key-Value (KV) cache represents a major bottleneck for efficient
 > large language model (LLM) inference. While attention-score-based KV cache pruning shows
 > promise, it faces critical practical limitations: attention scores from future tokens are
 > unavailable during compression, and modern implementations like Flash Attention do not
 > materialize the full attention matrix, making past scores inaccessible. To overcome these
 > challenges, we introduce *Expected Attention*, **a training-free compression method** that
 > estimates KV pairs importance by predicting how future queries will attend to them. Our
 > approach leverages the distributional properties of LLM activations to compute expected
 > attention scores in closed form for each KV pair. These scores enable principled ranking and
 > pruning of KV pairs with minimal impact on the residual stream, achieving effective
 > compression without performance degradation. Importantly, our method operates seamlessly
 > across both prefilling and decoding phases, consistently outperforming state-of-the-art
 > baselines in both scenarios. Finally, we release KVPress, a comprehensive library to enable
 > researchers to implement and benchmark KV cache compression methods, already including more
 > than 20 techniques.
 refs:
 - [arXiv:2510.0063v1](https://arxiv.org/abs/2510.00636v1)
 - https://github.com/NVIDIA/kvpress
 ***********************************************************************************************/
 #include <cmath>
 #include <vector>
 #include <cstring>
 #include <stdio.h>
 #include <charconv>
 #include <stdint.h>
 #include <stdexcept>
 #include <algorithm>
 #include "ggml.h"
 #include "llama.h"
 // path to GGUF file to load from (compile-time constant for PoC - change me!)
 static constexpr const char * MODEL_PATH = "/home/dylan/gguf/Qwen3-14B-Q4_K_X.gguf";
 static constexpr float SCORE_EPS = 0.02f; // TODO: added to attention scores for numerical stability
 // this struct holds the statistics we accumulate during graph execution via the callback
 struct expected_attn_stats {
    // callback index - track how many times the callback fires for both <ask, !ask>
    std::pair<size_t, size_t> idx = {0, 0};
    size_t n_runs   = 0; // number of distinct graph executions observed
    size_t n_tokens = 0; // number of tokens observed (incremented by n_ubatch for each run)
    std::vector<std::vector<size_t>> n_samples_per_head; // [layer][head]
    // we need to know these model hparams
    const int32_t n_embd    = 0; // native dimensionality of model
    const int32_t n_head    = 0; // number of query heads per layer
    const int32_t n_head_kv = 0; // number of KV heads per layer
    const int32_t n_layers  = 0; // number of layers in the model
    // these are computed at init based on the provided hparams
    const int32_t n_embd_head;    // dimensionality per query head
    const int32_t n_embd_head_kv; // dimensionality per KV head
    // a vector of vectors of pairs of vectors (of doubles)
    //
    // the primary vector corresponds to the observed layers of the model [n_layers].
    // the secondary vector corresponds to the number of query heads per layer [n_head].
    // for each query head, we store a pair of vectors <mean, m2> where:
    //  - the first vector `mean` is the running mean for this query head
    //  - the second vector `m2` is the running sum of squared differences from the mean for
    //    this query head
    //  - both vectors are of the same length [n_embd_head]
    std::vector<std::vector<std::pair<std::vector<double>, std::vector<double>>>> observed_data;
    // these vectors are reserved once and re-used for every call to expected_attn_stats.print()
    mutable std::vector<double> all_means;
    mutable std::vector<double> all_vars;
    // initialize stats
    expected_attn_stats(
        const int32_t n_embd,
        const int32_t n_head,
        const int32_t n_head_kv,
        const int32_t n_layers
    ) : n_embd(n_embd),
        n_head(n_head),
        n_head_kv(n_head_kv),
        n_layers(n_layers),
        n_embd_head(n_embd / n_head),
        n_embd_head_kv(n_embd / n_head_kv)
    {
        fprintf(stdout,
            "expected_attn_stats: init: n_embd = %d, n_head = %d, n_head_kv = %d, "
            "n_layers = %d, n_embd_head = %d, n_embd_head_kv = %d\n",
            n_embd, n_head, n_head_kv, n_layers, n_embd_head, n_embd_head_kv
        );
        // resize outer vector for layers
        observed_data.resize(n_layers);
        n_samples_per_head.resize(n_layers);
        // for each layer, resize for number of query heads
        for (int32_t il = 0; il < n_layers; ++il) {
            observed_data[il].resize(n_head);
            n_samples_per_head[il].resize(n_head, 0);
            // for each head, initialize mean and m2 vectors
            for (int32_t ih = 0; ih < n_head; ++ih) {
                observed_data[il][ih].first.resize(n_embd_head, 0.0);  // mean
                observed_data[il][ih].second.resize(n_embd_head, 0.0); // m2
            }
        }
        all_means.reserve(n_head * n_embd_head);
        all_vars.reserve(n_head * n_embd_head);
    }
    // reset stats for all query heads in all layers
    void reset() {
        idx.first  = 0;
        idx.second = 0;
        n_runs     = 0;
        n_tokens   = 0;
        for (int32_t il = 0; il < n_layers; ++il) {
            for (int32_t ih = 0; ih < n_head; ++ih) {
                auto & [mean, m2] = observed_data[il][ih];
                std::fill(mean.begin(), mean.end(), 0.0);
                std::fill(m2.begin(),   m2.end(),   0.0);
                n_samples_per_head[il][ih] = 0;
            }
        }
    }
    // compute the expected query distribution for all query heads in all layers based on the
    // currently accumulated statistics
    //
    // returns a pair of 3D vectors <mu_q, sigma_q>
    // the shape of each vector is [n_layers][n_head][n_embd_head]
    const std::pair<
        std::vector<std::vector<std::vector<double>>>, 
        std::vector<std::vector<std::vector<double>>>
    >
    compute() const {
        std::vector<std::vector<std::vector<double>>> mu_q(n_layers);
        std::vector<std::vector<std::vector<double>>> sigma_q(n_layers);
        for (int32_t il = 0; il < n_layers; ++il) {
            mu_q[il].resize(n_head);
            sigma_q[il].resize(n_head);
            for (int32_t ih = 0; ih < n_head; ++ih) {
                const auto & [mean, m2] = observed_data[il][ih];
                const size_t n = n_samples_per_head[il][ih];
                mu_q[il][ih] = mean;
                // compute variance from m2 (Welford's algorithm)
                sigma_q[il][ih].resize(n_embd_head, 0.0);
                if (n > 1) {
                    for (int32_t i = 0; i < n_embd_head; ++i) {
                        sigma_q[il][ih][i] = m2[i] / (n - 1);
                    }
                }
            }
        }
        return {mu_q, sigma_q};
    }
    // print captured query statistics
    void print() const {
        fprintf(stdout, "%s: ------------------------------------------------------------\n", __func__);
        fprintf(stdout, "%s:  captured query statistics\n", __func__);
        fprintf(stdout, "%s: ------------------------------------------------------------\n", __func__);
        fprintf(stdout, "%s:  idx: <%ld, %ld>, n_runs: %ld, n_tokens: %ld\n",
                __func__, idx.first, idx.second, n_runs, n_tokens);
        fprintf(stdout, "%s: ------------------------------------------------------------\n", __func__);
        for (int32_t il = 0; il < n_layers; ++il) {
            // collect all means and variances for this layer
            all_means.clear();
            all_vars.clear();
            for (int32_t ih = 0; ih < n_head; ++ih) {
                const auto & [mean, m2] = observed_data[il][ih];
                const size_t n = n_samples_per_head[il][ih];
                for (int32_t i = 0; i < n_embd_head; ++i) {
                    all_means.push_back(mean[i]);
                    if (n > 1) {
                        double var = m2[i] / (n - 1);
                        all_vars.push_back(var);
                    }
                }
            }
            if (!all_means.empty()) {
                // compute mean and stddev of means
                double mean_of_means = 0.0;
                for (double val : all_means) {
                    mean_of_means += val;
                }
                mean_of_means /= all_means.size();
                double stddev_of_means = 0.0;
                for (double val : all_means) {
                    double diff = val - mean_of_means;
                    stddev_of_means += diff * diff;
                }
                stddev_of_means = std::sqrt(stddev_of_means / all_means.size());
                // compute mean and stddev of variances
                double mean_of_vars = 0.0;
                double stddev_of_vars = 0.0;
                if (!all_vars.empty()) {
                    for (double val : all_vars) {
                        mean_of_vars += val;
                    }
                    mean_of_vars /= all_vars.size();
                    for (double val : all_vars) {
                        double diff = val - mean_of_vars;
                        stddev_of_vars += diff * diff;
                    }
                    stddev_of_vars = std::sqrt(stddev_of_vars / all_vars.size());
                }
                fprintf(stdout, "%s: - layer %3d: mean: %8.4f ±%3.1f,  variance: %8.4f ±%3.1f\n",
                        __func__, il, mean_of_means, stddev_of_means, mean_of_vars, stddev_of_vars);
            } else {
                fprintf(stdout, "%s: - layer %3d: [no data]\n", __func__, il);
            }
        }
    }
    // given a computed distribution, print stats about it
    void print_distribution(
        const std::pair<
            std::vector<std::vector<std::vector<double>>>,
            std::vector<std::vector<std::vector<double>>>
        > & dist
    ) const {
        auto [mu_q, sigma_q] = dist;
        fprintf(stdout, "%s: ------------------------------------------------------------\n", __func__);
        fprintf(stdout, "%s:  computed query distribution\n", __func__);
        fprintf(stdout, "%s: ------------------------------------------------------------\n", __func__);
        for (int32_t il = 0; il < n_layers; ++il) {
            if (!mu_q[il].empty() && !mu_q[il][0].empty()) {
                double min_mu    =  std::numeric_limits<double>::infinity();
                double max_mu    = -std::numeric_limits<double>::infinity();
                double min_sigma =  std::numeric_limits<double>::infinity();
                double max_sigma = -std::numeric_limits<double>::infinity();
                for (int32_t ih = 0; ih < n_head; ++ih) {
                    for (int32_t ie = 0; ie < n_embd_head; ++ie) {
                        min_mu    = std::min(min_mu,       mu_q[il][ih][ie]);
                        max_mu    = std::max(max_mu,       mu_q[il][ih][ie]);
                        min_sigma = std::min(min_sigma, sigma_q[il][ih][ie]);
                        max_sigma = std::max(max_sigma, sigma_q[il][ih][ie]);
                    }
                }
                fprintf(stdout, "%s: - layer %3d: mu [%8.3f, %8.3f], sigma [%8.3f, %8.3f]\n",
                        __func__, il, min_mu, max_mu, min_sigma, max_sigma);
            } else {
                fprintf(stdout, "%s: - layer %3d: [no data]\n", __func__, il);
            }
        }
    }
    // calculate the total number of samples observed across all query heads in all layers
    size_t n_samples() const {
        size_t total = 0;
        for (const auto& layer : n_samples_per_head) {
            for (size_t count : layer) {
                total += count;
            }
        }
        return total;
    }
 };
 // parse layer index from tensor name
 static int32_t parse_layer_index(const char * name) {
    std::string_view sv(name);
    auto dash_pos = sv.rfind('-');
    if (dash_pos == std::string_view::npos) {
        return -1;
    }
    auto n_part = sv.substr(dash_pos + 1);
    if (n_part.empty()) {
        return -1;
    }
    if (!std::all_of(n_part.begin(), n_part.end(), [](char c)
        { return c >= '0' && c <= '9'; }))
    {
        return -1;
    }
    int32_t result{};
    auto [ptr, ec] = std::from_chars(n_part.data(), n_part.data() + n_part.size(), result);
    if (ec != std::errc{}) {
        return -1;
    }
    return result;
 }
 // check if this tensor name starts with "Qcur-" and is not a (view) or (permuted)
 static bool tensor_name_match(const char * name) {
    if /* OVERRIDE TO MATCH ALL NAMES FOR DEBUG?: */ (false)  {
        return true;
    }
    if (strncmp(name, "Qcur-", 5) != 0) {
        return false;
    }
    if (strchr(name, ' ') != nullptr) {
        // spaces indicate suffixes like " (view)" or " (permuted)"
        return false;
    }
    return true;
 }
 // print tensor name, shape, and type
 static void print_tensor_info(const ggml_tensor * t) {
    fprintf(stdout, "%s: name = %8s, shape = [ %6ld, %6ld, %6ld, %6ld ], type = %s\n",
            __func__, t->name, t->ne[0], t->ne[1], t->ne[2], t->ne[3], ggml_type_name(t->type));
 }
 // expected attention eval callback function
 static bool expected_attn_eval_cb(struct ggml_tensor * t, bool ask, void * user_data) {
    auto * stats = static_cast<expected_attn_stats*>(user_data);
    if (ask) {
        // the scheduler wants to know if we want to observe this tensor. if the shape is what
        // we expect, and the tensor name matches, then yes, we do.
        ++stats->idx.first;
        return (
            // TODO: this check works for Qwen3 and likely many other models, but not all models
            t->ne[0] == stats->n_embd_head &&
            t->ne[1] == stats->n_head      &&
            tensor_name_match(t->name)
        );
    } else {
        // the scheduler is giving us a tensor to observe
        print_tensor_info(t);
        GGML_ASSERT(t->ne[0] == stats->n_embd_head && t->ne[1] == stats->n_head &&
                    "unexpected shape - this should not happen");
        const int64_t n_tokens = t->ne[2];
        const int32_t il = parse_layer_index(t->name);
        // increment stat counters
        ++stats->idx.second;
        if (il == 0) {
            ++stats->n_runs;
            // only increment the n_tokens counter once per graph execution (not every layer)
            // TODO: is there a more elegant way to check per-execution?
            stats->n_tokens += n_tokens;
        }
        // allocate buffer and get the tensor data from the backend
        const int64_t n_elements = stats->n_embd_head * stats->n_head * n_tokens;
        GGML_ASSERT(n_elements == ggml_nelements(t));
        std::vector<float> buffer(n_elements);
        ggml_backend_tensor_get(t, buffer.data(), 0, ggml_nbytes(t));
        //
        // accumulate statistics from the tensor data using Welford's algorithm
        //
        // iterate over all tokens
        for (int64_t it = 0; it < n_tokens; ++it) {
            // for each query head
            for (int64_t ih = 0; ih < stats->n_head; ++ih) {
                ++stats->n_samples_per_head[il][ih];
                const size_t n = stats->n_samples_per_head[il][ih];
                auto & mean = stats->observed_data[il][ih].first;
                auto & m2   = stats->observed_data[il][ih].second;
                // for each dimension in this head
                for (int64_t ie = 0; ie < stats->n_embd_head; ++ie) {
                    const size_t idx = ie + ih * stats->n_embd_head + it * stats->n_embd_head * stats->n_head;
                    const double value = static_cast<double>(buffer[idx]);
                    // Welford's online algorithm
                    const double delta = value - mean[ie];
                    mean[ie] += delta / n;
                    const double delta2 = value - mean[ie];
                    m2[ie] += delta * delta2;
                }
            }
        }
        return true; // return false to cancel graph computation
    }
 }
 int main() {
    // init llama_model
    llama_model_params model_params = llama_model_default_params();
    model_params.check_tensors = true;
    model_params.n_gpu_layers  = 999;
    model_params.use_mmap      = false;
    model_params.use_mlock     = false;
    model_params.use_direct_io = false;
    llama_model * model = llama_model_load_from_file(MODEL_PATH, model_params);
    if (!model) {
        throw std::runtime_error("failed to load model");
    }
    const int32_t n_embd    = llama_model_n_embd(model);
    const int32_t n_head    = llama_model_n_head(model);
    const int32_t n_head_kv = llama_model_n_head_kv(model);
    const int32_t n_layers  = llama_model_n_layer(model);
    // callback statistics
    expected_attn_stats cb_stats(n_embd, n_head, n_head_kv, n_layers);
    // init llama_context
    llama_context_params ctx_params = llama_context_default_params();
    ctx_params.offload_kqv       = true;
    ctx_params.n_ubatch          = 2560;
    ctx_params.n_batch           = 5120;
    ctx_params.n_ctx             = 5120;
    ctx_params.kv_unified        = true;
    ctx_params.n_seq_max         = 1;
    ctx_params.n_threads         = 8;
    ctx_params.n_threads_batch   = 8;
    ctx_params.cb_eval           = expected_attn_eval_cb;
    ctx_params.cb_eval_user_data = &cb_stats;
    ctx_params.flash_attn_type   = LLAMA_FLASH_ATTN_TYPE_ENABLED; // need to test flash attention both enabled and disabled
    llama_context * ctx = llama_init_from_model(model, ctx_params);
    if (!ctx) {
        llama_model_free(model);
        throw std::runtime_error("failed to create context");
    }
    // prepare dummy prompt processing input (TODO: eventually need to use real text)
    llama_batch pp_batch = llama_batch_init(/* n_tokens */ ctx_params.n_batch, /* embd */ 0, /* n_seq_max */ ctx_params.n_seq_max);
    pp_batch.n_tokens = ctx_params.n_batch;
    for (int32_t i = 0; i < pp_batch.n_tokens; ++i) {
        pp_batch.token[i]     = (llama_token) i; // use position as token ID for now
        pp_batch.pos[i]       = (llama_pos)   i;
        pp_batch.n_seq_id[i]  = 1;
        pp_batch.seq_id[i][0] = 0;
    }
    // run dummy prompt processing
    int32_t return_code = llama_decode(ctx, pp_batch);
    if (return_code != GGML_STATUS_SUCCESS) {
        llama_batch_free(pp_batch);
        llama_free(ctx);
        llama_model_free(model);
        throw std::runtime_error("dummy PP failed");
    }
    // display accumulated statistics
    cb_stats.print();
    // compute query distribution
    auto & dist = cb_stats.compute();
    // print query distribution
    cb_stats.print_distribution(dist);
    //
    // TODO: calculate importance scores for all KV entries based on `dist`
    //
    //
    // TODO: evict the least important x% of KV entries
    //
    // cleanup
    llama_batch_free(pp_batch); // llama_batch
    llama_free(ctx);            // llama_context
    llama_model_free(model);    // llama_model
    return GGML_STATUS_SUCCESS;
 }