Merge c3b6685599 into 58062860af

2025-12-17 05:51:06 +02:00 · 2025-12-17 05:51:06 +02:00 · dd273d5914
parent 58062860af c3b6685599
commit dd273d5914
3 changed files with 482 additions and 177 deletions
--- a/common/common.h
+++ b/common/common.h
@ -528,10 +528,11 @@ struct common_params {
    int32_t i_chunk     =  0; // start processing from this chunk
    int8_t  imat_dat    =  0; // whether the legacy imatrix.dat format should be output (gguf <= 0 < dat)
-    bool process_output  = false; // collect data for the output tensor
+    bool process_output         = false; // collect data for the output tensor
-    bool compute_ppl     = true;  // whether to compute perplexity
+    bool compute_ppl            = true;  // whether to compute perplexity
-    bool show_statistics = false; // show imatrix statistics per tensor
+    bool show_statistics        = false; // show imatrix statistics per tensor
-    bool parse_special   = false; // whether to parse special tokens during imatrix tokenization
+    bool activation_statistics  = false; // generate data to calculate activation based statistics
    bool parse_special          = false; // whether to parse special tokens during imatrix tokenization
    // cvector-generator params
    int n_pca_batch = 100;
--- a/tools/imatrix/README.md
+++ b/tools/imatrix/README.md
@ -10,7 +10,7 @@ More information is available in <https://github.com/ggml-org/llama.cpp/pull/486
    -m model.gguf -f some-text.txt [-o imatrix.gguf] [--output-format {gguf,dat}] [--no-ppl] \
    [--process-output] [--chunk 123] [--save-frequency 0] [--output-frequency 10] \
    [--in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf ...] [--parse-special] \
-    [--show-statistics] [...]
+    [--output-format gguf|dat] [--show-statistics] [...]
 ```
 Here `-m | --model` with a model name and `-f | --file` with a file containing calibration data (such as e.g. `wiki.train.raw`) are mandatory.
@ -20,19 +20,19 @@ The parameters in square brackets are optional and have the following meaning:
 * `-lv | --verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`.
 * `-o | --output-file` specifies the name of the file where the computed data will be stored. If missing `imatrix.gguf` is used.
 * `-ofreq | --output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
-* `--output-format` specifies the output format of the generated imatrix file. Either "gguf", or "dat" (the legacy format). Defaults to "gguf".
+* `--output-format` specifies the output format of the generated imatrix file. Either `gguf`, or `dat` (the legacy format). Defaults to `gguf`.
 * `--save-frequency` specifies how often to save a copy of the imatrix in a separate file. Default is 0 (i.e., never)
 * `--process-output` specifies if data will be collected for the `output.weight` tensor. Typically, it is better not to utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
 * `--in-file` one or more existing imatrix files to load and combine. Useful for merging files from multiple runs/datasets.
 * `--parse-special` enables parsing of special tokens (e.g., `<|im_start|>` in some models). Useful for models with custom tokenizers.
 * `--chunk | --from-chunk` to skip the first `n` chunks of tokens from the input data. Useful for resuming or skipping initial low-quality data.
-* `--chunks` maximum number of chunks to process. Default is -1 for all available chunks.
+* `--chunks` maximum number of chunks to process. Default is `-1` for all available chunks.
 * `--no-ppl` disables the calculation of perplexity for the processed chunks. Useful if you want to speed up the processing and do not care about perplexity.
 * `--show-statistics` displays imatrix file's statistics.
 For faster computation, make sure to use GPU offloading via the `-ngl | --n-gpu-layers` argument.
-Recent versions of `llama-imatrix` store data in GGUF format by default. For the legacy format, use an extension other than `.gguf` when saving the output file. More information is available in <https://github.com/ggml-org/llama.cpp/pull/9400>.
+Versions **b5942** and newer of `llama-imatrix` store data in GGUF format by default. For the legacy format, use `--output-format dat` when saving the output file. More information is available in <https://github.com/ggml-org/llama.cpp/pull/9400>.
 ## Examples
@ -74,25 +74,25 @@ Recent versions of `llama-imatrix` store data in GGUF format by default. For the
 ./llama-imatrix --in-file imatrix.gguf --show-statistics
 ```
-`--show-statistics` will display the following statistics:
+## Statistics
 #### Per tensor
-* Σ(Act²): sum of all squared activations (the importance scores)
+* **Σ(Act²)** *(legacy mode)* / **L₂ Norm** *(preferred)*: If in legacy mode, the raw sum of squares of activations (sum of `Act²`). In preferred mode, the Euclidean Distance (L₂ Norm) between this tensor’s average activations and those of the previous layer.
-* Min & Max: minimum and maximum squared activations values
+* **Min / Max / μ / σ**: Tensor elements Min, Max, Mean, and Standard Deviation.
-* μ & σ: Squared activations' mean and standard deviation
+* **N**: Number of tensor elements considered.
-* % Active: proportion of elements whose average squared activation exceeds a small threshold (1e-5). Helpful to determine how alive/dormant the tensor is during inference
+* **H Norm**: Shannon Entropy normalized over log₂(N). Defined as $H Norm=\frac{-\sum_{i=1}^N p_i \log_2 p_i}{log_2 N}$. Used to determine how well a prompt "exercises" the model's capabilities.
-* N: number of squared activations
+* **H** *(legacy mode)* / **ECS** *(preferred)*: If legacy, Shannon Entropy defined as $H = -\sum_{i=1}^N p_i \log_2 p_i$. If preferred, *Euclidean-Cosine Score* defined as $ECS = K \cdot e^{-\alpha a} \cdot |b|^{\gamma}$ where `a = L₂ Norm`, `b = Cosine Similarity`, `α = 0.01`, `γ = 10` between this tensor’s elements and those of the previous layer. Higher score means more similarity and lower change.
-* Entropy: entropy of the squared activation distribution, in bits (standard Shannon entropy measurement) $S = -\sum_{i=1}^N p_i \log_2 p_i$
+* **ZD**: % of elements whose Z-score is > 1.0 in magnitude (an indicator of outliers), as described in _3.1 Layer Importance Scores_ of [Layer-Wise Quantization](https://arxiv.org/abs/2406.17415)
-* E (norm): Normalized entropy. $E(norm)=\frac{-\sum_{i=1}^N p_i \log_2 p_i}{log_2 N}$. These two metrics can be used to determine how well a prompt "exercises" the model's capabilities
+* **CosSim**: Cosine Similarity of the mean activations between this tensor’s elements and those of the previous layer.
 * ZD Score: z-score distribution as described in _3.1 Layer Importance Scores_ of [Layer-Wise Quantization](https://arxiv.org/abs/2406.17415)
 * CosSim: cosine similarity with respect to the previous layer's tensor. Useful to determine how similar the squared activations of the current layer are to the previous layer's squared activations.
 #### Per layer
-Weighted averages of Σ(Act²), ZD Score and CosSim are also calculated.
+Aggregated metrics per block/layer:
-#### Important note on the computed Statistics
+* **Σ(Act²)** *(legacy mode)* / **L₂ Norm** *(preferred)*: If in legacy mode, the sum of squared activations (sum of Act²) for the layer's concatenated tensors. In preferred mode, the Euclidean Distance (L₂ Norm) between this layer's average concatenated tensor activations the previous layer.
 * **ZD**: % of this layer's concatenated tensors' elements with |Z| > 1.
 * **CosSim**: Cosine Similarity of the mean activations between this layer's concatenated tensors' elements compared and the previous layer’s.
 * **ECS** *(preferred only)*: Euclidean-Cosine Score applied to the layer.
-When using these statistics, please note that they are computed on the squared activations, **not on the actual (raw) activations**.
+More information is available in https://github.com/ggml-org/llama.cpp/pull/14891
 Whilst the results are still useful, they're less realiable than using the raw values, and in the case of the cosine similarity, could be misleading if the tensor contains opposite vectors.
--- a/tools/imatrix/imatrix.cpp
+++ b/tools/imatrix/imatrix.cpp
@ -1,8 +1,8 @@
 #include "arg.h"
 #include "common.h"
 #include "log.h"
 #include "llama.h"
 #include "gguf.h"
 #include "llama.h"
 #include "log.h"
 #include <algorithm>
 #include <chrono>
@ -10,14 +10,15 @@
 #include <cstdio>
 #include <cstring>
 #include <ctime>
 #include <thread>
 #include <mutex>
 #include <vector>
 #include <fstream>
 #include <unordered_map>
 #include <map>
-#include <regex>
+#include <mutex>
 #include <numeric>
 #include <regex>
 #include <thread>
 #include <unordered_map>
 #include <valarray>
 #include <vector>
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@ -29,7 +30,7 @@ static void print_usage(int, char ** argv) {
            "       -m model.gguf -f some-text.txt [-o imatrix.gguf] [--output-format {gguf,dat}] [--no-ppl] \\\n"
            "       [--process-output] [--chunk 123] [--save-frequency 0] [--output-frequency 10] \\\n"
            "       [--in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf ...] [--parse-special] \\\n"
-            "       [--show-statistics] [...]\n" , argv[0]);
+            "       [--output-format gguf|dat] [--show-statistics] [...]\n" , argv[0]);
    LOG("\n");
 }
@ -38,6 +39,7 @@ static const char * const LLM_KV_IMATRIX_CHUNK_COUNT = "imatrix.chunk_count";
 static const char * const LLM_KV_IMATRIX_CHUNK_SIZE  = "imatrix.chunk_size";
 struct Stats {
    std::vector<float>   activations;
    std::vector<float>   values;
    std::vector<int64_t> counts;
 };
@ -45,16 +47,16 @@ struct Stats {
 struct tensor_statistics {
    std::string tensor;
    Stats stats;
-    float total_sqract = 0.0f;
+    float sum_values    = 0.0f;
-    float mean_sqract  = 0.0f;
+    float mean_values   = 0.0f;
-    float max_sqract   = 0.0f;
+    float max_values    = 0.0f;
-    float min_sqract   = 0.0f;
+    float min_values    = 0.0f;
-    int elements       = 0;
+    int   elements      = 0;
-    float stddev       = 0.0f;
+    float std_deviation = 0.0f;
-    float active       = 0.0f;
+    float entropy       = 0.0f;
-    float entropy      = 0.0f;
+    float zd_score      = 0.0f;
-    float zd           = 0.0f;
+    float cossim        = 0.0f;
-    float cossim       = 0.0f;
+    float l2_dist       = 0.0f;
 };
 class IMatrixCollector {
@ -97,13 +99,14 @@ static std::string filter_tensor_name(const char * name) {
 }
 static void process_tensor_name(const std::string & input, std::string & layer, std::string & tensor) {
    layer.clear();
    tensor.clear();
    std::vector<std::string> name;
    std::istringstream stream(input);
    std::string item;
-    while (std::getline(stream, item, '.')) {
+    while (std::getline(stream, item, '.')) { name.push_back(item); }
        name.push_back(item);
    }
    for (size_t i = 0; i < name.size(); ++i) {
        if (name[i] == "blk" && i + 1 < name.size()) {
            layer = name[i + 1];
@ -117,105 +120,302 @@ static void process_tensor_name(const std::string & input, std::string & layer,
        }
    }
-    if (tensor.empty()) {
+    if (tensor.empty()) { tensor = input; }
-        tensor = input;
+    if (layer.empty()) { layer = "-"; }
    }
    if (layer.empty()) {
        layer = "-";
    }
 }
-static void compute_statistics(std::vector<tensor_statistics> & tstats, const std::string & name, const Stats & e) {
+static std::vector<float> compute_tensor_averages(const Stats & tstats) {
-    if (e.values.size() % e.counts.size() != 0) {
+    if (tstats.counts.empty()) { return {}; }
-        LOG_ERR("%s: activation size mismatch for tensor %s (%zu vs %zu)\n", __func__, name.c_str(), e.counts.size(), e.values.size());
+    const size_t n_mat = tstats.counts.size();
-        return;
+    const size_t len = !tstats.activations.empty() ? tstats.activations.size() : tstats.values.size();
-    }
+    if (len == 0 || n_mat == 0 || len % n_mat != 0) { return {}; }
-    if (e.counts.empty()) {
+    const size_t row = len / n_mat;
-        LOG_ERR("%s: there are no activations for tensor %s. The imatrix may be suboptimal\n", __func__, name.c_str());
+    std::vector<float> vec;
-        return;
+    vec.reserve(len);
    }
-    const int n_mat = e.counts.size();
+    bool has_valid = false;
-    const int row_size = e.values.size() / n_mat;
+    if (tstats.activations.empty()) {
        // Mean of squares (legacy: only values are available)
        for (size_t m = 0; m < n_mat; ++m) {
            const float c = (float) tstats.counts[m];
            const size_t off = m * row;
            if (c <= 0.0f) {
                for (size_t j = 0; j < row; ++j) { vec.push_back(0.0f); }
                continue;
            }
-    std::vector<float> activations;
+            has_valid = true;
-    activations.reserve(e.values.size());
+            for (size_t j = 0; j < row; ++j) {
-
+                vec.push_back(tstats.values[off + j] / c);
-    for (int i = 0; i < n_mat; ++i) {
+            }
        for (int j = 0; j < row_size; ++j) {
            activations.push_back(e.values[i*row_size + j] / e.counts[i]);
        }
-    }
+    } else {
        // Mean (new format: activations + values)
        for (size_t m = 0; m < n_mat; ++m) {
            const float c = (float) tstats.counts[m];
            const size_t off = m * row;
            if (c <= 0.0f) {
                for (size_t j = 0; j < row; ++j) { vec.push_back(0.0f); }
                continue;
            }
-    const float act_total     = std::accumulate(activations.begin(), activations.end(), 0.0f);
+            has_valid = true;
-    const float act_max       = *std::max_element(activations.begin(), activations.end());
+            for (size_t j = 0; j < row; ++j) {
-    const float act_min       = *std::min_element(activations.begin(), activations.end());
+                vec.push_back(tstats.activations[off + j] / c);
    const float act_mean      = act_total / activations.size();
    const float act_sqr_total = std::inner_product(activations.begin(), activations.end(), activations.begin(), 0.0f);
    const float act_var       = (act_sqr_total / activations.size()) - (act_mean * act_mean);
    const float act_dev       = std::sqrt(std::max(0.0f, act_var));
    float threshold           = 1e-5f;
    const int inactive_count  = std::count_if(activations.begin(), activations.end(),
                                               [threshold](const float v) { return fabsf(v) <= threshold; });
    const float active_ratio  = 1 - static_cast<float>(inactive_count) / activations.size();
    float entropy = 0;
    if (act_total > 0) {
        for (const auto act : activations) {
            if (const float p = act / act_total; p > 0) {
                entropy -= p * std::log2(p);
            }
        }
    }
-    int z_score = 0;
+    if (!has_valid) { return {}; }
-    if (act_dev > 0.0f) {
+    return vec;
-        for (const auto act : activations) {
+}
-            if (const float p = (act - act_mean) / act_dev; p > 1) {
+
-                z_score++;
+static bool compute_vector_statistics(std::vector<tensor_statistics> & tstats, const std::string & name, const Stats & e, bool & legacy) {
    legacy = e.activations.empty();
    const size_t n_mat = e.counts.size();
    const size_t len = legacy ? e.values.size() : e.activations.size();
    if (n_mat == 0 || len == 0) {
        LOG_ERR("%s: there's no data for tensor %s. The imatrix may be suboptimal\n", __func__, name.c_str());
        return false;
    }
    if (len % n_mat != 0) {
        LOG_ERR("%s: activation size mismatch for tensor %s (len=%zu, counts=%zu)\n", __func__, name.c_str(), len, n_mat);
        return false;
    }
    if (!legacy && e.values.size() != len) {
        LOG_ERR("%s: activations/values size mismatch for tensor %s (act=%zu, val=%zu)\n", __func__, name.c_str(), len, e.values.size());
        return false;
    }
    const size_t row_size = len / n_mat;
    double mean = 0.0;
    double M2 = 0.0;
    double sum = 0.0;
    float vmin = std::numeric_limits<float>::infinity();
    float vmax = -std::numeric_limits<float>::infinity();
    double energy_sum = 0.0;
    size_t valid_n = 0;
    for (size_t i = 0; i < n_mat; ++i) {
        const auto c = (float)e.counts[i];
        if (c <= 0.0f) { continue; } // skip experts with zero count
        const size_t off = i * row_size;
        for (size_t j = 0; j < row_size; ++j) {
            const double v_avg = legacy ? 0.0 : (double)e.activations[off + j] / (double)c; // E[x]
            const double v_energy = (double)e.values[off + j] / (double)c; // E[x^2]
            const double v = legacy ? v_energy : v_avg;
            ++valid_n;
            sum += v;
            vmin = std::min(vmin, (float)v);
            vmax = std::max(vmax, (float)v);
            const double delta = v - mean;
            mean += delta / (double)valid_n;
            M2 += delta * (v - mean);
            energy_sum += std::max(0.0, v_energy);
        }
    }
    if (valid_n == 0) {
        LOG_ERR("%s: there's no data for tensor %s. The imatrix may be suboptimal\n", __func__, name.c_str());
        return false;
    }
    float std_deviation = 0.0f;
    float entropy = 0.0f;
    double zd_count = 0.0;
    double variance = valid_n > 1 ? M2 / ((double)valid_n - 1) : 0.0;
    variance = std::max(variance, 0.0);
    std_deviation = std::sqrt((float)variance);
    if (energy_sum > 0.0) {
        for (size_t i = 0; i < n_mat; ++i) {
            const auto c = (float)e.counts[i];
            if (c <= 0.0f) { continue; }
            const size_t off = i * row_size;
            for (size_t j = 0; j < row_size; ++j) {
                const double v_energy = (double)e.values[off + j] / (double)c; // E[x^2]
                const double w = std::max(0.0, v_energy);
                const double p = w / energy_sum;
                if (p > 0.0) { entropy -= (float)(p * std::log2(p)); }
            }
        }
    }
    if (std_deviation > 0.0f) {
        for (size_t i = 0; i < n_mat; ++i) {
            const auto c = (float)e.counts[i];
            if (c <= 0.0f) { continue; }
            const size_t off = i * row_size;
            for (size_t j = 0; j < row_size; ++j) {
                const double v_avg = legacy ? 0.0 : (double)e.activations[off + j] / (double)c; // E[x]
                const double v_energy = (double)e.values[off + j] / (double)c; // E[x^2]
                const auto v = (float)(legacy ? v_energy : v_avg);
                const float z = (v - (float)mean) / std_deviation;
                if (std::fabs(z) > 1.0f) { zd_count += 1.0; }
            }
        }
    }
    auto & ts = tstats.emplace_back();
-    ts.tensor     = name;
+    ts.tensor = name;
-    ts.stats      = e;
+    ts.stats = e;
-    ts.total_sqract = act_total;
+    ts.sum_values = (float)sum;
-    ts.mean_sqract  = act_mean;
+    ts.mean_values = (float)mean;
-    ts.max_sqract   = act_max;
+    ts.max_values = vmax;
-    ts.min_sqract   = act_min;
+    ts.min_values = vmin;
-    ts.elements   = static_cast<int>(activations.size());
+    ts.elements = (int)valid_n;
-    ts.stddev     = act_dev;
+    ts.std_deviation = std_deviation;
-    ts.active     = active_ratio;
+    ts.entropy = entropy;
-    ts.entropy    = entropy;
+    ts.zd_score = (float)(zd_count / (double)valid_n);
-    ts.zd         = static_cast<float>(z_score) / ts.elements;
+
    return true;
 }
-static void compute_cossim(std::vector<tensor_statistics> & tstats) {
+static void compute_tensor_statistics(std::vector<tensor_statistics> & tstats) {
    static const std::regex pattern(R"(blk\.(\d+)\.)");
    for (auto & ts : tstats) {
        ts.cossim = 1.0f;
        ts.l2_dist = 0.0f;
        if (std::smatch match; std::regex_search(ts.tensor, match, pattern)) {
            const int blk = std::stoi(match[1]);
            if (blk <= 0) { continue; }
            std::string tname(ts.tensor);
-            tname.replace(match.position(1), match.length(1), std::to_string(blk-1));
+            tname.replace(match.position(1), match.length(1), std::to_string(blk - 1));
-            auto prev = std::find_if(tstats.begin(), tstats.end(),
+            auto prev_it = std::find_if(tstats.begin(), tstats.end(),
                [tname](const tensor_statistics & t) { return t.tensor == tname; });
-            if (prev != tstats.end()) {
+            if (prev_it == tstats.end()) {
-                const float dp = std::inner_product(ts.stats.values.begin(), ts.stats.values.end(),
+                LOG_WRN("%s: missing previous-layer tensor '%s' (current: '%s'). Statistics may not be accurate\n",
-                    prev->stats.values.begin(), 0.0f);
+                    __func__, tname.c_str(), ts.tensor.c_str());
-                const float curr_mag = std::sqrt(std::inner_product(ts.stats.values.begin(), ts.stats.values.end(),
+                continue;
                    ts.stats.values.begin(), 0.0f));
                const float prev_mag = std::sqrt(std::inner_product(prev->stats.values.begin(), prev->stats.values.end(),
                    prev->stats.values.begin(), 0.0f));
                const float cs = dp / (curr_mag * prev_mag);
                ts.cossim = cs;
            }
-        } else {
+
-            ts.cossim = 0;
+            const auto curr_avg = compute_tensor_averages(ts.stats);
            const auto prev_avg = compute_tensor_averages(prev_it->stats);
            if (curr_avg.empty() || curr_avg.size() != prev_avg.size()) {
                LOG_WRN("%s: size mismatch between '%s' and its previous-layer tensor '%s' (%zu vs %zu). Statistics may not be accurate\n",
                    __func__, ts.tensor.c_str(), tname.c_str(), curr_avg.size(), prev_avg.size());
                continue;
            }
            float dot_prod = 0.0f;
            float norm1_sq = 0.0f;
            float norm2_sq = 0.0f;
            float l2_dist_sq = 0.0f;
            for (size_t i = 0; i < curr_avg.size(); ++i) {
                const float c_val = curr_avg[i];
                const float p_val = prev_avg[i];
                dot_prod += c_val * p_val;
                norm1_sq += c_val * c_val;
                norm2_sq += p_val * p_val;
                const float diff = c_val - p_val;
                l2_dist_sq += diff * diff;
            }
            // Compute Cosine Similarity
            float cs = 0.0f;
            if (norm1_sq > 0.0f && norm2_sq > 0.0f) {
                cs = dot_prod / (std::sqrt(norm1_sq) * std::sqrt(norm2_sq));
                cs = std::min(cs, 1.0f);
                cs = std::max(cs, -1.0f);
            } else if (norm1_sq == 0.0f && norm2_sq == 0.0f) {
                cs = 1.0f;
            }
            ts.cossim = cs;
            // Compute L2 Norm (Euclidean Distance)
            ts.l2_dist = std::sqrt(l2_dist_sq);
        }
    }
 }
 static void compute_layer_statistics(const std::vector<tensor_statistics> & tstats,
                                              std::map<int, float> & layer_cossim,
                                              std::map<int, float> & layer_l2_dist,
                                              const std::unordered_map<std::string, Stats> & stats_map) {
    struct layer_aggregation {
        double sum_dot_prod = 0.0;
        double sum_norm1_sq = 0.0;
        double sum_norm2_sq = 0.0;
        double sum_l2_dist_sq = 0.0;
        int n_tensors = 0;
    };
    static const std::regex pattern(R"(blk\.(\d+)\.)");
    std::map<int, layer_aggregation> l_agr;
    for (const auto & ts : tstats) {
        std::smatch match;
        if (!std::regex_search(ts.tensor, match, pattern)) { continue; }
        const int blk = std::stoi(match[1]);
        if (blk <= 0) { continue; }
        std::string prev_lyr(ts.tensor);
        prev_lyr.replace(match.position(1), match.length(1), std::to_string(blk - 1));
        auto it_curr = stats_map.find(ts.tensor);
        auto it_prev = stats_map.find(prev_lyr);
        if (it_curr == stats_map.end() || it_prev == stats_map.end()) { continue; }
        const auto curr_avg = compute_tensor_averages(it_curr->second);
        const auto prev_avg = compute_tensor_averages(it_prev->second);
        if (curr_avg.empty() || prev_avg.empty()) { continue; }
        if (curr_avg.size() != prev_avg.size()) {
            LOG_WRN("%s: size mismatch between '%s' and its previous-layer tensor '%s' (%zu vs %zu) - skipping this tensor pair in layer statistics\n",
                __func__, ts.tensor.c_str(), prev_lyr.c_str(), curr_avg.size(), prev_avg.size());
            continue;
        }
        // Compute statistics for each tensor pair individually
        const size_t n = curr_avg.size();
        GGML_ASSERT(n > 0);
        double dot_prod = 0.0;
        double norm1_sq = 0.0;
        double norm2_sq = 0.0;
        double l2_dist_sq = 0.0;
        for (size_t i = 0; i < n; ++i) {
            const double a = curr_avg[i];
            const double b = prev_avg[i];
            dot_prod += a * b;
            norm1_sq += a * a;
            norm2_sq += b * b;
            const double d = a - b;
            l2_dist_sq += d * d;
        }
        // Accumulate statistics for the layer
        auto & entry = l_agr[blk];
        entry.sum_dot_prod += dot_prod;
        entry.sum_norm1_sq += norm1_sq;
        entry.sum_norm2_sq += norm2_sq;
        entry.sum_l2_dist_sq += l2_dist_sq;
        entry.n_tensors++;
    }
    // Compute aggregated layer statistics
    for (const auto & kv : l_agr) {
        const int layer = kv.first;
        const auto & agg = kv.second;
        if (agg.n_tensors == 0) { continue; }
        // Compute aggregated Cosine Similarity
        float cossim = 0.0f;
        if (agg.sum_norm1_sq > 0.0 && agg.sum_norm2_sq > 0.0) {
            cossim = (float)(agg.sum_dot_prod / (std::sqrt(agg.sum_norm1_sq) * std::sqrt(agg.sum_norm2_sq)));
            cossim = std::min(cossim, 1.0f);
            cossim = std::max(cossim, -1.0f);
        } else if (agg.sum_norm1_sq == 0.0 && agg.sum_norm2_sq == 0.0) {
            cossim = 1.0f; // both vectors are zero then CosSim is 1
        } else {
            cossim = 0.0f; // One zero and the other non-zero then CosSim is 0
        }
        // Compute aggregated L2 Distance (Euclidean Distance)
        layer_cossim[layer] = cossim;
        layer_l2_dist[layer] = (float)std::sqrt(agg.sum_l2_dist_sq);
    }
 }
 bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
    GGML_UNUSED(user_data);
@ -281,6 +481,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
            e.counts.resize(n_as, e.counts[0]);
        }
        if (e.values.empty()) {
            e.activations.resize(src1->ne[0]*n_as, 0);
            e.values.resize(src1->ne[0]*n_as, 0);
            e.counts.resize(n_as, 0);
        }
@ -312,6 +513,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
                    e.counts[ex]++;
                    for (int64_t j = 0; j < src1->ne[0]; ++j) {
                        e.activations[e_start + j] += x[j];
                        e.values[e_start + j] += x[j] * x[j];
                        if (!std::isfinite((float)e.values[e_start + j])) {
                            LOG_ERR("%f detected in %s\n", (float)e.values[e_start + j], wname.c_str());
@ -351,6 +553,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
            }
        }
        if (e.values.empty()) {
            e.activations.resize(src1->ne[0] * n_mat, 0);
            e.values.resize(src1->ne[0] * n_mat, 0);
            e.counts.resize(1, 0);
        }
@ -369,6 +572,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
                for (int64_t row = 0; row < src1->ne[1]; ++row) {
                    const float * x = (const float *) (data + row * src1->nb[1] + i2 * src1->nb[2] + i3 * src1->nb[3]);
                    for (int64_t j = 0; j < src1->ne[0]; ++j) {
                        e.activations[mat_start + j] += x[j];
                        e.values[mat_start + j] += x[j] * x[j];
                        if (!std::isfinite((float)e.values[j])) {
                            LOG_ERR("%f detected in %s\n", (float)e.values[j], wname.c_str());
@ -550,6 +754,7 @@ void IMatrixCollector::save_imatrix(int32_t n_chunk) const {
        }
        to_store.push_back(kv.first);
        data_size += GGML_PAD(ggml_tensor_overhead() + sizeof(float) * kv.second.activations.size(), GGML_MEM_ALIGN);
        data_size += GGML_PAD(ggml_tensor_overhead() + sizeof(float) * kv.second.values.size(), GGML_MEM_ALIGN);
        data_size += GGML_PAD(ggml_tensor_overhead() + sizeof(float) * kv.second.counts.size(), GGML_MEM_ALIGN);
    }
@ -602,6 +807,16 @@ void IMatrixCollector::save_imatrix(int32_t n_chunk) const {
            gguf_add_tensor(ctx_gguf, in_sum2);
            gguf_add_tensor(ctx_gguf, counts);
            if (!stat.activations.empty()) {
                const int32_t nact = (int32_t) stat.activations.size();
                struct ggml_tensor * in_sum  = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nact / nmat, nmat);
                ggml_format_name(in_sum, "%s.in_sum", name.c_str());
                for (int32_t j = 0; j < nact; ++j) {
                    ((float *) in_sum->data)[j] = (float) stat.activations[j];
                }
                gguf_add_tensor(ctx_gguf, in_sum);
            }
        }
    }
@ -740,6 +955,7 @@ bool IMatrixCollector::load_imatrix(const char * file_name) {
        }
    }
    const std::string in_sum_suffix{ ".in_sum" };
    const std::string in_sum2_suffix{ ".in_sum2" };
    const std::string counts_suffix{ ".counts" };
@ -747,7 +963,7 @@ bool IMatrixCollector::load_imatrix(const char * file_name) {
    // checking for completeness of *each* loaded imatrix file
    // and also makes it easier to re-use a similar implementation in quantize.cpp
    // Using an ordered map to get a deterministic iteration order.
-    std::map<std::string, std::pair<struct ggml_tensor *, struct ggml_tensor *>> sums_counts_for;
+    std::map<std::string, std::tuple<struct ggml_tensor *, struct ggml_tensor *, struct ggml_tensor *>> sums_counts_for;
    for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
        std::string name = cur->name;
@ -756,21 +972,26 @@ bool IMatrixCollector::load_imatrix(const char * file_name) {
        if (string_remove_suffix(name, in_sum2_suffix)) {
            // in_sum2
-            sums_counts_for[std::move(name)].first = cur;
+            std::get<0>(sums_counts_for[std::move(name)]) = cur;
        } else if (string_remove_suffix(name, counts_suffix)) {
            // counts
-            sums_counts_for[std::move(name)].second = cur;
+            std::get<1>(sums_counts_for[std::move(name)]) = cur;
-        } else {
+        }  else if (string_remove_suffix(name, in_sum_suffix)) {
            // in_sum
            std::get<2>(sums_counts_for[std::move(name)]) = cur;
        }
        else {
            // ignore other tensors
        }
    }
    for (const auto & sc : sums_counts_for) {
        const std::string &        name    = sc.first;
-        const struct ggml_tensor * in_sum2 = sc.second.first;
+        const struct ggml_tensor * in_sum  = std::get<2>(sc.second);
-        const struct ggml_tensor * counts  = sc.second.second;
+        const struct ggml_tensor * in_sum2 = std::get<0>(sc.second);
        const struct ggml_tensor * counts  = std::get<1>(sc.second);
-        if (!in_sum2 || !counts) {
+        if (!in_sum2 || !counts || (in_sum != nullptr && ggml_nelements(in_sum) != ggml_nelements(in_sum2))) {
            LOG_ERR("%s: mismatched sums and counts for %s\n", __func__, name.c_str());
            gguf_free(ctx_gguf);
            ggml_free(ctx);
@ -788,6 +1009,16 @@ bool IMatrixCollector::load_imatrix(const char * file_name) {
            ggml_free(ctx);
            return false;
        }
        if (in_sum != nullptr) {
            if (e.activations.empty()) {
                e.activations.resize(nval, 0.0f);
            } else if ((size_t) nval != e.activations.size()) {
                LOG_ERR("%s: mismatched activations size for %s: %zu != %zu\n", __func__, name.c_str(), (size_t) nval, e.activations.size());
                gguf_free(ctx_gguf);
                ggml_free(ctx);
                return false;
            }
        }
        int64_t ncounts = ggml_nelements(counts);
        if (e.counts.empty()) {
@ -804,6 +1035,7 @@ bool IMatrixCollector::load_imatrix(const char * file_name) {
        // Recreate the state as expected by save_imatrix()
        for (int64_t j = 0; j < nval; j++) {
            if (in_sum != nullptr) { e.activations[j] += ((const float *) in_sum->data)[j]; }
            e.values[j] += ((const float *) in_sum2->data)[j];
        }
        for (int64_t j = 0; j < ncounts; j++) {
@ -1082,105 +1314,177 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params, c
 }
 static bool show_statistics(const common_params & params) {
    g_collector.set_params(params);
    std::vector<tensor_statistics> ts;
    if (params.in_files.empty() || params.in_files.size() > 1) {
        LOG_ERR("\nError: a single imatrix file is required to compute tensor statistics\n\n");
        return false;
    }
    bool has_activations = false;
    bool no_activations = false;
    if (g_collector.load_imatrix(params.in_files[0].c_str())) {
-        for (const auto & [name, stats] :g_collector.get_mstats()) {
+        for (const auto & [name, stats] : g_collector.get_mstats()) {
-            compute_statistics(ts, name, stats);
+            bool legacy_imatrix = true;
            if (!compute_vector_statistics(ts, name, stats, legacy_imatrix)) {
                LOG_WRN("%s: tensor %s has no data - skipping\n", __func__, name.c_str());
                continue;
            }
            if (legacy_imatrix) { no_activations = true; }
            else { has_activations = true; }
        }
    } else {
        LOG_ERR("\nError: %s is not a valid imatrix file\n\n", params.in_files[0].c_str());
        return false;
    }
-    if (!ts.empty()) {
+    if (ts.empty()) {
        compute_cossim(ts);
    } else {
        LOG_ERR("Error: cannot compute statistics for %s\n\n", params.in_files[0].c_str());
        return false;
    }
    if (has_activations && no_activations) {
        LOG_ERR("Error: %s has mixed tensors with and without activations\n\n", params.in_files[0].c_str());
        return false;
    }
    const bool legacy = !has_activations;
    compute_tensor_statistics(ts);
    struct tensor_comparer {
        bool legacy_mode;
        explicit tensor_comparer(const bool legacy) : legacy_mode(legacy) {}
        bool operator()(const tensor_statistics & a, const tensor_statistics & b) const {
-            std::string layer, name_a, name_b;
+            std::string layer;
-            ;
+            std::string name_a;
            std::string name_b;
            process_tensor_name(a.tensor, layer, name_a);
            process_tensor_name(b.tensor, layer, name_b);
-            return name_a < name_b || (name_a == name_b && a.total_sqract > b.total_sqract);
+            return legacy_mode ? name_a < name_b || (name_a == name_b && a.sum_values > b.sum_values)
                               : name_a < name_b || (name_a == name_b && a.cossim > b.cossim);
        }
    };
-    std::sort(ts.begin(), ts.end(), tensor_comparer());
+    std::sort(ts.begin(), ts.end(), tensor_comparer(legacy));
-    struct weighted_stats {
+    struct layer_stats {
-        float weighted_bias   = 0.0f;
+        float layer_sum = 0.0f;
-        float weighted_zd     = 0.0f;
+        float layer_zd = 0.0f;
-        float weighted_cossim = 0.0f;
+        int n = 0;
        int   total_elements  = 0;
    };
    std::map<int, weighted_stats> ws;
-    LOG_INF("\nComputing statistics for %s (%d tensors)\n", params.in_files[0].c_str(), static_cast<int>(ts.size()));
+    std::map<int, layer_stats> ls;
-    LOG_INF("\n%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", " Layer", "       Tensor", "          Σ(Act²)",
+    LOG_INF("\nComputing tensor statistics for %s (%d tensors)\n", params.in_files[0].c_str(), static_cast<int>(ts.size()));
-            "  Min", "            Max", "           μ", "   σ", " % Active", "N", "   Entropy", "E (norm)", "ZD",
+    LOG_INF("\n%6s\t%18s\t%13s\t%8s\t%8s\t%7s\t%15s\t%13s\t%11s\t%8s\t%5s\t%10s\n",
-            "  CosSim");
+        "Layer",
        "Tensor",
        legacy ? "Σ E[Act²]" : "L₂ Dist",
        "Min",
        "Max",
        "μ",
        "σ",
        "N",
        "H Norm",
        legacy ? "H" : "ECS",
        "ZD",
        "CosSim");
    LOG_INF(
        "=============================================================================================================="
-        "===========================================================\n");
+        "=============================================================\n");
    // Euclidean-Cosine score
    auto ecs = [](const float l2_dist, const float cossim) {
        return 100.0f - (100.0f * (1.0f / (1.0f + ((2.0f / 3.0f) * l2_dist * l2_dist))) * ((1 + cossim) * 0.5f));
    };
    for (const auto & tstat : ts) {
-        std::string layer, name;
+        std::string layer;
        std::string name;
        process_tensor_name(tstat.tensor, layer, name);
        const float h_norm = tstat.elements > 1 ? 100.0f * (tstat.entropy / std::log2((float) tstat.elements)) : 0.0f;
        int blk;
        try {
            blk = std::stoi(layer);
-        } catch (const std::exception & e) {
+        } catch (const std::exception &) {
-            blk = -1;  // not a block layer
+            blk = -1; // not a block layer
        }
-        LOG_INF("%5s\t%-20s\t%10.2f\t%8.4f\t%11.4f\t%6.2f\t%6.2f\t%8.2f%%\t%6d\t%10.4f\t%6.2f%%\t%10.2f%%\t%8.4f\n",
+        LOG_INF("%5s\t%-20s\t%11.4f\t%10.4f\t%10.4f\t%8.4f\t%8.4f\t%7d\t%10.2f%%\t%10.4f\t%6.2f%%\t%10.4f\n",
-                layer.c_str(), name.c_str(), tstat.total_sqract, tstat.min_sqract, tstat.max_sqract, tstat.mean_sqract,
+            layer.c_str(),
-                tstat.stddev, tstat.active * 100.0f, tstat.elements, tstat.entropy,
+            name.c_str(),
-                100.0f * (tstat.entropy / std::log2(tstat.elements)), 100.0f * tstat.zd, tstat.cossim);
+            legacy ? tstat.sum_values : tstat.l2_dist,
            tstat.min_values,
            tstat.max_values,
            tstat.mean_values,
            tstat.std_deviation,
            tstat.elements,
            h_norm,
            legacy ? tstat.entropy : ecs(tstat.l2_dist, tstat.cossim),
            100.0f * tstat.zd_score,
            tstat.cossim);
-        const float weighted_bias   = tstat.elements * tstat.total_sqract;
+        const float zd = (float)tstat.elements * tstat.zd_score;
-        const float weighted_zd     = tstat.elements * tstat.zd;
+        if (ls.find(blk) != ls.end()) {
-        const float weighted_cossim = tstat.elements * tstat.cossim;
+            if (legacy) { ls[blk].layer_sum += tstat.sum_values; }
-
+            ls[blk].layer_zd += zd;
-        if (ws.find(blk) != ws.end()) {
+            ls[blk].n += tstat.elements;
            ws[blk].weighted_bias += weighted_bias;
            ws[blk].weighted_zd += weighted_zd;
            ws[blk].weighted_cossim += weighted_cossim;
            ws[blk].total_elements += tstat.elements;
        } else {
-            weighted_stats temp_ws;
+            layer_stats temp_ls;
-            temp_ws.weighted_bias   = weighted_bias;
+            if (legacy) { temp_ls.layer_sum = tstat.sum_values; }
-            temp_ws.weighted_zd     = weighted_zd;
+            else { temp_ls.layer_sum = 0.0f; }
-            temp_ws.weighted_cossim = weighted_cossim;
+            temp_ls.layer_zd = zd;
-            temp_ws.total_elements  = tstat.elements;
+            temp_ls.n = tstat.elements;
-            ws[blk]                 = temp_ws;
+            ls[blk] = temp_ls;
        }
    }
-    const int layers = std::count_if(ws.begin(), ws.end(), [](const auto & kv) { return kv.first >= 0; });
+    std::map<int, float> layer_cossim;
-    LOG_INF("\nComputing weighted average statistics per layer (%d layers)\n", layers);
+    std::map<int, float> layer_l2_dist;
-    LOG_INF("\n%s\t%s\t%s\t%s\n", "  Layer", "     μΣ(Act²)", "      μZD", "μCosSim");
+    compute_layer_statistics(ts, layer_cossim, layer_l2_dist, g_collector.get_mstats());
    LOG_INF("================================================\n");
    for (const auto & [first, second] : ws) {
        const auto & layer = first;
        const auto & stats = second;
-        if (stats.total_elements == 0) {
+    const size_t layers = std::count_if(ls.begin(), ls.end(), [](const auto & kv) { return kv.first >= 0; });
    LOG_INF("\nComputing layer statistics (%zu layers)\n", layers);
    LOG_INF("\n%6s\t%13s\t%6s\t%11s\t%6s\n",
        "Layer",
        legacy ? "Σ E[Act²]" : "L₂ Dist",
        "ZD",
        "CosSim",
        legacy ? "" : "ECS");
    if (legacy) {
        LOG_INF("============================================\n");
    } else {
        LOG_INF("=========================================================\n");
    }
    for (const auto & [layer, stats] : ls) {
        if (layer < 0 || stats.n == 0) { continue; }
        const auto lcs = layer_cossim.find(layer);
        const auto ll2n = layer_l2_dist.find(layer);
        float layer_cs  = 0.0f;
        float layer_l2n = 0.0f;
        if (lcs != layer_cossim.end() && ll2n != layer_l2_dist.end()) {
            layer_cs  = lcs->second;
            layer_l2n = ll2n->second;
        } else if (layer == 0) {
            layer_cs  = 1.0f;
            layer_l2n = 0.0f;
        } else {
            continue;
        }
-        if (layer >= 0) {
+        if (legacy) {
-            const float bias   = stats.weighted_bias / stats.total_elements;
+            LOG_INF("%5d\t%11.4f\t%6.2f%%\t%11.4f\n",
-            const float zd     = stats.weighted_zd / stats.total_elements;
+                layer,
-            const float cossim = stats.weighted_cossim / stats.total_elements;
+                stats.layer_sum,
-
+                100.0f * stats.layer_zd / stats.n,
-            LOG_INF("%5d\t%14.2f\t%10.4f%%\t%6.4f\n", layer, bias, 100.0f * zd, cossim);
+                layer_cs);
        } else {
            LOG_INF("%5d\t%11.4f\t%6.2f%%\t%11.4f\t%8.4f\n",
                layer,
                layer_l2n,
                100.0f * stats.layer_zd / stats.n,
                layer_cs,
                ecs(layer_l2n, layer_cs));
        }
    }
    LOG_INF("\n");