Performance optimisations (AI assisted)

2026-01-22 10:38:16 +00:00 · 2026-01-22 10:38:16 +00:00 · 2ede173218
parent 1c23a6fbd2
commit 2ede173218
1 changed files with 111 additions and 120 deletions
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@ -949,6 +949,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
        std::vector<float> & dequantized_buffer,
        float tensor_bias,
        const float * slice_bias,
+        float h_norm,
        const wce_cache * ref_wce = nullptr,
        const mse_cache * ref_mse = nullptr
    ) -> quant_error
@ -990,10 +991,10 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                        const float * v = has_vals ? values_sample + s * n_per_row : nullptr;
                        const float * a = activations_sample + s * n_per_row;
                        double denom = 0.0;
-                        for (int64_t j = 0; j < n_per_row; ++j) {
-                            const double w = v ? std::max(0.0f, v[j]) : 1.0;
-                            const double aj = a[j];
-                            denom += w * aj * aj;
+                        if (v) {
+                            for (int64_t j = 0; j < n_per_row; ++j) { denom += std::max(0.0f, v[j]) * a[j] * a[j]; }
+                        } else {
+                            for (int64_t j = 0; j < n_per_row; ++j) { denom += a[j] * a[j]; }
                        }

                        local_bias_denom[s] = denom;
@ -1009,9 +1010,10 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                    for (int64_t r = 0; r < rs; ++r) {
                        const float * x = f32_sample.data() + off;
                        double sum = 0.0;
-                        for (int64_t j = 0; j < n_per_row; ++j) {
-                            double xx = x[j];
-                            sum += (v ? std::max(0.0f, v[j]) : 1.0) * xx * xx;
+                        if (v) {
+                            for (int64_t j = 0; j < n_per_row; ++j) { sum += std::max(0.0f, v[j]) * x[j] * x[j]; }
+                        } else {
+                            for (int64_t j = 0; j < n_per_row; ++j) { sum += x[j] * x[j]; }
                        }

                        local_row_sq_norm.push_back(sum);
@ -1061,15 +1063,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(

        // Compute Error Metrics: Entropy-Modulated Weighted Cosine Error (WCE) - Experimental
        if (do_wce) {
-            float h_norm = 1.0f;
-            if (statistics_data) {
-                const std::string name = ggml_get_name(t);
-                const std::string key = remap_imatrix(name, mapped);
-                if (auto it = statistics_data->find(key); it != statistics_data->end() && !it->second.empty()) {
-                    h_norm = it->second.size() > 3 ? it->second[1] : 1.0f;
-                }
-            }
-
            double total_cos_error = 0.0;
            size_t off = 0;
            size_t sample_idx = 0;
@ -1093,44 +1086,24 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                    const bool calc_nx = !cached_norm_x;

                    // SIMD-friendly loops
-                    if (v) {
-                        if (calc_nx) {
-                            for (int64_t j = 0; j < n_per_row; ++j) {
-                                const double w = std::max(0.0f, v[j]);
-                                const double xj = wx[j];
-                                const double yj = wy[j];
-                                const double yw = yj * w;
-                                dot += xj * yw;
-                                ny += yj * yw;
-                                nx += xj * xj * w;
-                            }
-                        } else {
-                            nx = (* cached_norm_x)[sample_idx];
-                            for (int64_t j = 0; j < n_per_row; ++j) {
-                                const double w = std::max(0.0f, v[j]);
-                                const double yj = wy[j];
-                                const double yw = yj * w;
-                                dot += (double) wx[j] * yw;
-                                ny += yj * yw;
-                            }
+                    if (calc_nx) {
+                        for (int64_t j = 0; j < n_per_row; ++j) {
+                            const double w = std::max(0.0f, v[j]);
+                            const double xj = wx[j];
+                            const double yj = wy[j];
+                            const double yw = yj * w;
+                            dot += xj * yw;
+                            ny += yj * yw;
+                            nx += xj * xj * w;
                        }
                    } else {
-                        if (calc_nx) {
-                            for (int64_t j = 0; j < n_per_row; ++j) {
-                                const double xj = wx[j];
-                                const double yj = wy[j];
-                                dot += xj * yj;
-                                ny += yj * yj;
-                                nx += xj * xj;
-                            }
-                        } else {
-                            nx = (* cached_norm_x)[sample_idx];
-                            for (int64_t j = 0; j < n_per_row; ++j) {
-                                const double xj = wx[j];
-                                const double yj = wy[j];
-                                dot += xj * yj;
-                                ny += yj * yj;
-                            }
+                        nx = (* cached_norm_x)[sample_idx];
+                        for (int64_t j = 0; j < n_per_row; ++j) {
+                            const double w = std::max(0.0f, v[j]);
+                            const double yj = wy[j];
+                            const double yw = yj * w;
+                            dot += (double) wx[j] * yw;
+                            ny += yj * yw;
                        }
                    }

@ -1184,14 +1157,35 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                double w_err = 0.0;
                double bias_num = 0.0;

-                for (int64_t j = 0; j < n_per_row; ++j) {
-                    const double w = val ? std::max(0.0f, val[j]) : 1.0;
-                    const double e = y[j] - x[j];
-                    w_err += w * e * e;
-                    if (act) { bias_num += w * e * act[j]; }
+                if (val && act) {
+                    for (int64_t j = 0; j < n_per_row; ++j) {
+                        const double w = std::max(0.0f, val[j]);
+                        const double e = y[j] - x[j];
+                        const double we = w * e;
+                        w_err += we * e;
+                        bias_num += we * act[j];
+                    }
+                } else if (val) {
+                    for (int64_t j = 0; j < n_per_row; ++j) {
+                        const double w = std::max(0.0f, val[j]);
+                        const double e = y[j] - x[j];
+                        w_err += w * e * e;
+                    }
+                } else if (act) {
+                    for (int64_t j = 0; j < n_per_row; ++j) {
+                         const double e = y[j] - x[j];
+                         w_err += e * e;
+                         bias_num += e * act[j];
+                    }
+                } else {
+                    for (int64_t j = 0; j < n_per_row; ++j) {
+                        const double e = y[j] - x[j];
+                        w_err += e * e;
+                    }
                }

-                const double m_norm = w_err / ((* ptr_row_sq_norm)[row_idx] + EPSILON);
+                const double rsn = (* ptr_row_sq_norm)[row_idx];
+                const double m_norm = rsn > EPSILON ? w_err / rsn : 0.0;
                slice_mse_norm.push_back(std::isfinite(m_norm) ? m_norm : INFINITE);

                if (act) {
@ -1319,7 +1313,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
            const ggml_type src_type = tensor->type;
            const size_t src_row_sz = ggml_row_size(src_type, n_per_row);
            const ggml_type_traits * traits = ggml_get_type_traits(src_type);
-            std::vector<float> row_buf(n_per_row);

            for (int64_t slice = 0; slice < ne2; ++slice) {
                std::mt19937 rng(std::hash<std::string>{}(name) ^ HASH_MAGIC ^ slice);
@ -1330,18 +1323,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                int64_t count = 0;
                for (int64_t r = offset; r < nrows_total && count < limit; r += stride) {
                    const uint8_t * src = (const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz;
-                    if (src_type == GGML_TYPE_F32) {
-                        f32_sample.insert(f32_sample.end(), (const float*)src, (const float*)src + n_per_row);
-                    } else if (src_type == GGML_TYPE_F16 || src_type == GGML_TYPE_BF16) {
-                        if (src_type == GGML_TYPE_F16) { ggml_fp16_to_fp32_row((const ggml_fp16_t*)src, row_buf.data(), (int)n_per_row); }
-                        else { ggml_bf16_to_fp32_row((const ggml_bf16_t*)src, row_buf.data(), (int)n_per_row); }
-                        f32_sample.insert(f32_sample.end(), row_buf.begin(), row_buf.end());
-                    } else if (traits && traits->to_float) {
-                        traits->to_float(src, row_buf.data(), (int)n_per_row);
-                        f32_sample.insert(f32_sample.end(), row_buf.begin(), row_buf.end());
-                    } else {
-                        throw std::runtime_error(format("unsupported source type %s for sampling", ggml_type_name(src_type)));
-                    }
+                    size_t cur_sz = f32_sample.size();
+                    f32_sample.resize(cur_sz + n_per_row);
+                    float * dst = f32_sample.data() + cur_sz;
+
+                    if (src_type == GGML_TYPE_F32) { std::memcpy(dst, src, n_per_row * sizeof(float)); }
+                    else if (src_type == GGML_TYPE_F16) { ggml_fp16_to_fp32_row((const ggml_fp16_t*)src, dst, (int)n_per_row); }
+                    else if (src_type == GGML_TYPE_BF16) { ggml_bf16_to_fp32_row((const ggml_bf16_t*)src, dst, (int)n_per_row); }
+                    else if (traits && traits->to_float) { traits->to_float(src, dst, (int)n_per_row); }
+                    else { throw std::runtime_error(format("unsupported source type %s for sampling", ggml_type_name(src_type))); }

                    ++count;
                }
@ -1360,6 +1350,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
        auto [val_ptr, val_sz] = get_side_data(values_data);
        auto [act_ptr, act_sz] = get_side_data(activations_data);

+        // Cache WCE stats once per tensor to avoid repeated map lookups/regex inside compute_quant_error
+        float h_norm = 1.0f;
+        if (valid_wce && statistics_data) {
+            const std::string key = remap_imatrix(name, mapped);
+            if (auto it = statistics_data->find(key); it != statistics_data->end() && !it->second.empty()) {
+                h_norm = it->second.size() > 3 ? it->second[1] : 1.0f;
+            }
+        }
+
        std::vector<float> val_vec;
        std::vector<float> act_vec;
        auto prepare_broadcast = [&](const float* src, size_t sz, std::vector<float>& dst) {
@ -1378,7 +1377,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
        prepare_broadcast(val_ptr, val_sz, val_vec);
        prepare_broadcast(act_ptr, act_sz, act_vec);

-        // Precompute WCE reference stats (row_sq_norm) to avoid recalculation per candidate
+        // Precompute WCE reference stats
        wce_cache ref_wce;
        mse_cache ref_mse;
        size_t total_rows_sampled = 0;
@ -1386,13 +1385,11 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(

        if (valid_wce && !val_vec.empty() && !act_vec.empty()) {
            ref_wce.row_sq_norm.reserve(total_rows_sampled);
-
            size_t off = 0;
            for (int64_t s = 0; s < ne2; ++s) {
                const int64_t rs = rows_sample[s];
                if (rs == 0) { continue; }
                const float * v = val_vec.data() + s * n_per_row;
-
                for (int64_t r = 0; r < rs; ++r) {
                    const float * wx = f32_sample.data() + off;
                    double norm_x = 0.0;
@ -1405,43 +1402,45 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                }
            }
        } else {
-             // Precompute MSE reference stats (row_sq_norm and bias_denominator) to avoid recalculation per candidate
-             ref_mse.row_sq_norm.reserve(total_rows_sampled);
-             ref_mse.bias_denominator.assign(ne2, 0.0);
-             const bool has_acts = !act_vec.empty();
-             const bool has_vals = !val_vec.empty();
+            // Precompute MSE reference stats
+            ref_mse.row_sq_norm.reserve(total_rows_sampled);
+            ref_mse.bias_denominator.assign(ne2, 0.0);
+            const bool has_acts = !act_vec.empty();
+            const bool has_vals = !val_vec.empty();

-             // Bias Denominators
-             if (has_acts) {
-                 for (int64_t s = 0; s < ne2; ++s) {
-                     const float * v = has_vals ? val_vec.data() + s * n_per_row : nullptr;
-                     const float * a = act_vec.data() + s * n_per_row;
-                     double denom = 0.0;
-                     for (int64_t j = 0; j < n_per_row; ++j) {
-                         const double w = v ? std::max(0.0f, v[j]) : 1.0;
-                         const double aj = a[j];
-                         denom += w * aj * aj;
-                     }
-                     ref_mse.bias_denominator[s] = denom;
-                 }
-             }
+            if (has_acts) {
+                for (int64_t s = 0; s < ne2; ++s) {
+                    const float * v = has_vals ? val_vec.data() + s * n_per_row : nullptr;
+                    const float * a = act_vec.data() + s * n_per_row;
+                    double denom = 0.0;
+                    if (v) {
+                        for (int64_t j = 0; j < n_per_row; ++j) { denom += std::max(0.0f, v[j]) * a[j] * a[j]; }
+                    } else {
+                        for (int64_t j = 0; j < n_per_row; ++j) { denom += a[j] * a[j]; }
+                    }

-             // Row Squared Norms
-             size_t off = 0;
-             for (int64_t s = 0; s < ne2; ++s) {
-                 const int64_t rs = rows_sample[s];
-                 const float * v = has_vals ? val_vec.data() + s * n_per_row : nullptr;
-                 for (int64_t r = 0; r < rs; ++r) {
-                     const float * x = f32_sample.data() + off;
-                     double sum = 0.0;
-                     for (int64_t j = 0; j < n_per_row; ++j) {
-                         double xx = x[j];
-                         sum += (v ? std::max(0.0f, v[j]) : 1.0) * xx * xx;
-                     }
-                     ref_mse.row_sq_norm.push_back(sum);
-                     off += (size_t)n_per_row;
-                 }
-             }
+                    ref_mse.bias_denominator[s] = denom;
+                }
+            }
+
+            size_t off = 0;
+            for (int64_t s = 0; s < ne2; ++s) {
+                const int64_t rs = rows_sample[s];
+                const float * v = has_vals ? val_vec.data() + s * n_per_row : nullptr;
+                for (int64_t r = 0; r < rs; ++r) {
+                    const float * x = f32_sample.data() + off;
+                    double sum = 0.0;
+                    if (v) {
+                        for (int64_t j = 0; j < n_per_row; ++j) { sum += std::max(0.0f, v[j]) * x[j] * x[j]; }
+                    }
+                    else {
+                        for (int64_t j = 0; j < n_per_row; ++j) { sum += x[j] * x[j]; }
+                    }
+
+                    ref_mse.row_sq_norm.push_back(sum);
+                    off += (size_t)n_per_row;
+                }
+            }
        }

        // Build candidates
@ -1461,7 +1460,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
        std::sort(valid_types.begin(), valid_types.end());
        valid_types.erase(std::unique(valid_types.begin(), valid_types.end()), valid_types.end());

-        // Calculate bias lambda to adjust the trade-off between MSE and systematic bias
        float tensor_lambda = 0.0f;
        std::vector<float> slice_lambdas = estimate_lambda(val_vec.empty()?nullptr:val_vec.data(), act_vec.empty()?nullptr:act_vec.data(), n_per_row, ne2);
        if (!slice_lambdas.empty()) {
@ -1492,6 +1490,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                dq_buf,
                tensor_lambda,
                slice_lambdas.data(),
+                h_norm,
                ptr_ref_wce,
                ptr_ref_mse
            );
@ -1507,35 +1506,29 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
            evaluations.push_back(candidate);
        }

-        // Select final quality metric (MSE or MSE + bias) if not using WCE
        type_choice ch;
        ch.w = tw;
        ch.n_elements = ggml_nelements(tensor);
        bool bias_needed = false;
        if (!valid_wce && !slice_lambdas.empty()) {
-            // Determine if bias correction is required
            double best_mse = INFINITE;
            double max_rel_bias = 0.0;
            for (const auto& c : evaluations) {
                if (c.bytes == 0) { continue; }
                best_mse = std::min(best_mse, c.mse);
-                // Check penalty term contribution (error - mse)
                if (c.mse > EPSILON) { max_rel_bias = std::max(max_rel_bias, std::max(0.0, c.error - c.mse) / c.mse); }
            }

-            // If penalty/bias is significant (>= 50% of MSE), use combined error, else pure MSE
            bias_needed = max_rel_bias >= 0.5;
        }

        for (const auto & ev : evaluations) {
            if (ev.bytes == 0) { continue; }
            type_scores ts = ev;
-            // If using WCE, c.error is already set
            if (!valid_wce && !bias_needed) { ts.error = ts.mse; }
            ch.candidates.push_back(ts);
        }

-        // Fallback if empty
        if (ch.candidates.empty()) {
            type_scores fb;
            fb.type = tensor->type;
@ -1544,15 +1537,13 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
            ch.candidates.push_back(fb);
        }

-        // Convex hull & Pareto Front simplification
        auto simplify_pareto = [](std::vector<type_scores> & candidates) {
            std::sort(candidates.begin(), candidates.end(), [](const auto& a, const auto& b) {
                return a.bytes < b.bytes || (a.bytes == b.bytes && a.error < b.error);
            });
            candidates.erase(std::unique(candidates.begin(), candidates.end(),
-                 [](const auto & a, const auto &b) { return a.bytes == b.bytes; }), candidates.end());
+                [](const auto & a, const auto &b) { return a.bytes == b.bytes; }), candidates.end());

-            // Lower envelope
            std::vector<type_scores> hull;
            double min_err = INFINITE;
            for(const auto & c : candidates) {
@ -1563,12 +1554,12 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
            }
            candidates = std::move(hull);

-            // Convex hull
            if (candidates.size() < 3) { return; }
            std::vector<type_scores> convex;
            auto cross = [](const auto& a, const auto& b, const auto& c) {
                return ((double)b.bytes - (double)a.bytes) * (c.error - a.error) - ((double)c.bytes - (double)a.bytes) * (b.error - a.error);
            };
+
            for (const auto & c : candidates) {
                while (convex.size() >= 2 && cross(convex[convex.size()-2], convex.back(), c) <= EPSILON) { convex.pop_back(); }
                convex.push_back(c);