Add precise_lambda()

2025-08-28 16:06:42 +01:00 · 2025-08-28 16:06:42 +01:00 · 66aff8fa1e
parent 8df1d00ae4
commit 66aff8fa1e
1 changed files with 102 additions and 0 deletions
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@ -921,6 +921,108 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
        // Clamp to a reasonable range
        return (float)std::clamp(scale, 0.5, 2.0);
    };
    // Returns an adaptive lambda for this tensor using a small probe set
    // bias_lambda adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE
    // larger value favours quantisation types that produce smaller bias even if the MSE is slightly larger
    auto precise_lambda = [&](const ggml_tensor * t,
        const std::vector<float> & f32_sample,
        const std::vector<int64_t> & sample_rows_per_slice,
        const float * values,
        const float * activations,
        const std::vector<ggml_type> & compatible_candidates) -> float
    {
        // No activations => no projection term
        if (!activations) { return 0.0f; }
        // pick a tiny probe set: try to spread around mid-range types
        std::vector<ggml_type> probes;
        probes.reserve(3);
        auto push_if = [&](const ggml_type tiny) {
            if (std::find(compatible_candidates.begin(), compatible_candidates.end(), tiny) != compatible_candidates.end()) {
                probes.push_back(tiny);
            }
        };
        // Prefer family-consistent probes; fall back to whatever exists
        push_if(GGML_TYPE_Q4_K);
        push_if(GGML_TYPE_Q3_K);
        push_if(GGML_TYPE_Q5_K);
        if (probes.empty() && !compatible_candidates.empty()) {
            probes.push_back(compatible_candidates[compatible_candidates.size() / 2]);
        }
        if (probes.size() == 1 && compatible_candidates.size() >= 2) {
            probes.push_back(compatible_candidates.front());
        }
        if (probes.empty()) { return 0.0f; }
        // Scratch buffers (reused)
        const int64_t n_per_row = t->ne[0];
        const size_t total_sampled_rows = f32_sample.size() / n_per_row;
        size_t max_row_sz = 0;
        for (auto pt : probes) {
            max_row_sz = std::max(max_row_sz, ggml_row_size(pt, n_per_row));
        }
        std::vector<uint8_t> quantized_buffer(max_row_sz * total_sampled_rows);
        std::vector<float>   dequantized_buffer(f32_sample.size());
        std::vector<double> ratios;
        ratios.reserve(probes.size());
        for (const auto pt : probes) {
            // err at lambda=0 => pure weighted MSE part
            double err0 = estimate_error(t, pt, f32_sample, sample_rows_per_slice, values, activations, quantized_buffer, dequantized_buffer, 0.0f);
            // err at lambda=1 => weighted MSE + projection penalty
            const double err1 = estimate_error(t, pt, f32_sample, sample_rows_per_slice, values, activations, quantized_buffer, dequantized_buffer, 1.0f);
            const double p = std::max(0.0, err1 - err0);  // projection term contribution
            const double m = std::max(0.0, err0); // MSE term contribution
            if (p > epsilon && std::isfinite(m) && std::isfinite(p)) {
                ratios.push_back(m / p);
            }
        }
        if (ratios.empty()) { return 0.0f; }
        std::nth_element(ratios.begin(), ratios.begin() + ratios.size() / 2, ratios.end());
        double lambda = ratios[ratios.size() / 2];
        // activations directional scale
        const float scale = directional_scale(values, activations, n_per_row);
        lambda *= scale;
        // clamp to safe range
        lambda = std::clamp(lambda, 0.0, 8.0);
        return (float)lambda;
    };
    auto fast_lambda = [&](const float * values, const float * activations, const int64_t n_per_row) {
        if (!activations) { return 0.0f; }
        double s = 0.0;
        double s2 = 0.0;
        for (int64_t j = 0; j < n_per_row; ++j) {
            const double w = values ? std::max(0.0f, values[j]) : 1.0;
            const double aw = std::sqrt(w) * activations[j];
            const double aw2 = aw * aw;
            s += aw2;
            s2 += aw2 * aw2;
        }
        if (s2 <= 0.0) { return 0.0f; }
        const auto d = (double)n_per_row;
        //const double p = s * s / (d * s2 + epsilon);
        //const double lambda = 8.0 * std::clamp(1.0 - p, 0.0, 1.0);
        // Map p in (0,1] to lambda in [0,8] decreasing
        double base = 1.0 - s * s / (d * s2 + epsilon);
        base = std::clamp(base, 0.0, 1.0);
        // activations directional scale
        const double scale = directional_scale(values, activations, n_per_row);
        // clamp to safe range
        const double lambda = std::clamp(base * scale, 0.0, 1.0) * 8.0;
        return (float)lambda;
    };
    std::vector<tensor_info> all;
    all.reserve(tensors.size());
    for (const auto * tw : tensors) {