Parallelise candidate evaluation

2025-08-21 12:47:13 +01:00 · 2025-08-21 12:47:13 +01:00 · e01dad886b
parent 95b2ab2800
commit e01dad886b
1 changed files with 59 additions and 28 deletions
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@ -610,7 +610,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
    const std::unordered_map<std::string, std::vector<float>> * activations_data,
    const llama_model_quantize_params * params,
    int nthread,
-    int sample_rows_per_expert = 256,
+    int sample_rows_per_expert = 384,
    float bias_lambda = 1.0
 ) {
    struct candidate_types {
@ -758,16 +758,17 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
        std::vector<float> & deq) -> double
    {
        const int64_t n_per_row = t->ne[0];
-        const int64_t nrows     = t->ne[1];
-        const int64_t ne2       = t->ne[2] > 0 ? t->ne[2] : 1;
+        const int64_t nrows = t->ne[1];
+        const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;

-        const size_t total_sampled_rows = f32_sample.size() / n_per_row;
+        const size_t nels = f32_sample.size();
+        const size_t total_sampled_rows = nels / (size_t)n_per_row;
        if (total_sampled_rows == 0) { return 0.0; }

        const size_t row_sz = ggml_row_size(typ, n_per_row);
        const size_t need_q = row_sz * total_sampled_rows;
        if (qbuf.size() < need_q) { qbuf.resize(need_q); }
-        if (deq.size() < f32_sample.size()) { deq.resize(f32_sample.size()); }
+        if (deq.size() < nels) { deq.resize(nels); }

        // Quantize sampled rows slice-by-slice
        size_t qoff = 0;
@ -777,31 +778,31 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
            if (rs == 0) { continue; }

            const float * value = values_sample ? values_sample + slice * n_per_row : nullptr;
-
            (void)ggml_quantize_chunk(typ, f32_sample.data() + foff, qbuf.data() + qoff, 0, rs, n_per_row, value);

-            qoff += row_sz * rs;
-            foff += (size_t)rs * n_per_row;
+            qoff += row_sz * (size_t)rs;
+            foff += (size_t)rs * (size_t)n_per_row;
        }

-        // Dequantize to deq
+        // Dequantize into deq
        if (typ == GGML_TYPE_F16) {
-            ggml_fp16_to_fp32_row((const ggml_fp16_t *)qbuf.data(), deq.data(), (int)f32_sample.size());
+            ggml_fp16_to_fp32_row((const ggml_fp16_t *)qbuf.data(), deq.data(), (int)nels);
        } else if (typ == GGML_TYPE_BF16) {
-            ggml_bf16_to_fp32_row((const ggml_bf16_t *)qbuf.data(), deq.data(), (int)f32_sample.size());
+            ggml_bf16_to_fp32_row((const ggml_bf16_t *)qbuf.data(), deq.data(), (int)nels);
        } else {
            const ggml_type_traits * traits = ggml_get_type_traits(typ);
            if (!traits || !traits->to_float) {
-                // no dequantizer available
+                LLAMA_LOG_WARN("%s: unsupported quantization type %s\n", __func__, ggml_type_name(typ));
                return 1e35;
            }
-            traits->to_float(qbuf.data(), deq.data(), (int) f32_sample.size());
+
+            traits->to_float(qbuf.data(), deq.data(), (int) nels);
        }

        // Compute error
+        const double eps = 1e-12;
        size_t off = 0;
        double total_err = 0.0;
-        const double eps = 1e-12;

        for (int64_t slice = 0; slice < ne2; ++slice) {
            const int64_t rs = sample_rows_per_slice[slice];
@ -817,9 +818,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                const float * y = deq.data() + off;

                double mse_w = 0.0;
-                double x2_w = 0.0;
-                double bnum = 0.0;
-                double bden = 0.0;
+                double x2_w  = 0.0;
+                double bnum  = 0.0;
+                double bden  = 0.0;

                if (wv && act) {
                    for (int64_t j = 0; j < n_per_row; ++j) {
@ -828,8 +829,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                        const double a = act[j];
                        mse_w += w * e * e;
                        x2_w += w * x[j] * x[j];
-                        bnum += e * a;
-                        bden += a * a;
+                        bnum += w * e * a;  // weighted bias
+                        bden += w * a * a;  // weighted norm
                    }
                } else if (wv) {
                    for (int64_t j = 0; j < n_per_row; ++j) {
@ -856,7 +857,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                }

                double row_err = mse_w / (x2_w + eps);
+
                if (act && bias_lambda != 0.0) {
+                    // penalize squared projection of error onto activations
                    row_err += bias_lambda * (bnum * bnum) / (bden + eps);
                }

@ -864,7 +867,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                off += (size_t)n_per_row;
            }

-            // scale back up to the full number of rows in this slice
+            // scale to full rows in this slice (nrows)
            const double scale_rows = (double)nrows / std::max(1.0, (double)rs);
            total_err += slice_err * scale_rows;
        }
@ -982,10 +985,14 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(

        // Compute maximum row size among compatible candidates (to size qbuf once)
        size_t max_row_sz = 0;
+        const bool has_valid_imatrix = !values_sample.empty() && values_sample.size() == (size_t)ne2 * (size_t)n_per_row;
        std::vector<ggml_type> compatible_candidates;
        compatible_candidates.reserve(quant_candidates.size());
        for (ggml_type ts_type : quant_candidates) {
-            if (is_iq(ts_type) && !values_all) { continue; }
+            if (is_iq(ts_type) && !has_valid_imatrix) {
+                LLAMA_LOG_WARN("%s: skipping IQ quantization for %s, no or mismatched imatrix provided\n", __func__, name.c_str());
+                continue;
+            }
            ggml_type tt = make_compatible(t, ts_type);
            if (!is_compatible(t, tt)) { continue; }
            compatible_candidates.push_back(tt);
@ -996,13 +1003,37 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
        std::vector<float>   deq(f32_sample.size());

        // Now evaluate candidates
-        for (ggml_type tt : compatible_candidates) {
-            auto bpw = (float)tensor_bpw(t, tt);
-            size_t bytes = total_bytes(t, tt);
-            const float *vals_ptr = values_sample.empty() ? nullptr : values_sample.data();
-            const float *acts_ptr = activations_sample.empty() ? nullptr : activations_sample.data();
-            float  err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, vals_ptr, acts_ptr, qbuf, deq);
-            info.candidate.push_back(candidate_types{ tt, bpw, bytes, err });
+        std::vector<candidate_types> cand_out(compatible_candidates.size());
+        const float *vals_ptr = values_sample.empty() ? nullptr : values_sample.data();
+        const float *acts_ptr = activations_sample.empty() ? nullptr : activations_sample.data();
+
+        int n_eval_threads = std::max(1, nthread);
+        std::atomic<size_t> cidx{0};
+        std::vector<std::thread> eval_workers;
+        eval_workers.reserve(n_eval_threads);
+
+        for (int ti = 0; ti < n_eval_threads; ++ti) {
+            eval_workers.emplace_back([&] {
+                // thread-local scratch
+                std::vector<uint8_t> tl_qbuf(qbuf.size());
+                std::vector<float>   tl_deq(deq.size());
+
+                for (;;) {
+                    const size_t i = cidx.fetch_add(1, std::memory_order_relaxed);
+                    if (i >= compatible_candidates.size()) { break; }
+
+                    const ggml_type tt = compatible_candidates[i];
+                    const auto bpw = (float)tensor_bpw(t, tt);
+                    const size_t bytes = total_bytes(t, tt);
+                    const auto err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, vals_ptr, acts_ptr, tl_qbuf, tl_deq);
+                    cand_out[i] = candidate_types{ tt, bpw, bytes, err };
+                }
+            });
+        }
+        for (auto &th : eval_workers) { th.join(); }
+
+        for (auto &c : cand_out) {
+            if (c.bytes > 0) { info.candidate.push_back(c); }
        }

        if (info.candidate.empty()) {