From 14fae69a7bb932fadbc5dd62072a254866512650 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 20 Sep 2025 21:31:31 +0100
Subject: [PATCH] General refactoring

---
 src/llama-quant.cpp | 75 +++++++++++++++++++++++----------------------
 1 file changed, 39 insertions(+), 36 deletions(-)
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index c6051a480c..6e5562379c 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -729,19 +729,19 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     auto estimate_error = [&](const ggml_tensor * t,
         const ggml_type quant_type,
         const std::vector<float> & f32_sample,
-        const std::vector<int64_t> & sample_rows_per_slice,
+        const std::vector<int64_t> & rows_sample,
         const float * values_sample,
         const float * activations_sample,
         std::vector<uint8_t> & quantized_buffer,
         std::vector<float> & dequantized_buffer,
-        float bias_lambda,
+        float tensor_bias_lambda,
+        const float * slice_bias_lambda,
         double * out_mse = nullptr,
         double * out_proj = nullptr) -> double
     {
         const int64_t n_per_row = t->ne[0];
         const int64_t nrows = t->ne[1];
         const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
-
         const size_t sample_element_count = f32_sample.size();
         const size_t sample_row_count = n_per_row > 0 ? sample_element_count / (size_t)n_per_row : 0;
         if (sample_row_count == 0) {
@@ -753,8 +753,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         size_t expected_rows = 0;
         for (int64_t s = 0; s < ne2; ++s) {
-            expected_rows += (size_t)sample_rows_per_slice[s];
+            expected_rows += (size_t)rows_sample[s];
         }
+
         if (expected_rows != sample_row_count) {
             if (out_mse) { *out_mse = infinity; }
             if (out_proj) { *out_proj = 0.0; }
@@ -783,17 +784,18 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     const double a = activations[j];
                     denom += w * a * a;
                 }
+
                 bias_denominator_per_slice[s] = denom;
             }
         }
 
-        // Per-row squared norms with weighting
+        // Weighted per-row squared norms
         std::vector<double> row_sq_norm(sample_row_count, 0.0);
         {
             size_t offset = 0;
             size_t row_idx = 0;
             for (int64_t s = 0; s < ne2; ++s) {
-                const int64_t rs = sample_rows_per_slice[s];
+                const int64_t rs = rows_sample[s];
                 if (rs == 0) { continue; }
 
                 const float * values = has_values ? values_sample + s * n_per_row : nullptr;
@@ -823,7 +825,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             size_t q_offset = 0;
             size_t f_offset = 0;
             for (int64_t slice = 0; slice < ne2; ++slice) {
-                const int64_t rs = sample_rows_per_slice[slice];
+                const int64_t rs = rows_sample[slice];
                 if (rs == 0) { continue; }
 
                 const float * value = has_values ? values_sample + slice * n_per_row : nullptr;
@@ -843,21 +845,19 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             } else {
                 for (size_t r = 0; r < sample_row_count; ++r) {
                     uint8_t * src = quantized_buffer.data() + r * row_sz;
-                    float * dst = dequantized_buffer.data() + r * (size_t) n_per_row;
+                    float * dst = dequantized_buffer.data() + r * (size_t)n_per_row;
                     if (is_fp16) {
-                        ggml_fp16_to_fp32_row((const ggml_fp16_t *) src, dst, (int) n_per_row);
-                    }
-                    else if (is_bf16) {
-                        ggml_bf16_to_fp32_row((const ggml_bf16_t *) src, dst, (int) n_per_row);
-                    }
-                    else {
+                        ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row);
+                    } else if (is_bf16) {
+                        ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row);
+                    } else {
                         if (!traits || !traits->to_float) {
                             if (out_mse) { *out_mse = infinity; }
                             if (out_proj) { *out_proj = 0.0; }
 
                             return infinity;
                         }
-                        traits->to_float(src, dst, (int) n_per_row);
+                        traits->to_float(src, dst, (int)n_per_row);
                     }
                 }
             }
@@ -1098,20 +1098,20 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 offset = dist(rng);
             }
 
-            for (int64_t r = offset; r < nrows_total && current_sampled_rows < sample_rows_max; r += stride) {
+            for (int64_t r = offset; r < nrows_total && current_sampled_rows < rows_sample_max; r += stride) {
                 if (src_type == GGML_TYPE_F32) {
-                    const float * src_row = (const float *)t->data + slice * (n_per_row * nrows_total) + r * n_per_row;
+                    const float * src_row = (const float *)tensor->data + slice * (n_per_row * nrows_total) + r * n_per_row;
                     f32_sample.insert(f32_sample.end(), src_row, src_row + n_per_row);
                 } else if (src_type == GGML_TYPE_F16) {
-                    const auto * src_row = (const ggml_fp16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz);
+                    const auto * src_row = (const ggml_fp16_t *)((const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz);
                     ggml_fp16_to_fp32_row(src_row, row_buffer.data(), (int)n_per_row);
                     f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end());
                 } else if (src_type == GGML_TYPE_BF16) {
-                    const auto * src_row = (const ggml_bf16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz);
+                    const auto * src_row = (const ggml_bf16_t *)((const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz);
                     ggml_bf16_to_fp32_row(src_row, row_buffer.data(), (int)n_per_row);
                     f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end());
                 } else if (src_is_quant) {
-                    const uint8_t * qrow = (const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz;
+                    const uint8_t * qrow = (const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz;
                     if (!src_traits || !src_traits->to_float) {
                         throw std::runtime_error(format("cannot dequantize type %s for sampling", ggml_type_name(src_type)));
                     }
@@ -1120,9 +1120,11 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 } else {
                     throw std::runtime_error(format("unsupported src type %s for sampling", ggml_type_name(src_type)));
                 }
+
                 ++current_sampled_rows;
             }
-            sample_rows_per_slice[slice] = current_sampled_rows;
+
+            rows_sample[slice] = current_sampled_rows;
         }
 
         auto side_data = [&](const std::unordered_map<std::string, std::vector<float>> * m, const std::string & tensor_name) -> std::pair<const float*, size_t> {
@@ -1160,7 +1162,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         if (values_all) { copy_or_broadcast(values_all, values_sz, values_sample); }
         if (activations_all) { copy_or_broadcast(activations_all, activations_sz, activations_sample); }
 
-        const int64_t nelem = ggml_nelements(t);
+        const int64_t nelem = ggml_nelements(tensor);
         tensor_info info;
         info.w = tw;
         info.n_elements = nelem;
@@ -1185,8 +1187,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     __func__, ggml_type_name(ts_type), name.c_str());
                 continue;
             }
-            ggml_type tt = make_compatible(t, ts_type);
-            if (!is_compatible(t, tt)) { continue; }
+
+            ggml_type tt = make_compatible(tensor, ts_type);
+            if (!is_compatible(tensor, tt)) { continue; }
             compatible_candidates.push_back(tt);
             max_row_sz = std::max(max_row_sz, ggml_row_size(tt, n_per_row));
         }
@@ -1222,16 +1225,16 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 // thread-local scratch
                 std::vector<uint8_t> tl_quantized_buffer(quantized_buffer.size());
                 std::vector<float>   tl_dequantised_buffer(dequantised_buffer.size());
-
                 for (;;) {
                     const size_t i = cidx.fetch_add(1, std::memory_order_relaxed);
                     if (i >= compatible_candidates.size()) { break; }
 
-                    const ggml_type tt = compatible_candidates[i];
-                    const auto bpw = (float)tensor_bpw(t, tt);
-                    const size_t bytes = tensor_bytes(t, tt);
-                    const auto err = estimate_error(t, tt, f32_sample, sample_rows_per_slice, values, activations, tl_quantized_buffer, tl_dequantised_buffer, bias_lambda);
-                    eval_candidates[i] = candidate_types{ tt, bpw, bytes, err };
+                    const ggml_type tensor_types = compatible_candidates[i];
+                    const auto bpw = (float)tensor_bpw(tensor, tensor_types);
+                    const size_t bytes = tensor_bytes(tensor, tensor_types);
+                    const auto err = estimate_error(tensor, tensor_types, f32_sample, rows_sample, values, activations,
+                        tl_quantized_buffer, tl_dequantised_buffer, tensor_lambda, slice_lambda);
+                    eval_candidates[i] = candidate_types{ tensor_types, bpw, bytes, err };
                 }
             });
         }
@@ -1244,8 +1247,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         if (info.candidate.empty()) {
             // As a last resort, keep original type
-            float bpw = ggml_nbytes(t) * 8.0f / nelem;
-            info.candidate.push_back(candidate_types{ t->type, bpw, ggml_nbytes(t), 0.0 });
+            float bpw = ggml_nbytes(tensor) * 8.0f / nelem;
+            info.candidate.push_back(candidate_types{ tensor->type, bpw, ggml_nbytes(tensor), 0.0 });
         }
 
         // Keep only the pareto‑optimal candidates: if A has >= bytes and >= error than B, drop A.
@@ -1274,6 +1277,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     // same bytes: we already sorted by error; skip
                 }
             }
+
             info.candidate.swap(pruned);
         }
 
@@ -1299,6 +1303,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     }
                     convex.push_back(p);
                 }
+
                 info.candidate.swap(convex);
             }
         }
@@ -1312,7 +1317,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     if (all.empty()) { return {}; }
 
-    // Lagrangian relaxation to minimise error subject to a bpw target constraint
     auto total_bytes = [&]() -> size_t {
         size_t tb = 0;
         for (const auto & ti : all) {
@@ -1359,6 +1363,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return emit_overrides();
     }
 
+    // Lagrangian relaxation to minimise error subject to a bpw target constraint
     auto lagrange_penalty = [&](const double mu, std::vector<int> & choice, size_t & bytes, double & err) {
         choice.resize(all.size());
         bytes = 0;
@@ -1406,6 +1411,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             if (bytes_hi <= budget_bytes) {
                 break;
             }
+
             mu_hi *= 2.0;
             if (++expand > 60) {
                 break;
@@ -1422,11 +1428,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         lagrange_penalty(mu, choice_mid, bytes_mid, err_mid);
 
         const double gap = std::abs((double)bytes_mid - (double)budget_bytes);
-
         if (bytes_mid > budget_bytes) {
             // Too big, need stronger penalty
             mu_lo = mu;
-
             if (gap < best_over_gap - epsilon || (std::abs(gap - best_over_gap) <= epsilon && err_mid < best_over_err)) {
                 best_over_gap = gap;
                 best_over_err = err_mid;
@@ -1435,7 +1439,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         } else {
             // Under budget, good candidate
             mu_hi = mu;
-
             if (gap < best_under_gap - epsilon || (std::abs(gap - best_under_gap) <= epsilon && err_mid < best_under_err)) {
                 best_under_gap = gap;
                 best_under_err = err_mid;