From 14fae69a7bb932fadbc5dd62072a254866512650 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 20 Sep 2025 21:31:31 +0100 Subject: [PATCH] General refactoring --- src/llama-quant.cpp | 75 +++++++++++++++++++++++---------------------- 1 file changed, 39 insertions(+), 36 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index c6051a480c..6e5562379c 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -729,19 +729,19 @@ static std::unordered_map target_bpw_type( auto estimate_error = [&](const ggml_tensor * t, const ggml_type quant_type, const std::vector & f32_sample, - const std::vector & sample_rows_per_slice, + const std::vector & rows_sample, const float * values_sample, const float * activations_sample, std::vector & quantized_buffer, std::vector & dequantized_buffer, - float bias_lambda, + float tensor_bias_lambda, + const float * slice_bias_lambda, double * out_mse = nullptr, double * out_proj = nullptr) -> double { const int64_t n_per_row = t->ne[0]; const int64_t nrows = t->ne[1]; const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; - const size_t sample_element_count = f32_sample.size(); const size_t sample_row_count = n_per_row > 0 ? sample_element_count / (size_t)n_per_row : 0; if (sample_row_count == 0) { @@ -753,8 +753,9 @@ static std::unordered_map target_bpw_type( size_t expected_rows = 0; for (int64_t s = 0; s < ne2; ++s) { - expected_rows += (size_t)sample_rows_per_slice[s]; + expected_rows += (size_t)rows_sample[s]; } + if (expected_rows != sample_row_count) { if (out_mse) { *out_mse = infinity; } if (out_proj) { *out_proj = 0.0; } @@ -783,17 +784,18 @@ static std::unordered_map target_bpw_type( const double a = activations[j]; denom += w * a * a; } + bias_denominator_per_slice[s] = denom; } } - // Per-row squared norms with weighting + // Weighted per-row squared norms std::vector row_sq_norm(sample_row_count, 0.0); { size_t offset = 0; size_t row_idx = 0; for (int64_t s = 0; s < ne2; ++s) { - const int64_t rs = sample_rows_per_slice[s]; + const int64_t rs = rows_sample[s]; if (rs == 0) { continue; } const float * values = has_values ? values_sample + s * n_per_row : nullptr; @@ -823,7 +825,7 @@ static std::unordered_map target_bpw_type( size_t q_offset = 0; size_t f_offset = 0; for (int64_t slice = 0; slice < ne2; ++slice) { - const int64_t rs = sample_rows_per_slice[slice]; + const int64_t rs = rows_sample[slice]; if (rs == 0) { continue; } const float * value = has_values ? values_sample + slice * n_per_row : nullptr; @@ -843,21 +845,19 @@ static std::unordered_map target_bpw_type( } else { for (size_t r = 0; r < sample_row_count; ++r) { uint8_t * src = quantized_buffer.data() + r * row_sz; - float * dst = dequantized_buffer.data() + r * (size_t) n_per_row; + float * dst = dequantized_buffer.data() + r * (size_t)n_per_row; if (is_fp16) { - ggml_fp16_to_fp32_row((const ggml_fp16_t *) src, dst, (int) n_per_row); - } - else if (is_bf16) { - ggml_bf16_to_fp32_row((const ggml_bf16_t *) src, dst, (int) n_per_row); - } - else { + ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row); + } else if (is_bf16) { + ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row); + } else { if (!traits || !traits->to_float) { if (out_mse) { *out_mse = infinity; } if (out_proj) { *out_proj = 0.0; } return infinity; } - traits->to_float(src, dst, (int) n_per_row); + traits->to_float(src, dst, (int)n_per_row); } } } @@ -1098,20 +1098,20 @@ static std::unordered_map target_bpw_type( offset = dist(rng); } - for (int64_t r = offset; r < nrows_total && current_sampled_rows < sample_rows_max; r += stride) { + for (int64_t r = offset; r < nrows_total && current_sampled_rows < rows_sample_max; r += stride) { if (src_type == GGML_TYPE_F32) { - const float * src_row = (const float *)t->data + slice * (n_per_row * nrows_total) + r * n_per_row; + const float * src_row = (const float *)tensor->data + slice * (n_per_row * nrows_total) + r * n_per_row; f32_sample.insert(f32_sample.end(), src_row, src_row + n_per_row); } else if (src_type == GGML_TYPE_F16) { - const auto * src_row = (const ggml_fp16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz); + const auto * src_row = (const ggml_fp16_t *)((const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz); ggml_fp16_to_fp32_row(src_row, row_buffer.data(), (int)n_per_row); f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end()); } else if (src_type == GGML_TYPE_BF16) { - const auto * src_row = (const ggml_bf16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz); + const auto * src_row = (const ggml_bf16_t *)((const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz); ggml_bf16_to_fp32_row(src_row, row_buffer.data(), (int)n_per_row); f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end()); } else if (src_is_quant) { - const uint8_t * qrow = (const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz; + const uint8_t * qrow = (const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz; if (!src_traits || !src_traits->to_float) { throw std::runtime_error(format("cannot dequantize type %s for sampling", ggml_type_name(src_type))); } @@ -1120,9 +1120,11 @@ static std::unordered_map target_bpw_type( } else { throw std::runtime_error(format("unsupported src type %s for sampling", ggml_type_name(src_type))); } + ++current_sampled_rows; } - sample_rows_per_slice[slice] = current_sampled_rows; + + rows_sample[slice] = current_sampled_rows; } auto side_data = [&](const std::unordered_map> * m, const std::string & tensor_name) -> std::pair { @@ -1160,7 +1162,7 @@ static std::unordered_map target_bpw_type( if (values_all) { copy_or_broadcast(values_all, values_sz, values_sample); } if (activations_all) { copy_or_broadcast(activations_all, activations_sz, activations_sample); } - const int64_t nelem = ggml_nelements(t); + const int64_t nelem = ggml_nelements(tensor); tensor_info info; info.w = tw; info.n_elements = nelem; @@ -1185,8 +1187,9 @@ static std::unordered_map target_bpw_type( __func__, ggml_type_name(ts_type), name.c_str()); continue; } - ggml_type tt = make_compatible(t, ts_type); - if (!is_compatible(t, tt)) { continue; } + + ggml_type tt = make_compatible(tensor, ts_type); + if (!is_compatible(tensor, tt)) { continue; } compatible_candidates.push_back(tt); max_row_sz = std::max(max_row_sz, ggml_row_size(tt, n_per_row)); } @@ -1222,16 +1225,16 @@ static std::unordered_map target_bpw_type( // thread-local scratch std::vector tl_quantized_buffer(quantized_buffer.size()); std::vector tl_dequantised_buffer(dequantised_buffer.size()); - for (;;) { const size_t i = cidx.fetch_add(1, std::memory_order_relaxed); if (i >= compatible_candidates.size()) { break; } - const ggml_type tt = compatible_candidates[i]; - const auto bpw = (float)tensor_bpw(t, tt); - const size_t bytes = tensor_bytes(t, tt); - const auto err = estimate_error(t, tt, f32_sample, sample_rows_per_slice, values, activations, tl_quantized_buffer, tl_dequantised_buffer, bias_lambda); - eval_candidates[i] = candidate_types{ tt, bpw, bytes, err }; + const ggml_type tensor_types = compatible_candidates[i]; + const auto bpw = (float)tensor_bpw(tensor, tensor_types); + const size_t bytes = tensor_bytes(tensor, tensor_types); + const auto err = estimate_error(tensor, tensor_types, f32_sample, rows_sample, values, activations, + tl_quantized_buffer, tl_dequantised_buffer, tensor_lambda, slice_lambda); + eval_candidates[i] = candidate_types{ tensor_types, bpw, bytes, err }; } }); } @@ -1244,8 +1247,8 @@ static std::unordered_map target_bpw_type( if (info.candidate.empty()) { // As a last resort, keep original type - float bpw = ggml_nbytes(t) * 8.0f / nelem; - info.candidate.push_back(candidate_types{ t->type, bpw, ggml_nbytes(t), 0.0 }); + float bpw = ggml_nbytes(tensor) * 8.0f / nelem; + info.candidate.push_back(candidate_types{ tensor->type, bpw, ggml_nbytes(tensor), 0.0 }); } // Keep only the pareto‑optimal candidates: if A has >= bytes and >= error than B, drop A. @@ -1274,6 +1277,7 @@ static std::unordered_map target_bpw_type( // same bytes: we already sorted by error; skip } } + info.candidate.swap(pruned); } @@ -1299,6 +1303,7 @@ static std::unordered_map target_bpw_type( } convex.push_back(p); } + info.candidate.swap(convex); } } @@ -1312,7 +1317,6 @@ static std::unordered_map target_bpw_type( if (all.empty()) { return {}; } - // Lagrangian relaxation to minimise error subject to a bpw target constraint auto total_bytes = [&]() -> size_t { size_t tb = 0; for (const auto & ti : all) { @@ -1359,6 +1363,7 @@ static std::unordered_map target_bpw_type( return emit_overrides(); } + // Lagrangian relaxation to minimise error subject to a bpw target constraint auto lagrange_penalty = [&](const double mu, std::vector & choice, size_t & bytes, double & err) { choice.resize(all.size()); bytes = 0; @@ -1406,6 +1411,7 @@ static std::unordered_map target_bpw_type( if (bytes_hi <= budget_bytes) { break; } + mu_hi *= 2.0; if (++expand > 60) { break; @@ -1422,11 +1428,9 @@ static std::unordered_map target_bpw_type( lagrange_penalty(mu, choice_mid, bytes_mid, err_mid); const double gap = std::abs((double)bytes_mid - (double)budget_bytes); - if (bytes_mid > budget_bytes) { // Too big, need stronger penalty mu_lo = mu; - if (gap < best_over_gap - epsilon || (std::abs(gap - best_over_gap) <= epsilon && err_mid < best_over_err)) { best_over_gap = gap; best_over_err = err_mid; @@ -1435,7 +1439,6 @@ static std::unordered_map target_bpw_type( } else { // Under budget, good candidate mu_hi = mu; - if (gap < best_under_gap - epsilon || (std::abs(gap - best_under_gap) <= epsilon && err_mid < best_under_err)) { best_under_gap = gap; best_under_err = err_mid;