From 7d85993f268d9fa35bea9178f6acf2d72833dffa Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 13 Sep 2025 08:44:41 +0100 Subject: [PATCH] Minor refactoring --- src/llama-quant.cpp | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 1677b242d9..15ea36721e 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -617,7 +617,7 @@ static std::unordered_map target_bpw_type( struct tensor_info { const llama_model_loader::llama_tensor_weight * w = nullptr; - std::vector candidate = {}; + std::vector candidate; int choice = -1; float min_bpw = 0.0; float max_bpw = 0.0; @@ -972,8 +972,8 @@ static std::unordered_map target_bpw_type( } }; - push_if(GGML_TYPE_Q4_K); push_if(GGML_TYPE_Q3_K); + push_if(GGML_TYPE_Q4_K); push_if(GGML_TYPE_Q5_K); if (probes.empty() && !compatible_candidates.empty()) { probes.push_back(compatible_candidates[compatible_candidates.size() / 2]); @@ -1011,7 +1011,7 @@ static std::unordered_map target_bpw_type( return (float)lambda; }; - // Faster to compute but lower precision. Best option for the vast majority of models + // Faster to compute but may yield lower precision. Best option for the vast majority of cases auto fast_lambda = [&](const float * values, const float * activations, const int64_t n_per_row) { if (!activations) { return 0.0f; } @@ -1057,12 +1057,10 @@ static std::unordered_map target_bpw_type( const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; // Larger sample_rows_per_expert values may result in more accurate error estimates, but it will take much longer to compute - constexpr int sample_rows_per_expert = 256; + const int sample_rows_per_expert = activations_data ? 512 : 256; std::vector f32_sample; f32_sample.reserve((size_t)ne2 * (size_t)std::min(nrows_total, sample_rows_per_expert) * (size_t)n_per_row); - // deterministic sampling seed based on tensor name + fixed constant - std::mt19937 rng(std::hash{}(name) ^0xeabada55cafed00d); std::vector sample_rows_per_slice(ne2, 0); const int64_t sample_rows_max = std::max(1, std::min(nrows_total, sample_rows_per_expert)); const int64_t stride = std::max(1, nrows_total / sample_rows_max); @@ -1072,6 +1070,7 @@ static std::unordered_map target_bpw_type( const bool src_is_quant = ggml_is_quantized(src_type); const size_t src_row_sz = ggml_row_size(src_type, n_per_row); for (int64_t slice = 0; slice < ne2; ++slice) { + std::mt19937 rng(std::hash{}(name) ^ 0xeabada55cafed00d ^ slice); int64_t current_sampled_rows = 0; int64_t offset = 0; if (stride > 1) { @@ -1084,11 +1083,11 @@ static std::unordered_map target_bpw_type( const float * src_row = (const float *)t->data + slice * (n_per_row * nrows_total) + r * n_per_row; f32_sample.insert(f32_sample.end(), src_row, src_row + n_per_row); } else if (src_type == GGML_TYPE_F16) { - const ggml_fp16_t * src_row = (const ggml_fp16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz); + const auto * src_row = (const ggml_fp16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz); ggml_fp16_to_fp32_row(src_row, row_buffer.data(), (int)n_per_row); f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end()); } else if (src_type == GGML_TYPE_BF16) { - const ggml_bf16_t * src_row = (const ggml_bf16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz); + const auto * src_row = (const ggml_bf16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz); ggml_bf16_to_fp32_row(src_row, row_buffer.data(), (int)n_per_row); f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end()); } else if (src_is_quant) { @@ -1211,7 +1210,7 @@ static std::unordered_map target_bpw_type( const ggml_type tt = compatible_candidates[i]; const auto bpw = (float)tensor_bpw(t, tt); const size_t bytes = tensor_bytes(t, tt); - const auto err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, values, activations, tl_quantized_buffer, tl_dequantised_buffer, bias_lambda); + const auto err = estimate_error(t, tt, f32_sample, sample_rows_per_slice, values, activations, tl_quantized_buffer, tl_dequantised_buffer, bias_lambda); eval_candidates[i] = candidate_types{ tt, bpw, bytes, err }; } }); @@ -1240,7 +1239,7 @@ static std::unordered_map target_bpw_type( return a.error < b.error; }); - double best_err = std::numeric_limits::infinity(); + double best_err = infinity; size_t last_bytes = std::numeric_limits::max(); for (const auto & c : info.candidate) { // Only keep the best error seen so far at strictly larger byte sizes