From 3a3d807fc3aacc01715047bcc893f925f5343c6b Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Fri, 10 Oct 2025 13:10:42 +0100 Subject: [PATCH] Remove bias mode computation --- src/llama-quant.cpp | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 5388d5a072..7b3e956193 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -769,11 +769,9 @@ static std::unordered_map target_bpw_type( std::ofstream ofs(tmp, std::ios::binary | std::ios::trunc); if (!ofs) { return; } // best-effort const float target_bpw = params->target_bpw; - const uint8_t bias_mode = params->no_bias ? 1 : 0; ofs.write((const char *)&file_magic, sizeof(file_magic)); ofs.write((const char *)&model_id, sizeof(model_id)); ofs.write((const char *)&target_bpw, sizeof(target_bpw)); - ofs.write((const char *)&bias_mode, sizeof(bias_mode)); const uint64_t n = all_vec.size(); ofs.write((const char *)&n, sizeof(n)); for (const auto & ti : all_vec) { @@ -814,11 +812,9 @@ static std::unordered_map target_bpw_type( uint32_t magic = 0; uint64_t id = 0; float bpw = 0.0f; - uint8_t bias = 0; ifs.read((char *)&magic, sizeof(magic)); ifs.read((char *)&id, sizeof(id)); ifs.read((char *)&bpw, sizeof(bpw)); - ifs.read((char *)&bias, sizeof(bias)); if (magic != file_magic) { LLAMA_LOG_WARN("%s: invalid resume file, ignoring: %s\n", func, checkpoint_file.c_str()); return out; @@ -828,9 +824,6 @@ static std::unordered_map target_bpw_type( } else if (bpw != params->target_bpw) { LLAMA_LOG_WARN("%s: target bpw of %f does not match %f, ignoring: %s\n", func, params->target_bpw, bpw, checkpoint_file.c_str()); return out; - } else if (bias != (params->no_bias ? 1 : 0)) { - LLAMA_LOG_WARN("%s: bias mode does not match, ignoring: %s\n", func, checkpoint_file.c_str()); - return out; } else { LLAMA_LOG_INFO("%s: resuming tensor quantization\n", func); } @@ -1319,13 +1312,11 @@ static std::unordered_map target_bpw_type( std::vector lambdas; const float * values = values_sample.empty() ? nullptr : values_sample.data(); const float * activations = activations_sample.empty() ? nullptr : activations_sample.data(); - if (!params->no_bias) { - double acc = 0.0; - int ns = 0; - lambdas = estimate_lambda(values, activations, n_per_row, ne2); - for (float l : lambdas) { acc += l; ++ns; } - tensor_lambda = ns ? (float)(acc / ns) : 0.0f; - } + double acc = 0.0; + int ns = 0; + lambdas = estimate_lambda(values, activations, n_per_row, ne2); + for (float l : lambdas) { acc += l; ++ns; } + tensor_lambda = ns ? (float)(acc / ns) : 0.0f; // Evaluate candidates std::vector eval_candidates(compatible_candidates.size()); @@ -1925,11 +1916,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: if (params->target_bpw != -1.0f && !params->only_copy) { if (params->imatrix) { if (params->activations) { - LLAMA_LOG_INFO("%s: imatrix with activations provided, target bpw quantization will be more accurate - ",__func__); + LLAMA_LOG_INFO("%s: imatrix with activations provided, target bpw quantization will be more accurate\n",__func__); } else { - LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate - ", __func__); + LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate\n", __func__); } - LLAMA_LOG_INFO("using %s error estimation\n", params->no_bias ? "MSE only (no alignment bias)" : "alignment bias (default)"); LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw); bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread); } else {