diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 05486a275f..790003b5c9 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -642,11 +642,11 @@ static std::unordered_map target_bpw_type( handler_t prev_int = SIG_DFL; handler_t prev_term = SIG_DFL; signal_scope_guard() { - prev_int = std::signal(SIGINT, signal_handler); + prev_int = std::signal(SIGINT, signal_handler); prev_term = std::signal(SIGTERM, signal_handler); } ~signal_scope_guard() { - std::signal(SIGINT, prev_int); + std::signal(SIGINT, prev_int); std::signal(SIGTERM, prev_term); } } signal_guard; @@ -661,17 +661,17 @@ static std::unordered_map target_bpw_type( double proj = 0.0; }; - // Per‑tensor quantization mix that satisfies a global bpw target - struct tensor_info { + // Tensor quantization type choice + struct type_choice { const llama_model_loader::llama_tensor_weight * w = nullptr; - std::vector candidate; + std::vector candidates; int choice = -1; float min_bpw = 0.0; float max_bpw = 0.0; size_t n_elements = 0; }; - // subset of quantization types with the best accuracy/size tradeoff + // Quantization types constexpr ggml_type quant_types[] = { GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M, @@ -701,34 +701,31 @@ static std::unordered_map target_bpw_type( const char * func = __func__; // Tensor size in bytes for a given type - auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t { - const int64_t n_per_row = t->ne[0]; - const size_t row_sz = ggml_row_size(typ, n_per_row); - return (size_t)ggml_nrows(t) * row_sz; + auto tensor_bytes = [](const ggml_tensor * gt, const ggml_type gq) -> size_t { + return (size_t)ggml_nrows(gt) * ggml_row_size(gq, gt->ne[0]); }; // Tensor bpw for a given type - auto tensor_bpw = [&](const ggml_tensor * t, const ggml_type typ) -> double { - const size_t bytes = tensor_bytes(t, typ); - return (double)bytes * 8.0 / (double)ggml_nelements(t); + auto tensor_bpw = [&](const ggml_tensor * gt, const ggml_type gq) -> double { + return (double)tensor_bytes(gt, gq) * 8.0 / (double)ggml_nelements(gt); }; // Check if tensor is compatible with quantization type - auto is_compatible = [](const ggml_tensor * t, const ggml_type typ) -> bool { - const int64_t blck = ggml_blck_size(typ); - return blck <= 1 || (t->ne[0] % blck) == 0; + auto is_compatible = [](const ggml_tensor * gt, const ggml_type gq) -> bool { + const int64_t blck = ggml_blck_size(gq); + return blck <= 1 || gt->ne[0] % blck == 0; }; // Get suitable fallback for type - auto make_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> ggml_type { - if (is_compatible(t, typ)) { return typ; } - const ggml_type fb = fallback_type(typ); - return is_compatible(t, fb) ? fb : GGML_TYPE_F16; + auto make_compatible = [&](const ggml_tensor * gt, const ggml_type gq) -> ggml_type { + if (is_compatible(gt, gq)) { return gq; } + const ggml_type fb = fallback_type(gq); + return is_compatible(gt, fb) ? fb : GGML_TYPE_F16; }; // Check if tensor is an IQ type - auto is_iq = [](const enum ggml_type t) { - switch (t) { + auto is_iq = [](const enum ggml_type gt) { + switch (gt) { case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ1_M: case GGML_TYPE_IQ2_XXS: @@ -745,176 +742,169 @@ static std::unordered_map target_bpw_type( }; // Check if tensor can be quantized - auto can_quantize = [&](const ggml_tensor * t) -> bool { - if (ggml_n_dims(t) < 2) { return false; } // skip 1D tensors - return is_quantizable(ggml_get_name(t), model.arch, params); - }; - - // Saved state per tensor - struct saved_info { - std::vector candidate; - int choice = -1; - float min_bpw = 0.0f; - float max_bpw = 0.0f; - size_t n_elements = 0; + auto can_quantize = [&](const ggml_tensor * gt) -> bool { + if (ggml_n_dims(gt) < 2 || ggml_n_dims(gt) > 3) { return false; } // skip 1D & 4D+ tensors + return is_quantizable(ggml_get_name(gt), model.arch, params); }; // DJB2 hashing algorithm auto djb2_hash = [&](const uint8_t * data, const size_t n) -> uint64_t { uint64_t h = 5381; - for (size_t i = 0; i < n; ++i) { - h = (h << 5) + h + data[i]; - } - return h ? h : arbitrary_magic; + for (size_t i = 0; i < n; ++i) { h = (h << 5) + h + data[i]; } + return h ? h : HASH_MAGIC; }; - // Get model ID from metadata hash - auto metadata_id = [&](const gguf_context * ctx) -> uint64_t { - const size_t sz = gguf_get_meta_size(ctx); + // Model ID from metadata hash + const uint64_t model_id = [&] { + const size_t sz = gguf_get_meta_size(ml.meta.get()); std::vector buf(sz); - gguf_get_meta_data(ctx, buf.data()); + gguf_get_meta_data(ml.meta.get(), buf.data()); return djb2_hash(buf.data(), buf.size()); - }; + }(); - std::string gen_name; std::string checkpoint_file; - char hex[17]; - const uint64_t model_id = metadata_id(ml.meta.get()); - std::snprintf(hex, sizeof(hex), "%016" PRIx64, (uint64_t)model_id); - ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false); - std::replace(gen_name.begin(), gen_name.end(), ' ', '_'); + { + char hex[17]; + std::string name; + std::snprintf(hex, sizeof(hex), "%016" PRIx64, (uint64_t)model_id); + ml.get_key(LLM_KV_GENERAL_NAME, name, false); + std::replace(name.begin(), name.end(), ' ', '_'); + name.empty() ? checkpoint_file = ml.arch_name : checkpoint_file = name; + checkpoint_file += "-" + std::string(hex) + (valid_wce ? "-wce" : "-mse") + ".bpw_state"; - gen_name.empty() ? checkpoint_file = ml.arch_name : checkpoint_file = gen_name; - checkpoint_file += "-" + std::string(hex) + "-mse.bpw_state"; + if (params->state_file) { + const auto * filename = static_cast(params->state_file); + bool is_valid = false; - if (params->state_file) { - const auto * filename = static_cast(params->state_file); - bool is_valid = false; - - if (std::ifstream(filename, std::ios::binary).good()) { - is_valid = true; - } else if (params->save_state) { - std::ofstream ofs(filename, std::ios::binary | std::ios::app); - if (ofs.is_open()) { + if (std::ifstream(filename, std::ios::binary).good()) { is_valid = true; - ofs.close(); - std::remove(filename); + } else if (params->save_state) { + std::ofstream ofs(filename, std::ios::binary | std::ios::app); + if (ofs.is_open()) { + is_valid = true; + ofs.close(); + std::remove(filename); + } } - } - if (is_valid) { - checkpoint_file = filename; - } else { - LLAMA_LOG_WARN("%s: '%s' is not a valid state file\n", func, filename); - checkpoint_file.clear(); + if (is_valid) { + checkpoint_file = filename; + } else { + LLAMA_LOG_WARN("%s: '%s' is not a valid state file\n", func, filename); + checkpoint_file.clear(); + } } } - // Serializes vector state to disk - auto save_state = [&](const std::vector & all_vec) { + // Save vector state to disk + auto save_state = [&](const std::vector & all_tensors) { const std::string tmp = checkpoint_file + ".tmp"; std::ofstream ofs(tmp, std::ios::binary | std::ios::trunc); if (!ofs) { return; } - ofs.write((const char *)&file_magic, sizeof(file_magic)); - ofs.write((const char *)&model_id, sizeof(model_id)); - const uint64_t n = all_vec.size(); - ofs.write((const char *)&n, sizeof(n)); - for (const auto & ti : all_vec) { - const std::string name = ggml_get_name(ti.w->tensor); + ofs.write((const char *)& file_magic, sizeof(file_magic)); + ofs.write((const char *)& model_id, sizeof(model_id)); + const uint64_t n = all_tensors.size(); + ofs.write((const char *)& n, sizeof(n)); + for (const auto & tn : all_tensors) { + const std::string name = ggml_get_name(tn.w->tensor); const auto len = (uint32_t)name.size(); - ofs.write((const char *)&len, sizeof(len)); + ofs.write((const char *)& len, sizeof(len)); ofs.write(name.data(), len); - const uint64_t cn = ti.candidate.size(); - ofs.write((const char *)&cn, sizeof(cn)); - ofs.write((const char *)&ti.choice, sizeof(ti.choice)); - ofs.write((const char *)&ti.min_bpw, sizeof(ti.min_bpw)); - ofs.write((const char *)&ti.max_bpw, sizeof(ti.max_bpw)); - const uint64_t ne = ti.n_elements; - ofs.write((const char *)&ne, sizeof(ne)); + const uint64_t sz = tn.candidates.size(); + ofs.write((const char *)& sz, sizeof(sz)); + ofs.write((const char *)& tn.choice, sizeof(tn.choice)); + ofs.write((const char *)& tn.min_bpw, sizeof(tn.min_bpw)); + ofs.write((const char *)& tn.max_bpw, sizeof(tn.max_bpw)); + const uint64_t ne = tn.n_elements; + ofs.write((const char *)& ne, sizeof(ne)); - for (const auto & c : ti.candidate) { - const int32_t t = c.type; - const uint64_t b = c.bytes; - ofs.write((const char *)&t, sizeof(t)); - ofs.write((const char *)&c.bpw, sizeof(c.bpw)); - ofs.write((const char *)&b, sizeof(b)); - ofs.write((const char *)&c.error, sizeof(c.error)); + for (const auto & c : tn.candidates) { + const int32_t tp = c.type; + const uint64_t bt = c.bytes; + ofs.write((const char *)& tp, sizeof(tp)); + ofs.write((const char *)& c.bpw, sizeof(c.bpw)); + ofs.write((const char *)& bt, sizeof(bt)); + ofs.write((const char *)& c.error, sizeof(c.error)); } } ofs.close(); std::remove(checkpoint_file.c_str()); std::rename(tmp.c_str(), checkpoint_file.c_str()); - LLAMA_LOG_INFO("%s: saved target progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str()); + LLAMA_LOG_INFO("%s: saved target progress for %lu tensors to %s\n", func, all_tensors.size(), checkpoint_file.c_str()); }; - // Deserializes vector state from disk - auto load_state = [&]() -> std::unordered_map { - std::unordered_map out; + // Load vector state from disk + auto load_state = [&]() -> std::unordered_map { std::ifstream ifs(checkpoint_file, std::ios::binary); - if (!ifs) { return out; } + if (!ifs) { return {}; } uint32_t magic = 0; uint64_t id = 0; - ifs.read((char *)&magic, sizeof(magic)); - ifs.read((char *)&id, sizeof(id)); - if (magic != file_magic) { - LLAMA_LOG_WARN("%s: invalid resume file, ignoring: %s\n", func, checkpoint_file.c_str()); - return out; - } + ifs.read((char *)& magic, sizeof(magic)); + ifs.read((char *)& id, sizeof(id)); if (id != model_id) { - LLAMA_LOG_WARN("%s: model ID mismatch, ignoring: %s\n", func, checkpoint_file.c_str()); - return out; + LLAMA_LOG_WARN("%s: invalid target state file, ignoring\n", func); + return {}; + } + + if (magic != file_magic) { + LLAMA_LOG_WARN("%s: bpw state file mismatch (expected %s, got %s), ignoring\n", + func, file_magic == MSE_MAGIC ? "MSE" : "WCE", magic == MSE_MAGIC ? "MSE" : "WCE"); + return {}; } LLAMA_LOG_INFO("%s: state file found, resuming tensor quantization\n", func); + std::unordered_map out; uint64_t n = 0; - ifs.read((char *)&n, sizeof(n)); + ifs.read((char *)& n, sizeof(n)); for (uint64_t i = 0; i < n; ++i) { uint32_t len = 0; - ifs.read((char *)&len, sizeof(len)); + ifs.read((char *)& len, sizeof(len)); std::string name(len, '\0'); ifs.read(name.data(), len); - uint64_t cn = 0; - ifs.read((char *)&cn, sizeof(cn)); - - saved_info si; - ifs.read((char *)&si.choice, sizeof(si.choice)); - ifs.read((char *)&si.min_bpw, sizeof(si.min_bpw)); - ifs.read((char *)&si.max_bpw, sizeof(si.max_bpw)); + type_choice si; + uint64_t sz = 0; + ifs.read((char *)& sz, sizeof(sz)); + ifs.read((char *)& si.choice, sizeof(si.choice)); + ifs.read((char *)& si.min_bpw, sizeof(si.min_bpw)); + ifs.read((char *)& si.max_bpw, sizeof(si.max_bpw)); uint64_t ne = 0; - ifs.read((char *)&ne, sizeof(ne)); + ifs.read((char *)& ne, sizeof(ne)); si.n_elements = (size_t)ne; - si.candidate.resize(cn); - for (auto & s : si.candidate) { + si.candidates.resize(sz); + for (auto & cd : si.candidates) { int32_t t = 0; uint64_t b = 0; - ifs.read((char *)&t, sizeof(t)); - s.type = (ggml_type)t; - ifs.read((char *)&s.bpw, sizeof(s.bpw)); - ifs.read((char *)&b, sizeof(b)); - s.bytes = (size_t)b; - ifs.read((char *)&s.error, sizeof(s.error)); + ifs.read((char *)& t, sizeof(t)); + cd.type = (ggml_type)t; + ifs.read((char *)& cd.bpw, sizeof(cd.bpw)); + ifs.read((char *)& b, sizeof(b)); + cd.bytes = (size_t)b; + ifs.read((char *)& cd.error, sizeof(cd.error)); + // Populate mse/wce for consistency, though optimization relies on s.error + if (valid_wce) { cd.wce = cd.error; } + else { cd.mse = cd.error; } } out.emplace(std::move(name), std::move(si)); } - LLAMA_LOG_INFO("%s: loaded target state for %lu tensors from %s\n", func, out.size(), checkpoint_file.c_str()); + LLAMA_LOG_INFO("%s: resuming from %s (data for %lu tensors loaded)\n", func, checkpoint_file.c_str(), out.size()); return out; }; // Check for user interrupt and save progress - auto check_signal_handler = [&](const std::vector & all_vec) { + auto check_signal_handler = [&](const std::vector & all_tensors) { if (bpw_stop.load(std::memory_order_relaxed)) { - LLAMA_LOG_INFO("\n%s: saving progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str()); - save_state(all_vec); - throw std::runtime_error("user interrupted the process"); + LLAMA_LOG_INFO("\n%s: interrupted, saving progress for %lu tensors to %s\n", func, all_tensors.size(), checkpoint_file.c_str()); + save_state(all_tensors); + throw std::runtime_error("user terminated the process"); } }; @@ -1013,7 +1003,7 @@ static std::unordered_map target_bpw_type( } } - // Quantize per slice into quantized_buffer + // Quantize & dequantize row samples { size_t qoff = 0; size_t foff = 0; @@ -1021,129 +1011,104 @@ static std::unordered_map target_bpw_type( const int64_t rs = rows_sample[s]; if (rs == 0) { continue; } - const float * v = has_values ? values_sample + s * n_per_row : nullptr; - (void)ggml_quantize_chunk(quant_type, f32_sample.data() + foff, quantized_buffer.data() + qoff, 0, rs, n_per_row, v); + const float * v = has_vals ? values_sample + s * n_per_row : nullptr; + ggml_quantize_chunk(quant_type, f32_sample.data() + foff, quantized_buffer.data() + qoff, 0, rs, n_per_row, v); qoff += row_sz * (size_t)rs; foff += (size_t)rs * (size_t)n_per_row; } - } - // Dequantize into dequantized_buffer - { - if (quant_type == GGML_TYPE_F16) { - for (size_t r = 0; r < sample_rows; ++r) { - auto src = (const ggml_fp16_t *)(quantized_buffer.data() + r * row_sz); - float * dst = dequantized_buffer.data() + r * (size_t)n_per_row; - ggml_fp16_to_fp32_row(src, dst, (int)n_per_row); - } - } else if (quant_type == GGML_TYPE_BF16) { - for (size_t r = 0; r < sample_rows; ++r) { - auto src = (const ggml_bf16_t *)(quantized_buffer.data() + r * row_sz); - float * dst = dequantized_buffer.data() + r * (size_t)n_per_row; - ggml_bf16_to_fp32_row(src, dst, (int)n_per_row); - } - } else { - const ggml_type_traits * traits = ggml_get_type_traits(quant_type); - if (!traits || !traits->to_float) { - if (out_mse) { *out_mse = infinity; } - if (out_proj) { *out_proj = 0.0; } - return infinity; - } - for (size_t r = 0; r < sample_rows; ++r) { - const uint8_t * src = quantized_buffer.data() + r * row_sz; - float * dst = dequantized_buffer.data() + r * (size_t)n_per_row; - traits->to_float(src, dst, (int)n_per_row); - } + const ggml_type_traits * traits = ggml_get_type_traits(quant_type); + if (!traits || !traits->to_float) { return qe; } + for (size_t r = 0; r < sample_rows; ++r) { + const void * src = quantized_buffer.data() + r * row_sz; + float * dst = dequantized_buffer.data() + r * (size_t)n_per_row; + if (quant_type == GGML_TYPE_F16) { ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row); } + else if (quant_type == GGML_TYPE_BF16) { ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row); } + else { traits->to_float(src, dst, (int)n_per_row); } } } - // Compute error per slice with trimmed aggregation + // Helper for trimmed mean auto trimmed_mean = [](std::vector & v) -> double { - const int64_t n = (int64_t)v.size(); + const auto n = v.size(); if (n == 0) { return 0.0; } - double sum = std::accumulate(v.begin(), v.end(), 0.0); - if (n < 50) { return sum / (double)n; } // too few elements to trim - int64_t k = (int64_t) std::floor(0.025 * (double)n); // trim 5% (2.5% each side) - std::sort(v.begin(), v.end()); - const auto num = (double)(n - 2 * k); - sum = std::accumulate(v.begin() + k, v.begin() + (n - k), 0.0); - return sum / std::max(1.0, num); + if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0) / (double)n; } + const auto k = (size_t)((double)n * 0.01); // trim 1% from each end + std::nth_element(v.begin(), v.begin() + k, v.end()); + std::nth_element(v.begin() + k, v.end() - k, v.end()); + return std::accumulate(v.begin() + k, v.end() - k, 0.0) / std::max(1.0, (double)(n - 2 * k)); }; + // Compute Error Metrics: Weighted MSE Optimization - Default size_t off = 0; - size_t ridx = 0; - double total_mse = 0.0; + size_t row_idx = 0; + double total_wmse = 0.0; double total_proj = 0.0; double total_bias = 0.0; + for (int64_t s = 0; s < ne2; ++s) { const int64_t rs = rows_sample[s]; if (rs == 0) { continue; } - const float * v = has_values ? values_sample + s * n_per_row : nullptr; - const float * a = has_activations ? activations_sample + s * n_per_row : nullptr; - const double denom_bias = has_activations ? bias_denom[s] : 0.0; - std::vector row_mse_norm; - row_mse_norm.reserve(rs); - std::vector row_proj_norm; - if (a) { row_proj_norm.reserve(rs); } + const float * val = has_vals ? values_sample + s * n_per_row : nullptr; + const float * act = has_acts ? activations_sample + s * n_per_row : nullptr; + const double denom_bias = has_acts ? (* ptr_bias_denom)[s] : 0.0; - for (int64_t r = 0; r < rs; ++r, ++ridx) { + std::vector slice_mse_norm; + slice_mse_norm.reserve(rs); + std::vector slice_proj_norm; + if (act) { slice_proj_norm.reserve(rs); } + + for (int64_t r = 0; r < rs; ++r, ++row_idx) { const float * x = f32_sample.data() + off; const float * y = dequantized_buffer.data() + off; - double w_mse = 0.0; + double w_err = 0.0; double bias_num = 0.0; + for (int64_t j = 0; j < n_per_row; ++j) { - const double wj = v ? std::max(0.0f, v[j]) : 1.0; + const double w = val ? std::max(0.0f, val[j]) : 1.0; const double e = y[j] - x[j]; - w_mse += wj * e * e; - if (a) { bias_num += wj * e * a[j]; } + w_err += w * e * e; + if (act) { bias_num += w * e * act[j]; } } - const double denom_x = row_sq_norm[ridx]; - const double m_norm = w_mse / (denom_x + epsilon); - row_mse_norm.push_back(std::isfinite(m_norm) ? m_norm : infinity); + const double m_norm = w_err / ((* ptr_row_sq_norm)[row_idx] + EPSILON); + slice_mse_norm.push_back(std::isfinite(m_norm) ? m_norm : INFINITE); - if (a) { + if (act) { double p_norm = 0.0; if (denom_bias > 0.0) { - const double proj = bias_num * bias_num / (denom_bias + epsilon); + const double proj = bias_num * bias_num / (denom_bias + EPSILON); p_norm = std::isfinite(proj) ? proj : 0.0; } - - row_proj_norm.push_back(p_norm); + slice_proj_norm.push_back(p_norm); } off += (size_t)n_per_row; } - const double slice_mse = trimmed_mean(row_mse_norm) * (double)nrows; - const double slice_proj = a ? trimmed_mean(row_proj_norm) * (double)nrows : 0.0; + const int64_t nrows = t->ne[1]; + const double slice_mean_mse = trimmed_mean(slice_mse_norm) * (double)nrows; + const double slice_mean_proj = act ? trimmed_mean(slice_proj_norm) * (double)nrows : 0.0; - total_mse += slice_mse; - total_proj += slice_proj; + total_wmse += slice_mean_mse; + total_proj += slice_mean_proj; - const double bl = slice_bias_lambda ? (double)std::max(0.0f, slice_bias_lambda[s]) : (double)tensor_bias_lambda; - total_bias += bl * slice_proj; - - if (!std::isfinite(total_mse) || !std::isfinite(total_proj) || !std::isfinite(total_bias)) { - if (out_mse) { *out_mse = infinity; } - if (out_proj) { *out_proj = 0.0; } - return infinity; - } + const double lambda = slice_bias ? (double)std::max(0.0f, slice_bias[s]) : (double)tensor_bias; + total_bias += lambda * slice_mean_proj; } - if (out_mse) { *out_mse = total_mse; } - if (out_proj) { *out_proj = total_proj; } - - const double total_err = total_mse + total_bias; - return std::isfinite(total_err) ? total_err : infinity; + qe.mse = total_wmse; + qe.proj = total_proj; + qe.error = total_wmse + total_bias; + return qe; }; - // Returns lambda per slice or 0.0 if no activations + // Lambda per slice or 0.0 if no activations auto estimate_lambda = [&](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) -> std::vector { + if (!activations) { return {}; } const int64_t ns = std::max(1, ne2); std::vector lambdas(ns, 0.0f); - if (!activations) { return lambdas; } for (int64_t s = 0; s < ns; ++s) { const float * v = values ? values + s * n_per_row : nullptr; @@ -1152,51 +1117,45 @@ static std::unordered_map target_bpw_type( double s2 = 0.0; for (int64_t j = 0; j < n_per_row; ++j) { const double w = v ? std::max(0.0f, v[j]) : 1.0; - const double aw = std::sqrt(w) * a[j]; - const double z = aw * aw; - s1 += z; - s2 += z * z; + const double aw = std::sqrt(w) * a[j]; // z = w * a^2 + s1 += aw * aw; + s2 += aw * aw * aw * aw; } - float l = 0.0f; if (s1 > 0.0) { - const auto n = (double)n_per_row; - const double c = std::max(0.0, s2 / (s1 * s1 + epsilon) - 1.0 / n); - l = (float)std::clamp(12.0 * (c / (c + 1.0)), 0.0, 16.0); + const double c = std::max(0.0, s2 / (s1 * s1 + EPSILON) - 1.0 / (double)n_per_row); + lambdas[s] = (float)std::clamp(12.0 * (c / (c + 1.0)), 0.0, 16.0); } - - lambdas[(size_t)s] = l; } return lambdas; }; - std::unordered_map bpw_data; + std::unordered_map bpw_data; if (params->state_file && !checkpoint_file.empty()) { bpw_data = load_state(); } // Parallelize tensor processing (courtesy of https://github.com/ddh0) - auto process_tensor = [&](const llama_model_loader::llama_tensor_weight * tw, + auto process_tensor = [&]( + const llama_model_loader::llama_tensor_weight * tw, std::vector> & thread_local_buffer, std::mutex & loader_mutex, - std::mutex & log_mutex) -> std::optional + std::mutex & log_mutex + ) -> std::optional { ggml_tensor * tensor = tw->tensor; const std::string name = ggml_get_name(tensor); - if (bpw_stop.load(std::memory_order_relaxed)) { - return std::nullopt; - } + if (bpw_stop.load(std::memory_order_relaxed)) { return std::nullopt; } - // check for pre-computed results from a checkpoint file. - auto it_saved = bpw_data.find(name); - if (it_saved != bpw_data.end()) { - tensor_info info; - info.w = tw; - info.candidate = it_saved->second.candidate; - info.choice = it_saved->second.choice; - info.min_bpw = it_saved->second.min_bpw; - info.max_bpw = it_saved->second.max_bpw; - info.n_elements = it_saved->second.n_elements ? it_saved->second.n_elements : (size_t)ggml_nelements(tensor); - return info; + // Check cache + if (auto tn = bpw_data.find(name); tn != bpw_data.end()) { + type_choice tc; + tc.w = tw; + tc.candidates = tn->second.candidates; + tc.choice = tn->second.choice; + tc.min_bpw = tn->second.min_bpw; + tc.max_bpw = tn->second.max_bpw; + tc.n_elements = tn->second.n_elements ? tn->second.n_elements : (size_t)ggml_nelements(tensor); + return tc; } { std::lock_guard lock(log_mutex); @@ -1212,358 +1171,284 @@ static std::unordered_map target_bpw_type( ml.load_data_for(tensor); } - // Dequantize sampled rows into f32_sample + // Sampling const int64_t n_per_row = tensor->ne[0]; const int64_t nrows_total = tensor->ne[1]; const int64_t ne2 = tensor->ne[2] > 0 ? tensor->ne[2] : 1; // Compute rows based on tensor shape and slice count - auto sample_rows = [](const int64_t n, const int64_t rows, const int64_t n2, const bool has_acts) -> int64_t { - const double tensor_budget = has_acts ? 1 * 1024 * 1024 : 0.5 * 1024 * 1024; - const double scale_rows = std::clamp(std::sqrt(std::max(1.0, (double)rows) / 4096.0), 0.5, 2.0); // favour more rows for large tensors - const double slice_budget = tensor_budget * scale_rows / std::max(1, n2); - const int64_t min_rows = has_acts ? 128 : 64; - constexpr int64_t max_rows = 4096; // row limit to avoid excessive memory use - int64_t total_rows = std::llround(slice_budget / std::max(1, n)); - total_rows = std::max(min_rows, std::min(total_rows, std::min(rows, max_rows))); - if (rows <= min_rows * 2) { total_rows = rows; } - return total_rows; + auto sample_count = [&](const int64_t n, const int64_t rows, const int64_t n2, const bool has_acts) { + const double k_scale = valid_wce ? 2.0 : 1.0; + const double tensor_budget = (has_acts ? 1.0 : 0.5) * k_scale * 1024.0 * 1024.0; + const double scale = std::clamp(std::sqrt(std::max(1.0, (double)rows) / 4096.0), 0.5, 2.0); // more rows for large tensors + const double slice_budget = tensor_budget * scale / std::max(1, n2); + const int64_t min_r = (has_acts ? 512 : 256) * (int64_t)k_scale; + const int64_t max_r = 4096 * (int64_t)k_scale; + int64_t tr = std::llround(slice_budget / std::max(1, n)); + tr = std::max(min_r, std::min(tr, std::min(rows, max_r))); + if (rows <= min_r * 2) { tr = rows; } + return tr; }; - const int64_t rows_sample_per_expert = sample_rows(n_per_row, nrows_total, ne2, activations_data != nullptr); + const int64_t rows_to_sample = sample_count(n_per_row, nrows_total, ne2, activations_data != nullptr); std::vector f32_sample; - f32_sample.reserve((size_t)ne2 * (size_t)std::min(nrows_total, rows_sample_per_expert) * (size_t)n_per_row); + f32_sample.reserve((size_t)ne2 * (size_t)std::min(nrows_total, rows_to_sample) * (size_t)n_per_row); std::vector rows_sample(ne2, 0); - const ggml_type src_type = tensor->type; - const ggml_type_traits * src_traits = ggml_get_type_traits(src_type); - const bool src_is_quant = ggml_is_quantized(src_type); - const size_t src_row_sz = ggml_row_size(src_type, n_per_row); - // Convert a single row to fp32 - auto row_to_fp32 = [&](const uint8_t * src, float * dst) { - const ggml_type t = src_type; - if (t == GGML_TYPE_F32) { - std::memcpy(dst, src, sizeof(float) * (size_t)n_per_row); - return; - } - if (t == GGML_TYPE_F16) { - ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row); - return; - } - if (t == GGML_TYPE_BF16) { - ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row); - return; - } - if (src_is_quant) { - GGML_ASSERT(src_traits && src_traits->to_float); - src_traits->to_float(src, dst, (int)n_per_row); - return; - } - - throw std::runtime_error(format("unsupported src type %s for sampling", ggml_type_name(t))); - }; - - // Sample rows randomly per slice + // Populate f32_sample { - f32_sample.clear(); - std::vector row_buffer(n_per_row); - for (int64_t slice = 0; slice < ne2; ++slice) { - std::mt19937 rng(std::hash{}(name) ^ arbitrary_magic ^ slice); - const int64_t rows_sample_max = std::max(1, std::min(nrows_total, rows_sample_per_expert)); - const int64_t stride = std::max(1, nrows_total / rows_sample_max); - int64_t offset = 0; - if (stride > 1) { - std::uniform_int_distribution dist(0, stride - 1); - offset = dist(rng); - } + const ggml_type src_type = tensor->type; + const size_t src_row_sz = ggml_row_size(src_type, n_per_row); + const ggml_type_traits * traits = ggml_get_type_traits(src_type); + std::vector row_buf(n_per_row); - int64_t current = 0; - for (int64_t r = offset; r < nrows_total && current < rows_sample_max; r += stride) { - const uint8_t * src_row = (const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz; + for (int64_t slice = 0; slice < ne2; ++slice) { + std::mt19937 rng(std::hash{}(name) ^ HASH_MAGIC ^ slice); + const int64_t limit = std::max(1, std::min(nrows_total, rows_to_sample)); + const int64_t stride = std::max(1, nrows_total / limit); + int64_t offset = stride > 1 ? std::uniform_int_distribution(0, stride - 1)(rng) : 0; + + int64_t count = 0; + for (int64_t r = offset; r < nrows_total && count < limit; r += stride) { + const uint8_t * src = (const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz; if (src_type == GGML_TYPE_F32) { - const auto *src_f32 = (const float *)src_row; - f32_sample.insert(f32_sample.end(), src_f32, src_f32 + n_per_row); + f32_sample.insert(f32_sample.end(), (const float*)src, (const float*)src + n_per_row); + } else if (src_type == GGML_TYPE_F16 || src_type == GGML_TYPE_BF16) { + if (src_type == GGML_TYPE_F16) { ggml_fp16_to_fp32_row((const ggml_fp16_t*)src, row_buf.data(), (int)n_per_row); } + else { ggml_bf16_to_fp32_row((const ggml_bf16_t*)src, row_buf.data(), (int)n_per_row); } + f32_sample.insert(f32_sample.end(), row_buf.begin(), row_buf.end()); + } else if (traits && traits->to_float) { + traits->to_float(src, row_buf.data(), (int)n_per_row); + f32_sample.insert(f32_sample.end(), row_buf.begin(), row_buf.end()); } else { - row_to_fp32(src_row, row_buffer.data()); - f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end()); + throw std::runtime_error(format("unsupported source type %s for sampling", ggml_type_name(src_type))); } - ++current; + ++count; } - rows_sample[slice] = current; + rows_sample[slice] = count; } } - auto side_data = [&](const std::unordered_map> * m, const std::string & tensor_name) { - if (!m) { return std::pair{nullptr, 0}; } - - const std::string key = remap_imatrix(tensor_name, mapped); - const auto it = m->find(key); - return it == m->end() ? std::pair{nullptr, 0} : std::pair{ it->second.data(), it->second.size() }; + // Prepare side data + auto get_side_data = [&](const auto * m) { + if (!m) { return std::pair{nullptr, 0}; } + auto it = m->find(remap_imatrix(name, mapped)); + return it != m->end() ? std::pair{it->second.data(), it->second.size()} : std::pair{nullptr, 0}; }; - // Copy this row's side data (values and activations), or broadcasts to all slices - auto copy_or_broadcast = [&](const float * src, size_t src_sz, std::vector & dst) { - dst.clear(); - if (!src || src_sz == 0) { return; } + auto [val_ptr, val_sz] = get_side_data(values_data); + auto [act_ptr, act_sz] = get_side_data(activations_data); - const size_t want = (size_t)ne2 * (size_t)n_per_row; - if (src_sz == want) { - dst.assign(src, src + want); - return; - } - if (src_sz == (size_t)n_per_row) { - dst.resize(want); - for (int64_t s = 0; s < ne2; ++s) { - std::memcpy(dst.data() + s * n_per_row, src, n_per_row * sizeof(float)); - } - return; - } - - std::lock_guard lock(log_mutex); - LLAMA_LOG_WARN("%s: side data size mismatch for %s: got %zu, expected %zu or %zu; ignoring\n", func, name.c_str(), src_sz, (size_t)n_per_row, want); - }; - - const auto [values_all, values_sz] = side_data(values_data, name); - const auto [activations_all, activations_sz] = side_data(activations_data, name); - std::vector values_sample; - std::vector activations_sample; - if (values_all) { copy_or_broadcast(values_all, values_sz, values_sample); } - if (activations_all) { copy_or_broadcast(activations_all, activations_sz, activations_sample); } - - tensor_info info; - info.w = tw; - info.n_elements = ggml_nelements(tensor); - size_t total_sampled_rows = f32_sample.size() / n_per_row; - - // Build list of candidate types first (compatible ones) - const bool has_valid_imatrix = !values_sample.empty() && values_sample.size() == (size_t)ne2 * (size_t)n_per_row; - size_t max_row_sz = 0; - const ggml_type * base_arr = quant_types; - const size_t base_sz = std::size(quant_types); - std::vector compatible_candidates; - compatible_candidates.reserve(base_sz); - - for (size_t i = 0; i < base_sz; ++i) { - ggml_type ts_type = base_arr[i]; - if (is_iq(ts_type) && !has_valid_imatrix) { + std::vector val_vec; + std::vector act_vec; + auto prepare_broadcast = [&](const float* src, size_t sz, std::vector& dst) { + if (!src) { return; } + size_t req = (size_t)ne2 * n_per_row; + if (sz == req) { dst.assign(src, src + req); } + else if (sz == (size_t)n_per_row) { + dst.resize(req); + for (int s = 0; s < ne2; ++s) { std::memcpy(dst.data() + s * n_per_row, src, n_per_row * sizeof(float)); } + } else { std::lock_guard lock(log_mutex); - LLAMA_LOG_WARN("\t%s: skipping %s for %s, no or mismatched imatrix\n", func, ggml_type_name(ts_type), name.c_str()); - continue; + LLAMA_LOG_WARN("%s: side data mismatch for %s\n", func, name.c_str()); } + }; - ggml_type tt = make_compatible(tensor, ts_type); - if (!is_compatible(tensor, tt)) { continue; } - compatible_candidates.push_back(tt); - max_row_sz = std::max(max_row_sz, ggml_row_size(tt, n_per_row)); + prepare_broadcast(val_ptr, val_sz, val_vec); + prepare_broadcast(act_ptr, act_sz, act_vec); } - std::sort(compatible_candidates.begin(), compatible_candidates.end()); - compatible_candidates.erase(std::unique(compatible_candidates.begin(), compatible_candidates.end()), compatible_candidates.end()); + // Build candidates + std::vector valid_types; + valid_types.reserve(std::size(quant_types)); + size_t max_row_sz = 0; + const bool valid_matrix = !val_vec.empty(); - // Adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE. - // Larger values favours quantisation types that produce smaller bias even if the MSE is slightly bigger + for (auto t : quant_types) { + if (is_iq(t) && !valid_matrix) { continue; } + ggml_type compat = make_compatible(tensor, t); + if (!is_compatible(tensor, compat)) { continue; } + valid_types.push_back(compat); + max_row_sz = std::max(max_row_sz, ggml_row_size(compat, n_per_row)); + } + + std::sort(valid_types.begin(), valid_types.end()); + valid_types.erase(std::unique(valid_types.begin(), valid_types.end()), valid_types.end()); + + // Calculate bias lambda to adjust the trade-off between MSE and systematic bias float tensor_lambda = 0.0f; - std::vector lambdas; - const float * values = values_sample.empty() ? nullptr : values_sample.data(); - const float * activations = activations_sample.empty() ? nullptr : activations_sample.data(); - double acc = 0.0; - int ns = 0; - lambdas = estimate_lambda(values, activations, n_per_row, ne2); - for (float l : lambdas) { acc += l; ++ns; } - tensor_lambda = ns ? (float)(acc / ns) : 0.0f; + std::vector slice_lambdas = estimate_lambda(val_vec.empty()?nullptr:val_vec.data(), act_vec.empty()?nullptr:act_vec.data(), n_per_row, ne2); + if (!slice_lambdas.empty()) { + double sum = 0; + for(float l : slice_lambdas) { sum += l; } + tensor_lambda = (float)(sum / slice_lambdas.size()); + } // Evaluate candidates - std::vector eval_candidates(compatible_candidates.size()); - std::vector quantized_buffer(max_row_sz * total_sampled_rows); - std::vector dequantized_buffer(f32_sample.size()); - const float * slice_lambda = lambdas.empty() ? nullptr : lambdas.data(); - for (size_t i = 0; i < compatible_candidates.size(); ++i) { + std::vector evaluations; + evaluations.reserve(valid_types.size()); + std::vector q_buf; + std::vector dq_buf; + + for (ggml_type vt : valid_types) { if (bpw_stop.load(std::memory_order_relaxed)) { return std::nullopt; } + const wce_cache * ptr_ref_wce = valid_wce && !ref_wce.row_sq_norm.empty() ? & ref_wce : nullptr; + const mse_cache * ptr_ref_mse = !valid_wce && !ref_mse.row_sq_norm.empty() ? & ref_mse : nullptr; - const ggml_type tensor_type = compatible_candidates[i]; - const auto bpw = (float)tensor_bpw(tensor, tensor_type); - const size_t bytes = tensor_bytes(tensor, tensor_type); - double mse = 0.0; - double proj = 0.0; - const auto err = estimate_error(tensor, tensor_type, f32_sample, rows_sample, values, activations, - quantized_buffer, dequantized_buffer, tensor_lambda, slice_lambda, &mse, &proj); - eval_candidates[i] = candidate_types{ tensor_type, bpw, bytes, err, mse, proj }; + quant_error qe = compute_quant_error( + tensor, + vt, + f32_sample, + rows_sample, + val_vec.empty() ? nullptr : val_vec.data(), + act_vec.empty() ? nullptr : act_vec.data(), + q_buf, + dq_buf, + tensor_lambda, + slice_lambdas.data(), + ptr_ref_wce, + ptr_ref_mse + ); + + type_scores candidate; + candidate.type = vt; + candidate.bpw = (float)tensor_bpw(tensor, vt); + candidate.bytes = tensor_bytes(tensor, vt); + candidate.error = qe.error; + candidate.mse = qe.mse; + candidate.proj = qe.proj; + candidate.wce = qe.wce; + evaluations.push_back(candidate); } - if (bpw_stop.load(std::memory_order_relaxed)) { return std::nullopt; } - - // Check if biasing is needed + // Select final quality metric (MSE or MSE + bias) if not using WCE + type_choice ch; + ch.w = tw; + ch.n_elements = ggml_nelements(tensor); bool bias_needed = false; - if (!lambdas.empty()) { - int min_mse = -1; - int min_bias = -1; - double best_mse = std::numeric_limits::infinity(); - double best_err = std::numeric_limits::infinity(); - for (int i = 0; i < (int)eval_candidates.size(); ++i) { - const auto & c = eval_candidates[i]; + // Determine if bias correction is required + double best_mse = INFINITE; + double max_rel_bias = 0.0; + for (const auto& c : evaluations) { if (c.bytes == 0) { continue; } - if (c.mse < best_mse) { - best_mse = c.mse; - min_mse = i; - } - if (c.error < best_err) { - best_err = c.error; - min_bias = i; - } + best_mse = std::min(best_mse, c.mse); + // Check penalty term contribution (error - mse) + if (c.mse > EPSILON) { max_rel_bias = std::max(max_rel_bias, std::max(0.0, c.error - c.mse) / c.mse); } } - if (min_mse != min_bias) { - bias_needed = true; - } else { - double max_rel_bias = 0.0; - for (const auto & c : eval_candidates) { - if (c.bytes == 0) { continue; } - const double mse = std::max(c.mse, epsilon); - const double bias_term = std::max(0.0, c.error - c.mse); - max_rel_bias = std::max(bias_term / mse, max_rel_bias); - } - - bias_needed = max_rel_bias >= 0.5; // >= 50% of MSE? - } + // If penalty/bias is significant (>= 50% of MSE), use combined error, else pure MSE + bias_needed = max_rel_bias >= 0.5; } - for (auto & c : eval_candidates) { - if (c.bytes == 0) { continue; } - const double final_err = bias_needed ? c.error : c.mse; - info.candidate.push_back(candidate_types{ c.type, c.bpw, c.bytes, final_err, c.mse, c.proj }); + for (const auto & ev : evaluations) { + if (ev.bytes == 0) { continue; } + type_scores ts = ev; + // If using WCE, c.error is already set + if (!valid_wce && !bias_needed) { ts.error = ts.mse; } + ch.candidates.push_back(ts); } - if (info.candidate.empty()) { - // As a last resort, keep original type - float bpw = ggml_nbytes(tensor) * 8.0f / info.n_elements; - info.candidate.push_back(candidate_types{ tensor->type, bpw, ggml_nbytes(tensor), 0.0 }); + // Fallback if empty + if (ch.candidates.empty()) { + type_scores fb; + fb.type = tensor->type; + fb.bytes = ggml_nbytes(tensor); + fb.bpw = fb.bytes * 8.0f / ch.n_elements; + ch.candidates.push_back(fb); } - // Keep only the pareto‑optimal candidates and enforce convexity in (bytes, error) curve - auto pareto_convex = [&](std::vector & candidates) { - if (candidates.empty()) { return; } - - std::sort(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) { - if (a.bytes != b.bytes) { return a.bytes < b.bytes; } - return a.error < b.error; + // Convex hull & Pareto Front simplification + auto simplify_pareto = [](std::vector & candidates) { + std::sort(candidates.begin(), candidates.end(), [](const auto& a, const auto& b) { + return a.bytes < b.bytes || (a.bytes == b.bytes && a.error < b.error); }); - candidates.erase(std::unique(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) { - return a.bytes == b.bytes; - }), candidates.end()); - std::vector pareto; - pareto.reserve(candidates.size()); - double best_err = infinity; - for (const auto & c : candidates) { - if (c.error < best_err) { - best_err = c.error; - pareto.push_back(c); + candidates.erase(std::unique(candidates.begin(), candidates.end(), + [](const auto & a, const auto &b) { return a.bytes == b.bytes; }), candidates.end()); + + // Lower envelope + std::vector hull; + double min_err = INFINITE; + for(const auto & c : candidates) { + if (c.error < min_err) { + min_err = c.error; + hull.push_back(c); } } - candidates.swap(pareto); - if (candidates.size() < 3) { return; } // need at least 3 points to do convex hull + candidates = std::move(hull); - // Convex hull (lower envelope) - auto cross_product = [](const candidate_types & h0, const candidate_types & h1, const candidate_types & p) -> double { - const double dx1 = (double)h1.bytes - (double)h0.bytes; - const double dy1 = h1.error - h0.error; - const double dx2 = (double)p.bytes - (double)h0.bytes; - const double dy2 = p.error - h0.error; - return dx1 * dy2 - dx2 * dy1; + // Convex hull + if (candidates.size() < 3) { return; } + std::vector convex; + auto cross = [](const auto& a, const auto& b, const auto& c) { + return ((double)b.bytes - (double)a.bytes) * (c.error - a.error) - ((double)c.bytes - (double)a.bytes) * (b.error - a.error); }; - std::vector hull; hull.reserve(candidates.size()); for (const auto & c : candidates) { - while (hull.size() >= 2) { - if (cross_product(hull[hull.size() - 2], hull[hull.size() - 1], c) <= epsilon) { - hull.pop_back(); - } else { - break; - } - } - - hull.push_back(c); + while (convex.size() >= 2 && cross(convex[convex.size()-2], convex.back(), c) <= EPSILON) { convex.pop_back(); } + convex.push_back(c); } - candidates.swap(hull); + candidates = std::move(convex); }; - pareto_convex(info.candidate); - - // Initialize choice at the smallest bpw candidate - info.choice = 0; - info.min_bpw = info.candidate.front().bpw; - info.max_bpw = info.candidate.back().bpw; - - return info; + simplify_pareto(ch.candidates); + ch.choice = 0; + ch.min_bpw = ch.candidates.front().bpw; + ch.max_bpw = ch.candidates.back().bpw; + return ch; }; - std::vector all; // this vector will be populated by the parallel workers + std::vector all_tensors; // this vector will be populated by the parallel workers { - std::atomic tensor_idx{0}; // shared work queue index for all threads - const size_t tensors_to_process = tensors.size(); - std::mutex loader_mutex; - std::mutex log_mutex; - std::mutex results_mutex; - std::vector workers; - int threads_to_spawn = std::max(1, std::min(nthread, (int)tensors_to_process)); + std::atomic idx{0}; + std::mutex m_load; + std::mutex m_log; + std::mutex m_res; + std::vector threads; + int n_workers = std::max(1, std::min(nthread, (int)tensors.size())); + threads.reserve(n_workers); - for (int i = 0; i < threads_to_spawn; ++i) { - workers.emplace_back([&]() { - std::vector> thread_local_buffer; - while (true) { - const size_t current_idx = tensor_idx.fetch_add(1); - if (current_idx >= tensors_to_process) { break; } - const auto * tw = tensors[current_idx]; - if (!can_quantize(tw->tensor)) { continue; } - // Execute the main processing logic for this tensor - std::optional result_info = process_tensor(tw, thread_local_buffer, loader_mutex, log_mutex); - if (result_info) { - std::lock_guard lock(results_mutex); - all.push_back(std::move(*result_info)); + for (int i = 0; i < n_workers; ++i) { + threads.emplace_back([&](){ + std::vector> buf; + while(true) { + const size_t cur = idx.fetch_add(1); + if (cur >= tensors.size()) { break; } + if (!can_quantize(tensors[cur]->tensor)) { continue; } + + auto res = process_tensor(tensors[cur], buf, m_load, m_log); + if (res) { + std::lock_guard lock(m_res); + all_tensors.push_back(std::move(*res)); } } }); } - for (auto & w : workers) { w.join(); } + for(auto& t : threads) { t.join(); } } - check_signal_handler(all); - if (params->save_state) { save_state(all); } - - if (all.empty()) { return {}; } + check_signal_handler(all_tensors); + if (params->save_state) { save_state(all_tensors); } + if (all_tensors.empty()) { return {}; } // Compute total elements across all tensors and bytes for non-quantizable tensors size_t nq_elements = 0; size_t nq_bytes = 0; for (const auto * it : tensors) { const ggml_tensor * tensor = it->tensor; - const std::string name = ggml_get_name(tensor); nq_elements += (size_t)ggml_nelements(tensor); if (!can_quantize(tensor)) { nq_bytes += ggml_nbytes(tensor); } } - auto total_bytes = [&]() -> size_t { - size_t tb = 0; - for (const auto & ti : all) { - tb += ti.candidate[ti.choice].bytes; - } - - return tb; - }; - - size_t q_elements = 0; - size_t min_bytes = 0; - size_t max_bytes = 0; - for (const auto & ti : all) { - q_elements += (size_t)ti.n_elements; - min_bytes += ti.candidate.front().bytes; // smallest candidate per tensor - max_bytes += ti.candidate.back().bytes; // largest candidate per tensor + size_t min_total_bytes = 0; + size_t max_total_bytes = 0; + for (const auto & tn : all_tensors) { + min_total_bytes += tn.candidates.front().bytes; + max_total_bytes += tn.candidates.back().bytes; } - if (q_elements == 0) { return {}; } - size_t budget_bytes = 0; if (params->target_size != -1) { @@ -1572,207 +1457,196 @@ static std::unordered_map target_bpw_type( int64_t available = (int64_t)params->target_size - (int64_t)metadata_size - (int64_t)nq_bytes; // Clamp to the absolute minimum possible size for the variable tensors - if (available < (int64_t)min_bytes) { + if (available < (int64_t)min_total_bytes) { LLAMA_LOG_WARN("%s: requested file size %zu is smaller than minimum possible model size (~%zu), clamping to minimum.\n", - func, (size_t)params->target_size, min_bytes + nq_bytes + metadata_size); - budget_bytes = min_bytes; + func, (size_t)params->target_size, min_total_bytes + nq_bytes + metadata_size); + budget_bytes = min_total_bytes; } else { budget_bytes = (size_t)available; } } else { const double target_bpw = params->target_bpw; size_t target_total_bytes = std::llround(target_bpw * (double)nq_elements / 8.0); - budget_bytes = target_total_bytes >= nq_bytes ? target_total_bytes - nq_bytes : min_bytes; + budget_bytes = target_total_bytes >= nq_bytes ? target_total_bytes - nq_bytes : min_total_bytes; } // Get the types' override - auto emit_overrides = [&]() -> std::unordered_map { - std::unordered_map overrides; + auto build_mix = [&]() -> std::unordered_map { + std::unordered_map mix; LLAMA_LOG_INFO("%s: - estimated tensor quantization mix:\n", func); - for (const auto & ti : all) { + for (const auto & ti : all_tensors) { LLAMA_LOG_INFO("\t%s: %45s - \t%8s, \t%1.4f bpw,\terror: %.4f\n", - func, ggml_get_name(ti.w->tensor), ggml_type_name(ti.candidate[ti.choice].type), ti.candidate[ti.choice].bpw, ti.candidate[ti.choice].error); - overrides[ggml_get_name(ti.w->tensor)] = ti.candidate[ti.choice].type; + func, ggml_get_name(ti.w->tensor), ggml_type_name(ti.candidates[ti.choice].type), ti.candidates[ti.choice].bpw, ti.candidates[ti.choice].error); + mix[ggml_get_name(ti.w->tensor)] = ti.candidates[ti.choice].type; } - return overrides; + return mix; }; - if (budget_bytes <= min_bytes) { - for (auto & ti : all) { ti.choice = 0; } - return emit_overrides(); + if (budget_bytes <= min_total_bytes) { + for(auto & tn : all_tensors) { tn.choice = 0; } + return build_mix(); } - if (budget_bytes >= max_bytes) { - for (auto & ti : all) { ti.choice = (int)ti.candidate.size() - 1; } - return emit_overrides(); + if (budget_bytes >= max_total_bytes) { + for(auto & tn : all_tensors) { tn.choice = (int)tn.candidates.size() - 1; } + return build_mix(); } // Certain tensors have a higher impact on model quality, so we apply a lower penalty to them auto is_important = [&](const std::string & tensor_name) -> bool { - bool important = tensor_name == "output.weight"; - if (!important && !params->ignore_tensor_importance) { - important = tensor_name.find(".attn_v.weight") != std::string::npos || - tensor_name.find(".time_mix_value.weight") != std::string::npos || + bool important = false; + if (params->ignore_tensor_importance) { return important; } + + important = tensor_name == "output.weight" || + tensor_name.find(".attn_output.weight") != std::string::npos || + tensor_name.find(".attn_o.weight") != std::string::npos || + tensor_name.find(".attn_v.weight") != std::string::npos || tensor_name.find(".ffn_down.weight") != std::string::npos || tensor_name.find(".ffn_down_exps.weight") != std::string::npos || - tensor_name.find(".attn_output.weight") != std::string::npos || tensor_name.find(".time_mix_output.weight") != std::string::npos || - tensor_name.find(".attn_o.weight") != std::string::npos; - } + tensor_name.find(".time_mix_value.weight") != std::string::npos; return important; }; - // Lagrangian relaxation to minimize error subject to a bpw target constraint - auto lagrange_penalty = [&](const double mu, std::vector & choice, size_t & bytes, double & err) { - choice.resize(all.size()); + // Minimize error subject to a size target constraint + auto lagrangian_relaxation = [&](const double mu, std::vector & choices, size_t & bytes, double & cost) { + choices.resize(all_tensors.size()); bytes = 0; - err = 0.0; - for (size_t i = 0; i < all.size(); ++i) { - const auto & candidate = all[i].candidate; - const std::string tensor_name = ggml_get_name(all[i].w->tensor); - double effective_mu = mu; - if (is_important(tensor_name)) { effective_mu *= 0.1; } // important tensors get 10x lower penalty + cost = 0.0; + for (size_t i = 0; i < all_tensors.size(); ++i) { + const auto & tn = all_tensors[i]; + const bool imp = is_important(ggml_get_name(tn.w->tensor)); + const double eff_mu = imp ? mu * 0.1 : mu; // important tensors get 10x lower penalty - int best_j = 0; - double best_val = infinity; - for (int j = 0; j < (int)candidate.size(); ++j) { - const double bits = (double)candidate[j].bytes * 8.0; - const double val = candidate[j].error + effective_mu * bits; - if (val < best_val - epsilon || (std::abs(val - best_val) <= epsilon && candidate[j].bytes < candidate[best_j].bytes)) { - best_val = val; - best_j = j; + int best = 0; + double min = INFINITE; + + for(int j = 0; j < (int)tn.candidates.size(); ++j) { + double lr = tn.candidates[j].error + eff_mu * (double)tn.candidates[j].bytes * 8.0; + if (lr < min - EPSILON || (std::abs(lr - min) <= EPSILON && tn.candidates[j].bytes < tn.candidates[best].bytes)) { + min = lr; + best = j; } } - choice[i] = best_j; - bytes += candidate[best_j].bytes; - err += candidate[best_j].error; + choices[i] = best; + bytes += tn.candidates[best].bytes; + cost += tn.candidates[best].error; } }; - size_t bytes_lo = 0; - size_t bytes_hi = 0; - size_t bytes_mid = 0; + // Binary search for mu double mu_lo = 0.0; double mu_hi = 1.0; - double err_lo = 0.0; - double err_hi = 0.0; - double err_mid = 0.0; - std::vector choice_lo; - std::vector choice_hi; - std::vector choice_mid; - std::vector best_under_choice; - std::vector best_over_choice; + std::vector ch_lo; + std::vector ch_hi; + std::vector ch_under; + std::vector ch_over; + size_t bt_lo; + size_t bt_hi; + size_t bt_mid; + double dummy; - lagrange_penalty(mu_lo, choice_lo, bytes_lo, err_lo); + lagrangian_relaxation(mu_lo, ch_lo, bt_lo, dummy); + int safety = 0; - // Increase mu until we get under budget or hit a safety cap - { - int expand = 0; - size_t prev_bytes_hi = std::numeric_limits::max(); - while (true) { - lagrange_penalty(mu_hi, choice_hi, bytes_hi, err_hi); - if (bytes_hi <= budget_bytes) { break; } - if (bytes_hi >= prev_bytes_hi) { break; } - prev_bytes_hi = bytes_hi; + do { + lagrangian_relaxation(mu_hi, ch_hi, bt_hi, dummy); + if (bt_hi <= budget_bytes || bt_hi == std::numeric_limits::max()) { break; } + mu_hi *= 2.0; + } while(++safety < 60); - mu_hi *= 2.0; // double the penalty multiplier to reduce tensor sizes - if (++expand > 60) { break; } // safety cap to prevent an infinite loop - } - } + double gap_under = INFINITE; + double gap_over = INFINITE; - double best_under_gap = infinity; - double best_over_gap = infinity; - double best_under_err = infinity; - double best_over_err = infinity; - for (int it = 0; it < 40; ++it) { // binary search iterations for optimal Lagrange multiplier (40 ≈ 1e-12 precision) - double mu = 0.5 * (mu_lo + mu_hi); // midpoint of current bounds - lagrange_penalty(mu, choice_mid, bytes_mid, err_mid); + for(int i = 0; i < 40; ++i) { + double mu = 0.5 * (mu_lo + mu_hi); + std::vector ch_mid; + double cost_mid = 0.0; + lagrangian_relaxation(mu, ch_mid, bt_mid, cost_mid); - const double gap = std::abs((double)bytes_mid - (double)budget_bytes); - if (bytes_mid > budget_bytes) { - // Too big, need stronger penalty + double gap = std::abs((double)bt_mid - (double)budget_bytes); + if (bt_mid > budget_bytes) { mu_lo = mu; - if (gap < best_over_gap - epsilon || (std::abs(gap - best_over_gap) <= epsilon && err_mid < best_over_err)) { - best_over_gap = gap; - best_over_err = err_mid; - best_over_choice = choice_mid; + if (gap < gap_over) { + gap_over = gap; + ch_over = ch_mid; } } else { - // Under budget, good candidate mu_hi = mu; - if (gap < best_under_gap - epsilon || (std::abs(gap - best_under_gap) <= epsilon && err_mid < best_under_err)) { - best_under_gap = gap; - best_under_err = err_mid; - best_under_choice = choice_mid; + if (gap < gap_under) { + gap_under = gap; + ch_under = ch_mid; } } } - if (!best_under_choice.empty()) { - for (size_t i = 0; i < all.size(); ++i) { - all[i].choice = best_under_choice[i]; + if (!ch_under.empty()) { + for(size_t i = 0; i < all_tensors.size(); ++i) { all_tensors[i].choice = ch_under[i]; } + } + else if (!ch_over.empty()) { + for(size_t i = 0; i < all_tensors.size(); ++i) { all_tensors[i].choice = ch_over[i]; } + } + else if (bt_hi <= budget_bytes && !ch_hi.empty()) { + for(size_t i = 0; i < all_tensors.size(); ++i) { all_tensors[i].choice = ch_hi[i]; } + } + else { + for(auto& tn : all_tensors) { tn.choice = 0; } + } + + // Single pass greedy upgrade in case there is budget left + auto current_bytes = [&] { + size_t cb = 0; + for(const auto & tn : all_tensors) { cb += tn.candidates[tn.choice].bytes; } + return cb; + }; + size_t cb = current_bytes(); + + struct tensor_upgrade { + int index; + int next_choice; + double score; + bool operator<(const tensor_upgrade & other) const { + return score < other.score; } - } else if (!best_over_choice.empty()) { - for (size_t i = 0; i < all.size(); ++i) { - all[i].choice = best_over_choice[i]; + }; + + std::priority_queue queue; + + auto push_next = [&](const int i) { + const auto & tn = all_tensors[i]; + int next = tn.choice + 1; + if (next < (int)tn.candidates.size()) { + const double err = std::max(0.0, tn.candidates[tn.choice].error - tn.candidates[next].error); + auto bytes = (double)(tn.candidates[next].bytes - tn.candidates[tn.choice].bytes); + if (bytes > EPSILON) { + double ratio = err / bytes; + if (is_important(ggml_get_name(tn.w->tensor))) { ratio *= 5.0; } // important tensors get 5x boost + queue.push({i, next, ratio}); + } } - } else { - // Pick whichever side we already have, or keep minimal - if (bytes_hi <= budget_bytes && !choice_hi.empty()) { - for (size_t i = 0; i < all.size(); ++i) { - all[i].choice = choice_hi[i]; - } - } else { - for (auto & ti : all) { - ti.choice = 0; - } + }; + + for (size_t i = 0; i < all_tensors.size(); ++i) { push_next((int)i); } + + while (!queue.empty()) { + auto top = queue.top(); + queue.pop(); + + int i = top.index; + int next = top.next_choice; + if (all_tensors[i].choice >= next) { continue; } + + size_t delta_bt = all_tensors[i].candidates[next].bytes - all_tensors[i].candidates[all_tensors[i].choice].bytes; + if (cb + delta_bt <= budget_bytes) { + cb += delta_bt; + all_tensors[i].choice = next; + push_next(i); } } - // Spend any remaining budget with best upgrades that still fit (one pass) - { - auto cur_bytes = total_bytes(); - while (true) { - int best_i = -1; - int best_j = -1; - double best_ratio = -1.0; - double best_gain = -1.0; - - for (int i = 0; i < (int)all.size(); ++i) { - const auto & ti = all[i]; - const std::string tensor_name = ggml_get_name(ti.w->tensor); - int j = ti.choice + 1; - if (j >= (int)ti.candidate.size()) { continue; } // no upgrade available - - size_t delta_bytes = ti.candidate[j].bytes - ti.candidate[ti.choice].bytes; - if (cur_bytes + delta_bytes > budget_bytes) { continue; } // won't fit in budget - - double err_gain = std::max(0.0, ti.candidate[ti.choice].error - ti.candidate[j].error); - if (err_gain < epsilon) { continue; } // no error improvement - - double ratio = err_gain / (double)delta_bytes; // error reduction per byte - if (is_important(tensor_name)) { ratio *= 5.0; } // important tensors get 5x boost - - // For tie-breaking, prioritize the largest absolute error improvement. - if (ratio > best_ratio + epsilon || (std::abs(ratio - best_ratio) <= epsilon && err_gain > best_gain)) { - best_ratio = ratio; - best_gain = err_gain; - best_i = i; - best_j = j; - } - } - - if (best_i < 0) { break; } // no more upgrades within budget found - - size_t upgrade_cost = all[best_i].candidate[best_j].bytes - all[best_i].candidate[all[best_i].choice].bytes; - all[best_i].choice = best_j; - cur_bytes += upgrade_cost; - } - } - - return emit_overrides(); + return build_mix(); } static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {