Angle-aware/Magnitude-aware (MSE/WCE) hybrid error estimation

This commit is contained in:
Ed Addario 2026-03-12 19:59:26 +00:00
parent 9bb8e17e04
commit 6e68a04797
No known key found for this signature in database
GPG Key ID: E7875815A3230993
1 changed files with 9 additions and 40 deletions

View File

@ -646,13 +646,12 @@ static void signal_handler(int) { bpw_stop.store(true, std::memory_order_relaxed
// Returns tensor type overrides that meet a global file size or bpw target
static std::unordered_map<std::string, ggml_type> target_bpw_type(
llama_model_loader & ml,
const llama_model & model,
quantize_state_impl & qs,
const std::vector<const llama_model_loader::llama_tensor_weight *> & tensors,
const std::map<int, std::string> & mapped,
const std::unordered_map<std::string, std::vector<float>> * values_data,
const std::unordered_map<std::string, std::vector<float>> * activations_data,
const std::unordered_map<std::string, std::vector<float>> * statistics_data,
const llama_model_quantize_params * params,
int nthread
) {
bpw_stop.store(false, std::memory_order_relaxed);
@ -743,38 +742,10 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
return blck <= 1 || gt->ne[0] % blck == 0;
};
auto fallback_type = [](const enum ggml_type new_type) {
switch (new_type) {
case GGML_TYPE_TQ1_0:
case GGML_TYPE_TQ2_0:
return GGML_TYPE_Q4_0; // symmetric-ish fallback
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ1_M:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_IQ4_XS:
return GGML_TYPE_IQ4_NL;
case GGML_TYPE_Q4_K:
return GGML_TYPE_Q5_0;
case GGML_TYPE_Q5_K:
return GGML_TYPE_Q5_1;
case GGML_TYPE_Q6_K:
return GGML_TYPE_Q8_0;
default:
return new_type;
}
};
// Get suitable fallback for type
auto make_compatible = [&](const ggml_tensor * gt, const ggml_type gq) -> ggml_type {
if (is_compatible(gt, gq)) { return gq; }
// const ggml_type fb = tensor_type_fallback(gq);
const ggml_type fb = fallback_type(gq);
const ggml_type fb = tensor_type_fallback(qs, gt, gq);
return is_compatible(gt, fb) ? fb : GGML_TYPE_F16;
};
@ -799,7 +770,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
// Check if tensor can be quantized
auto can_quantize = [&](const ggml_tensor * gt) -> bool {
if (ggml_n_dims(gt) < 2 || ggml_n_dims(gt) > 3) { return false; } // skip 1D & 4D+ tensors
return tensor_allows_quantization(params, model.arch, gt);
return tensor_allows_quantization(qs.params, qs.model.arch, gt);
};
// DJB2 hashing algorithm
@ -1009,14 +980,14 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
const bool has_vals = values_sample != nullptr;
const bool has_acts = activations_sample != nullptr;
const bool do_wce = valid_wce && has_acts && has_vals;
const bool use_wce_for_tensor = has_acts && has_vals && is_angle_sensitive(t->name);
// Sampled stats for MSE
std::vector<double> local_row_sq_norm;
const std::vector<double> * ptr_row_sq_norm = nullptr;
// Setup reference stats pointers for MSE
if (!do_wce) {
if (!use_wce_for_tensor) {
if (ref_mse) {
ptr_row_sq_norm = & ref_mse->row_sq_norm;
} else {
@ -1089,8 +1060,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
return std::accumulate(v.begin() + k, v.end() - k, 0.0) / std::max(1.0, (double)(n - 2 * k));
};
// Weighted Cosine Error (WCE) - Experimental
if (do_wce) {
// Weighted Cosine Error (WCE)
if (use_wce_for_tensor) {
double total_cos_error = 0.0;
size_t off = 0;
size_t sample_idx = 0;
@ -1113,7 +1084,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
double nx = 0.0;
const bool calc_nx = !cached_norm_x;
// SIMD-friendly loops
if (calc_nx) {
for (int64_t j = 0; j < n_per_row; ++j) {
const double w = std::max(0.0f, v[j]);
@ -1220,9 +1190,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
return qe;
};
std::unordered_map<std::string, type_choice> bpw_data;
if (params->state_file && !checkpoint_file.empty()) { bpw_data = load_state(); } // ToDo: rethink this condition
if (qs.params->state_file && !checkpoint_file.empty()) { bpw_data = load_state(); } // ToDo: rethink this condition
// Parallelize tensor processing (courtesy of https://github.com/ddh0)
auto process_tensor = [&](
@ -1270,7 +1239,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
// Compute rows based on tensor shape and slice count
auto sample_count = [&](const int64_t n, const int64_t rows, const int64_t n2, const bool has_acts) {
const double k_scale = valid_wce ? 2.0 : 1.0;
constexpr double k_scale = 1.0;
const double tensor_budget = (has_acts ? 1.0 : 0.5) * k_scale * 1024.0 * 1024.0;
const double scale = std::clamp(std::sqrt(std::max(1.0, (double)rows) / 4096.0), 0.5, 2.0); // more rows for large tensors
const double slice_budget = tensor_budget * scale / std::max<int64_t>(1, n2);