Performance optimisations (AI assisted)

This commit is contained in:
Ed Addario 2026-01-22 10:38:16 +00:00
parent 1c23a6fbd2
commit 2ede173218
No known key found for this signature in database
GPG Key ID: E7875815A3230993
1 changed files with 111 additions and 120 deletions

View File

@ -949,6 +949,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
std::vector<float> & dequantized_buffer,
float tensor_bias,
const float * slice_bias,
float h_norm,
const wce_cache * ref_wce = nullptr,
const mse_cache * ref_mse = nullptr
) -> quant_error
@ -990,10 +991,10 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
const float * v = has_vals ? values_sample + s * n_per_row : nullptr;
const float * a = activations_sample + s * n_per_row;
double denom = 0.0;
for (int64_t j = 0; j < n_per_row; ++j) {
const double w = v ? std::max(0.0f, v[j]) : 1.0;
const double aj = a[j];
denom += w * aj * aj;
if (v) {
for (int64_t j = 0; j < n_per_row; ++j) { denom += std::max(0.0f, v[j]) * a[j] * a[j]; }
} else {
for (int64_t j = 0; j < n_per_row; ++j) { denom += a[j] * a[j]; }
}
local_bias_denom[s] = denom;
@ -1009,9 +1010,10 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
for (int64_t r = 0; r < rs; ++r) {
const float * x = f32_sample.data() + off;
double sum = 0.0;
for (int64_t j = 0; j < n_per_row; ++j) {
double xx = x[j];
sum += (v ? std::max(0.0f, v[j]) : 1.0) * xx * xx;
if (v) {
for (int64_t j = 0; j < n_per_row; ++j) { sum += std::max(0.0f, v[j]) * x[j] * x[j]; }
} else {
for (int64_t j = 0; j < n_per_row; ++j) { sum += x[j] * x[j]; }
}
local_row_sq_norm.push_back(sum);
@ -1061,15 +1063,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
// Compute Error Metrics: Entropy-Modulated Weighted Cosine Error (WCE) - Experimental
if (do_wce) {
float h_norm = 1.0f;
if (statistics_data) {
const std::string name = ggml_get_name(t);
const std::string key = remap_imatrix(name, mapped);
if (auto it = statistics_data->find(key); it != statistics_data->end() && !it->second.empty()) {
h_norm = it->second.size() > 3 ? it->second[1] : 1.0f;
}
}
double total_cos_error = 0.0;
size_t off = 0;
size_t sample_idx = 0;
@ -1093,44 +1086,24 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
const bool calc_nx = !cached_norm_x;
// SIMD-friendly loops
if (v) {
if (calc_nx) {
for (int64_t j = 0; j < n_per_row; ++j) {
const double w = std::max(0.0f, v[j]);
const double xj = wx[j];
const double yj = wy[j];
const double yw = yj * w;
dot += xj * yw;
ny += yj * yw;
nx += xj * xj * w;
}
} else {
nx = (* cached_norm_x)[sample_idx];
for (int64_t j = 0; j < n_per_row; ++j) {
const double w = std::max(0.0f, v[j]);
const double yj = wy[j];
const double yw = yj * w;
dot += (double) wx[j] * yw;
ny += yj * yw;
}
if (calc_nx) {
for (int64_t j = 0; j < n_per_row; ++j) {
const double w = std::max(0.0f, v[j]);
const double xj = wx[j];
const double yj = wy[j];
const double yw = yj * w;
dot += xj * yw;
ny += yj * yw;
nx += xj * xj * w;
}
} else {
if (calc_nx) {
for (int64_t j = 0; j < n_per_row; ++j) {
const double xj = wx[j];
const double yj = wy[j];
dot += xj * yj;
ny += yj * yj;
nx += xj * xj;
}
} else {
nx = (* cached_norm_x)[sample_idx];
for (int64_t j = 0; j < n_per_row; ++j) {
const double xj = wx[j];
const double yj = wy[j];
dot += xj * yj;
ny += yj * yj;
}
nx = (* cached_norm_x)[sample_idx];
for (int64_t j = 0; j < n_per_row; ++j) {
const double w = std::max(0.0f, v[j]);
const double yj = wy[j];
const double yw = yj * w;
dot += (double) wx[j] * yw;
ny += yj * yw;
}
}
@ -1184,14 +1157,35 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
double w_err = 0.0;
double bias_num = 0.0;
for (int64_t j = 0; j < n_per_row; ++j) {
const double w = val ? std::max(0.0f, val[j]) : 1.0;
const double e = y[j] - x[j];
w_err += w * e * e;
if (act) { bias_num += w * e * act[j]; }
if (val && act) {
for (int64_t j = 0; j < n_per_row; ++j) {
const double w = std::max(0.0f, val[j]);
const double e = y[j] - x[j];
const double we = w * e;
w_err += we * e;
bias_num += we * act[j];
}
} else if (val) {
for (int64_t j = 0; j < n_per_row; ++j) {
const double w = std::max(0.0f, val[j]);
const double e = y[j] - x[j];
w_err += w * e * e;
}
} else if (act) {
for (int64_t j = 0; j < n_per_row; ++j) {
const double e = y[j] - x[j];
w_err += e * e;
bias_num += e * act[j];
}
} else {
for (int64_t j = 0; j < n_per_row; ++j) {
const double e = y[j] - x[j];
w_err += e * e;
}
}
const double m_norm = w_err / ((* ptr_row_sq_norm)[row_idx] + EPSILON);
const double rsn = (* ptr_row_sq_norm)[row_idx];
const double m_norm = rsn > EPSILON ? w_err / rsn : 0.0;
slice_mse_norm.push_back(std::isfinite(m_norm) ? m_norm : INFINITE);
if (act) {
@ -1319,7 +1313,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
const ggml_type src_type = tensor->type;
const size_t src_row_sz = ggml_row_size(src_type, n_per_row);
const ggml_type_traits * traits = ggml_get_type_traits(src_type);
std::vector<float> row_buf(n_per_row);
for (int64_t slice = 0; slice < ne2; ++slice) {
std::mt19937 rng(std::hash<std::string>{}(name) ^ HASH_MAGIC ^ slice);
@ -1330,18 +1323,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
int64_t count = 0;
for (int64_t r = offset; r < nrows_total && count < limit; r += stride) {
const uint8_t * src = (const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz;
if (src_type == GGML_TYPE_F32) {
f32_sample.insert(f32_sample.end(), (const float*)src, (const float*)src + n_per_row);
} else if (src_type == GGML_TYPE_F16 || src_type == GGML_TYPE_BF16) {
if (src_type == GGML_TYPE_F16) { ggml_fp16_to_fp32_row((const ggml_fp16_t*)src, row_buf.data(), (int)n_per_row); }
else { ggml_bf16_to_fp32_row((const ggml_bf16_t*)src, row_buf.data(), (int)n_per_row); }
f32_sample.insert(f32_sample.end(), row_buf.begin(), row_buf.end());
} else if (traits && traits->to_float) {
traits->to_float(src, row_buf.data(), (int)n_per_row);
f32_sample.insert(f32_sample.end(), row_buf.begin(), row_buf.end());
} else {
throw std::runtime_error(format("unsupported source type %s for sampling", ggml_type_name(src_type)));
}
size_t cur_sz = f32_sample.size();
f32_sample.resize(cur_sz + n_per_row);
float * dst = f32_sample.data() + cur_sz;
if (src_type == GGML_TYPE_F32) { std::memcpy(dst, src, n_per_row * sizeof(float)); }
else if (src_type == GGML_TYPE_F16) { ggml_fp16_to_fp32_row((const ggml_fp16_t*)src, dst, (int)n_per_row); }
else if (src_type == GGML_TYPE_BF16) { ggml_bf16_to_fp32_row((const ggml_bf16_t*)src, dst, (int)n_per_row); }
else if (traits && traits->to_float) { traits->to_float(src, dst, (int)n_per_row); }
else { throw std::runtime_error(format("unsupported source type %s for sampling", ggml_type_name(src_type))); }
++count;
}
@ -1360,6 +1350,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
auto [val_ptr, val_sz] = get_side_data(values_data);
auto [act_ptr, act_sz] = get_side_data(activations_data);
// Cache WCE stats once per tensor to avoid repeated map lookups/regex inside compute_quant_error
float h_norm = 1.0f;
if (valid_wce && statistics_data) {
const std::string key = remap_imatrix(name, mapped);
if (auto it = statistics_data->find(key); it != statistics_data->end() && !it->second.empty()) {
h_norm = it->second.size() > 3 ? it->second[1] : 1.0f;
}
}
std::vector<float> val_vec;
std::vector<float> act_vec;
auto prepare_broadcast = [&](const float* src, size_t sz, std::vector<float>& dst) {
@ -1378,7 +1377,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
prepare_broadcast(val_ptr, val_sz, val_vec);
prepare_broadcast(act_ptr, act_sz, act_vec);
// Precompute WCE reference stats (row_sq_norm) to avoid recalculation per candidate
// Precompute WCE reference stats
wce_cache ref_wce;
mse_cache ref_mse;
size_t total_rows_sampled = 0;
@ -1386,13 +1385,11 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
if (valid_wce && !val_vec.empty() && !act_vec.empty()) {
ref_wce.row_sq_norm.reserve(total_rows_sampled);
size_t off = 0;
for (int64_t s = 0; s < ne2; ++s) {
const int64_t rs = rows_sample[s];
if (rs == 0) { continue; }
const float * v = val_vec.data() + s * n_per_row;
for (int64_t r = 0; r < rs; ++r) {
const float * wx = f32_sample.data() + off;
double norm_x = 0.0;
@ -1405,43 +1402,45 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
}
}
} else {
// Precompute MSE reference stats (row_sq_norm and bias_denominator) to avoid recalculation per candidate
ref_mse.row_sq_norm.reserve(total_rows_sampled);
ref_mse.bias_denominator.assign(ne2, 0.0);
const bool has_acts = !act_vec.empty();
const bool has_vals = !val_vec.empty();
// Precompute MSE reference stats
ref_mse.row_sq_norm.reserve(total_rows_sampled);
ref_mse.bias_denominator.assign(ne2, 0.0);
const bool has_acts = !act_vec.empty();
const bool has_vals = !val_vec.empty();
// Bias Denominators
if (has_acts) {
for (int64_t s = 0; s < ne2; ++s) {
const float * v = has_vals ? val_vec.data() + s * n_per_row : nullptr;
const float * a = act_vec.data() + s * n_per_row;
double denom = 0.0;
for (int64_t j = 0; j < n_per_row; ++j) {
const double w = v ? std::max(0.0f, v[j]) : 1.0;
const double aj = a[j];
denom += w * aj * aj;
}
ref_mse.bias_denominator[s] = denom;
}
}
if (has_acts) {
for (int64_t s = 0; s < ne2; ++s) {
const float * v = has_vals ? val_vec.data() + s * n_per_row : nullptr;
const float * a = act_vec.data() + s * n_per_row;
double denom = 0.0;
if (v) {
for (int64_t j = 0; j < n_per_row; ++j) { denom += std::max(0.0f, v[j]) * a[j] * a[j]; }
} else {
for (int64_t j = 0; j < n_per_row; ++j) { denom += a[j] * a[j]; }
}
// Row Squared Norms
size_t off = 0;
for (int64_t s = 0; s < ne2; ++s) {
const int64_t rs = rows_sample[s];
const float * v = has_vals ? val_vec.data() + s * n_per_row : nullptr;
for (int64_t r = 0; r < rs; ++r) {
const float * x = f32_sample.data() + off;
double sum = 0.0;
for (int64_t j = 0; j < n_per_row; ++j) {
double xx = x[j];
sum += (v ? std::max(0.0f, v[j]) : 1.0) * xx * xx;
}
ref_mse.row_sq_norm.push_back(sum);
off += (size_t)n_per_row;
}
}
ref_mse.bias_denominator[s] = denom;
}
}
size_t off = 0;
for (int64_t s = 0; s < ne2; ++s) {
const int64_t rs = rows_sample[s];
const float * v = has_vals ? val_vec.data() + s * n_per_row : nullptr;
for (int64_t r = 0; r < rs; ++r) {
const float * x = f32_sample.data() + off;
double sum = 0.0;
if (v) {
for (int64_t j = 0; j < n_per_row; ++j) { sum += std::max(0.0f, v[j]) * x[j] * x[j]; }
}
else {
for (int64_t j = 0; j < n_per_row; ++j) { sum += x[j] * x[j]; }
}
ref_mse.row_sq_norm.push_back(sum);
off += (size_t)n_per_row;
}
}
}
// Build candidates
@ -1461,7 +1460,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
std::sort(valid_types.begin(), valid_types.end());
valid_types.erase(std::unique(valid_types.begin(), valid_types.end()), valid_types.end());
// Calculate bias lambda to adjust the trade-off between MSE and systematic bias
float tensor_lambda = 0.0f;
std::vector<float> slice_lambdas = estimate_lambda(val_vec.empty()?nullptr:val_vec.data(), act_vec.empty()?nullptr:act_vec.data(), n_per_row, ne2);
if (!slice_lambdas.empty()) {
@ -1492,6 +1490,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
dq_buf,
tensor_lambda,
slice_lambdas.data(),
h_norm,
ptr_ref_wce,
ptr_ref_mse
);
@ -1507,35 +1506,29 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
evaluations.push_back(candidate);
}
// Select final quality metric (MSE or MSE + bias) if not using WCE
type_choice ch;
ch.w = tw;
ch.n_elements = ggml_nelements(tensor);
bool bias_needed = false;
if (!valid_wce && !slice_lambdas.empty()) {
// Determine if bias correction is required
double best_mse = INFINITE;
double max_rel_bias = 0.0;
for (const auto& c : evaluations) {
if (c.bytes == 0) { continue; }
best_mse = std::min(best_mse, c.mse);
// Check penalty term contribution (error - mse)
if (c.mse > EPSILON) { max_rel_bias = std::max(max_rel_bias, std::max(0.0, c.error - c.mse) / c.mse); }
}
// If penalty/bias is significant (>= 50% of MSE), use combined error, else pure MSE
bias_needed = max_rel_bias >= 0.5;
}
for (const auto & ev : evaluations) {
if (ev.bytes == 0) { continue; }
type_scores ts = ev;
// If using WCE, c.error is already set
if (!valid_wce && !bias_needed) { ts.error = ts.mse; }
ch.candidates.push_back(ts);
}
// Fallback if empty
if (ch.candidates.empty()) {
type_scores fb;
fb.type = tensor->type;
@ -1544,15 +1537,13 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
ch.candidates.push_back(fb);
}
// Convex hull & Pareto Front simplification
auto simplify_pareto = [](std::vector<type_scores> & candidates) {
std::sort(candidates.begin(), candidates.end(), [](const auto& a, const auto& b) {
return a.bytes < b.bytes || (a.bytes == b.bytes && a.error < b.error);
});
candidates.erase(std::unique(candidates.begin(), candidates.end(),
[](const auto & a, const auto &b) { return a.bytes == b.bytes; }), candidates.end());
[](const auto & a, const auto &b) { return a.bytes == b.bytes; }), candidates.end());
// Lower envelope
std::vector<type_scores> hull;
double min_err = INFINITE;
for(const auto & c : candidates) {
@ -1563,12 +1554,12 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
}
candidates = std::move(hull);
// Convex hull
if (candidates.size() < 3) { return; }
std::vector<type_scores> convex;
auto cross = [](const auto& a, const auto& b, const auto& c) {
return ((double)b.bytes - (double)a.bytes) * (c.error - a.error) - ((double)c.bytes - (double)a.bytes) * (b.error - a.error);
};
for (const auto & c : candidates) {
while (convex.size() >= 2 && cross(convex[convex.size()-2], convex.back(), c) <= EPSILON) { convex.pop_back(); }
convex.push_back(c);