Optimise tensor sampling

This commit is contained in:
Ed Addario 2025-08-20 20:58:26 +01:00
parent 3f0118d602
commit b0b33b7ccb
No known key found for this signature in database
GPG Key ID: E7875815A3230993
1 changed files with 119 additions and 78 deletions

View File

@ -609,7 +609,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
const std::unordered_map<std::string, std::vector<float>> * activations_data, const std::unordered_map<std::string, std::vector<float>> * activations_data,
const llama_model_quantize_params * params, const llama_model_quantize_params * params,
int nthread, int nthread,
int sample_rows_per_expert = 128, int sample_rows_per_expert = 256,
float bias_lambda = 1.0 float bias_lambda = 1.0
) { ) {
struct candidate_types { struct candidate_types {
@ -671,7 +671,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
auto can_quantize = [&](const ggml_tensor * t) -> bool { auto can_quantize = [&](const ggml_tensor * t) -> bool {
const std::string name = ggml_get_name(t); const std::string name = ggml_get_name(t);
bool q = name.rfind("weight") == name.size() - 6; bool q = name.rfind("weight") == name.size() - 6;
q &= (ggml_n_dims(t) >= 2); q &= ggml_n_dims(t) >= 2;
q &= name.find("_norm.weight") == std::string::npos; q &= name.find("_norm.weight") == std::string::npos;
q &= name.find("ffn_gate_inp.weight") == std::string::npos; q &= name.find("ffn_gate_inp.weight") == std::string::npos;
q &= name.find("altup") == std::string::npos; q &= name.find("altup") == std::string::npos;
@ -734,7 +734,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
auto is_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> bool { auto is_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> bool {
const int64_t n_per_row = t->ne[0]; const int64_t n_per_row = t->ne[0];
const int64_t blck = ggml_blck_size(typ); const int64_t blck = ggml_blck_size(typ);
if (blck <= 1) { return true; } // FP16/BF16/Q8_0 etc if (blck <= 1) { return true; }
return n_per_row % blck == 0; return n_per_row % blck == 0;
}; };
@ -742,12 +742,17 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
if (is_compatible(t, typ)) { return typ; } if (is_compatible(t, typ)) { return typ; }
ggml_type fb = fallback_type(typ); ggml_type fb = fallback_type(typ);
if (is_compatible(t, fb)) { return fb; } if (is_compatible(t, fb)) { return fb; }
return GGML_TYPE_F16; // final guard return GGML_TYPE_F16;
}; };
// Estimate error for a given type using a sampled subset of rows. // Estimate error for a given type using a sampled subset of rows
// Uses both imatrix (E[a^2]) and activations (E[a]) if available. auto estimate_error = [&](const ggml_tensor * t,
auto estimate_error = [&](const ggml_tensor * t, const float * f32_data, const ggml_type typ, const float * values_all, const float * activations_all) -> double { const ggml_type typ,
const std::vector<float> & f32_sample,
const std::vector<int64_t> & sample_rows_per_slice,
const std::vector<float> & values_sample,
const std::vector<float> & activations_sample) -> double
{
const int64_t n_per_row = t->ne[0]; const int64_t n_per_row = t->ne[0];
const int64_t nrows = t->ne[1]; const int64_t nrows = t->ne[1];
const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
@ -758,70 +763,73 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
return 1e35f; return 1e35f;
} }
// Sampling plan: for each expert slice, take up to sample_rows rows spread uniformly const size_t total_sampled_rows = f32_sample.size() / n_per_row;
const int64_t rows_per_expert = nrows; if (total_sampled_rows == 0) { return 0.0; }
const int64_t sample_rows = std::max<int64_t>(1, std::min<int64_t>(rows_per_expert, sample_rows_per_expert));
const int64_t stride = std::max<int64_t>(1, rows_per_expert / sample_rows);
const size_t row_sz = ggml_row_size(typ, n_per_row); const size_t qbuf_size = ggml_row_size(typ, n_per_row) * total_sampled_rows;
std::vector<uint8_t> qbuf(row_sz * sample_rows); std::vector<uint8_t> qbuf(qbuf_size);
std::vector<float> f32_sample(sample_rows * n_per_row); std::vector<float> deq(f32_sample.size());
std::vector<float> deq(sample_rows * n_per_row);
double total_err = 0.0;
// Quantize all sampled rows at once and dequantize back
size_t qbuf_offset = 0;
size_t f32_offset = 0;
for (int64_t slice = 0; slice < ne2; ++slice) { for (int64_t slice = 0; slice < ne2; ++slice) {
const float * value = values_all ? (values_all + slice * n_per_row) : nullptr; const int64_t rs = sample_rows_per_slice[slice];
const float * activation = activations_all ? (activations_all + slice * n_per_row) : nullptr;
int64_t rs = 0;
for (int64_t r = 0; r < rows_per_expert && rs < sample_rows; r += stride) {
const float * src = f32_data + slice * (n_per_row * rows_per_expert) + r * n_per_row;
std::memcpy(f32_sample.data() + rs * n_per_row, src, sizeof(float) * n_per_row);
++rs;
}
if (rs == 0) { continue; } if (rs == 0) { continue; }
// Quantize sample rows and dequantize back const float * value = values_sample.empty() ? nullptr : values_sample.data() + slice * n_per_row;
(void)ggml_quantize_chunk(typ, f32_sample.data(), qbuf.data(), 0, rs, n_per_row, value); (void)ggml_quantize_chunk(typ, f32_sample.data() + f32_offset, qbuf.data() + qbuf_offset, 0, rs, n_per_row, value);
traits->to_float(qbuf.data(), deq.data(), rs * n_per_row); qbuf_offset += ggml_row_size(typ, n_per_row) * rs;
f32_offset += rs * n_per_row;
}
traits->to_float(qbuf.data(), deq.data(), f32_sample.size());
double total_err = 0.0;
size_t sample_offset = 0;
for (int64_t slice = 0; slice < ne2; ++slice) {
const float * value_slice = values_sample.empty() ? nullptr : values_sample.data() + slice * n_per_row;
const float * activation_slice = activations_sample.empty() ? nullptr : activations_sample.data() + slice * n_per_row;
const int64_t rs = sample_rows_per_slice[slice];
// Compute error proxy per sampled slice
double slice_err = 0.0; double slice_err = 0.0;
for (int64_t s = 0; s < rs; ++s) { for (int64_t s = 0; s < rs; ++s) {
const float * xs = f32_sample.data() + s * n_per_row; const float * xs = f32_sample.data() + sample_offset;
const float * ys = deq.data() + s * n_per_row; const float * ys = deq.data() + sample_offset;
double mse_w = 0.0; double mse_w = 0.0;
double bias_sum = 0.0; double bias_sum = 0.0;
if (value) { if (value_slice) {
for (int64_t j = 0; j < n_per_row; ++j) { for (int64_t j = 0; j < n_per_row; ++j) {
const float e = ys[j] - xs[j]; const float e = ys[j] - xs[j];
mse_w += e * e * value[j]; mse_w += e * e * value_slice[j];
if (activation) { bias_sum += e * activation[j]; } if (activation_slice) { bias_sum += e * activation_slice[j]; }
} }
} else { } else {
for (int64_t j = 0; j < n_per_row; ++j) { for (int64_t j = 0; j < n_per_row; ++j) {
const float e = ys[j] - xs[j]; const float e = ys[j] - xs[j];
mse_w += e * e; mse_w += e * e;
if (activation) { bias_sum += e * activation[j]; } if (activation_slice) { bias_sum += e * activation_slice[j]; }
} }
} }
// Normalize by n_per_row to get a per-row average scale // Normalize by n_per_row to get a per-row average scale
double row_err = mse_w / std::max<int64_t>(1, n_per_row); double row_err = mse_w / std::max<int64_t>(1, n_per_row);
if (activation && bias_lambda != 0.0) { if (activation_slice && bias_lambda != 0.0) {
// bias_sum ~= sum_j ( (w_q - w_fp)[j] * E[a_j] ) // bias_sum ~= sum_j ( (w_q - w_fp)[j] * E[a_j] )
const double bias = std::abs(bias_sum) / std::max<int64_t>(1, n_per_row); const double bias = std::abs(bias_sum) / std::max<int64_t>(1, n_per_row);
row_err += bias_lambda * bias; row_err += bias_lambda * bias;
} }
slice_err += row_err; slice_err += row_err;
sample_offset += n_per_row;
} }
// Scale the slice contribution by the sampling factor // Scale the slice contribution by the sampling factor
const auto scale_rows = (double)rows_per_expert / std::max(1.0, (double)rs); const double rows_per_expert = (double) nrows;
const auto scale_rows = rows_per_expert / std::max(1.0, (double) rs);
total_err += slice_err * scale_rows; total_err += slice_err * scale_rows;
} }
@ -858,8 +866,40 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
f32_data = (float *)f32_conv_buf.data(); f32_data = (float *)f32_conv_buf.data();
} }
const float * values = get_values(name); const float * values_all = get_values(name);
const float * activations = get_activations(name); const float * activations_all = get_activations(name);
// Sample the tensor rows once, before looping through quantization candidates.
const int64_t n_per_row = t->ne[0];
const int64_t nrows_total = t->ne[1];
const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
const int64_t rows_per_expert = nrows_total;
const int64_t sample_rows_max = std::max<int64_t>(1, std::min<int64_t>(rows_per_expert, sample_rows_per_expert));
const int64_t stride = std::max<int64_t>(1, rows_per_expert / sample_rows_max);
std::vector<float> f32_sample;
std::vector<float> values_sample;
std::vector<float> activations_sample;
std::vector<int64_t> sample_rows_per_slice(ne2);
for (int64_t slice = 0; slice < ne2; ++slice) {
int64_t current_sampled_rows = 0;
for (int64_t r = 0; r < rows_per_expert && current_sampled_rows < sample_rows_max; r += stride) {
const float * src_row = f32_data + slice * (n_per_row * rows_per_expert) + r * n_per_row;
f32_sample.insert(f32_sample.end(), src_row, src_row + n_per_row);
current_sampled_rows++;
}
sample_rows_per_slice[slice] = current_sampled_rows;
}
if (values_all) {
values_sample.resize(ne2 * n_per_row);
std::memcpy(values_sample.data(), values_all, ne2 * n_per_row * sizeof(float));
}
if (activations_all) {
activations_sample.resize(ne2 * n_per_row);
std::memcpy(activations_sample.data(), activations_all, ne2 * n_per_row * sizeof(float));
}
tensor_info info; tensor_info info;
info.w = tw; info.w = tw;
@ -874,7 +914,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
// Build per-tensor candidate list // Build per-tensor candidate list
for (ggml_type ts_type : quant_candidates) { for (ggml_type ts_type : quant_candidates) {
if (is_iq(ts_type) && !values) { continue; } if (is_iq(ts_type) && !values_all) { continue; }
ggml_type tt = make_compatible(t, ts_type); ggml_type tt = make_compatible(t, ts_type);
if (!is_compatible(t, tt)) { continue; } if (!is_compatible(t, tt)) { continue; }
@ -882,9 +922,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
auto bpw = (float)tensor_bpw(t, tt); auto bpw = (float)tensor_bpw(t, tt);
size_t bytes = total_bytes(t, tt); size_t bytes = total_bytes(t, tt);
// Estimate error // Estimate error using the pre-sampled data
auto err = (float)estimate_error(t, f32_data, tt, values, activations); auto err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, values_sample, activations_sample);
info.candidate.push_back(candidate_types{ tt, bpw, bytes, err }); info.candidate.push_back(candidate_types{ tt, bpw, bytes, err });
} }
@ -976,7 +1015,10 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
const auto & cand = ti.candidate; const auto & cand = ti.candidate;
const auto & cur = cand[ti.choice]; const auto & cur = cand[ti.choice];
int j = ti.choice + 1; int j = ti.choice + 1;
while (j < (int)cand.size() && cand[j].bytes == cur.bytes) ++j; while (j < (int)cand.size() && cand[j].bytes == cur.bytes) {
++j;
}
return j < (int)cand.size() ? j : -1; return j < (int)cand.size() ? j : -1;
}; };
@ -987,16 +1029,16 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
const auto & ti = all[i]; const auto & ti = all[i];
if (ti.choice >= (int)ti.candidate.size() - 1) { continue; } if (ti.choice >= (int)ti.candidate.size() - 1) { continue; }
int j = next_distinct_idx(ti); const int j = next_distinct_idx(ti);
if (j < 0) { continue; } if (j < 0) { continue; }
const auto & cur = ti.candidate[ti.choice]; const auto & cur = ti.candidate[ti.choice];
const auto & nxt = ti.candidate[j]; const auto & nxt = ti.candidate[j];
size_t delta_bytes = nxt.bytes - cur.bytes; const size_t delta_bytes = nxt.bytes - cur.bytes;
if (delta_bytes == 0) { continue; } if (delta_bytes == 0) { continue; }
double err = (double)cur.error - (double)nxt.error; double err = cur.error - nxt.error;
err = std::max(err, 0.0); err = std::max(err, 0.0);
double ratio = err / (double)(delta_bytes * 8ull); double ratio = err / (double)(delta_bytes * 8ull);
@ -1014,8 +1056,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
size_t now_bytes = current_total_bytes(); size_t now_bytes = current_total_bytes();
size_t next_bytes = now_bytes + up.delta_bytes; size_t next_bytes = now_bytes + up.delta_bytes;
double bpw_next = (double)next_bytes * 8.0 / (double)tw; double bpw_next = (double)next_bytes * 8.0 / (double)tw;
if (bpw_next <= target_bpw + 1e-12) {
if (bpw_next <= (double)target_bpw + 1e-12) {
all[up.idx].choice = up.next; all[up.idx].choice = up.next;
bpw_now = bpw_next; bpw_now = bpw_next;
} else { } else {
@ -1026,7 +1067,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
// We might still be below target but taking any single upgrade overshoots. // We might still be below target but taking any single upgrade overshoots.
// Try to find the best upgrade that overshoots the target_bpw by the least and has the best error-to-size ratio. // Try to find the best upgrade that overshoots the target_bpw by the least and has the best error-to-size ratio.
{ {
double under_gap = (double)target_bpw - bpw_now; double under_gap = target_bpw - bpw_now;
upgrade best_over{ -1, -1, 0.0, 0, -1.0 }; upgrade best_over{ -1, -1, 0.0, 0, -1.0 };
double best_over_gap = 1e300; double best_over_gap = 1e300;
@ -1051,7 +1092,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
double over_gap = std::abs(bpw_over - (double)target_bpw); double over_gap = std::abs(bpw_over - (double)target_bpw);
double err = (double)cur.error - (double)nxt.error; double err = cur.error - nxt.error;
if (err < 0.0) { err = 0.0; } if (err < 0.0) { err = 0.0; }
double ratio = err / (double)(delta_bytes * 8ull); double ratio = err / (double)(delta_bytes * 8ull);