Change tensor importance score logic

This commit is contained in:
Ed Addario 2026-02-20 15:05:46 +00:00
parent 551463e2e8
commit f2a719b14a
No known key found for this signature in database
GPG Key ID: E7875815A3230993
3 changed files with 122 additions and 42 deletions

View File

@ -399,7 +399,7 @@ extern "C" {
int64_t target_size; // target file size in bytes
bool save_state; // keep bpw state file
void * state_file; // pointer to bpw state file
bool ignore_tensor_importance; // allocate target bpw budget equitably across all tensors
float importance_pct; // identify up to pct% of tensors as important
bool use_wce; // optimize for WCE instead of MSE
} llama_model_quantize_params;

View File

@ -587,6 +587,22 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
) {
bpw_stop.store(false, std::memory_order_relaxed);
// Vector indices for statistics_data's metrics
enum {
ENERGY = 0,
MEAN = 1,
ELEMENTS = 2,
STDDEV = 3,
SKEWNESS = 4,
KURTOSIS = 5,
GAIN = 6,
H_NORM = 7,
L2_DIST = 8,
COSSIM = 9,
PCC = 10,
COVAR = 11
};
// SIGINT/SIGTERM signal handlers
struct signal_scope_guard {
using handler_t = void (*)(int);
@ -621,6 +637,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
float min_bpw = 0.0;
float max_bpw = 0.0;
size_t n_elements = 0;
bool important = false;
};
// Quantization types
@ -901,7 +918,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
std::vector<float> & dequantized_buffer,
float tensor_bias,
const float * slice_bias,
float h_norm,
const wce_cache * ref_wce = nullptr,
const mse_cache * ref_mse = nullptr
) -> quant_error
@ -1078,8 +1094,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
total_cos_error += slice_sum / (double)rs * (double)nrows;
}
const double penalty = 2.0 - std::clamp((double) h_norm, 0.0, 1.0);
qe.wce = total_cos_error * penalty;
qe.wce = total_cos_error;
qe.error = qe.wce;
return qe;
}
@ -1306,13 +1321,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
auto [act_ptr, act_sz] = get_side_data(activations_data);
// Cache WCE stats once per tensor to avoid repeated map lookups/regex inside compute_quant_error
float h_norm = 1.0f;
if (valid_wce && statistics_data) {
if (auto it = statistics_data->find(remapped_name); it != statistics_data->end() && !it->second.empty()) {
h_norm = it->second.size() > 3 ? it->second[1] : 1.0f;
}
}
std::vector<float> val_storage;
std::vector<float> act_storage;
const float * val_vec_ptr = nullptr;
@ -1440,6 +1448,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
dq_buf.reserve(total_rows_sampled * n_per_row);
}
// Kurtosis-Gain error scaling factor
float scaling_factor = 1.0f;
if (statistics_data) {
if (auto it = statistics_data->find(remapped_name); it != statistics_data->end() && !it->second.empty()) {
const auto & ts = it->second;
scaling_factor = 1.0f + std::log1p(std::max(0.0f, ts[KURTOSIS])) * std::max(1.0f, std::isnan(ts[GAIN]) ? 1.0f : ts[GAIN]);
}
}
for (ggml_type vt : valid_types) {
if (bpw_stop.load(std::memory_order_relaxed)) { return std::nullopt; }
const wce_cache * ptr_ref_wce = valid_wce && !ref_wce.row_sq_norm.empty() ? & ref_wce : nullptr;
@ -1455,8 +1472,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
q_buf,
dq_buf,
tensor_lambda,
slice_lambdas.data(),
h_norm,
slice_lambdas.empty() ? nullptr : slice_lambdas.data(),
ptr_ref_wce,
ptr_ref_mse
);
@ -1465,7 +1481,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
candidate.type = vt;
candidate.bpw = (float)tensor_bpw(tensor, vt);
candidate.bytes = tensor_bytes(tensor, vt);
candidate.error = qe.error;
candidate.error = qe.error * scaling_factor;
candidate.mse = qe.mse;
candidate.proj = qe.proj;
candidate.wce = qe.wce;
@ -1616,10 +1632,11 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
auto build_mix = [&]() -> std::unordered_map<std::string, ggml_type> {
std::unordered_map<std::string, ggml_type> mix;
LLAMA_LOG_INFO("%s: - estimated tensor quantization mix:\n", func);
for (const auto & ti : all_tensors) {
LLAMA_LOG_INFO("\t%s: %45s - \t%8s, \t%1.4f bpw,\terror: %.4f\n",
func, ggml_get_name(ti.w->tensor), ggml_type_name(ti.candidates[ti.choice].type), ti.candidates[ti.choice].bpw, ti.candidates[ti.choice].error);
mix[ggml_get_name(ti.w->tensor)] = ti.candidates[ti.choice].type;
for (const auto & tn : all_tensors) {
LLAMA_LOG_INFO("\t%s: %45s %s\t%8s, \t%1.4f bpw,\terror: %.4f\n",
func, ggml_get_name(tn.w->tensor), tn.important ? "⬆︎" : "-", ggml_type_name(tn.candidates[tn.choice].type), tn.candidates[tn.choice].bpw,
tn.candidates[tn.choice].error);
mix[ggml_get_name(tn.w->tensor)] = tn.candidates[tn.choice].type;
}
return mix;
@ -1634,23 +1651,62 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
return build_mix();
}
auto importance_score = [](const std::vector<float> & tstats) -> float {
if (tstats.size() < 12) { return 0.0f; }
const float energy = std::log1pf(std::max(0.0f, (float)tstats[ENERGY]));
const float range = 1.0f + std::max(0.0f, tstats[STDDEV]);
const float magnitude = std::isfinite(tstats[L2_DIST]) ? 1.0f + tstats[L2_DIST] : 1.0f;
const float alignment = std::isfinite(tstats[COSSIM]) ? 1.0f - tstats[COSSIM] : 1.0f;
const float concentration = 1.0f - std::clamp(tstats[H_NORM], 0.0f, 100.0f) / 100.0f + EPSILON;
return energy * range * magnitude * alignment * concentration;
};
// Threshold at which pct of tensors will be marked as important
auto threshold_score = [&](const std::unordered_map<std::string, std::vector<float>> & stats, const float pct) -> float {
if (stats.empty() || pct < 0.0f || pct > 100.0f) { return std::numeric_limits<float>::quiet_NaN(); }
std::vector<float> val;
val.reserve(stats.size());
for (const auto & ts : stats) { val.push_back(importance_score(ts.second)); }
if (val.empty()) { return std::numeric_limits<float>::quiet_NaN(); }
size_t idx = std::round((1.0f - pct / 100.0f) * (val.size() - 1));
if (idx >= val.size()) { idx = val.size() - 1; }
std::nth_element(val.begin(), val.begin() + idx, val.end());
return val[idx];
};
float cutoff = std::numeric_limits<float>::quiet_NaN();
if (statistics_data && !statistics_data->empty()) { cutoff = threshold_score(* statistics_data, params->importance_pct); }
LLAMA_LOG_INFO("%s: - importance score cutoff: %1.4f\n", func, cutoff);
// Certain tensors have a higher impact on model quality, so we apply a lower penalty to them
auto is_important = [&](const std::string & tensor_name) -> bool {
bool important = false;
if (params->ignore_tensor_importance) { return important; }
if (tensor_name == "output.weight") { return true; }
if (params->importance_pct == 0.0f) { return false; }
if (std::isfinite(cutoff)) {
if (auto it = statistics_data->find(remap_imatrix(tensor_name, mapped)); it != statistics_data->end() && !it->second.empty()) {
return importance_score(it->second) >= cutoff;
}
} else {
return tensor_name.find(".attn_output.weight") != std::string::npos ||
tensor_name.find(".attn_o.weight") != std::string::npos ||
tensor_name.find(".attn_v.weight") != std::string::npos ||
tensor_name.find(".ffn_down.weight") != std::string::npos ||
tensor_name.find(".ffn_down_exps.weight") != std::string::npos ||
tensor_name.find(".time_mix_output.weight") != std::string::npos ||
tensor_name.find(".time_mix_value.weight") != std::string::npos;
}
important = tensor_name == "output.weight" ||
tensor_name.find(".attn_output.weight") != std::string::npos ||
tensor_name.find(".attn_o.weight") != std::string::npos ||
tensor_name.find(".attn_v.weight") != std::string::npos ||
tensor_name.find(".ffn_down.weight") != std::string::npos ||
tensor_name.find(".ffn_down_exps.weight") != std::string::npos ||
tensor_name.find(".time_mix_output.weight") != std::string::npos ||
tensor_name.find(".time_mix_value.weight") != std::string::npos;
return important;
return false;
};
// Determine tensor importance
for (auto & tn : all_tensors) { tn.important = is_important(ggml_get_name(tn.w->tensor)); }
// Minimize error subject to a size target constraint
auto lagrangian_relaxation = [&](const double mu, std::vector<int> & choices, size_t & bytes, double & cost) {
choices.resize(all_tensors.size());
@ -1658,8 +1714,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
cost = 0.0;
for (size_t i = 0; i < all_tensors.size(); ++i) {
const auto & tn = all_tensors[i];
const bool imp = is_important(ggml_get_name(tn.w->tensor));
const double eff_mu = imp ? mu * 0.1 : mu; // important tensors get 10x lower penalty
const double eff_mu = tn.important ? mu / penalty : mu; // important tensors get a lower penalty
int best = 0;
double min = INFINITE;
@ -1764,7 +1819,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
auto bytes = (double)(tn.candidates[next].bytes - tn.candidates[tn.choice].bytes);
if (bytes > EPSILON) {
double ratio = err / bytes;
if (is_important(ggml_get_name(tn.w->tensor))) { ratio *= 5.0; } // important tensors get 5x boost
if (tn.important) { ratio *= penalty; } // important tensors get a higher priority
queue.push({i, next, ratio});
}
}
@ -2051,10 +2106,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
if (params->statistics) {
LLAMA_LOG_INFO("%s: imatrix has statistics\n", __func__);
}
if (params->ignore_tensor_importance) {
LLAMA_LOG_INFO("%s: distributing budget equitably across all tensors\n", __func__);
} else {
LLAMA_LOG_INFO("%s: assigning more budget to important tensors\n", __func__);
if (params->importance_pct != 0.0f) {
LLAMA_LOG_INFO("%s: marking up to %.2f%% of tensors as important\n", __func__, params->importance_pct);
}
if (params->use_wce) {
LLAMA_LOG_INFO("%s: using experimental Weighted Cosine Error (WCE) optimization\n", __func__);
@ -2426,7 +2479,7 @@ llama_model_quantize_params llama_model_quantize_default_params() {
/*.target_size =*/ -1,
/*.save_state =*/ false,
/*.state_file =*/ nullptr,
/*.ignore_tensor_importance =*/ false,
/*.importance_pct =*/ 0.0f,
/*.use_wce =*/ false
};

View File

@ -139,8 +139,8 @@ static void usage(const char * executable) {
printf(" Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n");
printf(" --target-size N[unit]: target a file size. N must be a positive number with an optional unit (b, kb, mb, gb, tb)\n");
printf(" Advanced option to automatically select quantization types to achieve a target file size\n");
printf(" --ignore-tensor-importance: distribute bpw budget equitably across all tensors\n");
printf(" Advanced option to disable assigning more bpw budget to important tensors. It may increase quality for some models\n");
printf(" --importance-pct N: mark up to N%% of tensors as important. N must be a positive number between 0.0 and 100.0\n");
printf(" Advanced option to select up to N%% of important tensors to keep at a higher precision. It may increase quality for some models\n");
printf(" --save-state: save the bpw / file size computations to <model name>-<model hash>-mse.bpw_state\n");
printf(" --state-file file_name: file name to use instead of default\n");
printf(" --keep-split: will generate quantized model in the same shards as input\n");
@ -557,6 +557,27 @@ static bool parse_target_bpw(const char * data, float & target_bpw) {
return true;
}
static bool parse_importance_pct(const char * data, float & importance_pct) {
if (!data) {
printf("\n%s: no tensor importance %% provided\n\n", __func__);
return false;
}
try {
importance_pct = std::stof(data);
if (importance_pct < 0.0f || importance_pct > 100.0f) {
printf("\n%s: tensor importance %% must be a positive number between 0.0 and 100.0\n\n", __func__);
return false;
}
}
catch (const std::exception & e) {
printf("\n%s: '%s' is not valid. Tensor importance %% must be a positive number between 0.0 and 100.0\n\n", __func__, data);
return false;
}
return true;
}
static bool parse_target_size(const char * data, int64_t & target_size) {
if (!data) {
printf("\n%s: no target file size provided\n\n", __func__);
@ -633,6 +654,7 @@ int main(int argc, char ** argv) {
std::vector<int> prune_layers;
float target_bpw = -1.0f;
int64_t target_size = -1;
float importance_pct = 0.0f;
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
@ -673,8 +695,10 @@ int main(int argc, char ** argv) {
}
} else if (strcmp(argv[arg_idx], "--use-wce") == 0) {
params.use_wce = true;
} else if (strcmp(argv[arg_idx], "--ignore-tensor-importance") == 0) {
params.ignore_tensor_importance = true;
} else if (strcmp(argv[arg_idx], "--importance-pct") == 0) {
if (arg_idx == argc-1 || !parse_importance_pct(argv[++arg_idx], importance_pct)) {
usage(argv[0]);
}
} else if (strcmp(argv[arg_idx], "--save-state") == 0) {
params.save_state = true;
} else if (strcmp(argv[arg_idx], "--state-file") == 0) {
@ -792,6 +816,9 @@ int main(int argc, char ** argv) {
if (target_size != -1) {
params.target_size = target_size;
}
if (importance_pct != 0.0f) {
params.importance_pct = importance_pct;
}
llama_backend_init();