Change tensor importance score logic
This commit is contained in:
parent
551463e2e8
commit
f2a719b14a
|
|
@ -399,7 +399,7 @@ extern "C" {
|
|||
int64_t target_size; // target file size in bytes
|
||||
bool save_state; // keep bpw state file
|
||||
void * state_file; // pointer to bpw state file
|
||||
bool ignore_tensor_importance; // allocate target bpw budget equitably across all tensors
|
||||
float importance_pct; // identify up to pct% of tensors as important
|
||||
bool use_wce; // optimize for WCE instead of MSE
|
||||
} llama_model_quantize_params;
|
||||
|
||||
|
|
|
|||
|
|
@ -587,6 +587,22 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
|
|||
) {
|
||||
bpw_stop.store(false, std::memory_order_relaxed);
|
||||
|
||||
// Vector indices for statistics_data's metrics
|
||||
enum {
|
||||
ENERGY = 0,
|
||||
MEAN = 1,
|
||||
ELEMENTS = 2,
|
||||
STDDEV = 3,
|
||||
SKEWNESS = 4,
|
||||
KURTOSIS = 5,
|
||||
GAIN = 6,
|
||||
H_NORM = 7,
|
||||
L2_DIST = 8,
|
||||
COSSIM = 9,
|
||||
PCC = 10,
|
||||
COVAR = 11
|
||||
};
|
||||
|
||||
// SIGINT/SIGTERM signal handlers
|
||||
struct signal_scope_guard {
|
||||
using handler_t = void (*)(int);
|
||||
|
|
@ -621,6 +637,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
|
|||
float min_bpw = 0.0;
|
||||
float max_bpw = 0.0;
|
||||
size_t n_elements = 0;
|
||||
bool important = false;
|
||||
};
|
||||
|
||||
// Quantization types
|
||||
|
|
@ -901,7 +918,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
|
|||
std::vector<float> & dequantized_buffer,
|
||||
float tensor_bias,
|
||||
const float * slice_bias,
|
||||
float h_norm,
|
||||
const wce_cache * ref_wce = nullptr,
|
||||
const mse_cache * ref_mse = nullptr
|
||||
) -> quant_error
|
||||
|
|
@ -1078,8 +1094,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
|
|||
total_cos_error += slice_sum / (double)rs * (double)nrows;
|
||||
}
|
||||
|
||||
const double penalty = 2.0 - std::clamp((double) h_norm, 0.0, 1.0);
|
||||
qe.wce = total_cos_error * penalty;
|
||||
qe.wce = total_cos_error;
|
||||
qe.error = qe.wce;
|
||||
return qe;
|
||||
}
|
||||
|
|
@ -1306,13 +1321,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
|
|||
auto [act_ptr, act_sz] = get_side_data(activations_data);
|
||||
|
||||
// Cache WCE stats once per tensor to avoid repeated map lookups/regex inside compute_quant_error
|
||||
float h_norm = 1.0f;
|
||||
if (valid_wce && statistics_data) {
|
||||
if (auto it = statistics_data->find(remapped_name); it != statistics_data->end() && !it->second.empty()) {
|
||||
h_norm = it->second.size() > 3 ? it->second[1] : 1.0f;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<float> val_storage;
|
||||
std::vector<float> act_storage;
|
||||
const float * val_vec_ptr = nullptr;
|
||||
|
|
@ -1440,6 +1448,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
|
|||
dq_buf.reserve(total_rows_sampled * n_per_row);
|
||||
}
|
||||
|
||||
// Kurtosis-Gain error scaling factor
|
||||
float scaling_factor = 1.0f;
|
||||
if (statistics_data) {
|
||||
if (auto it = statistics_data->find(remapped_name); it != statistics_data->end() && !it->second.empty()) {
|
||||
const auto & ts = it->second;
|
||||
scaling_factor = 1.0f + std::log1p(std::max(0.0f, ts[KURTOSIS])) * std::max(1.0f, std::isnan(ts[GAIN]) ? 1.0f : ts[GAIN]);
|
||||
}
|
||||
}
|
||||
|
||||
for (ggml_type vt : valid_types) {
|
||||
if (bpw_stop.load(std::memory_order_relaxed)) { return std::nullopt; }
|
||||
const wce_cache * ptr_ref_wce = valid_wce && !ref_wce.row_sq_norm.empty() ? & ref_wce : nullptr;
|
||||
|
|
@ -1455,8 +1472,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
|
|||
q_buf,
|
||||
dq_buf,
|
||||
tensor_lambda,
|
||||
slice_lambdas.data(),
|
||||
h_norm,
|
||||
slice_lambdas.empty() ? nullptr : slice_lambdas.data(),
|
||||
ptr_ref_wce,
|
||||
ptr_ref_mse
|
||||
);
|
||||
|
|
@ -1465,7 +1481,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
|
|||
candidate.type = vt;
|
||||
candidate.bpw = (float)tensor_bpw(tensor, vt);
|
||||
candidate.bytes = tensor_bytes(tensor, vt);
|
||||
candidate.error = qe.error;
|
||||
candidate.error = qe.error * scaling_factor;
|
||||
candidate.mse = qe.mse;
|
||||
candidate.proj = qe.proj;
|
||||
candidate.wce = qe.wce;
|
||||
|
|
@ -1616,10 +1632,11 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
|
|||
auto build_mix = [&]() -> std::unordered_map<std::string, ggml_type> {
|
||||
std::unordered_map<std::string, ggml_type> mix;
|
||||
LLAMA_LOG_INFO("%s: - estimated tensor quantization mix:\n", func);
|
||||
for (const auto & ti : all_tensors) {
|
||||
LLAMA_LOG_INFO("\t%s: %45s - \t%8s, \t%1.4f bpw,\terror: %.4f\n",
|
||||
func, ggml_get_name(ti.w->tensor), ggml_type_name(ti.candidates[ti.choice].type), ti.candidates[ti.choice].bpw, ti.candidates[ti.choice].error);
|
||||
mix[ggml_get_name(ti.w->tensor)] = ti.candidates[ti.choice].type;
|
||||
for (const auto & tn : all_tensors) {
|
||||
LLAMA_LOG_INFO("\t%s: %45s %s\t%8s, \t%1.4f bpw,\terror: %.4f\n",
|
||||
func, ggml_get_name(tn.w->tensor), tn.important ? "⬆︎" : "-", ggml_type_name(tn.candidates[tn.choice].type), tn.candidates[tn.choice].bpw,
|
||||
tn.candidates[tn.choice].error);
|
||||
mix[ggml_get_name(tn.w->tensor)] = tn.candidates[tn.choice].type;
|
||||
}
|
||||
|
||||
return mix;
|
||||
|
|
@ -1634,23 +1651,62 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
|
|||
return build_mix();
|
||||
}
|
||||
|
||||
auto importance_score = [](const std::vector<float> & tstats) -> float {
|
||||
if (tstats.size() < 12) { return 0.0f; }
|
||||
|
||||
const float energy = std::log1pf(std::max(0.0f, (float)tstats[ENERGY]));
|
||||
const float range = 1.0f + std::max(0.0f, tstats[STDDEV]);
|
||||
const float magnitude = std::isfinite(tstats[L2_DIST]) ? 1.0f + tstats[L2_DIST] : 1.0f;
|
||||
const float alignment = std::isfinite(tstats[COSSIM]) ? 1.0f - tstats[COSSIM] : 1.0f;
|
||||
const float concentration = 1.0f - std::clamp(tstats[H_NORM], 0.0f, 100.0f) / 100.0f + EPSILON;
|
||||
|
||||
return energy * range * magnitude * alignment * concentration;
|
||||
};
|
||||
|
||||
// Threshold at which pct of tensors will be marked as important
|
||||
auto threshold_score = [&](const std::unordered_map<std::string, std::vector<float>> & stats, const float pct) -> float {
|
||||
if (stats.empty() || pct < 0.0f || pct > 100.0f) { return std::numeric_limits<float>::quiet_NaN(); }
|
||||
|
||||
std::vector<float> val;
|
||||
val.reserve(stats.size());
|
||||
for (const auto & ts : stats) { val.push_back(importance_score(ts.second)); }
|
||||
if (val.empty()) { return std::numeric_limits<float>::quiet_NaN(); }
|
||||
|
||||
size_t idx = std::round((1.0f - pct / 100.0f) * (val.size() - 1));
|
||||
if (idx >= val.size()) { idx = val.size() - 1; }
|
||||
std::nth_element(val.begin(), val.begin() + idx, val.end());
|
||||
|
||||
return val[idx];
|
||||
};
|
||||
|
||||
float cutoff = std::numeric_limits<float>::quiet_NaN();
|
||||
if (statistics_data && !statistics_data->empty()) { cutoff = threshold_score(* statistics_data, params->importance_pct); }
|
||||
LLAMA_LOG_INFO("%s: - importance score cutoff: %1.4f\n", func, cutoff);
|
||||
|
||||
// Certain tensors have a higher impact on model quality, so we apply a lower penalty to them
|
||||
auto is_important = [&](const std::string & tensor_name) -> bool {
|
||||
bool important = false;
|
||||
if (params->ignore_tensor_importance) { return important; }
|
||||
if (tensor_name == "output.weight") { return true; }
|
||||
if (params->importance_pct == 0.0f) { return false; }
|
||||
if (std::isfinite(cutoff)) {
|
||||
if (auto it = statistics_data->find(remap_imatrix(tensor_name, mapped)); it != statistics_data->end() && !it->second.empty()) {
|
||||
return importance_score(it->second) >= cutoff;
|
||||
}
|
||||
} else {
|
||||
return tensor_name.find(".attn_output.weight") != std::string::npos ||
|
||||
tensor_name.find(".attn_o.weight") != std::string::npos ||
|
||||
tensor_name.find(".attn_v.weight") != std::string::npos ||
|
||||
tensor_name.find(".ffn_down.weight") != std::string::npos ||
|
||||
tensor_name.find(".ffn_down_exps.weight") != std::string::npos ||
|
||||
tensor_name.find(".time_mix_output.weight") != std::string::npos ||
|
||||
tensor_name.find(".time_mix_value.weight") != std::string::npos;
|
||||
}
|
||||
|
||||
important = tensor_name == "output.weight" ||
|
||||
tensor_name.find(".attn_output.weight") != std::string::npos ||
|
||||
tensor_name.find(".attn_o.weight") != std::string::npos ||
|
||||
tensor_name.find(".attn_v.weight") != std::string::npos ||
|
||||
tensor_name.find(".ffn_down.weight") != std::string::npos ||
|
||||
tensor_name.find(".ffn_down_exps.weight") != std::string::npos ||
|
||||
tensor_name.find(".time_mix_output.weight") != std::string::npos ||
|
||||
tensor_name.find(".time_mix_value.weight") != std::string::npos;
|
||||
|
||||
return important;
|
||||
return false;
|
||||
};
|
||||
|
||||
// Determine tensor importance
|
||||
for (auto & tn : all_tensors) { tn.important = is_important(ggml_get_name(tn.w->tensor)); }
|
||||
|
||||
// Minimize error subject to a size target constraint
|
||||
auto lagrangian_relaxation = [&](const double mu, std::vector<int> & choices, size_t & bytes, double & cost) {
|
||||
choices.resize(all_tensors.size());
|
||||
|
|
@ -1658,8 +1714,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
|
|||
cost = 0.0;
|
||||
for (size_t i = 0; i < all_tensors.size(); ++i) {
|
||||
const auto & tn = all_tensors[i];
|
||||
const bool imp = is_important(ggml_get_name(tn.w->tensor));
|
||||
const double eff_mu = imp ? mu * 0.1 : mu; // important tensors get 10x lower penalty
|
||||
const double eff_mu = tn.important ? mu / penalty : mu; // important tensors get a lower penalty
|
||||
|
||||
int best = 0;
|
||||
double min = INFINITE;
|
||||
|
|
@ -1764,7 +1819,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
|
|||
auto bytes = (double)(tn.candidates[next].bytes - tn.candidates[tn.choice].bytes);
|
||||
if (bytes > EPSILON) {
|
||||
double ratio = err / bytes;
|
||||
if (is_important(ggml_get_name(tn.w->tensor))) { ratio *= 5.0; } // important tensors get 5x boost
|
||||
if (tn.important) { ratio *= penalty; } // important tensors get a higher priority
|
||||
queue.push({i, next, ratio});
|
||||
}
|
||||
}
|
||||
|
|
@ -2051,10 +2106,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||
if (params->statistics) {
|
||||
LLAMA_LOG_INFO("%s: imatrix has statistics\n", __func__);
|
||||
}
|
||||
if (params->ignore_tensor_importance) {
|
||||
LLAMA_LOG_INFO("%s: distributing budget equitably across all tensors\n", __func__);
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: assigning more budget to important tensors\n", __func__);
|
||||
if (params->importance_pct != 0.0f) {
|
||||
LLAMA_LOG_INFO("%s: marking up to %.2f%% of tensors as important\n", __func__, params->importance_pct);
|
||||
}
|
||||
if (params->use_wce) {
|
||||
LLAMA_LOG_INFO("%s: using experimental Weighted Cosine Error (WCE) optimization\n", __func__);
|
||||
|
|
@ -2426,7 +2479,7 @@ llama_model_quantize_params llama_model_quantize_default_params() {
|
|||
/*.target_size =*/ -1,
|
||||
/*.save_state =*/ false,
|
||||
/*.state_file =*/ nullptr,
|
||||
/*.ignore_tensor_importance =*/ false,
|
||||
/*.importance_pct =*/ 0.0f,
|
||||
/*.use_wce =*/ false
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -139,8 +139,8 @@ static void usage(const char * executable) {
|
|||
printf(" Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n");
|
||||
printf(" --target-size N[unit]: target a file size. N must be a positive number with an optional unit (b, kb, mb, gb, tb)\n");
|
||||
printf(" Advanced option to automatically select quantization types to achieve a target file size\n");
|
||||
printf(" --ignore-tensor-importance: distribute bpw budget equitably across all tensors\n");
|
||||
printf(" Advanced option to disable assigning more bpw budget to important tensors. It may increase quality for some models\n");
|
||||
printf(" --importance-pct N: mark up to N%% of tensors as important. N must be a positive number between 0.0 and 100.0\n");
|
||||
printf(" Advanced option to select up to N%% of important tensors to keep at a higher precision. It may increase quality for some models\n");
|
||||
printf(" --save-state: save the bpw / file size computations to <model name>-<model hash>-mse.bpw_state\n");
|
||||
printf(" --state-file file_name: file name to use instead of default\n");
|
||||
printf(" --keep-split: will generate quantized model in the same shards as input\n");
|
||||
|
|
@ -557,6 +557,27 @@ static bool parse_target_bpw(const char * data, float & target_bpw) {
|
|||
return true;
|
||||
}
|
||||
|
||||
static bool parse_importance_pct(const char * data, float & importance_pct) {
|
||||
if (!data) {
|
||||
printf("\n%s: no tensor importance %% provided\n\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
try {
|
||||
importance_pct = std::stof(data);
|
||||
if (importance_pct < 0.0f || importance_pct > 100.0f) {
|
||||
printf("\n%s: tensor importance %% must be a positive number between 0.0 and 100.0\n\n", __func__);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
catch (const std::exception & e) {
|
||||
printf("\n%s: '%s' is not valid. Tensor importance %% must be a positive number between 0.0 and 100.0\n\n", __func__, data);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool parse_target_size(const char * data, int64_t & target_size) {
|
||||
if (!data) {
|
||||
printf("\n%s: no target file size provided\n\n", __func__);
|
||||
|
|
@ -633,6 +654,7 @@ int main(int argc, char ** argv) {
|
|||
std::vector<int> prune_layers;
|
||||
float target_bpw = -1.0f;
|
||||
int64_t target_size = -1;
|
||||
float importance_pct = 0.0f;
|
||||
|
||||
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
|
||||
if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
|
||||
|
|
@ -673,8 +695,10 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
} else if (strcmp(argv[arg_idx], "--use-wce") == 0) {
|
||||
params.use_wce = true;
|
||||
} else if (strcmp(argv[arg_idx], "--ignore-tensor-importance") == 0) {
|
||||
params.ignore_tensor_importance = true;
|
||||
} else if (strcmp(argv[arg_idx], "--importance-pct") == 0) {
|
||||
if (arg_idx == argc-1 || !parse_importance_pct(argv[++arg_idx], importance_pct)) {
|
||||
usage(argv[0]);
|
||||
}
|
||||
} else if (strcmp(argv[arg_idx], "--save-state") == 0) {
|
||||
params.save_state = true;
|
||||
} else if (strcmp(argv[arg_idx], "--state-file") == 0) {
|
||||
|
|
@ -792,6 +816,9 @@ int main(int argc, char ** argv) {
|
|||
if (target_size != -1) {
|
||||
params.target_size = target_size;
|
||||
}
|
||||
if (importance_pct != 0.0f) {
|
||||
params.importance_pct = importance_pct;
|
||||
}
|
||||
|
||||
llama_backend_init();
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue