From 1c9993e13198a28db1b5a8e7cd0fcb5d6bcf89eb Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 23 Nov 2025 17:51:04 +0000 Subject: [PATCH] Add --disable-tensor-importance option --- include/llama.h | 1 + src/llama-quant.cpp | 39 ++++++++++++++----------------------- tools/quantize/quantize.cpp | 4 ++++ 3 files changed, 20 insertions(+), 24 deletions(-) diff --git a/include/llama.h b/include/llama.h index c82a4147f4..1f5b2e8a2b 100644 --- a/include/llama.h +++ b/include/llama.h @@ -369,6 +369,7 @@ extern "C" { float target_bpw; // target bits per weight (bpw) bool keep_bpw_state; // keep bpw state file void * bpw_state; // pointer to bpw state file + bool disable_tensor_importance; // treat all tensors equally during quantization } llama_model_quantize_params; typedef struct llama_logit_bias { diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 99759a27c8..2b9aba091b 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1570,29 +1570,10 @@ static std::unordered_map target_bpw_type( // Certain tensors have a higher impact on model quality, so we apply a lower penalty to them auto is_important = [&](const std::string & tensor_name) -> bool { - bool important = false; - - if (statistics_data) { - const std::string key = remap_imatrix(tensor_name, mapped); - const auto tstats = statistics_data->find(key); - if (tstats != statistics_data->end() && !tstats->second.empty()) { - float ecs = 0.0f; // Euclidean-Cosine score - float l2 = 0.0f; // L2 Euclidean Distance - float cs = 0.0f; // Cosine Similarity - try { - // ecs = tstats->second.at(0); - l2 = tstats->second.at(1); - cs = tstats->second.at(2); - } catch (std::out_of_range &) { - LLAMA_LOG_ERROR("\t%s: insufficient statistics for tensor %s\n", func, tensor_name.c_str()); - return false; - } - ecs = 100.0f - (100.0f / (1.0f + 0.01f * l2 * l2) * std::fabs(cs)); // ecs = 100 - (100 / (1 + (L2 Dist/p)^2) * |Cos Sim|^q) - // LLAMA_LOG_INFO("\t%s: tensor %s has ECS score %.4f (L2 Distance %.4f and CosSim %.4f\n", func, tensor_name.c_str(), ecs, l2, cs); - important = ecs >= 99.99f; // mark as important if ecs is >= 99.99% - } - } else { - important = tensor_name == "output.weight" || + bool important = tensor_name == "output.weight"; + if (!important && !params->disable_tensor_importance) { + important = tensor_name.find(".attn_v.weight") != std::string::npos || + tensor_name.find(".time_mix_value.weight") != std::string::npos || tensor_name.find(".ffn_down.weight") != std::string::npos || tensor_name.find(".ffn_down_exps.weight") != std::string::npos || tensor_name.find(".attn_output.weight") != std::string::npos || @@ -2023,7 +2004,16 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: std::unordered_map bpw_overrides = {}; if (params->target_bpw != -1.0f && !params->only_copy) { if (params->imatrix) { - + if (params->activations) { + LLAMA_LOG_INFO("%s: imatrix has activations, process will be more accurate\n", __func__); + } else { + LLAMA_LOG_INFO("%s: imatrix does not have activations, process may be less accurate\n", __func__); + } + if (params->disable_tensor_importance) { + LLAMA_LOG_INFO("%s: allocating bpw budget to tensors equally\n", __func__); + } else { + LLAMA_LOG_INFO("%s: allocating more bpw budget to important tensors\n", __func__); + } LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw); bpw_overrides = target_bpw_type(ml, model, tensors, mapped, values_data, activations_data, params, nthread); @@ -2291,6 +2281,7 @@ llama_model_quantize_params llama_model_quantize_default_params() { /*.target_bpw =*/ -1.0f, /*.keep_bpw_state =*/ false, /*.bpw_state =*/ nullptr, + /*.disable_tensor_importance =*/ false }; return result; diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index aabcd73986..4fee8c91a1 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -134,6 +134,8 @@ static void usage(const char * executable) { printf(" Advanced option to remove all tensors from the given layers\n"); printf(" --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0\n"); printf(" Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n"); + printf(" --disable-tensor-importance: treat all tensors equally during bpw quantization\n"); + printf(" Advanced option to disable allocating more bpw budget to important tensors. It may increase quality for some models\n"); printf(" --keep-bpw-state: save the bpw computations to -.bpw_state\n"); printf(" --bpw-state: file name to use instead of default\n"); printf(" --keep-split: will generate quantized model in the same shards as input\n"); @@ -560,6 +562,8 @@ int main(int argc, char ** argv) { if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) { usage(argv[0]); } + } else if (strcmp(argv[arg_idx], "--disable-tensor-importance") == 0) { + params.disable_tensor_importance = true; } else if (strcmp(argv[arg_idx], "--keep-bpw-state") == 0) { params.keep_bpw_state = true; } else if (strcmp(argv[arg_idx], "--bpw-state") == 0) {