Add --disable-tensor-importance option
This commit is contained in:
parent
9ec3e6e262
commit
1c9993e131
|
|
@ -369,6 +369,7 @@ extern "C" {
|
||||||
float target_bpw; // target bits per weight (bpw)
|
float target_bpw; // target bits per weight (bpw)
|
||||||
bool keep_bpw_state; // keep bpw state file
|
bool keep_bpw_state; // keep bpw state file
|
||||||
void * bpw_state; // pointer to bpw state file
|
void * bpw_state; // pointer to bpw state file
|
||||||
|
bool disable_tensor_importance; // treat all tensors equally during quantization
|
||||||
} llama_model_quantize_params;
|
} llama_model_quantize_params;
|
||||||
|
|
||||||
typedef struct llama_logit_bias {
|
typedef struct llama_logit_bias {
|
||||||
|
|
|
||||||
|
|
@ -1570,29 +1570,10 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
|
||||||
|
|
||||||
// Certain tensors have a higher impact on model quality, so we apply a lower penalty to them
|
// Certain tensors have a higher impact on model quality, so we apply a lower penalty to them
|
||||||
auto is_important = [&](const std::string & tensor_name) -> bool {
|
auto is_important = [&](const std::string & tensor_name) -> bool {
|
||||||
bool important = false;
|
bool important = tensor_name == "output.weight";
|
||||||
|
if (!important && !params->disable_tensor_importance) {
|
||||||
if (statistics_data) {
|
important = tensor_name.find(".attn_v.weight") != std::string::npos ||
|
||||||
const std::string key = remap_imatrix(tensor_name, mapped);
|
tensor_name.find(".time_mix_value.weight") != std::string::npos ||
|
||||||
const auto tstats = statistics_data->find(key);
|
|
||||||
if (tstats != statistics_data->end() && !tstats->second.empty()) {
|
|
||||||
float ecs = 0.0f; // Euclidean-Cosine score
|
|
||||||
float l2 = 0.0f; // L2 Euclidean Distance
|
|
||||||
float cs = 0.0f; // Cosine Similarity
|
|
||||||
try {
|
|
||||||
// ecs = tstats->second.at(0);
|
|
||||||
l2 = tstats->second.at(1);
|
|
||||||
cs = tstats->second.at(2);
|
|
||||||
} catch (std::out_of_range &) {
|
|
||||||
LLAMA_LOG_ERROR("\t%s: insufficient statistics for tensor %s\n", func, tensor_name.c_str());
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
ecs = 100.0f - (100.0f / (1.0f + 0.01f * l2 * l2) * std::fabs(cs)); // ecs = 100 - (100 / (1 + (L2 Dist/p)^2) * |Cos Sim|^q)
|
|
||||||
// LLAMA_LOG_INFO("\t%s: tensor %s has ECS score %.4f (L2 Distance %.4f and CosSim %.4f\n", func, tensor_name.c_str(), ecs, l2, cs);
|
|
||||||
important = ecs >= 99.99f; // mark as important if ecs is >= 99.99%
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
important = tensor_name == "output.weight" ||
|
|
||||||
tensor_name.find(".ffn_down.weight") != std::string::npos ||
|
tensor_name.find(".ffn_down.weight") != std::string::npos ||
|
||||||
tensor_name.find(".ffn_down_exps.weight") != std::string::npos ||
|
tensor_name.find(".ffn_down_exps.weight") != std::string::npos ||
|
||||||
tensor_name.find(".attn_output.weight") != std::string::npos ||
|
tensor_name.find(".attn_output.weight") != std::string::npos ||
|
||||||
|
|
@ -2023,7 +2004,16 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||||
std::unordered_map<std::string, ggml_type> bpw_overrides = {};
|
std::unordered_map<std::string, ggml_type> bpw_overrides = {};
|
||||||
if (params->target_bpw != -1.0f && !params->only_copy) {
|
if (params->target_bpw != -1.0f && !params->only_copy) {
|
||||||
if (params->imatrix) {
|
if (params->imatrix) {
|
||||||
|
if (params->activations) {
|
||||||
|
LLAMA_LOG_INFO("%s: imatrix has activations, process will be more accurate\n", __func__);
|
||||||
|
} else {
|
||||||
|
LLAMA_LOG_INFO("%s: imatrix does not have activations, process may be less accurate\n", __func__);
|
||||||
|
}
|
||||||
|
if (params->disable_tensor_importance) {
|
||||||
|
LLAMA_LOG_INFO("%s: allocating bpw budget to tensors equally\n", __func__);
|
||||||
|
} else {
|
||||||
|
LLAMA_LOG_INFO("%s: allocating more bpw budget to important tensors\n", __func__);
|
||||||
|
}
|
||||||
LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw);
|
LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw);
|
||||||
|
|
||||||
bpw_overrides = target_bpw_type(ml, model, tensors, mapped, values_data, activations_data, params, nthread);
|
bpw_overrides = target_bpw_type(ml, model, tensors, mapped, values_data, activations_data, params, nthread);
|
||||||
|
|
@ -2291,6 +2281,7 @@ llama_model_quantize_params llama_model_quantize_default_params() {
|
||||||
/*.target_bpw =*/ -1.0f,
|
/*.target_bpw =*/ -1.0f,
|
||||||
/*.keep_bpw_state =*/ false,
|
/*.keep_bpw_state =*/ false,
|
||||||
/*.bpw_state =*/ nullptr,
|
/*.bpw_state =*/ nullptr,
|
||||||
|
/*.disable_tensor_importance =*/ false
|
||||||
};
|
};
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
|
|
||||||
|
|
@ -134,6 +134,8 @@ static void usage(const char * executable) {
|
||||||
printf(" Advanced option to remove all tensors from the given layers\n");
|
printf(" Advanced option to remove all tensors from the given layers\n");
|
||||||
printf(" --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0\n");
|
printf(" --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0\n");
|
||||||
printf(" Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n");
|
printf(" Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n");
|
||||||
|
printf(" --disable-tensor-importance: treat all tensors equally during bpw quantization\n");
|
||||||
|
printf(" Advanced option to disable allocating more bpw budget to important tensors. It may increase quality for some models\n");
|
||||||
printf(" --keep-bpw-state: save the bpw computations to <architecture>-<model hash>.bpw_state\n");
|
printf(" --keep-bpw-state: save the bpw computations to <architecture>-<model hash>.bpw_state\n");
|
||||||
printf(" --bpw-state: file name to use instead of default\n");
|
printf(" --bpw-state: file name to use instead of default\n");
|
||||||
printf(" --keep-split: will generate quantized model in the same shards as input\n");
|
printf(" --keep-split: will generate quantized model in the same shards as input\n");
|
||||||
|
|
@ -560,6 +562,8 @@ int main(int argc, char ** argv) {
|
||||||
if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) {
|
if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) {
|
||||||
usage(argv[0]);
|
usage(argv[0]);
|
||||||
}
|
}
|
||||||
|
} else if (strcmp(argv[arg_idx], "--disable-tensor-importance") == 0) {
|
||||||
|
params.disable_tensor_importance = true;
|
||||||
} else if (strcmp(argv[arg_idx], "--keep-bpw-state") == 0) {
|
} else if (strcmp(argv[arg_idx], "--keep-bpw-state") == 0) {
|
||||||
params.keep_bpw_state = true;
|
params.keep_bpw_state = true;
|
||||||
} else if (strcmp(argv[arg_idx], "--bpw-state") == 0) {
|
} else if (strcmp(argv[arg_idx], "--bpw-state") == 0) {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue