diff --git a/include/llama.h b/include/llama.h index 1f5b2e8a2b..50e61d4976 100644 --- a/include/llama.h +++ b/include/llama.h @@ -369,7 +369,7 @@ extern "C" { float target_bpw; // target bits per weight (bpw) bool keep_bpw_state; // keep bpw state file void * bpw_state; // pointer to bpw state file - bool disable_tensor_importance; // treat all tensors equally during quantization + bool no_importance; // allocate target bpw budget equitably across all tensors } llama_model_quantize_params; typedef struct llama_logit_bias { diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 2b9aba091b..c468a3e4fc 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1571,7 +1571,7 @@ static std::unordered_map target_bpw_type( // Certain tensors have a higher impact on model quality, so we apply a lower penalty to them auto is_important = [&](const std::string & tensor_name) -> bool { bool important = tensor_name == "output.weight"; - if (!important && !params->disable_tensor_importance) { + if (!important && !params->no_importance) { important = tensor_name.find(".attn_v.weight") != std::string::npos || tensor_name.find(".time_mix_value.weight") != std::string::npos || tensor_name.find(".ffn_down.weight") != std::string::npos || @@ -2009,10 +2009,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } else { LLAMA_LOG_INFO("%s: imatrix does not have activations, process may be less accurate\n", __func__); } - if (params->disable_tensor_importance) { - LLAMA_LOG_INFO("%s: allocating bpw budget to tensors equally\n", __func__); + if (params->no_importance) { + LLAMA_LOG_INFO("%s: distributing bpw budget equitably across all tensors\n", __func__); } else { - LLAMA_LOG_INFO("%s: allocating more bpw budget to important tensors\n", __func__); + LLAMA_LOG_INFO("%s: assigning more bpw budget to important tensors\n", __func__); } LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw); @@ -2281,7 +2281,7 @@ llama_model_quantize_params llama_model_quantize_default_params() { /*.target_bpw =*/ -1.0f, /*.keep_bpw_state =*/ false, /*.bpw_state =*/ nullptr, - /*.disable_tensor_importance =*/ false + /*.no_importance =*/ false }; return result; diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 4fee8c91a1..dd4b860e1b 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -117,9 +117,9 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp [[noreturn]] static void usage(const char * executable) { - printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--target-bpw n]\n", executable); - printf(" [--keep-bpw-state] [--bpw-state filename] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n"); - printf(" model-f32.gguf [model-quant.gguf] type [nthreads]\n\n"); + printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights]\n", executable); + printf(" [--target-bpw n] [--no-importance] [--keep-bpw-state] [--bpw-state filename] [--output-tensor-type] [--token-embedding-type] [--tensor-type]\n"); + printf(" [--prune-layers] [--keep-split] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n"); printf(" --allow-requantize: allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); printf(" --leave-output-tensor: will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); printf(" --pure: disable k-quant mixtures and quantize all tensors to the same type\n"); @@ -134,8 +134,8 @@ static void usage(const char * executable) { printf(" Advanced option to remove all tensors from the given layers\n"); printf(" --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0\n"); printf(" Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n"); - printf(" --disable-tensor-importance: treat all tensors equally during bpw quantization\n"); - printf(" Advanced option to disable allocating more bpw budget to important tensors. It may increase quality for some models\n"); + printf(" --no-importance: distribute bpw budget equitably across all tensors\n"); + printf(" Advanced option to disable assigning more bpw budget to important tensors. It may increase quality for some models\n"); printf(" --keep-bpw-state: save the bpw computations to -.bpw_state\n"); printf(" --bpw-state: file name to use instead of default\n"); printf(" --keep-split: will generate quantized model in the same shards as input\n"); @@ -562,8 +562,8 @@ int main(int argc, char ** argv) { if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) { usage(argv[0]); } - } else if (strcmp(argv[arg_idx], "--disable-tensor-importance") == 0) { - params.disable_tensor_importance = true; + } else if (strcmp(argv[arg_idx], "--no-importance") == 0) { + params.no_importance = true; } else if (strcmp(argv[arg_idx], "--keep-bpw-state") == 0) { params.keep_bpw_state = true; } else if (strcmp(argv[arg_idx], "--bpw-state") == 0) {