From a26db356c9ce52ca8b770de47613cfaa3344bf99 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Mon, 16 Feb 2026 13:43:22 -0600 Subject: [PATCH] refactor --- src/llama-quant.cpp | 164 +++++++++++++++++++++++--------------------- 1 file changed, 85 insertions(+), 79 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index afec667dc1..5cc538caf0 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include // Quantization types. Changes to this struct must be replicated in quantize.cpp @@ -19,6 +20,35 @@ struct tensor_quantization { ggml_type quant = GGML_TYPE_COUNT; }; +struct quantize_state_impl { + const llama_model & model; + const llama_model_quantize_params * params; + + int n_attention_wv = 0; + int n_ffn_down = 0; + int n_ffn_gate = 0; + int n_ffn_up = 0; + int i_attention_wv = 0; + int i_ffn_down = 0; + int i_ffn_gate = 0; + int i_ffn_up = 0; + + int n_fallback = 0; + + bool has_imatrix = false; + + // used to figure out if a model shares tok_embd with the output weight + bool has_output = false; + + // if this flag is false, the code will skip updating this struct + bool do_count = false; + + quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params) + : model(model) + , params(params) + {} +}; + static void zeros(std::ofstream & file, size_t n) { char zero = 0; for (size_t i = 0; i < n; ++i) { @@ -76,32 +106,6 @@ static std::string remap_imatrix (const std::string & orig_name, const std::map< return orig_name; } -struct quantize_state_impl { - const llama_model & model; - const llama_model_quantize_params * params; - - int n_attention_wv = 0; - int n_ffn_down = 0; - int n_ffn_gate = 0; - int n_ffn_up = 0; - int i_attention_wv = 0; - int i_ffn_down = 0; - int i_ffn_gate = 0; - int i_ffn_up = 0; - - int n_fallback = 0; - - bool has_imatrix = false; - - // used to figure out if a model shares tok_embd with the output weight - bool has_output = false; - - quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params) - : model(model) - , params(params) - {} -}; - static void llama_tensor_dequantize_impl( ggml_tensor * tensor, std::vector> & output, std::vector & workers, const size_t nelements, const int nthread @@ -177,7 +181,7 @@ static void llama_tensor_dequantize_impl( // internal standard logic for selecting the target tensor type for a specific // quantization mixture & model architecture static ggml_type llama_tensor_get_type_impl( - quantize_state_impl & qs, + quantize_state_impl * qs, ggml_type new_type, const ggml_tensor * tensor, const llama_ftype ftype @@ -185,13 +189,13 @@ static ggml_type llama_tensor_get_type_impl( const std::string name = ggml_get_name(tensor); // TODO: avoid hardcoded tensor names - use the TN_* constants - const llm_arch arch = qs.model.arch; + const llm_arch arch = qs->model.arch; const auto tn = LLM_TN(arch); auto use_more_bits = [](int i_layer, int n_layers) -> bool { return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2; }; - const int n_expert = std::max(1, (int)qs.model.hparams.n_expert); + const int n_expert = std::max(1, (int)qs->model.hparams.n_expert); auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) { if (n_expert > 1) { // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but occasionally randomly @@ -210,9 +214,9 @@ static ggml_type llama_tensor_get_type_impl( // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings // with the quantization of the output tensor - if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) { - if (qs.params->output_tensor_type < GGML_TYPE_COUNT) { - new_type = qs.params->output_tensor_type; + if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs->has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) { + if (qs->params->output_tensor_type < GGML_TYPE_COUNT) { + new_type = qs->params->output_tensor_type; } else { const int64_t nx = tensor->ne[0]; const int64_t qk_k = ggml_blck_size(new_type); @@ -241,8 +245,8 @@ static ggml_type llama_tensor_get_type_impl( new_type = GGML_TYPE_Q8_0; } } else if (name == "token_embd.weight" || name == "per_layer_token_embd.weight") { - if (qs.params->token_embedding_type < GGML_TYPE_COUNT) { - new_type = qs.params->token_embedding_type; + if (qs->params->token_embedding_type < GGML_TYPE_COUNT) { + new_type = qs->params->token_embedding_type; } else { if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { @@ -261,19 +265,19 @@ static ggml_type llama_tensor_get_type_impl( } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { if (name.find("attn_v.weight") != std::string::npos) { - if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; + if (qs->model.hparams.n_gqa() >= 4 || qs->model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; } - else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) { + else if (qs->model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) { new_type = GGML_TYPE_Q4_K; } else if (name.find("ffn_down") != std::string::npos) { - if (qs.i_ffn_down < qs.n_ffn_down/8) { + if (qs->i_ffn_down < qs->n_ffn_down/8) { new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; } } else if (name.find("attn_output.weight") != std::string::npos) { - if (qs.model.hparams.n_expert == 8) { + if (qs->model.hparams.n_expert == 8) { new_type = GGML_TYPE_Q5_K; } else { if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS; @@ -282,43 +286,43 @@ static ggml_type llama_tensor_get_type_impl( } } else if (name.find("attn_v.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { - new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + new_type = qs->model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) { + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs->model.hparams.n_gqa() >= 4) { new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { - new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + new_type = qs->model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs->has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; } - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) { + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs->model.hparams.n_gqa() >= 4) { new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { - new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + new_type = qs->i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) { + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs->model.hparams.n_gqa() >= 4) { new_type = GGML_TYPE_Q5_K; } else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && - use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; - if (qs.model.type == LLM_TYPE_70B) { + use_more_bits(qs->i_attention_wv, qs->n_attention_wv)) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs->i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; + if (qs->model.type == LLM_TYPE_70B) { // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with // nearly negligible increase in model size by quantizing this tensor with more bits: if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K; } - if (qs.model.hparams.n_expert == 8) { + if (qs->model.hparams.n_expert == 8) { // for the 8-expert model, bumping this to Q8_0 trades just ~128MB // TODO: explore better strategies new_type = GGML_TYPE_Q8_0; } } else if (name.find("attn_k.weight") != std::string::npos) { - if (qs.model.hparams.n_expert == 8) { + if (qs->model.hparams.n_expert == 8) { // for the 8-expert model, bumping this to Q8_0 trades just ~128MB // TODO: explore better strategies new_type = GGML_TYPE_Q8_0; @@ -337,13 +341,13 @@ static ggml_type llama_tensor_get_type_impl( new_type = GGML_TYPE_IQ2_S; } } else if (name.find("ffn_down") != std::string::npos) { - auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); + auto info = layer_info(qs->i_ffn_down, qs->n_ffn_down, name.c_str()); int i_layer = info.first, n_layer = info.second; if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs->has_imatrix) { new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { @@ -352,7 +356,7 @@ static ggml_type llama_tensor_get_type_impl( : GGML_TYPE_Q3_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 || - (qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) { + (qs->model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) { new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { @@ -366,7 +370,7 @@ static ggml_type llama_tensor_get_type_impl( if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; } } - else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) { + else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs->has_imatrix) { new_type = GGML_TYPE_Q5_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; @@ -374,7 +378,7 @@ static ggml_type llama_tensor_get_type_impl( new_type = GGML_TYPE_Q5_K; } else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0) - && qs.has_imatrix && i_layer < n_layer/8) { + && qs->has_imatrix && i_layer < n_layer/8) { // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0. // We only do it when an imatrix is provided because a) we want to make sure that one can always get the // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix. @@ -382,7 +386,7 @@ static ggml_type llama_tensor_get_type_impl( } } else if (name.find("attn_output.weight") != std::string::npos) { if (arch != LLM_ARCH_FALCON) { - if (qs.model.hparams.n_expert == 8) { + if (qs->model.hparams.n_expert == 8) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || @@ -408,14 +412,14 @@ static ggml_type llama_tensor_get_type_impl( else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; } else if (name.find("ffn_gate") != std::string::npos) { - auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str()); + auto info = layer_info(qs->i_ffn_gate, qs->n_ffn_gate, name.c_str()); int i_layer = info.first, n_layer = info.second; if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) { new_type = GGML_TYPE_IQ3_XXS; } } else if (name.find("ffn_up") != std::string::npos) { - auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str()); + auto info = layer_info(qs->i_ffn_up, qs->n_ffn_up, name.c_str()); int i_layer = info.first, n_layer = info.second; if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) { new_type = GGML_TYPE_IQ3_XXS; @@ -426,7 +430,7 @@ static ggml_type llama_tensor_get_type_impl( // determine the ggml_type that this tensor should be quantized to static ggml_type llama_tensor_get_type( - quantize_state_impl & qs, + quantize_state_impl * qs, const llama_model_quantize_params * params, const ggml_tensor * tensor, const ggml_type default_type @@ -492,7 +496,9 @@ static ggml_type llama_tensor_get_type( if (tensor->ne[0] % ggml_blck_size(new_type) != 0) { new_type = GGML_TYPE_F16; } - ++qs.n_fallback; + if (qs->do_count) { + ++qs->n_fallback; + } } } } @@ -708,7 +714,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: model.load_hparams(ml); model.load_stats (ml); - quantize_state_impl qs(model, params); + auto qs = std::make_unique(model, params); if (params->only_copy) { ftype = ml.ftype; @@ -718,7 +724,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: imatrix_data = static_cast>*>(params->imatrix); if (imatrix_data) { LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size())); - qs.has_imatrix = true; + qs->has_imatrix = true; // check imatrix for nans or infs for (const auto & kv : *imatrix_data) { for (float f : kv.second) { @@ -830,15 +836,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: const ggml_tensor * tensor = it->tensor; const std::string name = tensor->name; - // TODO: avoid hardcoded tensor names - use the TN_* constants - if (name.find("attn_v.weight") != std::string::npos || - name.find("attn_qkv.weight") != std::string::npos || - name.find("attn_kv_b.weight")!= std::string::npos) { - ++qs.n_attention_wv; - } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) { - qs.has_output = true; - } - // populate the original tensors so we get an initial meta data uint16_t i_split = params->keep_split ? it->idx : 0; if (!ctx_outs[i_split]) { @@ -867,9 +864,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } } - qs.n_fallback = 0; // may have been falsely incremented by the preliminary loop over weights - qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer; - // Set split info if needed if (n_split > 1) { for (size_t i = 0; i < ctx_outs.size(); ++i) { @@ -918,6 +912,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: size_t total_size_org = 0; size_t total_size_new = 0; + qs->n_ffn_down = qs->n_ffn_gate = qs->n_ffn_up = (int)model.hparams.n_layer; + qs->do_count = true; // we start counting stats for the main loop + // iterate over all weights (main loop) for (const auto * it : weights) { const auto & weight = *it; @@ -931,6 +928,15 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: const std::string name = ggml_get_name(tensor); const size_t tensor_size = ggml_nbytes(tensor); + // TODO: avoid hardcoded tensor names - use the TN_* constants + if (name.find("attn_v.weight") != std::string::npos || + name.find("attn_qkv.weight") != std::string::npos || + name.find("attn_kv_b.weight")!= std::string::npos) { + ++qs->n_attention_wv; + } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) { + qs->has_output = true; + } + if (!params->dry_run) { if (!ml.use_mmap) { if (read_data.size() < tensor_size) { @@ -962,13 +968,13 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // count stats for this tensor based on its name if (name.find("attn_v.weight") != std::string::npos || name.find("attn_kv_b.weight") != std::string::npos) { - ++qs.i_attention_wv; + ++qs->i_attention_wv; } else if (name.find("ffn_down") != std::string::npos) { - ++qs.i_ffn_down; + ++qs->i_ffn_down; } else if (name.find("ffn_gate") != std::string::npos) { - ++qs.i_ffn_gate; + ++qs->i_ffn_gate; } else if (name.find("ffn_up") != std::string::npos) { - ++qs.i_ffn_up; + ++qs->i_ffn_up; } } @@ -1128,9 +1134,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: ); } - if (qs.n_fallback > 0) { + if (qs->n_fallback > 0) { LLAMA_LOG_WARN("%s: WARNING: %d tensor(s) required fallback quantization\n", - __func__, qs.n_fallback); + __func__, qs->n_fallback); } }