From 444f00b0ec814a071ce1b9dc0de5ea4b4850bd1b Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Sat, 6 Dec 2025 12:26:20 +0100 Subject: [PATCH] llama : remove quantization sanity check (#17788) * llama : remove quantization sanity check This commit removes the quantization sanity check for attention layers. The motivation for this is that there are model that are hybrid models that have recurrent layers, experts layers, and attention layers. For these models the current check fails as the experts layers are not taking into account. After consideration, it was decided that this check is not strictly necessary, and can be removed to allow for more flexible model architectures. * llama : remove unused pruned_attention_w and is_clip_model vars --- src/llama-quant.cpp | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 764833749e..351dcb7baa 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -666,7 +666,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: std::map mapped; int blk_id = 0; - int pruned_attention_w = 0; // make a list of weights std::vector tensors; @@ -674,11 +673,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: for (const auto & it : ml.weights_map) { const std::string remapped_name(remap_layer(it.first, prune_list, mapped, blk_id)); if (remapped_name.empty()) { - if (it.first.find("attn_v.weight") != std::string::npos || - it.first.find("attn_qkv.weight") != std::string::npos || - it.first.find("attn_kv_b.weight") != std::string::npos) { - pruned_attention_w++; - } LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str()); continue; } @@ -703,7 +697,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: }); } - bool is_clip_model = false; for (const auto * it : tensors) { const struct ggml_tensor * tensor = it->tensor; @@ -717,30 +710,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) { qs.has_output = true; } - - is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix } qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer; - // sanity checks for models that have attention layers - if (qs.n_attention_wv != 0 && !is_clip_model) - { - int32_t n_layer_all = model.hparams.n_layer; - if (llama_model_has_encoder(&model)) { - // now n_layer_all is the number of attention layers in the encoder - // for each decoder block, there are 2 attention layers - n_layer_all += 2 * model.hparams.dec_n_layer; - } - - // note: for linear-attention models (such as Qwen3 Next) this is the number of linear layers - const int32_t n_layer_recr = std::count(model.hparams.recurrent_layer_arr.begin(), model.hparams.recurrent_layer_arr.end(), true); - - LLAMA_LOG_INFO("%s: n_layer_all = %d, n_layer_recr = %d, pruned_attention_w = %d\n", __func__, n_layer_all, n_layer_recr, pruned_attention_w); - - GGML_ASSERT((qs.n_attention_wv == n_layer_all - pruned_attention_w - n_layer_recr) && "n_attention_wv is unexpected"); - } - size_t total_size_org = 0; size_t total_size_new = 0;