From 10e5b148b061569aaee8ae0cf72a703129df0eab Mon Sep 17 00:00:00 2001 From: ddh0 Date: Tue, 10 Mar 2026 14:43:29 -0500 Subject: [PATCH] llama-quant : correct `n_attention_wv` usage (#20357) * llama-quant : correct `n_attention_wv` usage In #19770, I introduced a regression in the way the `quantize_state_impl` counter values were initialized. I was incrementing and using `n_attention_wv` in the same loop, when it should have been fixed by the time we're deciding tensor types in `llama_tensor_get_type_impl` (for `use_more_bits`). I never observed a difference in any of [my tests](https://github.com/ggml-org/llama.cpp/pull/19770#issuecomment-4000424712) - it was only after @bartowski kindly pointed this out that I realized it was incorrect. (Thanks!) * simplify --- src/llama-quant.cpp | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 3b0d234fbe..8e8ce23124 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -870,9 +870,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: quantize_state_impl qs(model, params); - // these need to be set to n_layer by default - qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer; - if (params->only_copy) { ftype = ml.ftype; } @@ -979,6 +976,22 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // compute tensor metadata once and cache it std::vector metadata(tensors.size()); + // initialize quantization state before preliminary loop (counters for use_more_bits) + { + for (size_t i = 0; i < tensors.size(); ++i) { + const auto cat = tensor_get_category(tensors[i]->tensor->name); + if (category_is_attn_v(cat)) { + ++qs.n_attention_wv; + } + if (cat == tensor_category::OUTPUT) { + qs.has_tied_embeddings = false; + } + metadata[i].category = cat; // save and re-use the category while we're at it + } + // these also need to be set to n_layer by default + qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer; + } + // flag for --dry-run bool will_require_imatrix = false; @@ -991,16 +1004,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: const struct ggml_tensor * tensor = it->tensor; const std::string name = ggml_get_name(tensor); - metadata[i].category = tensor_get_category(name); - - if (category_is_attn_v(metadata[i].category)) { - ++qs.n_attention_wv; - } - - if (tensor_name_match_output_weight(name.c_str())) { - qs.has_tied_embeddings = false; - } - uint16_t i_split = params->keep_split ? it->idx : 0; if (!ctx_outs[i_split]) { ctx_outs[i_split].reset(gguf_init_empty());