From bf34e75799ec65bcaeeb452bdb875ccd49dc65cc Mon Sep 17 00:00:00 2001 From: Colin Kealty <3266127+bartowski1182@users.noreply.github.com> Date: Mon, 9 Mar 2026 10:53:56 -0400 Subject: [PATCH] Add specific attn_qkv logic --- src/llama-quant.cpp | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index c2ab1b80e8..3a07bbf001 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -81,6 +81,7 @@ struct quantize_state_impl { const llama_model_quantize_params * params; int n_attention_wv = 0; + int n_attn_qkv = 0; int n_ffn_down = 0; int n_ffn_gate = 0; int n_ffn_up = 0; @@ -92,6 +93,7 @@ struct quantize_state_impl { int n_ffn_up_shexp = 0; int n_ssm_out = 0; int n_attn_q = 0; + int i_attn_qkv = 0; int i_attention_wv = 0; int i_ffn_down = 0; int i_ffn_gate = 0; @@ -333,7 +335,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t } } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { - if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) { + if (name.find("attn_v.weight") != std::string::npos) { if (use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) { new_type = GGML_TYPE_Q6_K; } @@ -342,6 +344,20 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t } ++qs.i_attention_wv; } + else if (name.find("attn_qkv.weight") != std::string::npos) { + if (qs.model.hparams.n_expert >= 8) { + if (use_more_bits(qs.i_attn_qkv, qs.n_attn_qkv)) { + new_type = GGML_TYPE_Q6_K; + } + else { + new_type = GGML_TYPE_Q4_K; + } + } + else if (use_more_bits(qs.i_attn_qkv, qs.n_attn_qkv)) { + new_type = GGML_TYPE_Q4_K; + } + ++qs.i_attn_qkv; + } else if (qs.model.hparams.n_expert >= 8 && name.find("ssm_out.weight") != std::string::npos) { if (use_more_bits(qs.i_ssm_out, qs.n_ssm_out)) { new_type = GGML_TYPE_Q4_K; @@ -640,7 +656,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { new_type = GGML_TYPE_Q4_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) { @@ -673,7 +689,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t new_type = GGML_TYPE_Q5_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || - ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS){ + ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = GGML_TYPE_IQ4_XS; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) { @@ -705,7 +721,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || - ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS){ + ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = GGML_TYPE_IQ4_XS; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) { @@ -720,7 +736,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t new_type = GGML_TYPE_Q5_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || - ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS){ + ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = GGML_TYPE_IQ4_XS; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) { @@ -973,11 +989,12 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // TODO: avoid hardcoded tensor names - use the TN_* constants if (name.find("attn_v.weight") != std::string::npos || - name.find("attn_qkv.weight") != std::string::npos || name.find("attn_kv_b.weight")!= std::string::npos) { ++qs.n_attention_wv; } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) { qs.has_output = true; + } else if (name.find("attn_qkv.weight") != std::string::npos) { + ++qs.n_attn_qkv; } else if (name.find("ffn_gate_exps.weight") != std::string::npos) { ++qs.n_ffn_gate_exps; } else if (name.find("ffn_gate_shexp.weight") != std::string::npos) {