From cb496c4e8a63a4ebe6362f587099a89488631048 Mon Sep 17 00:00:00 2001 From: Colin Kealty <3266127+bartowski1182@users.noreply.github.com> Date: Mon, 31 Mar 2025 13:21:07 -0400 Subject: [PATCH 01/13] Update llama-quant.cpp llama_tensor_get_type with DeepSeek friendly modifications --- src/llama-quant.cpp | 68 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 3 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 0b23eaef3a..6d67448fda 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -259,9 +259,39 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; ++qs.i_attention_wv; } - else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) { + else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k") != std::string::npos) { new_type = GGML_TYPE_Q4_K; } + else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q") != std::string::npos) { + new_type = GGML_TYPE_Q4_K; + } + else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down") != std::string::npos) { + if (qs.i_ffn_down < qs.n_ffn_down/16) { + new_type = GGML_TYPE_Q4_K; + } + else if (qs.i_ffn_down < qs.n_ffn_down/8) { + new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; + } + ++qs.i_ffn_down; + } + else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate") != std::string::npos) { + if (qs.i_ffn_gate < qs.n_ffn_gate/16) { + new_type = GGML_TYPE_Q4_K; + } + else if (qs.i_ffn_gate < qs.n_ffn_gate/8 || qs.i_ffn_gate >= 7*qs.n_ffn_gate/8) { + new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; + } + ++qs.i_ffn_gate; + } + else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up") != std::string::npos) { + if (qs.i_ffn_up < qs.n_ffn_up/16) { + new_type = GGML_TYPE_Q4_K; + } + else if (qs.i_ffn_up < qs.n_ffn_up/8) { + new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; + } + ++qs.i_ffn_up; + } else if (name.find("ffn_down") != std::string::npos) { if (qs.i_ffn_down < qs.n_ffn_down/8) { new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; @@ -269,7 +299,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t ++qs.i_ffn_down; } else if (name.find("attn_output.weight") != std::string::npos) { - if (qs.model.hparams.n_expert == 8) { + if (qs.model.hparams.n_expert >= 8) { new_type = GGML_TYPE_Q5_K; } else { if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS; @@ -380,7 +410,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t ++qs.i_ffn_down; } else if (name.find("attn_output.weight") != std::string::npos) { if (arch != LLM_ARCH_FALCON) { - if (qs.model.hparams.n_expert == 8) { + if (qs.model.hparams.n_expert >= 8) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || @@ -420,6 +450,38 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t new_type = GGML_TYPE_IQ3_XXS; } ++qs.i_ffn_up; + } else if (name.find("attn_kv_a_mqa") != std::string::npos) { + if (qs.model.hparams.n_expert >= 8) { + new_type = GGML_TYPE_Q8_0; + } + } else if (name.find("attn_kv_b.weight") != std::string::npos) { + if (qs.model.hparams.n_expert >= 8) { + new_type = GGML_TYPE_Q4_K; + if (qs.i_attention_wv < qs.n_attention_wv/16) { + new_type = GGML_TYPE_Q8_0; + } else if (use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) { + new_type = GGML_TYPE_Q6_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; + } + ++qs.i_attention_wv; + } else if (name.find("attn_q_b.weight") != std::string::npos) { + if (qs.model.hparams.n_expert >= 8) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { + new_type = GGML_TYPE_Q4_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; + } + } else if (name.find("attn_q_a.weight") != std::string::npos) { + if (qs.model.hparams.n_expert >= 8) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { + new_type = GGML_TYPE_Q4_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; + } } // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; From 3f8d7a258286fdeeb4a9b4851fbc8a737d237023 Mon Sep 17 00:00:00 2001 From: Colin Kealty <3266127+bartowski1182@users.noreply.github.com> Date: Tue, 1 Apr 2025 19:01:40 -0400 Subject: [PATCH 02/13] Claw back a few of the changes for less dramatic file size increase --- src/llama-quant.cpp | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 6d67448fda..7f854d0cb0 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -259,12 +259,27 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; ++qs.i_attention_wv; } - else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k") != std::string::npos) { + else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k.weight") != std::string::npos) { new_type = GGML_TYPE_Q4_K; } - else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q") != std::string::npos) { + else if (qs.model.hparams.n_expert >= 8 && name.find("attn_kv_a_mqa.weight") != std::string::npos) { new_type = GGML_TYPE_Q4_K; } + else if (qs.model.hparams.n_expert >= 8 && name.find("attn_kv_b.weight") != std::string::npos) { + if (qs.i_attention_wv < qs.n_attention_wv/16) { + new_type = GGML_TYPE_Q4_K; + } + else if (use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) { + new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; + } + ++qs.i_attention_wv; + } + else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q_a.weight") != std::string::npos) { + new_type = GGML_TYPE_Q4_K; + } + else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q_b.weight") != std::string::npos) { + new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; + } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down") != std::string::npos) { if (qs.i_ffn_down < qs.n_ffn_down/16) { new_type = GGML_TYPE_Q4_K; From 105261d2baef50ea65c48b49ba43d692b1d19ed9 Mon Sep 17 00:00:00 2001 From: Colin Kealty <3266127+bartowski1182@users.noreply.github.com> Date: Wed, 2 Apr 2025 22:32:59 -0400 Subject: [PATCH 03/13] Few more changes and tweaks --- ggml/src/ggml-common.h | 2 +- src/llama-quant.cpp | 157 +++++++++++++++++++++++++++++------------ 2 files changed, 114 insertions(+), 45 deletions(-) diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index 93ab7ea446..5e61c44f34 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -378,8 +378,8 @@ typedef struct { } block_iq3_xxs; static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_half) + 3*(QK_K/8), "wrong iq3_xxs block size/padding"); -// 3.4375 bpw #define IQ3S_N_SCALE QK_K/64 +// 3.4375 bpw typedef struct { ggml_half d; uint8_t qs[QK_K/4]; diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 7f854d0cb0..3e56b2b86c 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -84,10 +84,22 @@ struct quantize_state_impl { int n_ffn_down = 0; int n_ffn_gate = 0; int n_ffn_up = 0; + int n_ffn_down_exp = 0; + int n_ffn_gate_exp = 0; + int n_ffn_up_exp = 0; + int n_ffn_down_shexp = 0; + int n_ffn_gate_shexp = 0; + int n_ffn_up_shexp = 0; int i_attention_wv = 0; int i_ffn_down = 0; int i_ffn_gate = 0; int i_ffn_up = 0; + int i_ffn_down_exp = 0; + int i_ffn_gate_exp = 0; + int i_ffn_up_exp = 0; + int i_ffn_down_shexp = 0; + int i_ffn_gate_shexp = 0; + int i_ffn_up_shexp = 0; int n_k_quantized = 0; int n_fallback = 0; @@ -175,6 +187,23 @@ static void llama_tensor_dequantize_impl( workers.clear(); } +// Check if ftype is specifically IQ2_S or IQ2_M +static inline bool is_iq2s_or_iq2m(llama_ftype ftype) { + return ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M; +} + +// Check if ftype belongs to the IQ1 group +static inline bool is_iq1_group(llama_ftype ftype) { + return ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M; +} + +// Returns the appropriate type for expert _exps tensors based on ftype +static inline ggml_type get_expert_exps_type(llama_ftype ftype) { + if (is_iq1_group(ftype)) return GGML_TYPE_IQ2_XXS; + if (is_iq2s_or_iq2m(ftype)) return GGML_TYPE_IQ3_XXS; + /* otherwise */ return GGML_TYPE_IQ2_XS; +} + static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) { const std::string name = ggml_get_name(tensor); @@ -242,7 +271,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { new_type = GGML_TYPE_Q2_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { + else if (is_iq2s_or_iq2m(ftype)) { new_type = GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { @@ -256,7 +285,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { if (name.find("attn_v.weight") != std::string::npos) { if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; - else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; + else new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; ++qs.i_attention_wv; } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k.weight") != std::string::npos) { @@ -266,11 +295,11 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t new_type = GGML_TYPE_Q4_K; } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_kv_b.weight") != std::string::npos) { - if (qs.i_attention_wv < qs.n_attention_wv/16) { + if (qs.i_attention_wv < qs.n_attention_wv/8) { new_type = GGML_TYPE_Q4_K; } else if (use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) { - new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; + new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; } ++qs.i_attention_wv; } @@ -278,47 +307,83 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t new_type = GGML_TYPE_Q4_K; } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q_b.weight") != std::string::npos) { - new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; + new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; } - else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down") != std::string::npos) { + else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down.weight") != std::string::npos) { if (qs.i_ffn_down < qs.n_ffn_down/16) { new_type = GGML_TYPE_Q4_K; } else if (qs.i_ffn_down < qs.n_ffn_down/8) { - new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; + new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; } ++qs.i_ffn_down; } - else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate") != std::string::npos) { + else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate.weight") != std::string::npos) { if (qs.i_ffn_gate < qs.n_ffn_gate/16) { new_type = GGML_TYPE_Q4_K; } - else if (qs.i_ffn_gate < qs.n_ffn_gate/8 || qs.i_ffn_gate >= 7*qs.n_ffn_gate/8) { - new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; + else if (qs.i_ffn_gate < qs.n_ffn_gate/8) { + new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; } ++qs.i_ffn_gate; } - else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up") != std::string::npos) { + else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up.weight") != std::string::npos) { if (qs.i_ffn_up < qs.n_ffn_up/16) { new_type = GGML_TYPE_Q4_K; } else if (qs.i_ffn_up < qs.n_ffn_up/8) { - new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; + new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; } ++qs.i_ffn_up; } + else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down_exps.weight") != std::string::npos) { + if (qs.i_ffn_down_exp < qs.n_ffn_down_exp/8) { + new_type = get_expert_exps_type(ftype); + } + ++qs.i_ffn_down_exp; + } + else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate_exps.weight") != std::string::npos) { + if (qs.i_ffn_gate_exp < qs.n_ffn_gate_exp/8) { + new_type = get_expert_exps_type(ftype); + } + ++qs.i_ffn_gate_exp; + } + else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up_exps.weight") != std::string::npos) { + if (qs.i_ffn_up_exp < qs.n_ffn_up_exp/8) { + new_type = get_expert_exps_type(ftype); + } + ++qs.i_ffn_up_exp; + } + else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down_shexp.weight") != std::string::npos) { + if (use_more_bits(qs.i_ffn_down_shexp, qs.n_ffn_down_shexp)) { + new_type = GGML_TYPE_Q4_K; + } + ++qs.i_ffn_down_shexp; + } + else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate_shexp.weight") != std::string::npos) { + if (use_more_bits(qs.i_ffn_gate_shexp, qs.n_ffn_gate_shexp)) { + new_type = GGML_TYPE_Q4_K; + } + ++qs.i_ffn_gate_shexp; + } + else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up_shexp.weight") != std::string::npos) { + if (use_more_bits(qs.i_ffn_up_shexp, qs.n_ffn_up_shexp)) { + new_type = GGML_TYPE_Q4_K; + } + ++qs.i_ffn_up_shexp; + } else if (name.find("ffn_down") != std::string::npos) { if (qs.i_ffn_down < qs.n_ffn_down/8) { - new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; + new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; } ++qs.i_ffn_down; } else if (name.find("attn_output.weight") != std::string::npos) { if (qs.model.hparams.n_expert >= 8) { - new_type = GGML_TYPE_Q5_K; + new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } else { - if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S; + if (is_iq1_group(ftype)) new_type = GGML_TYPE_IQ2_XXS; + else if (is_iq2s_or_iq2m(ftype)) new_type = GGML_TYPE_IQ3_S; } } } else if (name.find("attn_v.weight") != std::string::npos) { @@ -465,38 +530,28 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t new_type = GGML_TYPE_IQ3_XXS; } ++qs.i_ffn_up; - } else if (name.find("attn_kv_a_mqa") != std::string::npos) { - if (qs.model.hparams.n_expert >= 8) { + } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_kv_a_mqa.weight") != std::string::npos) { + new_type = GGML_TYPE_Q8_0; + } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_kv_b.weight") != std::string::npos) { + new_type = GGML_TYPE_Q4_K; + if (qs.i_attention_wv < qs.n_attention_wv/16) { new_type = GGML_TYPE_Q8_0; + } else if (use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) { + new_type = GGML_TYPE_Q6_K; } - } else if (name.find("attn_kv_b.weight") != std::string::npos) { - if (qs.model.hparams.n_expert >= 8) { - new_type = GGML_TYPE_Q4_K; - if (qs.i_attention_wv < qs.n_attention_wv/16) { - new_type = GGML_TYPE_Q8_0; - } else if (use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) { - new_type = GGML_TYPE_Q6_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q5_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; - } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; ++qs.i_attention_wv; - } else if (name.find("attn_q_b.weight") != std::string::npos) { - if (qs.model.hparams.n_expert >= 8) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { - new_type = GGML_TYPE_Q4_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; - } - } else if (name.find("attn_q_a.weight") != std::string::npos) { - if (qs.model.hparams.n_expert >= 8) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { - new_type = GGML_TYPE_Q4_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; + } else if (qs.model.hparams.n_expert >= 8 &&name.find("attn_q_b.weight") != std::string::npos) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { + new_type = GGML_TYPE_Q4_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; + } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q_a.weight") != std::string::npos) { + new_type = GGML_TYPE_Q5_K; + if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0; } // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; @@ -793,11 +848,25 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: ++qs.n_attention_wv; } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) { qs.has_output = true; + } else if (name.find("ffn_gate_exps.weight") != std::string::npos) { + ++qs.n_ffn_gate_exp; + } else if (name.find("ffn_gate_shexp.weight") != std::string::npos) { + ++qs.n_ffn_gate_shexp; + } else if (name.find("ffn_down_exps.weight") != std::string::npos) { + ++qs.n_ffn_down_exp; + } else if (name.find("ffn_down_shexp.weight") != std::string::npos) { + ++qs.n_ffn_down_shexp; + } else if (name.find("ffn_up_exps.weight") != std::string::npos) { + ++qs.n_ffn_up_exp; + } else if (name.find("ffn_up_shexp.weight") != std::string::npos) { + ++qs.n_ffn_up_shexp; } is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix } + GGML_ASSERT(qs.n_ffn_down_exp != 0); + qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer; // sanity checks for models that have attention layers From 502812b2b16c0f8388e0741424f23f45426eeda6 Mon Sep 17 00:00:00 2001 From: Colin Kealty <3266127+bartowski1182@users.noreply.github.com> Date: Wed, 2 Apr 2025 22:52:13 -0400 Subject: [PATCH 04/13] Remove debug assert --- src/llama-quant.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 3e56b2b86c..e108a82f37 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -865,8 +865,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix } - GGML_ASSERT(qs.n_ffn_down_exp != 0); - qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer; // sanity checks for models that have attention layers From a5c7f9e749230aecfb1ea455cb2d4c6d3155d7c9 Mon Sep 17 00:00:00 2001 From: Colin Kealty <3266127+bartowski1182@users.noreply.github.com> Date: Wed, 2 Apr 2025 23:02:06 -0400 Subject: [PATCH 05/13] Remove trailing whitespaces --- src/llama-quant.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index e108a82f37..aa5f64e250 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -318,7 +318,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t } ++qs.i_ffn_down; } - else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate.weight") != std::string::npos) { + else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate.weight") != std::string::npos) { if (qs.i_ffn_gate < qs.n_ffn_gate/16) { new_type = GGML_TYPE_Q4_K; } @@ -327,7 +327,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t } ++qs.i_ffn_gate; } - else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up.weight") != std::string::npos) { + else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up.weight") != std::string::npos) { if (qs.i_ffn_up < qs.n_ffn_up/16) { new_type = GGML_TYPE_Q4_K; } From 7889d1b81b80b0b98d6458a1fb6b6dca0b61c01e Mon Sep 17 00:00:00 2001 From: Colin Kealty <3266127+bartowski1182@users.noreply.github.com> Date: Thu, 3 Apr 2025 19:45:46 -0400 Subject: [PATCH 06/13] A bit more weight to shared experts for larger sizes --- src/llama-quant.cpp | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index aa5f64e250..1b2e7632cf 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -443,6 +443,30 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = GGML_TYPE_IQ2_S; } + } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down_shexp.weight") != std::string::npos) { + new_type = GGML_TYPE_Q5_K; + if (use_more_bits(qs.i_ffn_down_shexp, qs.n_ffn_down_shexp)) { + new_type = GGML_TYPE_Q8_0; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0; + ++qs.i_ffn_down_shexp; + } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate_shexp.weight") != std::string::npos) { + new_type = GGML_TYPE_Q5_K; + if (use_more_bits(qs.i_ffn_gate_shexp, qs.n_ffn_gate_shexp)) { + new_type = GGML_TYPE_Q8_0; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0; + ++qs.i_ffn_gate_shexp; + } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up_shexp.weight") != std::string::npos) { + new_type = GGML_TYPE_Q5_K; + if (use_more_bits(qs.i_ffn_up_shexp, qs.n_ffn_up_shexp)) { + new_type = GGML_TYPE_Q8_0; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0; + ++qs.i_ffn_up_shexp; } else if (name.find("ffn_down") != std::string::npos) { auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); int i_layer = info.first, n_layer = info.second; From f609de5a2c36387aacad7279a2bb48506332564f Mon Sep 17 00:00:00 2001 From: Colin Kealty <3266127+bartowski1182@users.noreply.github.com> Date: Thu, 8 May 2025 13:58:11 -0400 Subject: [PATCH 07/13] Update some of the weightings, remove some complication --- src/llama-quant.cpp | 175 +++++++++++++++++++++++++++++--------------- 1 file changed, 114 insertions(+), 61 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 1b2e7632cf..344796814a 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -84,9 +84,9 @@ struct quantize_state_impl { int n_ffn_down = 0; int n_ffn_gate = 0; int n_ffn_up = 0; - int n_ffn_down_exp = 0; - int n_ffn_gate_exp = 0; - int n_ffn_up_exp = 0; + int n_ffn_down_exps = 0; + int n_ffn_gate_exps = 0; + int n_ffn_up_exps = 0; int n_ffn_down_shexp = 0; int n_ffn_gate_shexp = 0; int n_ffn_up_shexp = 0; @@ -94,9 +94,9 @@ struct quantize_state_impl { int i_ffn_down = 0; int i_ffn_gate = 0; int i_ffn_up = 0; - int i_ffn_down_exp = 0; - int i_ffn_gate_exp = 0; - int i_ffn_up_exp = 0; + int i_ffn_down_exps = 0; + int i_ffn_gate_exps = 0; + int i_ffn_up_exps = 0; int i_ffn_down_shexp = 0; int i_ffn_gate_shexp = 0; int i_ffn_up_shexp = 0; @@ -187,21 +187,54 @@ static void llama_tensor_dequantize_impl( workers.clear(); } -// Check if ftype is specifically IQ2_S or IQ2_M -static inline bool is_iq2s_or_iq2m(llama_ftype ftype) { - return ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M; -} - -// Check if ftype belongs to the IQ1 group -static inline bool is_iq1_group(llama_ftype ftype) { - return ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M; -} // Returns the appropriate type for expert _exps tensors based on ftype -static inline ggml_type get_expert_exps_type(llama_ftype ftype) { - if (is_iq1_group(ftype)) return GGML_TYPE_IQ2_XXS; - if (is_iq2s_or_iq2m(ftype)) return GGML_TYPE_IQ3_XXS; - /* otherwise */ return GGML_TYPE_IQ2_XS; +static inline ggml_type get_exps_type_low_bpw_bump(llama_ftype ftype, ggml_type new_type) { + if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) new_type = GGML_TYPE_IQ2_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ1_M; + return new_type; +} + +static inline ggml_type get_exps_type_low_bpw_squash(llama_ftype ftype, ggml_type new_type) { + if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ2_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) new_type = GGML_TYPE_IQ1_M; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ1_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ1_S; + return new_type; +} + +static inline ggml_type get_exps_type_high_bpw_bump(llama_ftype ftype, ggml_type new_type, bool has_imatrix) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; + // Bump I-quants + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S; + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !has_imatrix) new_type = GGML_TYPE_Q5_K; + + return new_type; +} + +static inline ggml_type get_exps_type_high_bpw_squash(llama_ftype ftype, ggml_type new_type, bool has_imatrix) { + // Squash K-quants + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q2_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K; + // Squash I-quants + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { + new_type = has_imatrix ? GGML_TYPE_IQ2_S : GGML_TYPE_Q2_K; + } + return new_type; } static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) { @@ -271,7 +304,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { new_type = GGML_TYPE_Q2_K; } - else if (is_iq2s_or_iq2m(ftype)) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { new_type = GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { @@ -285,7 +318,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { if (name.find("attn_v.weight") != std::string::npos) { if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; - else new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; + else new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; ++qs.i_attention_wv; } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k.weight") != std::string::npos) { @@ -299,7 +332,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t new_type = GGML_TYPE_Q4_K; } else if (use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) { - new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; + new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; } ++qs.i_attention_wv; } @@ -307,14 +340,14 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t new_type = GGML_TYPE_Q4_K; } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q_b.weight") != std::string::npos) { - new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; + new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down.weight") != std::string::npos) { if (qs.i_ffn_down < qs.n_ffn_down/16) { - new_type = GGML_TYPE_Q4_K; + new_type = GGML_TYPE_Q6_K; } else if (qs.i_ffn_down < qs.n_ffn_down/8) { - new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; + new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; } ++qs.i_ffn_down; } @@ -323,7 +356,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t new_type = GGML_TYPE_Q4_K; } else if (qs.i_ffn_gate < qs.n_ffn_gate/8) { - new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; + new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; } ++qs.i_ffn_gate; } @@ -332,58 +365,64 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t new_type = GGML_TYPE_Q4_K; } else if (qs.i_ffn_up < qs.n_ffn_up/8) { - new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; + new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; } ++qs.i_ffn_up; } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down_exps.weight") != std::string::npos) { - if (qs.i_ffn_down_exp < qs.n_ffn_down_exp/8) { - new_type = get_expert_exps_type(ftype); + if (qs.i_ffn_down_exps < qs.n_ffn_down_exps/8 || qs.i_ffn_down_exps > 7*qs.n_ffn_down_exps/8) { + new_type = get_exps_type_low_bpw_bump(ftype, new_type); + } else { + new_type = get_exps_type_low_bpw_squash(ftype, new_type); } - ++qs.i_ffn_down_exp; + ++qs.i_ffn_down_exps; } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate_exps.weight") != std::string::npos) { - if (qs.i_ffn_gate_exp < qs.n_ffn_gate_exp/8) { - new_type = get_expert_exps_type(ftype); + if (qs.i_ffn_gate_exps < qs.n_ffn_gate_exps/8 || qs.i_ffn_gate_exps > 7*qs.n_ffn_gate_exps/8) { + new_type = get_exps_type_low_bpw_bump(ftype, new_type); + } else { + new_type = get_exps_type_low_bpw_squash(ftype, new_type); } - ++qs.i_ffn_gate_exp; + ++qs.i_ffn_gate_exps; } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up_exps.weight") != std::string::npos) { - if (qs.i_ffn_up_exp < qs.n_ffn_up_exp/8) { - new_type = get_expert_exps_type(ftype); + if (qs.i_ffn_up_exps < qs.n_ffn_up_exps/8 || qs.i_ffn_up_exps > 7*qs.n_ffn_up_exps/8) { + new_type = get_exps_type_low_bpw_bump(ftype, new_type); + } else { + new_type = get_exps_type_low_bpw_squash(ftype, new_type); } - ++qs.i_ffn_up_exp; + ++qs.i_ffn_up_exps; } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down_shexp.weight") != std::string::npos) { if (use_more_bits(qs.i_ffn_down_shexp, qs.n_ffn_down_shexp)) { - new_type = GGML_TYPE_Q4_K; + new_type = GGML_TYPE_Q6_K; } ++qs.i_ffn_down_shexp; } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate_shexp.weight") != std::string::npos) { if (use_more_bits(qs.i_ffn_gate_shexp, qs.n_ffn_gate_shexp)) { - new_type = GGML_TYPE_Q4_K; + new_type = GGML_TYPE_Q6_K; } ++qs.i_ffn_gate_shexp; } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up_shexp.weight") != std::string::npos) { if (use_more_bits(qs.i_ffn_up_shexp, qs.n_ffn_up_shexp)) { - new_type = GGML_TYPE_Q4_K; + new_type = GGML_TYPE_Q6_K; } ++qs.i_ffn_up_shexp; } else if (name.find("ffn_down") != std::string::npos) { if (qs.i_ffn_down < qs.n_ffn_down/8) { - new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; + new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; } ++qs.i_ffn_down; } else if (name.find("attn_output.weight") != std::string::npos) { if (qs.model.hparams.n_expert >= 8) { - new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } else { - if (is_iq1_group(ftype)) new_type = GGML_TYPE_IQ2_XXS; - else if (is_iq2s_or_iq2m(ftype)) new_type = GGML_TYPE_IQ3_S; + if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S; } } } else if (name.find("attn_v.weight") != std::string::npos) { @@ -425,7 +464,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t } ++qs.i_attention_wv; } else if (name.find("attn_k.weight") != std::string::npos) { - if (qs.model.hparams.n_expert == 8) { + if (qs.model.hparams.n_expert >= 8) { // for the 8-expert model, bumping this to Q8_0 trades just ~128MB // TODO: explore better strategies new_type = GGML_TYPE_Q8_0; @@ -445,28 +484,46 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t } } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down_shexp.weight") != std::string::npos) { new_type = GGML_TYPE_Q5_K; + //if (qs.i_ffn_down_shexp < qs.n_ffn_down_shexp/8 || qs.i_ffn_down_shexp > 7*qs.n_ffn_down_shexp/8) { if (use_more_bits(qs.i_ffn_down_shexp, qs.n_ffn_down_shexp)) { new_type = GGML_TYPE_Q8_0; } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0; ++qs.i_ffn_down_shexp; } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate_shexp.weight") != std::string::npos) { new_type = GGML_TYPE_Q5_K; + //if (qs.i_ffn_gate_shexp < qs.n_ffn_gate_shexp/8 || qs.i_ffn_gate_shexp > 7*qs.n_ffn_gate_shexp/8) { if (use_more_bits(qs.i_ffn_gate_shexp, qs.n_ffn_gate_shexp)) { new_type = GGML_TYPE_Q8_0; } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0; ++qs.i_ffn_gate_shexp; } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up_shexp.weight") != std::string::npos) { new_type = GGML_TYPE_Q5_K; + //if (qs.i_ffn_up_shexp < qs.n_ffn_up_shexp/8 || qs.i_ffn_up_shexp > 7*qs.n_ffn_up_shexp/8) { if (use_more_bits(qs.i_ffn_up_shexp, qs.n_ffn_up_shexp)) { new_type = GGML_TYPE_Q8_0; } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0; ++qs.i_ffn_up_shexp; + } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down_exps.weight") != std::string::npos) { + if (use_more_bits(qs.i_ffn_down_exps, qs.n_ffn_down_exps)) { + if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0) && qs.has_imatrix) { + // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0. + // We only do it when an imatrix is provided because a) we want to make sure that one can always get the + // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix. + new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1; + } else { + new_type = get_exps_type_high_bpw_bump(ftype, new_type, qs.has_imatrix); + } + } + ++qs.i_ffn_down_exps; } else if (name.find("ffn_down") != std::string::npos) { auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); int i_layer = info.first, n_layer = info.second; @@ -556,21 +613,17 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t ++qs.i_ffn_up; } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_kv_a_mqa.weight") != std::string::npos) { new_type = GGML_TYPE_Q8_0; - } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_kv_b.weight") != std::string::npos) { - new_type = GGML_TYPE_Q4_K; - if (qs.i_attention_wv < qs.n_attention_wv/16) { - new_type = GGML_TYPE_Q8_0; - } else if (use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) { - new_type = GGML_TYPE_Q6_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q5_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; - ++qs.i_attention_wv; + } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k_b.weight") != std::string::npos) { + new_type = GGML_TYPE_Q5_K; + if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0; + } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_v_b.weight") != std::string::npos) { + new_type = GGML_TYPE_Q5_K; + if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0; } else if (qs.model.hparams.n_expert >= 8 &&name.find("attn_q_b.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { - new_type = GGML_TYPE_Q4_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q5_K; + new_type = GGML_TYPE_Q4_K; + if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q_a.weight") != std::string::npos) { new_type = GGML_TYPE_Q5_K; @@ -873,15 +926,15 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) { qs.has_output = true; } else if (name.find("ffn_gate_exps.weight") != std::string::npos) { - ++qs.n_ffn_gate_exp; + ++qs.n_ffn_gate_exps; } else if (name.find("ffn_gate_shexp.weight") != std::string::npos) { ++qs.n_ffn_gate_shexp; } else if (name.find("ffn_down_exps.weight") != std::string::npos) { - ++qs.n_ffn_down_exp; + ++qs.n_ffn_down_exps; } else if (name.find("ffn_down_shexp.weight") != std::string::npos) { ++qs.n_ffn_down_shexp; } else if (name.find("ffn_up_exps.weight") != std::string::npos) { - ++qs.n_ffn_up_exp; + ++qs.n_ffn_up_exps; } else if (name.find("ffn_up_shexp.weight") != std::string::npos) { ++qs.n_ffn_up_shexp; } From a312ac950d214036cfda549b8b96cb92fb776b5c Mon Sep 17 00:00:00 2001 From: Colin Kealty <3266127+bartowski1182@users.noreply.github.com> Date: Mon, 15 Dec 2025 12:38:26 -0500 Subject: [PATCH 08/13] Update to latest changes --- src/llama-quant.cpp | 31 +------------------------------ 1 file changed, 1 insertion(+), 30 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 344796814a..a0d9265e96 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -805,7 +805,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } std::vector splits = {}; - llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr); + llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr); ml.init_mappings(false); // no prefetching llama_model model(llama_model_default_params()); @@ -875,7 +875,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: std::map mapped; int blk_id = 0; - int pruned_attention_w = 0; // make a list of weights std::vector tensors; @@ -883,11 +882,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: for (const auto & it : ml.weights_map) { const std::string remapped_name(remap_layer(it.first, prune_list, mapped, blk_id)); if (remapped_name.empty()) { - if (it.first.find("attn_v.weight") != std::string::npos || - it.first.find("attn_qkv.weight") != std::string::npos || - it.first.find("attn_kv_b.weight") != std::string::npos) { - pruned_attention_w++; - } LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str()); continue; } @@ -912,7 +906,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: }); } - bool is_clip_model = false; for (const auto * it : tensors) { const struct ggml_tensor * tensor = it->tensor; @@ -938,32 +931,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } else if (name.find("ffn_up_shexp.weight") != std::string::npos) { ++qs.n_ffn_up_shexp; } - - is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix } qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer; - // sanity checks for models that have attention layers - if (qs.n_attention_wv != 0 && !is_clip_model) - { - const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin(); - // attention layers have a non-zero number of kv heads - int32_t n_layer_attn = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0); - if (llama_model_has_encoder(&model)) { - // now n_layer_attn is the number of attention layers in the encoder - // for each decoder block, there are 2 attention layers - n_layer_attn += 2 * model.hparams.dec_n_layer; - } - - // note: for linear-attention models (such as Qwen3 Next) this is the number of linear layers - const int32_t n_layer_recr = std::count(model.hparams.recurrent_layer_arr.begin(), model.hparams.recurrent_layer_arr.end(), true); - - LLAMA_LOG_INFO("%s: n_layer_attn = %d, n_layer_recr = %d, pruned_attention_w = %d\n", __func__, n_layer_attn, n_layer_recr, pruned_attention_w); - - GGML_ASSERT((qs.n_attention_wv == n_layer_attn - pruned_attention_w - n_layer_recr) && "n_attention_wv is unexpected"); - } - size_t total_size_org = 0; size_t total_size_new = 0; From d28595e5475f368074e1e8434f67f321858444c2 Mon Sep 17 00:00:00 2001 From: Colin Kealty <3266127+bartowski1182@users.noreply.github.com> Date: Sun, 1 Mar 2026 18:50:44 -0500 Subject: [PATCH 09/13] Update recipe --- src/llama-quant.cpp | 150 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 116 insertions(+), 34 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 7a64c3dec0..a2c6745480 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -187,11 +187,10 @@ static void llama_tensor_dequantize_impl( workers.clear(); } - // Returns the appropriate type for expert _exps tensors based on ftype static inline ggml_type get_exps_type_low_bpw_bump(llama_ftype ftype, ggml_type new_type) { - if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_S; + if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_IQ2_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) new_type = GGML_TYPE_IQ2_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS; @@ -237,6 +236,30 @@ static inline ggml_type get_exps_type_high_bpw_squash(llama_ftype ftype, ggml_ty return new_type; } +static inline ggml_type get_ffn_shexp_ggml_type(llama_ftype ftype, ggml_type new_type) { + if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || + ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { + new_type = GGML_TYPE_IQ4_XS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { + new_type = GGML_TYPE_Q4_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_0) new_type = GGML_TYPE_Q5_0; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_1) new_type = GGML_TYPE_Q5_1; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || + ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) { + new_type = GGML_TYPE_Q5_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) { + new_type = GGML_TYPE_Q6_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q6_K || + ftype == LLAMA_FTYPE_MOSTLY_Q8_0) { + new_type = GGML_TYPE_Q8_0; + } + return new_type; +} + static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) { const std::string name = ggml_get_name(tensor); @@ -288,6 +311,10 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t new_type = GGML_TYPE_Q6_K; } } + } else if (name.find("ssm_ba.weight") != std::string::npos || + name.find("ssm_alpha.weight") != std::string::npos || + name.find("ssm_beta.weight") != std::string::npos) { + new_type = GGML_TYPE_Q8_0; } else if (ftype == LLAMA_FTYPE_MOSTLY_MXFP4_MOE) { // MoE tensors -> MXFP4 // other tensors -> Q8_0 @@ -321,6 +348,9 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t else new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; ++qs.i_attention_wv; } + else if (qs.model.hparams.n_expert >= 8 && name.find("ssm_out.weight") != std::string::npos) { + new_type = GGML_TYPE_Q4_K; + } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k.weight") != std::string::npos) { new_type = GGML_TYPE_Q4_K; } @@ -425,6 +455,8 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S; } } + } else if (name.find("ssm_out.weight") != std::string::npos) { + new_type = GGML_TYPE_Q8_0; } else if (name.find("attn_v.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; @@ -433,7 +465,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { - new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + new_type = GGML_TYPE_IQ3_S; } else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) { new_type = GGML_TYPE_Q4_K; @@ -451,13 +483,14 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q6_K && use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q8_0; if (qs.model.type == LLM_TYPE_70B) { // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with // nearly negligible increase in model size by quantizing this tensor with more bits: if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K; } - if (qs.model.hparams.n_expert == 8) { + if (qs.model.hparams.n_expert >= 8) { // for the 8-expert model, bumping this to Q8_0 trades just ~128MB // TODO: explore better strategies new_type = GGML_TYPE_Q8_0; @@ -483,34 +516,29 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t new_type = GGML_TYPE_IQ2_S; } } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down_shexp.weight") != std::string::npos) { - new_type = GGML_TYPE_Q5_K; - //if (qs.i_ffn_down_shexp < qs.n_ffn_down_shexp/8 || qs.i_ffn_down_shexp > 7*qs.n_ffn_down_shexp/8) { if (use_more_bits(qs.i_ffn_down_shexp, qs.n_ffn_down_shexp)) { new_type = GGML_TYPE_Q8_0; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0; + else { + new_type = get_ffn_shexp_ggml_type(ftype, new_type); + } ++qs.i_ffn_down_shexp; } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate_shexp.weight") != std::string::npos) { - new_type = GGML_TYPE_Q5_K; //if (qs.i_ffn_gate_shexp < qs.n_ffn_gate_shexp/8 || qs.i_ffn_gate_shexp > 7*qs.n_ffn_gate_shexp/8) { if (use_more_bits(qs.i_ffn_gate_shexp, qs.n_ffn_gate_shexp)) { new_type = GGML_TYPE_Q8_0; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0; + else { + new_type = get_ffn_shexp_ggml_type(ftype, new_type); + } ++qs.i_ffn_gate_shexp; } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up_shexp.weight") != std::string::npos) { - new_type = GGML_TYPE_Q5_K; - //if (qs.i_ffn_up_shexp < qs.n_ffn_up_shexp/8 || qs.i_ffn_up_shexp > 7*qs.n_ffn_up_shexp/8) { if (use_more_bits(qs.i_ffn_up_shexp, qs.n_ffn_up_shexp)) { new_type = GGML_TYPE_Q8_0; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0; + else { + new_type = get_ffn_shexp_ggml_type(ftype, new_type); + } ++qs.i_ffn_up_shexp; } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down_exps.weight") != std::string::npos) { if (use_more_bits(qs.i_ffn_down_exps, qs.n_ffn_down_exps)) { @@ -572,11 +600,12 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t } else if (name.find("attn_output.weight") != std::string::npos) { if (arch != LLM_ARCH_FALCON) { if (qs.model.hparams.n_expert >= 8) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || - ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || - ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) { - new_type = GGML_TYPE_Q5_K; + ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || + ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M || + ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) { + new_type = GGML_TYPE_Q6_K; } } else { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K; @@ -584,6 +613,11 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M ) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S ) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M ) new_type = GGML_TYPE_Q8_0; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S ) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q6_K ) new_type = GGML_TYPE_Q8_0; } } else { if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K; @@ -595,6 +629,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t } else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q6_K ) new_type = GGML_TYPE_Q8_0; } else if (name.find("ffn_gate") != std::string::npos) { auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str()); @@ -614,21 +649,68 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_kv_a_mqa.weight") != std::string::npos) { new_type = GGML_TYPE_Q8_0; } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k_b.weight") != std::string::npos) { - new_type = GGML_TYPE_Q5_K; - if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0; + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || + ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || + ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) { + new_type = GGML_TYPE_Q5_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || + ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS){ + new_type = GGML_TYPE_IQ4_XS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) { + new_type = GGML_TYPE_Q6_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q6_K) { + new_type = GGML_TYPE_Q8_0; + } } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_v_b.weight") != std::string::npos) { - new_type = GGML_TYPE_Q5_K; - if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0; + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || + ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || + ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) { + new_type = GGML_TYPE_Q5_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || + ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS){ + new_type = GGML_TYPE_IQ4_XS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) { + new_type = GGML_TYPE_Q6_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q6_K) { + new_type = GGML_TYPE_Q8_0; + } } else if (qs.model.hparams.n_expert >= 8 &&name.find("attn_q_b.weight") != std::string::npos) { - new_type = GGML_TYPE_Q4_K; - if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q5_K; + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || + ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || + ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) { + new_type = GGML_TYPE_Q4_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || + ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS){ + new_type = GGML_TYPE_IQ4_XS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) { + new_type = GGML_TYPE_Q5_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q6_K) new_type = GGML_TYPE_Q8_0; } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q_a.weight") != std::string::npos) { - new_type = GGML_TYPE_Q5_K; - if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0; + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || + ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || + ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) { + new_type = GGML_TYPE_Q5_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || + ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS){ + new_type = GGML_TYPE_IQ4_XS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) { + new_type = GGML_TYPE_Q6_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q6_K) { + new_type = GGML_TYPE_Q8_0; + } } return new_type; From fc0a02df272469da253e227fadefe56864e3c3d1 Mon Sep 17 00:00:00 2001 From: Colin Kealty <3266127+bartowski1182@users.noreply.github.com> Date: Mon, 9 Mar 2026 10:52:14 -0400 Subject: [PATCH 10/13] Changes f or attn_q, ssm_out, other tweaks --- src/llama-quant.cpp | 182 +++++++++++++++++++++++++------------------- 1 file changed, 102 insertions(+), 80 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index a2c6745480..c2ab1b80e8 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -80,26 +80,30 @@ struct quantize_state_impl { const llama_model & model; const llama_model_quantize_params * params; - int n_attention_wv = 0; - int n_ffn_down = 0; - int n_ffn_gate = 0; - int n_ffn_up = 0; - int n_ffn_down_exps = 0; - int n_ffn_gate_exps = 0; - int n_ffn_up_exps = 0; + int n_attention_wv = 0; + int n_ffn_down = 0; + int n_ffn_gate = 0; + int n_ffn_up = 0; + int n_ffn_down_exps = 0; + int n_ffn_gate_exps = 0; + int n_ffn_up_exps = 0; int n_ffn_down_shexp = 0; int n_ffn_gate_shexp = 0; int n_ffn_up_shexp = 0; - int i_attention_wv = 0; - int i_ffn_down = 0; - int i_ffn_gate = 0; - int i_ffn_up = 0; - int i_ffn_down_exps = 0; - int i_ffn_gate_exps = 0; - int i_ffn_up_exps = 0; + int n_ssm_out = 0; + int n_attn_q = 0; + int i_attention_wv = 0; + int i_ffn_down = 0; + int i_ffn_gate = 0; + int i_ffn_up = 0; + int i_ffn_down_exps = 0; + int i_ffn_gate_exps = 0; + int i_ffn_up_exps = 0; int i_ffn_down_shexp = 0; int i_ffn_gate_shexp = 0; int i_ffn_up_shexp = 0; + int i_ssm_out = 0; + int i_attn_q = 0; int n_k_quantized = 0; int n_fallback = 0; @@ -209,39 +213,25 @@ static inline ggml_type get_exps_type_low_bpw_squash(llama_ftype ftype, ggml_typ } static inline ggml_type get_exps_type_high_bpw_bump(llama_ftype ftype, ggml_type new_type, bool has_imatrix) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q6_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q6_K) new_type = GGML_TYPE_Q8_0; // Bump I-quants - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_Q4_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !has_imatrix) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_Q4_K; + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !has_imatrix) new_type = GGML_TYPE_Q6_K; return new_type; } -static inline ggml_type get_exps_type_high_bpw_squash(llama_ftype ftype, ggml_type new_type, bool has_imatrix) { - // Squash K-quants - if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q2_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) new_type = GGML_TYPE_Q3_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K; - // Squash I-quants - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_IQ3_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { - new_type = has_imatrix ? GGML_TYPE_IQ2_S : GGML_TYPE_Q2_K; - } - return new_type; -} - static inline ggml_type get_ffn_shexp_ggml_type(llama_ftype ftype, ggml_type new_type) { - if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || - ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { - new_type = GGML_TYPE_IQ4_XS; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { + if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || + ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || + ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_0) new_type = GGML_TYPE_Q5_0; @@ -314,7 +304,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t } else if (name.find("ssm_ba.weight") != std::string::npos || name.find("ssm_alpha.weight") != std::string::npos || name.find("ssm_beta.weight") != std::string::npos) { - new_type = GGML_TYPE_Q8_0; + new_type = GGML_TYPE_F32; } else if (ftype == LLAMA_FTYPE_MOSTLY_MXFP4_MOE) { // MoE tensors -> MXFP4 // other tensors -> Q8_0 @@ -329,13 +319,13 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t } else { if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { - new_type = GGML_TYPE_Q2_K; + new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { - new_type = GGML_TYPE_IQ3_S; + new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { - new_type = GGML_TYPE_IQ3_S; + new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) { new_type = GGML_TYPE_Q4_K; @@ -343,17 +333,27 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t } } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { - if (name.find("attn_v.weight") != std::string::npos) { - if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; - else new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; + if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) { + if (use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) { + new_type = GGML_TYPE_Q6_K; + } + else { + new_type = GGML_TYPE_Q4_K; + } ++qs.i_attention_wv; } else if (qs.model.hparams.n_expert >= 8 && name.find("ssm_out.weight") != std::string::npos) { - new_type = GGML_TYPE_Q4_K; + if (use_more_bits(qs.i_ssm_out, qs.n_ssm_out)) { + new_type = GGML_TYPE_Q4_K; + } + ++qs.i_ssm_out; } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k.weight") != std::string::npos) { new_type = GGML_TYPE_Q4_K; } + else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q.weight") != std::string::npos) { + new_type = GGML_TYPE_Q4_K; + } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_kv_a_mqa.weight") != std::string::npos) { new_type = GGML_TYPE_Q4_K; } @@ -402,42 +402,47 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down_exps.weight") != std::string::npos) { if (qs.i_ffn_down_exps < qs.n_ffn_down_exps/8 || qs.i_ffn_down_exps > 7*qs.n_ffn_down_exps/8) { new_type = get_exps_type_low_bpw_bump(ftype, new_type); - } else { - new_type = get_exps_type_low_bpw_squash(ftype, new_type); } ++qs.i_ffn_down_exps; } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate_exps.weight") != std::string::npos) { - if (qs.i_ffn_gate_exps < qs.n_ffn_gate_exps/8 || qs.i_ffn_gate_exps > 7*qs.n_ffn_gate_exps/8) { - new_type = get_exps_type_low_bpw_bump(ftype, new_type); - } else { - new_type = get_exps_type_low_bpw_squash(ftype, new_type); - } + // if (qs.i_ffn_gate_exps < qs.n_ffn_gate_exps/8 || qs.i_ffn_gate_exps > 7*qs.n_ffn_gate_exps/8) { + // new_type = get_exps_type_low_bpw_bump(ftype, new_type); + // } else { + // new_type = get_exps_type_low_bpw_squash(ftype, new_type); + // } ++qs.i_ffn_gate_exps; } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up_exps.weight") != std::string::npos) { - if (qs.i_ffn_up_exps < qs.n_ffn_up_exps/8 || qs.i_ffn_up_exps > 7*qs.n_ffn_up_exps/8) { - new_type = get_exps_type_low_bpw_bump(ftype, new_type); - } else { - new_type = get_exps_type_low_bpw_squash(ftype, new_type); - } + // if (use_more_bits(qs.i_ffn_up_exps, qs.n_ffn_up_exps)) { + // new_type = get_exps_type_low_bpw_bump(ftype, new_type); + // } ++qs.i_ffn_up_exps; } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down_shexp.weight") != std::string::npos) { if (use_more_bits(qs.i_ffn_down_shexp, qs.n_ffn_down_shexp)) { - new_type = GGML_TYPE_Q6_K; + new_type = GGML_TYPE_Q8_0; + } + else { + new_type = GGML_TYPE_Q4_K; } ++qs.i_ffn_down_shexp; } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate_shexp.weight") != std::string::npos) { if (use_more_bits(qs.i_ffn_gate_shexp, qs.n_ffn_gate_shexp)) { - new_type = GGML_TYPE_Q6_K; + new_type = GGML_TYPE_Q8_0; + } + else { + new_type = GGML_TYPE_Q4_K; } ++qs.i_ffn_gate_shexp; } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up_shexp.weight") != std::string::npos) { if (use_more_bits(qs.i_ffn_up_shexp, qs.n_ffn_up_shexp)) { - new_type = GGML_TYPE_Q6_K; + new_type = GGML_TYPE_Q8_0; + } + else { + new_type = GGML_TYPE_Q4_K; } ++qs.i_ffn_up_shexp; } @@ -456,33 +461,37 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t } } } else if (name.find("ssm_out.weight") != std::string::npos) { - new_type = GGML_TYPE_Q8_0; + if (use_more_bits(qs.i_ssm_out, qs.n_ssm_out)) { + new_type = GGML_TYPE_Q8_0; + } + ++qs.i_ssm_out; } else if (name.find("attn_v.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) { - new_type = GGML_TYPE_Q4_K; - } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { - new_type = GGML_TYPE_IQ3_S; + new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; } else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) { new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { - new_type = GGML_TYPE_Q4_K; + new_type = GGML_TYPE_Q5_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { - new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + if (use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) { + new_type = GGML_TYPE_Q5_K; + } else { + new_type = GGML_TYPE_Q4_K; + } } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) { - new_type = GGML_TYPE_Q5_K; + new_type = GGML_TYPE_Q6_K; } else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q6_K && use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q8_0; if (qs.model.type == LLM_TYPE_70B) { // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is @@ -496,6 +505,16 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t new_type = GGML_TYPE_Q8_0; } ++qs.i_attention_wv; + } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q.weight") != std::string::npos) { + if (use_more_bits(qs.i_attn_q, qs.n_attn_q)) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q6_K || ftype == LLAMA_FTYPE_MOSTLY_Q8_0) { + new_type = GGML_TYPE_Q8_0; + } + else { + new_type = GGML_TYPE_Q6_K; + } + } + ++qs.i_attn_q; } else if (name.find("attn_k.weight") != std::string::npos) { if (qs.model.hparams.n_expert >= 8) { // for the 8-expert model, bumping this to Q8_0 trades just ~128MB @@ -508,13 +527,6 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = GGML_TYPE_IQ2_S; } - } else if (name.find("attn_q.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { - new_type = GGML_TYPE_IQ3_XXS; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { - new_type = GGML_TYPE_IQ2_S; - } } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down_shexp.weight") != std::string::npos) { if (use_more_bits(qs.i_ffn_down_shexp, qs.n_ffn_down_shexp)) { new_type = GGML_TYPE_Q8_0; @@ -610,6 +622,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t } else { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS ) new_type = GGML_TYPE_IQ3_S; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K; @@ -624,12 +637,17 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t } } else if (name.find("attn_qkv.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { new_type = GGML_TYPE_Q4_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q6_K ) new_type = GGML_TYPE_Q8_0; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || + ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || + ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || + ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) { + new_type = GGML_TYPE_Q5_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q6_K ) new_type = GGML_TYPE_Q8_0; } else if (name.find("ffn_gate") != std::string::npos) { auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str()); @@ -972,6 +990,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: ++qs.n_ffn_up_exps; } else if (name.find("ffn_up_shexp.weight") != std::string::npos) { ++qs.n_ffn_up_shexp; + } else if (name.find("ssm_out.weight") != std::string::npos) { + ++qs.n_ssm_out; + } else if (name.find("attn_q.weight") != std::string::npos) { + ++qs.n_attn_q; } } From bf34e75799ec65bcaeeb452bdb875ccd49dc65cc Mon Sep 17 00:00:00 2001 From: Colin Kealty <3266127+bartowski1182@users.noreply.github.com> Date: Mon, 9 Mar 2026 10:53:56 -0400 Subject: [PATCH 11/13] Add specific attn_qkv logic --- src/llama-quant.cpp | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index c2ab1b80e8..3a07bbf001 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -81,6 +81,7 @@ struct quantize_state_impl { const llama_model_quantize_params * params; int n_attention_wv = 0; + int n_attn_qkv = 0; int n_ffn_down = 0; int n_ffn_gate = 0; int n_ffn_up = 0; @@ -92,6 +93,7 @@ struct quantize_state_impl { int n_ffn_up_shexp = 0; int n_ssm_out = 0; int n_attn_q = 0; + int i_attn_qkv = 0; int i_attention_wv = 0; int i_ffn_down = 0; int i_ffn_gate = 0; @@ -333,7 +335,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t } } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { - if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) { + if (name.find("attn_v.weight") != std::string::npos) { if (use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) { new_type = GGML_TYPE_Q6_K; } @@ -342,6 +344,20 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t } ++qs.i_attention_wv; } + else if (name.find("attn_qkv.weight") != std::string::npos) { + if (qs.model.hparams.n_expert >= 8) { + if (use_more_bits(qs.i_attn_qkv, qs.n_attn_qkv)) { + new_type = GGML_TYPE_Q6_K; + } + else { + new_type = GGML_TYPE_Q4_K; + } + } + else if (use_more_bits(qs.i_attn_qkv, qs.n_attn_qkv)) { + new_type = GGML_TYPE_Q4_K; + } + ++qs.i_attn_qkv; + } else if (qs.model.hparams.n_expert >= 8 && name.find("ssm_out.weight") != std::string::npos) { if (use_more_bits(qs.i_ssm_out, qs.n_ssm_out)) { new_type = GGML_TYPE_Q4_K; @@ -640,7 +656,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { new_type = GGML_TYPE_Q4_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) { @@ -673,7 +689,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t new_type = GGML_TYPE_Q5_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || - ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS){ + ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = GGML_TYPE_IQ4_XS; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) { @@ -705,7 +721,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || - ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS){ + ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = GGML_TYPE_IQ4_XS; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) { @@ -720,7 +736,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t new_type = GGML_TYPE_Q5_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || - ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS){ + ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = GGML_TYPE_IQ4_XS; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) { @@ -973,11 +989,12 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // TODO: avoid hardcoded tensor names - use the TN_* constants if (name.find("attn_v.weight") != std::string::npos || - name.find("attn_qkv.weight") != std::string::npos || name.find("attn_kv_b.weight")!= std::string::npos) { ++qs.n_attention_wv; } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) { qs.has_output = true; + } else if (name.find("attn_qkv.weight") != std::string::npos) { + ++qs.n_attn_qkv; } else if (name.find("ffn_gate_exps.weight") != std::string::npos) { ++qs.n_ffn_gate_exps; } else if (name.find("ffn_gate_shexp.weight") != std::string::npos) { From 12a850132c54c480d72d30abde2c30dcf7bd2c2a Mon Sep 17 00:00:00 2001 From: Colin Kealty <3266127+bartowski1182@users.noreply.github.com> Date: Wed, 11 Mar 2026 13:36:06 -0400 Subject: [PATCH 12/13] Clean up a merge conflict --- src/llama-quant.cpp | 78 +++++++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 35 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 9da2f88f4f..3a6feb1388 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1238,9 +1238,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: quantize_state_impl qs(model, params); - // these need to be set to n_layer by default - qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer; - if (params->only_copy) { ftype = ml.ftype; } @@ -1347,6 +1344,49 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // compute tensor metadata once and cache it std::vector metadata(tensors.size()); + // initialize quantization state before preliminary loop (counters for use_more_bits) + { + for (size_t i = 0; i < tensors.size(); ++i) { + const auto cat = tensor_get_category(tensors[i]->tensor->name); + if (category_is_attn_v(cat)) { + ++qs.n_attention_wv; + } + else if (cat == tensor_category::ATTENTION_QKV) { + ++qs.n_attn_qkv; + } + else if (cat == tensor_category::FFN_GATE_EXPS) { + ++qs.n_ffn_gate_exps; + } + else if (cat == tensor_category::FFN_GATE_SHEXP) { + ++qs.n_ffn_gate_shexp; + } + else if (cat == tensor_category::FFN_DOWN_EXPS) { + ++qs.n_ffn_down_exps; + } + else if (cat == tensor_category::FFN_DOWN_SHEXP) { + ++qs.n_ffn_down_shexp; + } + else if (cat == tensor_category::FFN_UP_EXPS) { + ++qs.n_ffn_up_exps; + } + else if (cat == tensor_category::FFN_UP_SHEXP) { + ++qs.n_ffn_up_shexp; + } + else if (cat == tensor_category::SSM_OUT) { + ++qs.n_ssm_out; + } + else if (cat == tensor_category::ATTENTION_Q) { + ++qs.n_attn_q; + } + if (cat == tensor_category::OUTPUT) { + qs.has_tied_embeddings = false; + } + metadata[i].category = cat; // save and re-use the category while we're at it + } + // these also need to be set to n_layer by default + qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer; + } + // flag for --dry-run bool will_require_imatrix = false; @@ -1363,38 +1403,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: if (category_is_attn_v(metadata[i].category)) { ++qs.n_attention_wv; - } - else if (metadata[i].category == tensor_category::ATTENTION_QKV) { - ++qs.n_attn_qkv; - } - else if (metadata[i].category == tensor_category::FFN_GATE_EXPS) { - ++qs.n_ffn_gate_exps; - } - else if (metadata[i].category == tensor_category::FFN_GATE_SHEXP) { - ++qs.n_ffn_gate_shexp; - } - else if (metadata[i].category == tensor_category::FFN_DOWN_EXPS) { - ++qs.n_ffn_down_exps; - } - else if (metadata[i].category == tensor_category::FFN_DOWN_SHEXP) { - ++qs.n_ffn_down_shexp; - } - else if (metadata[i].category == tensor_category::FFN_UP_EXPS) { - ++qs.n_ffn_up_exps; - } - else if (metadata[i].category == tensor_category::FFN_UP_SHEXP) { - ++qs.n_ffn_up_shexp; - } - else if (metadata[i].category == tensor_category::SSM_OUT) { - ++qs.n_ssm_out; - } - else if (metadata[i].category == tensor_category::ATTENTION_Q) { - ++qs.n_attn_q; - } - - if (tensor_name_match_output_weight(name.c_str())) { - qs.has_tied_embeddings = false; - } uint16_t i_split = params->keep_split ? it->idx : 0; if (!ctx_outs[i_split]) { From 0af41bf152ee6ddb6e44bd39a3288cce7162f916 Mon Sep 17 00:00:00 2001 From: Colin Kealty <3266127+bartowski1182@users.noreply.github.com> Date: Wed, 11 Mar 2026 13:36:32 -0400 Subject: [PATCH 13/13] Clean up another merge conflict --- src/llama-quant.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 3a6feb1388..872f60059e 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1399,11 +1399,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: const struct ggml_tensor * tensor = it->tensor; const std::string name = ggml_get_name(tensor); - metadata[i].category = tensor_get_category(name); - - if (category_is_attn_v(metadata[i].category)) { - ++qs.n_attention_wv; - uint16_t i_split = params->keep_split ? it->idx : 0; if (!ctx_outs[i_split]) { ctx_outs[i_split].reset(gguf_init_empty());