From cb496c4e8a63a4ebe6362f587099a89488631048 Mon Sep 17 00:00:00 2001
From: Colin Kealty <3266127+bartowski1182@users.noreply.github.com>
Date: Mon, 31 Mar 2025 13:21:07 -0400
Subject: [PATCH 01/13] Update llama-quant.cpp llama_tensor_get_type with
 DeepSeek friendly modifications

---
 src/llama-quant.cpp | 68 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 65 insertions(+), 3 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 0b23eaef3a..6d67448fda 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -259,9 +259,39 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
             else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
             ++qs.i_attention_wv;
         }
-        else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
+        else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k") != std::string::npos) {
             new_type = GGML_TYPE_Q4_K;
         }
+        else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q") != std::string::npos) {
+            new_type = GGML_TYPE_Q4_K;
+        }
+        else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down") != std::string::npos) {
+            if (qs.i_ffn_down < qs.n_ffn_down/16) {
+                new_type = GGML_TYPE_Q4_K;
+            }
+            else if (qs.i_ffn_down < qs.n_ffn_down/8) {
+                new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
+            }
+            ++qs.i_ffn_down;
+        }
+        else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate") != std::string::npos) {           
+            if (qs.i_ffn_gate < qs.n_ffn_gate/16) {
+                new_type = GGML_TYPE_Q4_K;
+            }
+            else if (qs.i_ffn_gate < qs.n_ffn_gate/8 || qs.i_ffn_gate >= 7*qs.n_ffn_gate/8) {
+                new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
+            }
+            ++qs.i_ffn_gate;
+        }
+        else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up") != std::string::npos) {     
+            if (qs.i_ffn_up < qs.n_ffn_up/16) {
+                new_type = GGML_TYPE_Q4_K;
+            }
+            else if (qs.i_ffn_up < qs.n_ffn_up/8) {
+                new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
+            }
+            ++qs.i_ffn_up;
+        }
         else if (name.find("ffn_down") != std::string::npos) {
             if (qs.i_ffn_down < qs.n_ffn_down/8) {
                 new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
@@ -269,7 +299,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
             ++qs.i_ffn_down;
         }
         else if (name.find("attn_output.weight") != std::string::npos) {
-            if (qs.model.hparams.n_expert == 8) {
+            if (qs.model.hparams.n_expert >= 8) {
                 new_type = GGML_TYPE_Q5_K;
             } else {
                 if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
@@ -380,7 +410,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
         ++qs.i_ffn_down;
     } else if (name.find("attn_output.weight") != std::string::npos) {
         if (arch != LLM_ARCH_FALCON) {
-            if (qs.model.hparams.n_expert == 8) {
+            if (qs.model.hparams.n_expert >= 8) {
                 if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K   || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
                     ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL  ||
                     ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S  ||
@@ -420,6 +450,38 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
             new_type = GGML_TYPE_IQ3_XXS;
         }
         ++qs.i_ffn_up;
+    } else if (name.find("attn_kv_a_mqa") != std::string::npos) {
+        if (qs.model.hparams.n_expert >= 8) {
+            new_type = GGML_TYPE_Q8_0;
+        }
+    } else if (name.find("attn_kv_b.weight") != std::string::npos) {
+        if (qs.model.hparams.n_expert >= 8) {
+            new_type = GGML_TYPE_Q4_K;
+            if (qs.i_attention_wv < qs.n_attention_wv/16) {
+                new_type = GGML_TYPE_Q8_0;
+            } else if (use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) {
+                new_type = GGML_TYPE_Q6_K;
+            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q5_K;
+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
+        }
+        ++qs.i_attention_wv;
+    } else if (name.find("attn_q_b.weight") != std::string::npos) {
+        if (qs.model.hparams.n_expert >= 8) {
+            if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
+                new_type = GGML_TYPE_Q4_K;
+            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
+        }
+    } else if (name.find("attn_q_a.weight") != std::string::npos) {
+        if (qs.model.hparams.n_expert >= 8) {
+            if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
+                new_type = GGML_TYPE_Q4_K;
+            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
+        }
     }
 
     //    if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;

From 3f8d7a258286fdeeb4a9b4851fbc8a737d237023 Mon Sep 17 00:00:00 2001
From: Colin Kealty <3266127+bartowski1182@users.noreply.github.com>
Date: Tue, 1 Apr 2025 19:01:40 -0400
Subject: [PATCH 02/13] Claw back a few of the changes for less dramatic file
 size increase

---
 src/llama-quant.cpp | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 6d67448fda..7f854d0cb0 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -259,12 +259,27 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
             else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
             ++qs.i_attention_wv;
         }
-        else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k") != std::string::npos) {
+        else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k.weight") != std::string::npos) {
             new_type = GGML_TYPE_Q4_K;
         }
-        else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q") != std::string::npos) {
+        else if (qs.model.hparams.n_expert >= 8 && name.find("attn_kv_a_mqa.weight") != std::string::npos) {
             new_type = GGML_TYPE_Q4_K;
         }
+        else if (qs.model.hparams.n_expert >= 8 && name.find("attn_kv_b.weight") != std::string::npos) {
+            if (qs.i_attention_wv < qs.n_attention_wv/16) {
+                new_type = GGML_TYPE_Q4_K;
+            }
+            else if (use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) {
+                new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
+            }
+            ++qs.i_attention_wv;
+        }
+        else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q_a.weight") != std::string::npos) {
+            new_type = GGML_TYPE_Q4_K;
+        }
+        else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q_b.weight") != std::string::npos) {
+            new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
+        }
         else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down") != std::string::npos) {
             if (qs.i_ffn_down < qs.n_ffn_down/16) {
                 new_type = GGML_TYPE_Q4_K;

From 105261d2baef50ea65c48b49ba43d692b1d19ed9 Mon Sep 17 00:00:00 2001
From: Colin Kealty <3266127+bartowski1182@users.noreply.github.com>
Date: Wed, 2 Apr 2025 22:32:59 -0400
Subject: [PATCH 03/13] Few more changes and tweaks

---
 ggml/src/ggml-common.h |   2 +-
 src/llama-quant.cpp    | 157 +++++++++++++++++++++++++++++------------
 2 files changed, 114 insertions(+), 45 deletions(-)

diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h
index 93ab7ea446..5e61c44f34 100644
--- a/ggml/src/ggml-common.h
+++ b/ggml/src/ggml-common.h
@@ -378,8 +378,8 @@ typedef struct {
 } block_iq3_xxs;
 static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_half) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
 
-// 3.4375 bpw
 #define IQ3S_N_SCALE QK_K/64
+// 3.4375 bpw
 typedef struct {
     ggml_half d;
     uint8_t qs[QK_K/4];
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 7f854d0cb0..3e56b2b86c 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -84,10 +84,22 @@ struct quantize_state_impl {
     int n_ffn_down     = 0;
     int n_ffn_gate     = 0;
     int n_ffn_up       = 0;
+    int n_ffn_down_exp = 0;
+    int n_ffn_gate_exp = 0;
+    int n_ffn_up_exp   = 0;
+    int n_ffn_down_shexp = 0;
+    int n_ffn_gate_shexp = 0;
+    int n_ffn_up_shexp   = 0;
     int i_attention_wv = 0;
     int i_ffn_down     = 0;
     int i_ffn_gate     = 0;
     int i_ffn_up       = 0;
+    int i_ffn_down_exp = 0;
+    int i_ffn_gate_exp = 0;
+    int i_ffn_up_exp   = 0;
+    int i_ffn_down_shexp = 0;
+    int i_ffn_gate_shexp = 0;
+    int i_ffn_up_shexp   = 0;
 
     int n_k_quantized = 0;
     int n_fallback    = 0;
@@ -175,6 +187,23 @@ static void llama_tensor_dequantize_impl(
     workers.clear();
 }
 
+// Check if ftype is specifically IQ2_S or IQ2_M
+static inline bool is_iq2s_or_iq2m(llama_ftype ftype) {
+    return ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M;
+}
+
+// Check if ftype belongs to the IQ1 group
+static inline bool is_iq1_group(llama_ftype ftype) {
+    return ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M;
+}
+
+// Returns the appropriate type for expert _exps tensors based on ftype
+static inline ggml_type get_expert_exps_type(llama_ftype ftype) {
+    if (is_iq1_group(ftype))         return GGML_TYPE_IQ2_XXS;
+    if (is_iq2s_or_iq2m(ftype))      return GGML_TYPE_IQ3_XXS;
+    /* otherwise */                  return GGML_TYPE_IQ2_XS;
+}
+
 static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
     const std::string name = ggml_get_name(tensor);
 
@@ -242,7 +271,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
                 ftype == LLAMA_FTYPE_MOSTLY_IQ1_S   || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
                 new_type = GGML_TYPE_Q2_K;
             }
-            else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
+            else if (is_iq2s_or_iq2m(ftype)) {
                 new_type = GGML_TYPE_IQ3_S;
             }
             else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
@@ -256,7 +285,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
                ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M    || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
         if (name.find("attn_v.weight") != std::string::npos) {
             if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
-            else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
+            else new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
             ++qs.i_attention_wv;
         }
         else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k.weight") != std::string::npos) {
@@ -266,11 +295,11 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
             new_type = GGML_TYPE_Q4_K;
         }
         else if (qs.model.hparams.n_expert >= 8 && name.find("attn_kv_b.weight") != std::string::npos) {
-            if (qs.i_attention_wv < qs.n_attention_wv/16) {
+            if (qs.i_attention_wv < qs.n_attention_wv/8) {
                 new_type = GGML_TYPE_Q4_K;
             }
             else if (use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) {
-                new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
+                new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
             }
             ++qs.i_attention_wv;
         }
@@ -278,47 +307,83 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
             new_type = GGML_TYPE_Q4_K;
         }
         else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q_b.weight") != std::string::npos) {
-            new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
+            new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
         }
-        else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down") != std::string::npos) {
+        else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down.weight") != std::string::npos) {
             if (qs.i_ffn_down < qs.n_ffn_down/16) {
                 new_type = GGML_TYPE_Q4_K;
             }
             else if (qs.i_ffn_down < qs.n_ffn_down/8) {
-                new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
+                new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
             }
             ++qs.i_ffn_down;
         }
-        else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate") != std::string::npos) {           
+        else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate.weight") != std::string::npos) {           
             if (qs.i_ffn_gate < qs.n_ffn_gate/16) {
                 new_type = GGML_TYPE_Q4_K;
             }
-            else if (qs.i_ffn_gate < qs.n_ffn_gate/8 || qs.i_ffn_gate >= 7*qs.n_ffn_gate/8) {
-                new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
+            else if (qs.i_ffn_gate < qs.n_ffn_gate/8) {
+                new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
             }
             ++qs.i_ffn_gate;
         }
-        else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up") != std::string::npos) {     
+        else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up.weight") != std::string::npos) {     
             if (qs.i_ffn_up < qs.n_ffn_up/16) {
                 new_type = GGML_TYPE_Q4_K;
             }
             else if (qs.i_ffn_up < qs.n_ffn_up/8) {
-                new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
+                new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
             }
             ++qs.i_ffn_up;
         }
+        else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down_exps.weight") != std::string::npos) {
+            if (qs.i_ffn_down_exp < qs.n_ffn_down_exp/8) {
+                new_type = get_expert_exps_type(ftype);
+            }
+            ++qs.i_ffn_down_exp;
+        }
+        else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate_exps.weight") != std::string::npos) {
+            if (qs.i_ffn_gate_exp < qs.n_ffn_gate_exp/8) {
+                new_type = get_expert_exps_type(ftype);
+            }
+            ++qs.i_ffn_gate_exp;
+        }
+        else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up_exps.weight") != std::string::npos) {
+            if (qs.i_ffn_up_exp < qs.n_ffn_up_exp/8) {
+                new_type = get_expert_exps_type(ftype);
+            }
+            ++qs.i_ffn_up_exp;
+        }
+        else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down_shexp.weight") != std::string::npos) {
+            if (use_more_bits(qs.i_ffn_down_shexp, qs.n_ffn_down_shexp)) {
+                new_type = GGML_TYPE_Q4_K;
+            }
+            ++qs.i_ffn_down_shexp;
+        }
+        else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate_shexp.weight") != std::string::npos) {
+            if (use_more_bits(qs.i_ffn_gate_shexp, qs.n_ffn_gate_shexp)) {
+                new_type = GGML_TYPE_Q4_K;
+            }
+            ++qs.i_ffn_gate_shexp;
+        }
+        else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up_shexp.weight") != std::string::npos) {
+            if (use_more_bits(qs.i_ffn_up_shexp, qs.n_ffn_up_shexp)) {
+                new_type = GGML_TYPE_Q4_K;
+            }
+            ++qs.i_ffn_up_shexp;
+        }
         else if (name.find("ffn_down") != std::string::npos) {
             if (qs.i_ffn_down < qs.n_ffn_down/8) {
-                new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
+                new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
             }
             ++qs.i_ffn_down;
         }
         else if (name.find("attn_output.weight") != std::string::npos) {
             if (qs.model.hparams.n_expert >= 8) {
-                new_type = GGML_TYPE_Q5_K;
+                new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
             } else {
-                if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
-                else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
+                if (is_iq1_group(ftype)) new_type = GGML_TYPE_IQ2_XXS;
+                else if (is_iq2s_or_iq2m(ftype)) new_type = GGML_TYPE_IQ3_S;
             }
         }
     } else if (name.find("attn_v.weight") != std::string::npos) {
@@ -465,38 +530,28 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
             new_type = GGML_TYPE_IQ3_XXS;
         }
         ++qs.i_ffn_up;
-    } else if (name.find("attn_kv_a_mqa") != std::string::npos) {
-        if (qs.model.hparams.n_expert >= 8) {
+    } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_kv_a_mqa.weight") != std::string::npos) {
+        new_type = GGML_TYPE_Q8_0;
+    } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_kv_b.weight") != std::string::npos) {
+        new_type = GGML_TYPE_Q4_K;
+        if (qs.i_attention_wv < qs.n_attention_wv/16) {
             new_type = GGML_TYPE_Q8_0;
+        } else if (use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) {
+            new_type = GGML_TYPE_Q6_K;
         }
-    } else if (name.find("attn_kv_b.weight") != std::string::npos) {
-        if (qs.model.hparams.n_expert >= 8) {
-            new_type = GGML_TYPE_Q4_K;
-            if (qs.i_attention_wv < qs.n_attention_wv/16) {
-                new_type = GGML_TYPE_Q8_0;
-            } else if (use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) {
-                new_type = GGML_TYPE_Q6_K;
-            }
-            else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q5_K;
-            else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
-        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q5_K;
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
         ++qs.i_attention_wv;
-    } else if (name.find("attn_q_b.weight") != std::string::npos) {
-        if (qs.model.hparams.n_expert >= 8) {
-            if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
-                new_type = GGML_TYPE_Q4_K;
-            }
-            else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
-            else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
-        }
-    } else if (name.find("attn_q_a.weight") != std::string::npos) {
-        if (qs.model.hparams.n_expert >= 8) {
-            if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
-                new_type = GGML_TYPE_Q4_K;
-            }
-            else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
-            else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
+    } else if (qs.model.hparams.n_expert >= 8 &&name.find("attn_q_b.weight") != std::string::npos) {
+        if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
+            new_type = GGML_TYPE_Q4_K;
         }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q5_K;
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
+    } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q_a.weight") != std::string::npos) {
+        new_type = GGML_TYPE_Q5_K;
+        if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
     }
 
     //    if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
@@ -793,11 +848,25 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             ++qs.n_attention_wv;
         } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
             qs.has_output = true;
+        } else if (name.find("ffn_gate_exps.weight") != std::string::npos) {
+            ++qs.n_ffn_gate_exp;
+        } else if (name.find("ffn_gate_shexp.weight") != std::string::npos) {
+            ++qs.n_ffn_gate_shexp;
+        } else if (name.find("ffn_down_exps.weight") != std::string::npos) {
+            ++qs.n_ffn_down_exp;
+        } else if (name.find("ffn_down_shexp.weight") != std::string::npos) {
+            ++qs.n_ffn_down_shexp;
+        } else if (name.find("ffn_up_exps.weight") != std::string::npos) {
+            ++qs.n_ffn_up_exp;
+        } else if (name.find("ffn_up_shexp.weight") != std::string::npos) {
+            ++qs.n_ffn_up_shexp;
         }
 
         is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix
     }
 
+    GGML_ASSERT(qs.n_ffn_down_exp != 0);
+
     qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
 
     // sanity checks for models that have attention layers

From 502812b2b16c0f8388e0741424f23f45426eeda6 Mon Sep 17 00:00:00 2001
From: Colin Kealty <3266127+bartowski1182@users.noreply.github.com>
Date: Wed, 2 Apr 2025 22:52:13 -0400
Subject: [PATCH 04/13] Remove debug assert

---
 src/llama-quant.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 3e56b2b86c..e108a82f37 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -865,8 +865,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix
     }
 
-    GGML_ASSERT(qs.n_ffn_down_exp != 0);
-
     qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
 
     // sanity checks for models that have attention layers

From a5c7f9e749230aecfb1ea455cb2d4c6d3155d7c9 Mon Sep 17 00:00:00 2001
From: Colin Kealty <3266127+bartowski1182@users.noreply.github.com>
Date: Wed, 2 Apr 2025 23:02:06 -0400
Subject: [PATCH 05/13] Remove trailing whitespaces

---
 src/llama-quant.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index e108a82f37..aa5f64e250 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -318,7 +318,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
             }
             ++qs.i_ffn_down;
         }
-        else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate.weight") != std::string::npos) {           
+        else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate.weight") != std::string::npos) {
             if (qs.i_ffn_gate < qs.n_ffn_gate/16) {
                 new_type = GGML_TYPE_Q4_K;
             }
@@ -327,7 +327,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
             }
             ++qs.i_ffn_gate;
         }
-        else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up.weight") != std::string::npos) {     
+        else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up.weight") != std::string::npos) {
             if (qs.i_ffn_up < qs.n_ffn_up/16) {
                 new_type = GGML_TYPE_Q4_K;
             }

From 7889d1b81b80b0b98d6458a1fb6b6dca0b61c01e Mon Sep 17 00:00:00 2001
From: Colin Kealty <3266127+bartowski1182@users.noreply.github.com>
Date: Thu, 3 Apr 2025 19:45:46 -0400
Subject: [PATCH 06/13] A bit more weight to shared experts for larger sizes

---
 src/llama-quant.cpp | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index aa5f64e250..1b2e7632cf 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -443,6 +443,30 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
         else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
             new_type = GGML_TYPE_IQ2_S;
         }
+    } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down_shexp.weight") != std::string::npos) {
+        new_type = GGML_TYPE_Q5_K;
+        if (use_more_bits(qs.i_ffn_down_shexp, qs.n_ffn_down_shexp)) {
+            new_type = GGML_TYPE_Q8_0;
+        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
+        ++qs.i_ffn_down_shexp;
+    } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate_shexp.weight") != std::string::npos) {
+        new_type = GGML_TYPE_Q5_K;
+        if (use_more_bits(qs.i_ffn_gate_shexp, qs.n_ffn_gate_shexp)) {
+            new_type = GGML_TYPE_Q8_0;
+        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
+        ++qs.i_ffn_gate_shexp;
+    } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up_shexp.weight") != std::string::npos) {
+        new_type = GGML_TYPE_Q5_K;
+        if (use_more_bits(qs.i_ffn_up_shexp, qs.n_ffn_up_shexp)) {
+            new_type = GGML_TYPE_Q8_0;
+        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
+        ++qs.i_ffn_up_shexp;
     } else if (name.find("ffn_down") != std::string::npos) {
         auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
         int i_layer = info.first, n_layer = info.second;

From f609de5a2c36387aacad7279a2bb48506332564f Mon Sep 17 00:00:00 2001
From: Colin Kealty <3266127+bartowski1182@users.noreply.github.com>
Date: Thu, 8 May 2025 13:58:11 -0400
Subject: [PATCH 07/13] Update some of the weightings, remove some complication

---
 src/llama-quant.cpp | 175 +++++++++++++++++++++++++++++---------------
 1 file changed, 114 insertions(+), 61 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 1b2e7632cf..344796814a 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -84,9 +84,9 @@ struct quantize_state_impl {
     int n_ffn_down     = 0;
     int n_ffn_gate     = 0;
     int n_ffn_up       = 0;
-    int n_ffn_down_exp = 0;
-    int n_ffn_gate_exp = 0;
-    int n_ffn_up_exp   = 0;
+    int n_ffn_down_exps = 0;
+    int n_ffn_gate_exps = 0;
+    int n_ffn_up_exps   = 0;
     int n_ffn_down_shexp = 0;
     int n_ffn_gate_shexp = 0;
     int n_ffn_up_shexp   = 0;
@@ -94,9 +94,9 @@ struct quantize_state_impl {
     int i_ffn_down     = 0;
     int i_ffn_gate     = 0;
     int i_ffn_up       = 0;
-    int i_ffn_down_exp = 0;
-    int i_ffn_gate_exp = 0;
-    int i_ffn_up_exp   = 0;
+    int i_ffn_down_exps = 0;
+    int i_ffn_gate_exps = 0;
+    int i_ffn_up_exps   = 0;
     int i_ffn_down_shexp = 0;
     int i_ffn_gate_shexp = 0;
     int i_ffn_up_shexp   = 0;
@@ -187,21 +187,54 @@ static void llama_tensor_dequantize_impl(
     workers.clear();
 }
 
-// Check if ftype is specifically IQ2_S or IQ2_M
-static inline bool is_iq2s_or_iq2m(llama_ftype ftype) {
-    return ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M;
-}
-
-// Check if ftype belongs to the IQ1 group
-static inline bool is_iq1_group(llama_ftype ftype) {
-    return ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M;
-}
 
 // Returns the appropriate type for expert _exps tensors based on ftype
-static inline ggml_type get_expert_exps_type(llama_ftype ftype) {
-    if (is_iq1_group(ftype))         return GGML_TYPE_IQ2_XXS;
-    if (is_iq2s_or_iq2m(ftype))      return GGML_TYPE_IQ3_XXS;
-    /* otherwise */                  return GGML_TYPE_IQ2_XS;
+static inline ggml_type get_exps_type_low_bpw_bump(llama_ftype ftype, ggml_type new_type) {
+    if      (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS;
+    else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_S;
+    else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_IQ2_S;
+    else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) new_type = GGML_TYPE_IQ2_XS;
+    else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
+    else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ1_M;
+    return new_type;
+}
+
+static inline ggml_type get_exps_type_low_bpw_squash(llama_ftype ftype, ggml_type new_type) {
+    if      (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ2_XS;
+    else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_XXS;
+    else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_IQ2_XXS;
+    else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) new_type = GGML_TYPE_IQ1_M;
+    else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ1_S;
+    else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ1_S;
+    return new_type;
+}
+
+static inline ggml_type get_exps_type_high_bpw_bump(llama_ftype ftype, ggml_type new_type, bool has_imatrix) {
+    if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
+    else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
+    else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
+    else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) new_type = GGML_TYPE_Q5_K;
+    else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q6_K;
+    else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
+    // Bump I-quants
+    else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_Q4_K;
+    else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
+    else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !has_imatrix) new_type = GGML_TYPE_Q5_K;
+
+    return new_type;
+}
+
+static inline ggml_type get_exps_type_high_bpw_squash(llama_ftype ftype, ggml_type new_type, bool has_imatrix) {
+    // Squash K-quants
+    if      (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q2_K;
+    else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) new_type = GGML_TYPE_Q3_K;
+    else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
+    // Squash I-quants
+    else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_IQ3_XXS;
+    else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
+        new_type = has_imatrix ? GGML_TYPE_IQ2_S : GGML_TYPE_Q2_K;
+    }
+    return new_type;
 }
 
 static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
@@ -271,7 +304,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
                 ftype == LLAMA_FTYPE_MOSTLY_IQ1_S   || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
                 new_type = GGML_TYPE_Q2_K;
             }
-            else if (is_iq2s_or_iq2m(ftype)) {
+            else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
                 new_type = GGML_TYPE_IQ3_S;
             }
             else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
@@ -285,7 +318,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
                ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M    || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
         if (name.find("attn_v.weight") != std::string::npos) {
             if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
-            else new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
+            else new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
             ++qs.i_attention_wv;
         }
         else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k.weight") != std::string::npos) {
@@ -299,7 +332,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
                 new_type = GGML_TYPE_Q4_K;
             }
             else if (use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) {
-                new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
+                new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
             }
             ++qs.i_attention_wv;
         }
@@ -307,14 +340,14 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
             new_type = GGML_TYPE_Q4_K;
         }
         else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q_b.weight") != std::string::npos) {
-            new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
+            new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
         }
         else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down.weight") != std::string::npos) {
             if (qs.i_ffn_down < qs.n_ffn_down/16) {
-                new_type = GGML_TYPE_Q4_K;
+                new_type = GGML_TYPE_Q6_K;
             }
             else if (qs.i_ffn_down < qs.n_ffn_down/8) {
-                new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
+                new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
             }
             ++qs.i_ffn_down;
         }
@@ -323,7 +356,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
                 new_type = GGML_TYPE_Q4_K;
             }
             else if (qs.i_ffn_gate < qs.n_ffn_gate/8) {
-                new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
+                new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
             }
             ++qs.i_ffn_gate;
         }
@@ -332,58 +365,64 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
                 new_type = GGML_TYPE_Q4_K;
             }
             else if (qs.i_ffn_up < qs.n_ffn_up/8) {
-                new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
+                new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
             }
             ++qs.i_ffn_up;
         }
         else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down_exps.weight") != std::string::npos) {
-            if (qs.i_ffn_down_exp < qs.n_ffn_down_exp/8) {
-                new_type = get_expert_exps_type(ftype);
+            if (qs.i_ffn_down_exps < qs.n_ffn_down_exps/8 || qs.i_ffn_down_exps > 7*qs.n_ffn_down_exps/8) {
+                new_type = get_exps_type_low_bpw_bump(ftype, new_type);
+            } else {
+                new_type = get_exps_type_low_bpw_squash(ftype, new_type);
             }
-            ++qs.i_ffn_down_exp;
+            ++qs.i_ffn_down_exps;
         }
         else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate_exps.weight") != std::string::npos) {
-            if (qs.i_ffn_gate_exp < qs.n_ffn_gate_exp/8) {
-                new_type = get_expert_exps_type(ftype);
+            if (qs.i_ffn_gate_exps < qs.n_ffn_gate_exps/8 || qs.i_ffn_gate_exps > 7*qs.n_ffn_gate_exps/8) {
+                new_type = get_exps_type_low_bpw_bump(ftype, new_type);
+            } else {
+                new_type = get_exps_type_low_bpw_squash(ftype, new_type);
             }
-            ++qs.i_ffn_gate_exp;
+            ++qs.i_ffn_gate_exps;
         }
         else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up_exps.weight") != std::string::npos) {
-            if (qs.i_ffn_up_exp < qs.n_ffn_up_exp/8) {
-                new_type = get_expert_exps_type(ftype);
+            if (qs.i_ffn_up_exps < qs.n_ffn_up_exps/8 || qs.i_ffn_up_exps > 7*qs.n_ffn_up_exps/8) {
+                new_type = get_exps_type_low_bpw_bump(ftype, new_type);
+            } else {
+                new_type = get_exps_type_low_bpw_squash(ftype, new_type);
             }
-            ++qs.i_ffn_up_exp;
+            ++qs.i_ffn_up_exps;
         }
         else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down_shexp.weight") != std::string::npos) {
             if (use_more_bits(qs.i_ffn_down_shexp, qs.n_ffn_down_shexp)) {
-                new_type = GGML_TYPE_Q4_K;
+                new_type = GGML_TYPE_Q6_K;
             }
             ++qs.i_ffn_down_shexp;
         }
         else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate_shexp.weight") != std::string::npos) {
             if (use_more_bits(qs.i_ffn_gate_shexp, qs.n_ffn_gate_shexp)) {
-                new_type = GGML_TYPE_Q4_K;
+                new_type = GGML_TYPE_Q6_K;
             }
             ++qs.i_ffn_gate_shexp;
         }
         else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up_shexp.weight") != std::string::npos) {
             if (use_more_bits(qs.i_ffn_up_shexp, qs.n_ffn_up_shexp)) {
-                new_type = GGML_TYPE_Q4_K;
+                new_type = GGML_TYPE_Q6_K;
             }
             ++qs.i_ffn_up_shexp;
         }
         else if (name.find("ffn_down") != std::string::npos) {
             if (qs.i_ffn_down < qs.n_ffn_down/8) {
-                new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
+                new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
             }
             ++qs.i_ffn_down;
         }
         else if (name.find("attn_output.weight") != std::string::npos) {
             if (qs.model.hparams.n_expert >= 8) {
-                new_type = is_iq2s_or_iq2m(ftype) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
+                new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
             } else {
-                if (is_iq1_group(ftype)) new_type = GGML_TYPE_IQ2_XXS;
-                else if (is_iq2s_or_iq2m(ftype)) new_type = GGML_TYPE_IQ3_S;
+                if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
+                else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
             }
         }
     } else if (name.find("attn_v.weight") != std::string::npos) {
@@ -425,7 +464,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
         }
         ++qs.i_attention_wv;
     } else if (name.find("attn_k.weight") != std::string::npos) {
-        if (qs.model.hparams.n_expert == 8) {
+        if (qs.model.hparams.n_expert >= 8) {
             // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
             // TODO: explore better strategies
             new_type = GGML_TYPE_Q8_0;
@@ -445,28 +484,46 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
         }
     } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down_shexp.weight") != std::string::npos) {
         new_type = GGML_TYPE_Q5_K;
+        //if (qs.i_ffn_down_shexp < qs.n_ffn_down_shexp/8 || qs.i_ffn_down_shexp > 7*qs.n_ffn_down_shexp/8) {
         if (use_more_bits(qs.i_ffn_down_shexp, qs.n_ffn_down_shexp)) {
             new_type = GGML_TYPE_Q8_0;
         }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
         ++qs.i_ffn_down_shexp;
     } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate_shexp.weight") != std::string::npos) {
         new_type = GGML_TYPE_Q5_K;
+        //if (qs.i_ffn_gate_shexp < qs.n_ffn_gate_shexp/8 || qs.i_ffn_gate_shexp > 7*qs.n_ffn_gate_shexp/8) {
         if (use_more_bits(qs.i_ffn_gate_shexp, qs.n_ffn_gate_shexp)) {
             new_type = GGML_TYPE_Q8_0;
         }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
         ++qs.i_ffn_gate_shexp;
     } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up_shexp.weight") != std::string::npos) {
         new_type = GGML_TYPE_Q5_K;
+        //if (qs.i_ffn_up_shexp < qs.n_ffn_up_shexp/8 || qs.i_ffn_up_shexp > 7*qs.n_ffn_up_shexp/8) {
         if (use_more_bits(qs.i_ffn_up_shexp, qs.n_ffn_up_shexp)) {
             new_type = GGML_TYPE_Q8_0;
         }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
         ++qs.i_ffn_up_shexp;
+    } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down_exps.weight") != std::string::npos) {
+        if (use_more_bits(qs.i_ffn_down_exps, qs.n_ffn_down_exps)) {
+            if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0) && qs.has_imatrix) {
+                // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
+                // We only do it when an imatrix is provided because a) we want to make sure that one can always get the
+                // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
+                new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
+            } else {
+                new_type = get_exps_type_high_bpw_bump(ftype, new_type, qs.has_imatrix);
+            }
+        }
+        ++qs.i_ffn_down_exps;
     } else if (name.find("ffn_down") != std::string::npos) {
         auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
         int i_layer = info.first, n_layer = info.second;
@@ -556,21 +613,17 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
         ++qs.i_ffn_up;
     } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_kv_a_mqa.weight") != std::string::npos) {
         new_type = GGML_TYPE_Q8_0;
-    } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_kv_b.weight") != std::string::npos) {
-        new_type = GGML_TYPE_Q4_K;
-        if (qs.i_attention_wv < qs.n_attention_wv/16) {
-            new_type = GGML_TYPE_Q8_0;
-        } else if (use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) {
-            new_type = GGML_TYPE_Q6_K;
-        }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q5_K;
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
-        ++qs.i_attention_wv;
+    } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k_b.weight") != std::string::npos) {
+        new_type = GGML_TYPE_Q5_K;
+        if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
+    } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_v_b.weight") != std::string::npos) {
+        new_type = GGML_TYPE_Q5_K;
+        if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
     } else if (qs.model.hparams.n_expert >= 8 &&name.find("attn_q_b.weight") != std::string::npos) {
-        if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
-            new_type = GGML_TYPE_Q4_K;
-        }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q5_K;
+        new_type = GGML_TYPE_Q4_K;
+        if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q5_K;
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
     } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q_a.weight") != std::string::npos) {
         new_type = GGML_TYPE_Q5_K;
@@ -873,15 +926,15 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
             qs.has_output = true;
         } else if (name.find("ffn_gate_exps.weight") != std::string::npos) {
-            ++qs.n_ffn_gate_exp;
+            ++qs.n_ffn_gate_exps;
         } else if (name.find("ffn_gate_shexp.weight") != std::string::npos) {
             ++qs.n_ffn_gate_shexp;
         } else if (name.find("ffn_down_exps.weight") != std::string::npos) {
-            ++qs.n_ffn_down_exp;
+            ++qs.n_ffn_down_exps;
         } else if (name.find("ffn_down_shexp.weight") != std::string::npos) {
             ++qs.n_ffn_down_shexp;
         } else if (name.find("ffn_up_exps.weight") != std::string::npos) {
-            ++qs.n_ffn_up_exp;
+            ++qs.n_ffn_up_exps;
         } else if (name.find("ffn_up_shexp.weight") != std::string::npos) {
             ++qs.n_ffn_up_shexp;
         }

From a312ac950d214036cfda549b8b96cb92fb776b5c Mon Sep 17 00:00:00 2001
From: Colin Kealty <3266127+bartowski1182@users.noreply.github.com>
Date: Mon, 15 Dec 2025 12:38:26 -0500
Subject: [PATCH 08/13] Update to latest changes

---
 src/llama-quant.cpp | 31 +------------------------------
 1 file changed, 1 insertion(+), 30 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 344796814a..a0d9265e96 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -805,7 +805,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     }
 
     std::vector<std::string> splits = {};
-    llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr);
+    llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
     ml.init_mappings(false); // no prefetching
 
     llama_model model(llama_model_default_params());
@@ -875,7 +875,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
     std::map<int, std::string> mapped;
     int blk_id = 0;
-    int pruned_attention_w = 0;
 
     // make a list of weights
     std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
@@ -883,11 +882,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     for (const auto & it : ml.weights_map) {
         const std::string remapped_name(remap_layer(it.first, prune_list, mapped, blk_id));
         if (remapped_name.empty()) {
-            if (it.first.find("attn_v.weight") != std::string::npos ||
-                it.first.find("attn_qkv.weight") != std::string::npos ||
-                it.first.find("attn_kv_b.weight") != std::string::npos) {
-                    pruned_attention_w++;
-            }
             LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str());
             continue;
         }
@@ -912,7 +906,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         });
     }
 
-    bool is_clip_model = false;
     for (const auto * it : tensors) {
         const struct ggml_tensor * tensor = it->tensor;
 
@@ -938,32 +931,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         } else if (name.find("ffn_up_shexp.weight") != std::string::npos) {
             ++qs.n_ffn_up_shexp;
         }
-
-        is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix
     }
 
     qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
 
-    // sanity checks for models that have attention layers
-    if (qs.n_attention_wv != 0 && !is_clip_model)
-    {
-        const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
-        // attention layers have a non-zero number of kv heads
-        int32_t n_layer_attn = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0);
-        if (llama_model_has_encoder(&model)) {
-            // now n_layer_attn is the number of attention layers in the encoder
-            // for each decoder block, there are 2 attention layers
-            n_layer_attn += 2 * model.hparams.dec_n_layer;
-        }
-
-        // note: for linear-attention models (such as Qwen3 Next) this is the number of linear layers
-        const int32_t n_layer_recr = std::count(model.hparams.recurrent_layer_arr.begin(), model.hparams.recurrent_layer_arr.end(), true);
-
-        LLAMA_LOG_INFO("%s: n_layer_attn = %d, n_layer_recr = %d, pruned_attention_w = %d\n", __func__, n_layer_attn, n_layer_recr, pruned_attention_w);
-
-        GGML_ASSERT((qs.n_attention_wv == n_layer_attn - pruned_attention_w - n_layer_recr) && "n_attention_wv is unexpected");
-    }
-
     size_t total_size_org = 0;
     size_t total_size_new = 0;
 

From d28595e5475f368074e1e8434f67f321858444c2 Mon Sep 17 00:00:00 2001
From: Colin Kealty <3266127+bartowski1182@users.noreply.github.com>
Date: Sun, 1 Mar 2026 18:50:44 -0500
Subject: [PATCH 09/13] Update recipe

---
 src/llama-quant.cpp | 150 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 116 insertions(+), 34 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 7a64c3dec0..a2c6745480 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -187,11 +187,10 @@ static void llama_tensor_dequantize_impl(
     workers.clear();
 }
 
-
 // Returns the appropriate type for expert _exps tensors based on ftype
 static inline ggml_type get_exps_type_low_bpw_bump(llama_ftype ftype, ggml_type new_type) {
-    if      (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS;
-    else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_S;
+    if      (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
+    else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ3_XXS;
     else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_IQ2_S;
     else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) new_type = GGML_TYPE_IQ2_XS;
     else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
@@ -237,6 +236,30 @@ static inline ggml_type get_exps_type_high_bpw_squash(llama_ftype ftype, ggml_ty
     return new_type;
 }
 
+static inline ggml_type get_ffn_shexp_ggml_type(llama_ftype ftype, ggml_type new_type) {
+    if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S  ||
+        ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
+        new_type = GGML_TYPE_IQ4_XS;
+    }
+    else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
+        new_type = GGML_TYPE_Q4_K;
+    }
+    else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_0) new_type = GGML_TYPE_Q5_0;
+    else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_1) new_type = GGML_TYPE_Q5_1;
+    else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ||
+             ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) {
+        new_type = GGML_TYPE_Q5_K;
+    }
+    else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) {
+        new_type = GGML_TYPE_Q6_K;
+    }
+    else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q6_K   ||
+        ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
+        new_type = GGML_TYPE_Q8_0;
+    }
+    return new_type;
+}
+
 static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
     const std::string name = ggml_get_name(tensor);
 
@@ -288,6 +311,10 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
                 new_type = GGML_TYPE_Q6_K;
             }
         }
+    } else if (name.find("ssm_ba.weight")    != std::string::npos ||
+               name.find("ssm_alpha.weight") != std::string::npos ||
+               name.find("ssm_beta.weight")  != std::string::npos) {
+        new_type = GGML_TYPE_Q8_0;
     } else if (ftype == LLAMA_FTYPE_MOSTLY_MXFP4_MOE) {
         // MoE   tensors -> MXFP4
         // other tensors -> Q8_0
@@ -321,6 +348,9 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
             else new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
             ++qs.i_attention_wv;
         }
+        else if (qs.model.hparams.n_expert >= 8 && name.find("ssm_out.weight") != std::string::npos) {
+            new_type = GGML_TYPE_Q4_K;
+        }
         else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k.weight") != std::string::npos) {
             new_type = GGML_TYPE_Q4_K;
         }
@@ -425,6 +455,8 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
                 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
             }
         }
+    } else if (name.find("ssm_out.weight") != std::string::npos) {
+        new_type = GGML_TYPE_Q8_0;
     } else if (name.find("attn_v.weight") != std::string::npos) {
         if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
             new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
@@ -433,7 +465,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
             new_type = GGML_TYPE_Q4_K;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
-            new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
+            new_type = GGML_TYPE_IQ3_S;
         }
         else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
             new_type = GGML_TYPE_Q4_K;
@@ -451,13 +483,14 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
         else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
                 use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q6_K && use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q8_0;
         if (qs.model.type == LLM_TYPE_70B) {
             // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
             // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
             // nearly negligible increase in model size by quantizing this tensor with more bits:
             if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
         }
-        if (qs.model.hparams.n_expert == 8) {
+        if (qs.model.hparams.n_expert >= 8) {
             // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
             // TODO: explore better strategies
             new_type = GGML_TYPE_Q8_0;
@@ -483,34 +516,29 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
             new_type = GGML_TYPE_IQ2_S;
         }
     } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down_shexp.weight") != std::string::npos) {
-        new_type = GGML_TYPE_Q5_K;
-        //if (qs.i_ffn_down_shexp < qs.n_ffn_down_shexp/8 || qs.i_ffn_down_shexp > 7*qs.n_ffn_down_shexp/8) {
         if (use_more_bits(qs.i_ffn_down_shexp, qs.n_ffn_down_shexp)) {
             new_type = GGML_TYPE_Q8_0;
         }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
+        else {
+            new_type = get_ffn_shexp_ggml_type(ftype, new_type);
+        }
         ++qs.i_ffn_down_shexp;
     } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate_shexp.weight") != std::string::npos) {
-        new_type = GGML_TYPE_Q5_K;
         //if (qs.i_ffn_gate_shexp < qs.n_ffn_gate_shexp/8 || qs.i_ffn_gate_shexp > 7*qs.n_ffn_gate_shexp/8) {
         if (use_more_bits(qs.i_ffn_gate_shexp, qs.n_ffn_gate_shexp)) {
             new_type = GGML_TYPE_Q8_0;
         }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
+        else {
+            new_type = get_ffn_shexp_ggml_type(ftype, new_type);
+        }
         ++qs.i_ffn_gate_shexp;
     } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up_shexp.weight") != std::string::npos) {
-        new_type = GGML_TYPE_Q5_K;
-        //if (qs.i_ffn_up_shexp < qs.n_ffn_up_shexp/8 || qs.i_ffn_up_shexp > 7*qs.n_ffn_up_shexp/8) {
         if (use_more_bits(qs.i_ffn_up_shexp, qs.n_ffn_up_shexp)) {
             new_type = GGML_TYPE_Q8_0;
         }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
+        else {
+            new_type = get_ffn_shexp_ggml_type(ftype, new_type);
+        }
         ++qs.i_ffn_up_shexp;
     } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down_exps.weight") != std::string::npos) {
         if (use_more_bits(qs.i_ffn_down_exps, qs.n_ffn_down_exps)) {
@@ -572,11 +600,12 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
     } else if (name.find("attn_output.weight") != std::string::npos) {
         if (arch != LLM_ARCH_FALCON) {
             if (qs.model.hparams.n_expert >= 8) {
-                if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K   || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
+                if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K   || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS  || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
                     ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL  ||
-                    ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S  ||
-                    ftype == LLAMA_FTYPE_MOSTLY_IQ3_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) {
-                    new_type = GGML_TYPE_Q5_K;
+                    ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S   ||
+                    ftype == LLAMA_FTYPE_MOSTLY_IQ3_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS  || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M  ||
+                    ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) {
+                    new_type = GGML_TYPE_Q6_K;
                 }
             } else {
                 if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K   ) new_type = GGML_TYPE_Q3_K;
@@ -584,6 +613,11 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
                 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
                 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
                 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M  ) new_type = GGML_TYPE_Q4_K;
+                else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M ) new_type = GGML_TYPE_Q6_K;
+                else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S ) new_type = GGML_TYPE_Q5_K;
+                else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M ) new_type = GGML_TYPE_Q8_0;
+                else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S ) new_type = GGML_TYPE_Q6_K;
+                else if (ftype == LLAMA_FTYPE_MOSTLY_Q6_K   ) new_type = GGML_TYPE_Q8_0;
             }
         } else {
             if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
@@ -595,6 +629,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q6_K  ) new_type = GGML_TYPE_Q8_0;
     }
     else if (name.find("ffn_gate") != std::string::npos) {
         auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
@@ -614,21 +649,68 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
     } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_kv_a_mqa.weight") != std::string::npos) {
         new_type = GGML_TYPE_Q8_0;
     } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k_b.weight") != std::string::npos) {
-        new_type = GGML_TYPE_Q5_K;
-        if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
+        if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K   || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S ||
+            ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
+            ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) {
+            new_type = GGML_TYPE_Q5_K;
+        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
+                 ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS){
+            new_type = GGML_TYPE_IQ4_XS;
+        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) {
+            new_type = GGML_TYPE_Q6_K;
+        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q6_K) {
+            new_type = GGML_TYPE_Q8_0;
+        }
     } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_v_b.weight") != std::string::npos) {
-        new_type = GGML_TYPE_Q5_K;
-        if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
+        if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K   || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S ||
+            ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
+            ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) {
+            new_type = GGML_TYPE_Q5_K;
+        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
+                 ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS){
+            new_type = GGML_TYPE_IQ4_XS;
+        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) {
+            new_type = GGML_TYPE_Q6_K;
+        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q6_K) {
+            new_type = GGML_TYPE_Q8_0;
+        }
     } else if (qs.model.hparams.n_expert >= 8 &&name.find("attn_q_b.weight") != std::string::npos) {
-        new_type = GGML_TYPE_Q4_K;
-        if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q5_K;
+        if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K   || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S ||
+            ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
+            ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) {
+            new_type = GGML_TYPE_Q4_K;
+        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
+                 ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS){
+            new_type = GGML_TYPE_IQ4_XS;
+        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) {
+            new_type = GGML_TYPE_Q5_K;
+        }
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q6_K)   new_type = GGML_TYPE_Q8_0;
     } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q_a.weight") != std::string::npos) {
-        new_type = GGML_TYPE_Q5_K;
-        if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0;
+        if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K   || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S ||
+            ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
+            ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) {
+            new_type = GGML_TYPE_Q5_K;
+        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
+                 ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS){
+            new_type = GGML_TYPE_IQ4_XS;
+        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) {
+            new_type = GGML_TYPE_Q6_K;
+        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q6_K) {
+            new_type = GGML_TYPE_Q8_0;
+        }
     }
 
     return new_type;

From fc0a02df272469da253e227fadefe56864e3c3d1 Mon Sep 17 00:00:00 2001
From: Colin Kealty <3266127+bartowski1182@users.noreply.github.com>
Date: Mon, 9 Mar 2026 10:52:14 -0400
Subject: [PATCH 10/13] Changes f or attn_q, ssm_out, other tweaks

---
 src/llama-quant.cpp | 182 +++++++++++++++++++++++++-------------------
 1 file changed, 102 insertions(+), 80 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index a2c6745480..c2ab1b80e8 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -80,26 +80,30 @@ struct quantize_state_impl {
     const llama_model                 & model;
     const llama_model_quantize_params * params;
 
-    int n_attention_wv = 0;
-    int n_ffn_down     = 0;
-    int n_ffn_gate     = 0;
-    int n_ffn_up       = 0;
-    int n_ffn_down_exps = 0;
-    int n_ffn_gate_exps = 0;
-    int n_ffn_up_exps   = 0;
+    int n_attention_wv   = 0;
+    int n_ffn_down       = 0;
+    int n_ffn_gate       = 0;
+    int n_ffn_up         = 0;
+    int n_ffn_down_exps  = 0;
+    int n_ffn_gate_exps  = 0;
+    int n_ffn_up_exps    = 0;
     int n_ffn_down_shexp = 0;
     int n_ffn_gate_shexp = 0;
     int n_ffn_up_shexp   = 0;
-    int i_attention_wv = 0;
-    int i_ffn_down     = 0;
-    int i_ffn_gate     = 0;
-    int i_ffn_up       = 0;
-    int i_ffn_down_exps = 0;
-    int i_ffn_gate_exps = 0;
-    int i_ffn_up_exps   = 0;
+    int n_ssm_out        = 0;
+    int n_attn_q         = 0;
+    int i_attention_wv   = 0;
+    int i_ffn_down       = 0;
+    int i_ffn_gate       = 0;
+    int i_ffn_up         = 0;
+    int i_ffn_down_exps  = 0;
+    int i_ffn_gate_exps  = 0;
+    int i_ffn_up_exps    = 0;
     int i_ffn_down_shexp = 0;
     int i_ffn_gate_shexp = 0;
     int i_ffn_up_shexp   = 0;
+    int i_ssm_out        = 0;
+    int i_attn_q         = 0;
 
     int n_k_quantized = 0;
     int n_fallback    = 0;
@@ -209,39 +213,25 @@ static inline ggml_type get_exps_type_low_bpw_squash(llama_ftype ftype, ggml_typ
 }
 
 static inline ggml_type get_exps_type_high_bpw_bump(llama_ftype ftype, ggml_type new_type, bool has_imatrix) {
-    if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
+    if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K)   new_type = GGML_TYPE_Q3_K;
     else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
     else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
     else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) new_type = GGML_TYPE_Q5_K;
     else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q6_K;
     else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
+    else if (ftype == LLAMA_FTYPE_MOSTLY_Q6_K)   new_type = GGML_TYPE_Q8_0;
     // Bump I-quants
-    else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_Q4_K;
-    else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
-    else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !has_imatrix) new_type = GGML_TYPE_Q5_K;
+    else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_Q5_K;
+    else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_Q4_K;
+    else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !has_imatrix) new_type = GGML_TYPE_Q6_K;
 
     return new_type;
 }
 
-static inline ggml_type get_exps_type_high_bpw_squash(llama_ftype ftype, ggml_type new_type, bool has_imatrix) {
-    // Squash K-quants
-    if      (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q2_K;
-    else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) new_type = GGML_TYPE_Q3_K;
-    else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
-    // Squash I-quants
-    else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_IQ3_XXS;
-    else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
-        new_type = has_imatrix ? GGML_TYPE_IQ2_S : GGML_TYPE_Q2_K;
-    }
-    return new_type;
-}
-
 static inline ggml_type get_ffn_shexp_ggml_type(llama_ftype ftype, ggml_type new_type) {
-    if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S  ||
-        ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
-        new_type = GGML_TYPE_IQ4_XS;
-    }
-    else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
+    if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S   ||
+        ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
+        ftype == LLAMA_FTYPE_MOSTLY_Q2_K   || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
         new_type = GGML_TYPE_Q4_K;
     }
     else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_0) new_type = GGML_TYPE_Q5_0;
@@ -314,7 +304,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
     } else if (name.find("ssm_ba.weight")    != std::string::npos ||
                name.find("ssm_alpha.weight") != std::string::npos ||
                name.find("ssm_beta.weight")  != std::string::npos) {
-        new_type = GGML_TYPE_Q8_0;
+        new_type = GGML_TYPE_F32;
     } else if (ftype == LLAMA_FTYPE_MOSTLY_MXFP4_MOE) {
         // MoE   tensors -> MXFP4
         // other tensors -> Q8_0
@@ -329,13 +319,13 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
         } else {
             if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
                 ftype == LLAMA_FTYPE_MOSTLY_IQ1_S   || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
-                new_type = GGML_TYPE_Q2_K;
+                new_type = GGML_TYPE_Q4_K;
             }
             else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
-                new_type = GGML_TYPE_IQ3_S;
+                new_type = GGML_TYPE_Q4_K;
             }
             else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
-                new_type = GGML_TYPE_IQ3_S;
+                new_type = GGML_TYPE_Q4_K;
             }
             else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
                 new_type = GGML_TYPE_Q4_K;
@@ -343,17 +333,27 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
         }
     } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
                ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M    || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
-        if (name.find("attn_v.weight") != std::string::npos) {
-            if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
-            else new_type = (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
+        if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
+            if (use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) {
+                new_type = GGML_TYPE_Q6_K;
+            }
+            else {
+                new_type = GGML_TYPE_Q4_K;
+            }
             ++qs.i_attention_wv;
         }
         else if (qs.model.hparams.n_expert >= 8 && name.find("ssm_out.weight") != std::string::npos) {
-            new_type = GGML_TYPE_Q4_K;
+            if (use_more_bits(qs.i_ssm_out, qs.n_ssm_out)) {
+                new_type = GGML_TYPE_Q4_K;
+            }
+            ++qs.i_ssm_out;
         }
         else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k.weight") != std::string::npos) {
             new_type = GGML_TYPE_Q4_K;
         }
+        else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q.weight") != std::string::npos) {
+            new_type = GGML_TYPE_Q4_K;
+        }
         else if (qs.model.hparams.n_expert >= 8 && name.find("attn_kv_a_mqa.weight") != std::string::npos) {
             new_type = GGML_TYPE_Q4_K;
         }
@@ -402,42 +402,47 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
         else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down_exps.weight") != std::string::npos) {
             if (qs.i_ffn_down_exps < qs.n_ffn_down_exps/8 || qs.i_ffn_down_exps > 7*qs.n_ffn_down_exps/8) {
                 new_type = get_exps_type_low_bpw_bump(ftype, new_type);
-            } else {
-                new_type = get_exps_type_low_bpw_squash(ftype, new_type);
             }
             ++qs.i_ffn_down_exps;
         }
         else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate_exps.weight") != std::string::npos) {
-            if (qs.i_ffn_gate_exps < qs.n_ffn_gate_exps/8 || qs.i_ffn_gate_exps > 7*qs.n_ffn_gate_exps/8) {
-                new_type = get_exps_type_low_bpw_bump(ftype, new_type);
-            } else {
-                new_type = get_exps_type_low_bpw_squash(ftype, new_type);
-            }
+            // if (qs.i_ffn_gate_exps < qs.n_ffn_gate_exps/8 || qs.i_ffn_gate_exps > 7*qs.n_ffn_gate_exps/8) {
+            //     new_type = get_exps_type_low_bpw_bump(ftype, new_type);
+            // } else {
+            //     new_type = get_exps_type_low_bpw_squash(ftype, new_type);
+            // }
             ++qs.i_ffn_gate_exps;
         }
         else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up_exps.weight") != std::string::npos) {
-            if (qs.i_ffn_up_exps < qs.n_ffn_up_exps/8 || qs.i_ffn_up_exps > 7*qs.n_ffn_up_exps/8) {
-                new_type = get_exps_type_low_bpw_bump(ftype, new_type);
-            } else {
-                new_type = get_exps_type_low_bpw_squash(ftype, new_type);
-            }
+            // if (use_more_bits(qs.i_ffn_up_exps, qs.n_ffn_up_exps)) {
+            //     new_type = get_exps_type_low_bpw_bump(ftype, new_type);
+            // }
             ++qs.i_ffn_up_exps;
         }
         else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down_shexp.weight") != std::string::npos) {
             if (use_more_bits(qs.i_ffn_down_shexp, qs.n_ffn_down_shexp)) {
-                new_type = GGML_TYPE_Q6_K;
+                new_type = GGML_TYPE_Q8_0;
+            }
+            else {
+                new_type = GGML_TYPE_Q4_K;
             }
             ++qs.i_ffn_down_shexp;
         }
         else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate_shexp.weight") != std::string::npos) {
             if (use_more_bits(qs.i_ffn_gate_shexp, qs.n_ffn_gate_shexp)) {
-                new_type = GGML_TYPE_Q6_K;
+                new_type = GGML_TYPE_Q8_0;
+            }
+            else {
+                new_type = GGML_TYPE_Q4_K;
             }
             ++qs.i_ffn_gate_shexp;
         }
         else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up_shexp.weight") != std::string::npos) {
             if (use_more_bits(qs.i_ffn_up_shexp, qs.n_ffn_up_shexp)) {
-                new_type = GGML_TYPE_Q6_K;
+                new_type = GGML_TYPE_Q8_0;
+            }
+            else {
+                new_type = GGML_TYPE_Q4_K;
             }
             ++qs.i_ffn_up_shexp;
         }
@@ -456,33 +461,37 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
             }
         }
     } else if (name.find("ssm_out.weight") != std::string::npos) {
-        new_type = GGML_TYPE_Q8_0;
+        if (use_more_bits(qs.i_ssm_out, qs.n_ssm_out)) {
+            new_type = GGML_TYPE_Q8_0;
+        }
+        ++qs.i_ssm_out;
     } else if (name.find("attn_v.weight") != std::string::npos) {
-        if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
+        if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
             new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
         }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
-            new_type = GGML_TYPE_Q4_K;
-        }
         else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
-            new_type = GGML_TYPE_IQ3_S;
+            new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
         }
         else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
             new_type = GGML_TYPE_Q4_K;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
-            new_type = GGML_TYPE_Q4_K;
+            new_type = GGML_TYPE_Q5_K;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
-            new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
+            if (use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) {
+                new_type = GGML_TYPE_Q5_K;
+            } else {
+                new_type = GGML_TYPE_Q4_K;
+            }
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
         else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) {
-            new_type = GGML_TYPE_Q5_K;
+            new_type = GGML_TYPE_Q6_K;
         }
         else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
                 use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q5_K;
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q6_K && use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q8_0;
         if (qs.model.type == LLM_TYPE_70B) {
             // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
@@ -496,6 +505,16 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
             new_type = GGML_TYPE_Q8_0;
         }
         ++qs.i_attention_wv;
+    } else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q.weight") != std::string::npos) {
+        if (use_more_bits(qs.i_attn_q, qs.n_attn_q)) {
+            if (ftype == LLAMA_FTYPE_MOSTLY_Q6_K || ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
+                new_type = GGML_TYPE_Q8_0;
+            }
+            else {
+                new_type = GGML_TYPE_Q6_K;
+            }
+        }
+        ++qs.i_attn_q;
     } else if (name.find("attn_k.weight") != std::string::npos) {
         if (qs.model.hparams.n_expert >= 8) {
             // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
@@ -508,13 +527,6 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
         else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
             new_type = GGML_TYPE_IQ2_S;
         }
-    } else if (name.find("attn_q.weight") != std::string::npos) {
-        if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
-            new_type = GGML_TYPE_IQ3_XXS;
-        }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
-            new_type = GGML_TYPE_IQ2_S;
-        }
     } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down_shexp.weight") != std::string::npos) {
         if (use_more_bits(qs.i_ffn_down_shexp, qs.n_ffn_down_shexp)) {
             new_type = GGML_TYPE_Q8_0;
@@ -610,6 +622,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
             } else {
                 if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K   ) new_type = GGML_TYPE_Q3_K;
                 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
+                else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS ) new_type = GGML_TYPE_IQ3_S;
                 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
                 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
                 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M  ) new_type = GGML_TYPE_Q4_K;
@@ -624,12 +637,17 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
         }
     }
     else if (name.find("attn_qkv.weight") != std::string::npos) {
-        if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
+        if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
             new_type = GGML_TYPE_Q4_K;
         }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q6_K  ) new_type = GGML_TYPE_Q8_0;
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || 
+                 ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L  || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M  ||
+                 ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S  || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S  ||
+                 ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) {
+            new_type = GGML_TYPE_Q5_K;
+        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K;
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q6_K  ) new_type = GGML_TYPE_Q8_0;
     }
     else if (name.find("ffn_gate") != std::string::npos) {
         auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
@@ -972,6 +990,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             ++qs.n_ffn_up_exps;
         } else if (name.find("ffn_up_shexp.weight") != std::string::npos) {
             ++qs.n_ffn_up_shexp;
+        } else if (name.find("ssm_out.weight") != std::string::npos) {
+            ++qs.n_ssm_out;
+        } else if (name.find("attn_q.weight") != std::string::npos) {
+            ++qs.n_attn_q;
         }
     }
 

From bf34e75799ec65bcaeeb452bdb875ccd49dc65cc Mon Sep 17 00:00:00 2001
From: Colin Kealty <3266127+bartowski1182@users.noreply.github.com>
Date: Mon, 9 Mar 2026 10:53:56 -0400
Subject: [PATCH 11/13] Add specific attn_qkv logic

---
 src/llama-quant.cpp | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index c2ab1b80e8..3a07bbf001 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -81,6 +81,7 @@ struct quantize_state_impl {
     const llama_model_quantize_params * params;
 
     int n_attention_wv   = 0;
+    int n_attn_qkv       = 0;
     int n_ffn_down       = 0;
     int n_ffn_gate       = 0;
     int n_ffn_up         = 0;
@@ -92,6 +93,7 @@ struct quantize_state_impl {
     int n_ffn_up_shexp   = 0;
     int n_ssm_out        = 0;
     int n_attn_q         = 0;
+    int i_attn_qkv       = 0;
     int i_attention_wv   = 0;
     int i_ffn_down       = 0;
     int i_ffn_gate       = 0;
@@ -333,7 +335,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
         }
     } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
                ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M    || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
-        if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
+        if (name.find("attn_v.weight") != std::string::npos) {
             if (use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) {
                 new_type = GGML_TYPE_Q6_K;
             }
@@ -342,6 +344,20 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
             }
             ++qs.i_attention_wv;
         }
+        else if (name.find("attn_qkv.weight") != std::string::npos) {
+            if (qs.model.hparams.n_expert >= 8) {
+                if (use_more_bits(qs.i_attn_qkv, qs.n_attn_qkv)) {
+                    new_type = GGML_TYPE_Q6_K;
+                }
+                else {
+                    new_type = GGML_TYPE_Q4_K;
+                }
+            }
+            else if (use_more_bits(qs.i_attn_qkv, qs.n_attn_qkv)) {
+                new_type = GGML_TYPE_Q4_K;
+            }
+            ++qs.i_attn_qkv;
+        }
         else if (qs.model.hparams.n_expert >= 8 && name.find("ssm_out.weight") != std::string::npos) {
             if (use_more_bits(qs.i_ssm_out, qs.n_ssm_out)) {
                 new_type = GGML_TYPE_Q4_K;
@@ -640,7 +656,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
         if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
             new_type = GGML_TYPE_Q4_K;
         }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || 
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS ||
                  ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L  || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M  ||
                  ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S  || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S  ||
                  ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S) {
@@ -673,7 +689,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
             new_type = GGML_TYPE_Q5_K;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
-                 ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS){
+                 ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
             new_type = GGML_TYPE_IQ4_XS;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) {
@@ -705,7 +721,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
             new_type = GGML_TYPE_Q4_K;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
-                 ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS){
+                 ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
             new_type = GGML_TYPE_IQ4_XS;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) {
@@ -720,7 +736,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
             new_type = GGML_TYPE_Q5_K;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
-                 ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS){
+                 ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
             new_type = GGML_TYPE_IQ4_XS;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) {
@@ -973,11 +989,12 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
         // TODO: avoid hardcoded tensor names - use the TN_* constants
         if (name.find("attn_v.weight")   != std::string::npos ||
-            name.find("attn_qkv.weight") != std::string::npos ||
             name.find("attn_kv_b.weight")!= std::string::npos) {
             ++qs.n_attention_wv;
         } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
             qs.has_output = true;
+        } else if (name.find("attn_qkv.weight") != std::string::npos) {
+            ++qs.n_attn_qkv;
         } else if (name.find("ffn_gate_exps.weight") != std::string::npos) {
             ++qs.n_ffn_gate_exps;
         } else if (name.find("ffn_gate_shexp.weight") != std::string::npos) {

From 12a850132c54c480d72d30abde2c30dcf7bd2c2a Mon Sep 17 00:00:00 2001
From: Colin Kealty <3266127+bartowski1182@users.noreply.github.com>
Date: Wed, 11 Mar 2026 13:36:06 -0400
Subject: [PATCH 12/13] Clean up a merge conflict

---
 src/llama-quant.cpp | 78 +++++++++++++++++++++++++--------------------
 1 file changed, 43 insertions(+), 35 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 9da2f88f4f..3a6feb1388 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1238,9 +1238,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
     quantize_state_impl qs(model, params);
 
-    // these need to be set to n_layer by default
-    qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
-
     if (params->only_copy) {
         ftype = ml.ftype;
     }
@@ -1347,6 +1344,49 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     // compute tensor metadata once and cache it
     std::vector<tensor_metadata> metadata(tensors.size());
 
+    // initialize quantization state before preliminary loop (counters for use_more_bits)
+    {
+        for (size_t i = 0; i < tensors.size(); ++i) {
+            const auto cat = tensor_get_category(tensors[i]->tensor->name);
+            if (category_is_attn_v(cat)) {
+                ++qs.n_attention_wv;
+            }
+            else if (cat == tensor_category::ATTENTION_QKV) {
+                ++qs.n_attn_qkv;
+            }
+            else if (cat == tensor_category::FFN_GATE_EXPS) {
+                ++qs.n_ffn_gate_exps;
+            }
+            else if (cat == tensor_category::FFN_GATE_SHEXP) {
+                ++qs.n_ffn_gate_shexp;
+            }
+            else if (cat == tensor_category::FFN_DOWN_EXPS) {
+                ++qs.n_ffn_down_exps;
+            }
+            else if (cat == tensor_category::FFN_DOWN_SHEXP) {
+                ++qs.n_ffn_down_shexp;
+            }
+            else if (cat == tensor_category::FFN_UP_EXPS) {
+                ++qs.n_ffn_up_exps;
+            }
+            else if (cat == tensor_category::FFN_UP_SHEXP) {
+                ++qs.n_ffn_up_shexp;
+            }
+            else if (cat == tensor_category::SSM_OUT) {
+                ++qs.n_ssm_out;
+            }
+            else if (cat == tensor_category::ATTENTION_Q) {
+                ++qs.n_attn_q;
+            }
+            if (cat == tensor_category::OUTPUT) {
+                qs.has_tied_embeddings = false;
+            }
+            metadata[i].category = cat; // save and re-use the category while we're at it
+        }
+        // these also need to be set to n_layer by default
+        qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)qs.model.hparams.n_layer;
+    }
+
     // flag for --dry-run
     bool will_require_imatrix = false;
 
@@ -1363,38 +1403,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
         if (category_is_attn_v(metadata[i].category)) {
             ++qs.n_attention_wv;
-        }
-        else if (metadata[i].category == tensor_category::ATTENTION_QKV) {
-            ++qs.n_attn_qkv;
-        }
-        else if (metadata[i].category == tensor_category::FFN_GATE_EXPS) {
-            ++qs.n_ffn_gate_exps;
-        }
-        else if (metadata[i].category == tensor_category::FFN_GATE_SHEXP) {
-            ++qs.n_ffn_gate_shexp;
-        }
-        else if (metadata[i].category == tensor_category::FFN_DOWN_EXPS) {
-            ++qs.n_ffn_down_exps;
-        }
-        else if (metadata[i].category == tensor_category::FFN_DOWN_SHEXP) {
-            ++qs.n_ffn_down_shexp;
-        }
-        else if (metadata[i].category == tensor_category::FFN_UP_EXPS) {
-            ++qs.n_ffn_up_exps;
-        }
-        else if (metadata[i].category == tensor_category::FFN_UP_SHEXP) {
-            ++qs.n_ffn_up_shexp;
-        }
-        else if (metadata[i].category == tensor_category::SSM_OUT) {
-            ++qs.n_ssm_out;
-        }
-        else if (metadata[i].category == tensor_category::ATTENTION_Q) {
-            ++qs.n_attn_q;
-        }
-
-        if (tensor_name_match_output_weight(name.c_str())) {
-            qs.has_tied_embeddings = false;
-        }
 
         uint16_t i_split = params->keep_split ? it->idx : 0;
         if (!ctx_outs[i_split]) {

From 0af41bf152ee6ddb6e44bd39a3288cce7162f916 Mon Sep 17 00:00:00 2001
From: Colin Kealty <3266127+bartowski1182@users.noreply.github.com>
Date: Wed, 11 Mar 2026 13:36:32 -0400
Subject: [PATCH 13/13] Clean up another merge conflict

---
 src/llama-quant.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 3a6feb1388..872f60059e 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1399,11 +1399,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         const struct ggml_tensor * tensor = it->tensor;
         const std::string name = ggml_get_name(tensor);
 
-        metadata[i].category = tensor_get_category(name);
-
-        if (category_is_attn_v(metadata[i].category)) {
-            ++qs.n_attention_wv;
-
         uint16_t i_split = params->keep_split ? it->idx : 0;
         if (!ctx_outs[i_split]) {
             ctx_outs[i_split].reset(gguf_init_empty());