From a26db356c9ce52ca8b770de47613cfaa3344bf99 Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Mon, 16 Feb 2026 13:43:22 -0600
Subject: [PATCH] refactor

---
 src/llama-quant.cpp | 164 +++++++++++++++++++++++---------------------
 1 file changed, 85 insertions(+), 79 deletions(-)
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index afec667dc1..5cc538caf0 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -11,6 +11,7 @@
 #include <mutex>
 #include <regex>
 #include <thread>
+#include <memory>
 #include <unordered_map>
 
 // Quantization types. Changes to this struct must be replicated in quantize.cpp
@@ -19,6 +20,35 @@ struct tensor_quantization {
     ggml_type quant = GGML_TYPE_COUNT;
 };
 
+struct quantize_state_impl {
+    const llama_model                 & model;
+    const llama_model_quantize_params * params;
+
+    int n_attention_wv = 0;
+    int n_ffn_down     = 0;
+    int n_ffn_gate     = 0;
+    int n_ffn_up       = 0;
+    int i_attention_wv = 0;
+    int i_ffn_down     = 0;
+    int i_ffn_gate     = 0;
+    int i_ffn_up       = 0;
+
+    int n_fallback    = 0;
+
+    bool has_imatrix = false;
+
+    // used to figure out if a model shares tok_embd with the output weight
+    bool has_output = false;
+
+    // if this flag is false, the code will skip updating this struct
+    bool do_count = false;
+
+    quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params)
+        : model(model)
+        , params(params)
+        {}
+};
+
 static void zeros(std::ofstream & file, size_t n) {
     char zero = 0;
     for (size_t i = 0; i < n; ++i) {
@@ -76,32 +106,6 @@ static std::string remap_imatrix (const std::string & orig_name, const std::map<
     return orig_name;
 }
 
-struct quantize_state_impl {
-    const llama_model                 & model;
-    const llama_model_quantize_params * params;
-
-    int n_attention_wv = 0;
-    int n_ffn_down     = 0;
-    int n_ffn_gate     = 0;
-    int n_ffn_up       = 0;
-    int i_attention_wv = 0;
-    int i_ffn_down     = 0;
-    int i_ffn_gate     = 0;
-    int i_ffn_up       = 0;
-
-    int n_fallback    = 0;
-
-    bool has_imatrix = false;
-
-    // used to figure out if a model shares tok_embd with the output weight
-    bool has_output = false;
-
-    quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params)
-        : model(model)
-        , params(params)
-        {}
-};
-
 static void llama_tensor_dequantize_impl(
     ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
     const size_t nelements, const int nthread
@@ -177,7 +181,7 @@ static void llama_tensor_dequantize_impl(
 // internal standard logic for selecting the target tensor type for a specific
 // quantization mixture & model architecture
 static ggml_type llama_tensor_get_type_impl(
-            quantize_state_impl & qs,
+            quantize_state_impl * qs,
                       ggml_type   new_type,
               const ggml_tensor * tensor,
               const llama_ftype   ftype
@@ -185,13 +189,13 @@ static ggml_type llama_tensor_get_type_impl(
     const std::string name = ggml_get_name(tensor);
 
     // TODO: avoid hardcoded tensor names - use the TN_* constants
-    const llm_arch arch = qs.model.arch;
+    const llm_arch arch = qs->model.arch;
     const auto       tn = LLM_TN(arch);
 
     auto use_more_bits = [](int i_layer, int n_layers) -> bool {
         return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2;
     };
-    const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
+    const int n_expert = std::max(1, (int)qs->model.hparams.n_expert);
     auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
         if (n_expert > 1) {
             // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but occasionally randomly
@@ -210,9 +214,9 @@ static ggml_type llama_tensor_get_type_impl(
 
     // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
     // with the quantization of the output tensor
-    if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
-        if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
-            new_type = qs.params->output_tensor_type;
+    if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs->has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
+        if (qs->params->output_tensor_type < GGML_TYPE_COUNT) {
+            new_type = qs->params->output_tensor_type;
         } else {
             const int64_t nx = tensor->ne[0];
             const int64_t qk_k = ggml_blck_size(new_type);
@@ -241,8 +245,8 @@ static ggml_type llama_tensor_get_type_impl(
             new_type = GGML_TYPE_Q8_0;
         }
     } else if (name == "token_embd.weight" || name == "per_layer_token_embd.weight") {
-        if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
-            new_type = qs.params->token_embedding_type;
+        if (qs->params->token_embedding_type < GGML_TYPE_COUNT) {
+            new_type = qs->params->token_embedding_type;
         } else {
             if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
                 ftype == LLAMA_FTYPE_MOSTLY_IQ1_S   || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
@@ -261,19 +265,19 @@ static ggml_type llama_tensor_get_type_impl(
     } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
                ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M    || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
         if (name.find("attn_v.weight") != std::string::npos) {
-            if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
+            if (qs->model.hparams.n_gqa() >= 4 || qs->model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
             else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
         }
-        else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
+        else if (qs->model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
             new_type = GGML_TYPE_Q4_K;
         }
         else if (name.find("ffn_down") != std::string::npos) {
-            if (qs.i_ffn_down < qs.n_ffn_down/8) {
+            if (qs->i_ffn_down < qs->n_ffn_down/8) {
                 new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
             }
         }
         else if (name.find("attn_output.weight") != std::string::npos) {
-            if (qs.model.hparams.n_expert == 8) {
+            if (qs->model.hparams.n_expert == 8) {
                 new_type = GGML_TYPE_Q5_K;
             } else {
                 if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
@@ -282,43 +286,43 @@ static ggml_type llama_tensor_get_type_impl(
         }
     } else if (name.find("attn_v.weight") != std::string::npos) {
         if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
-            new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
+            new_type = qs->model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
         }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs->model.hparams.n_gqa() >= 4) {
             new_type = GGML_TYPE_Q4_K;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
-            new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
+            new_type = qs->model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs->has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
         }
-        else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
+        else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs->model.hparams.n_gqa() >= 4) {
             new_type = GGML_TYPE_Q4_K;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
             new_type = GGML_TYPE_Q4_K;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
-            new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
+            new_type = qs->i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
-        else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) {
+        else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs->model.hparams.n_gqa() >= 4) {
             new_type = GGML_TYPE_Q5_K;
         }
         else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
-                use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
-        if (qs.model.type == LLM_TYPE_70B) {
+                use_more_bits(qs->i_attention_wv, qs->n_attention_wv)) new_type = GGML_TYPE_Q6_K;
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs->i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
+        if (qs->model.type == LLM_TYPE_70B) {
             // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
             // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
             // nearly negligible increase in model size by quantizing this tensor with more bits:
             if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
         }
-        if (qs.model.hparams.n_expert == 8) {
+        if (qs->model.hparams.n_expert == 8) {
             // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
             // TODO: explore better strategies
             new_type = GGML_TYPE_Q8_0;
         }
     } else if (name.find("attn_k.weight") != std::string::npos) {
-        if (qs.model.hparams.n_expert == 8) {
+        if (qs->model.hparams.n_expert == 8) {
             // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
             // TODO: explore better strategies
             new_type = GGML_TYPE_Q8_0;
@@ -337,13 +341,13 @@ static ggml_type llama_tensor_get_type_impl(
             new_type = GGML_TYPE_IQ2_S;
         }
     } else if (name.find("ffn_down") != std::string::npos) {
-        auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
+        auto info = layer_info(qs->i_ffn_down, qs->n_ffn_down, name.c_str());
         int i_layer = info.first, n_layer = info.second;
         if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
             if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
         }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
+        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs->has_imatrix) {
             new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
@@ -352,7 +356,7 @@ static ggml_type llama_tensor_get_type_impl(
                      : GGML_TYPE_Q3_K;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
-                    (qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
+                    (qs->model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
             new_type = GGML_TYPE_Q4_K;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
@@ -366,7 +370,7 @@ static ggml_type llama_tensor_get_type_impl(
                 if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
             }
         }
-        else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) {
+        else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs->has_imatrix) {
             new_type = GGML_TYPE_Q5_K;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
@@ -374,7 +378,7 @@ static ggml_type llama_tensor_get_type_impl(
             new_type = GGML_TYPE_Q5_K;
         }
         else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0)
-                && qs.has_imatrix && i_layer < n_layer/8) {
+                && qs->has_imatrix && i_layer < n_layer/8) {
             // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
             // We only do it when an imatrix is provided because a) we want to make sure that one can always get the
             // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
@@ -382,7 +386,7 @@ static ggml_type llama_tensor_get_type_impl(
         }
     } else if (name.find("attn_output.weight") != std::string::npos) {
         if (arch != LLM_ARCH_FALCON) {
-            if (qs.model.hparams.n_expert == 8) {
+            if (qs->model.hparams.n_expert == 8) {
                 if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K   || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
                     ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL  ||
                     ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S  ||
@@ -408,14 +412,14 @@ static ggml_type llama_tensor_get_type_impl(
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
     }
     else if (name.find("ffn_gate") != std::string::npos) {
-        auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
+        auto info = layer_info(qs->i_ffn_gate, qs->n_ffn_gate, name.c_str());
         int i_layer = info.first, n_layer = info.second;
         if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
             new_type = GGML_TYPE_IQ3_XXS;
         }
     }
     else if (name.find("ffn_up") != std::string::npos) {
-        auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
+        auto info = layer_info(qs->i_ffn_up, qs->n_ffn_up, name.c_str());
         int i_layer = info.first, n_layer = info.second;
         if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
             new_type = GGML_TYPE_IQ3_XXS;
@@ -426,7 +430,7 @@ static ggml_type llama_tensor_get_type_impl(
 
 // determine the ggml_type that this tensor should be quantized to
 static ggml_type llama_tensor_get_type(
-                  quantize_state_impl & qs,
+                  quantize_state_impl * qs,
     const llama_model_quantize_params * params,
                     const ggml_tensor * tensor,
                       const ggml_type   default_type
@@ -492,7 +496,9 @@ static ggml_type llama_tensor_get_type(
                 if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
                     new_type = GGML_TYPE_F16;
                 }
-                ++qs.n_fallback;
+                if (qs->do_count) {
+                    ++qs->n_fallback;
+                }
             }
         }
     }
@@ -708,7 +714,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     model.load_hparams(ml);
     model.load_stats  (ml);
 
-    quantize_state_impl qs(model, params);
+    auto qs = std::make_unique<quantize_state_impl>(model, params);
 
     if (params->only_copy) {
         ftype = ml.ftype;
@@ -718,7 +724,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
         if (imatrix_data) {
             LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
-            qs.has_imatrix = true;
+            qs->has_imatrix = true;
             // check imatrix for nans or infs
             for (const auto & kv : *imatrix_data) {
                 for (float f : kv.second) {
@@ -830,15 +836,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         const ggml_tensor * tensor = it->tensor;
         const std::string name = tensor->name;
 
-        // TODO: avoid hardcoded tensor names - use the TN_* constants
-        if (name.find("attn_v.weight")   != std::string::npos ||
-            name.find("attn_qkv.weight") != std::string::npos ||
-            name.find("attn_kv_b.weight")!= std::string::npos) {
-            ++qs.n_attention_wv;
-        } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
-            qs.has_output = true;
-        }
-
         // populate the original tensors so we get an initial meta data
         uint16_t i_split = params->keep_split ? it->idx : 0;
         if (!ctx_outs[i_split]) {
@@ -867,9 +864,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         }
     }
 
-    qs.n_fallback = 0; // may have been falsely incremented by the preliminary loop over weights
-    qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
-
     // Set split info if needed
     if (n_split > 1) {
         for (size_t i = 0; i < ctx_outs.size(); ++i) {
@@ -918,6 +912,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     size_t total_size_org = 0;
     size_t total_size_new = 0;
 
+    qs->n_ffn_down = qs->n_ffn_gate = qs->n_ffn_up = (int)model.hparams.n_layer;
+    qs->do_count = true; // we start counting stats for the main loop
+
     // iterate over all weights (main loop)
     for (const auto * it : weights) {
         const auto & weight = *it;
@@ -931,6 +928,15 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         const std::string name = ggml_get_name(tensor);
         const size_t tensor_size = ggml_nbytes(tensor);
 
+        // TODO: avoid hardcoded tensor names - use the TN_* constants
+        if (name.find("attn_v.weight")   != std::string::npos ||
+            name.find("attn_qkv.weight") != std::string::npos ||
+            name.find("attn_kv_b.weight")!= std::string::npos) {
+            ++qs->n_attention_wv;
+        } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
+            qs->has_output = true;
+        }
+
         if (!params->dry_run) {
             if (!ml.use_mmap) {
                 if (read_data.size() < tensor_size) {
@@ -962,13 +968,13 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             // count stats for this tensor based on its name
             if (name.find("attn_v.weight") != std::string::npos ||
                 name.find("attn_kv_b.weight") != std::string::npos) {
-                ++qs.i_attention_wv;
+                ++qs->i_attention_wv;
             } else if (name.find("ffn_down") != std::string::npos) {
-                ++qs.i_ffn_down;
+                ++qs->i_ffn_down;
             } else if (name.find("ffn_gate") != std::string::npos) {
-                ++qs.i_ffn_gate;
+                ++qs->i_ffn_gate;
             } else if (name.find("ffn_up") != std::string::npos) {
-                ++qs.i_ffn_up;
+                ++qs->i_ffn_up;
             }
         }
 
@@ -1128,9 +1134,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         );
     }
 
-    if (qs.n_fallback > 0) {
+    if (qs->n_fallback > 0) {
         LLAMA_LOG_WARN("%s: WARNING: %d tensor(s) required fallback quantization\n",
-                __func__, qs.n_fallback);
+                __func__, qs->n_fallback);
     }
 }