From 3c1f94a49d331d96ce7f2469fb901eebc10803bf Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Mon, 16 Feb 2026 13:13:44 -0600
Subject: [PATCH] correct fallback logic

---
 src/llama-quant.cpp | 42 ++++++++++++++++++++----------------------
 1 file changed, 20 insertions(+), 22 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 5bce2bf221..afec667dc1 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -89,7 +89,6 @@ struct quantize_state_impl {
     int i_ffn_gate     = 0;
     int i_ffn_up       = 0;
 
-    int n_k_quantized = 0;
     int n_fallback    = 0;
 
     bool has_imatrix = false;
@@ -493,6 +492,7 @@ static ggml_type llama_tensor_get_type(
                 if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
                     new_type = GGML_TYPE_F16;
                 }
+                ++qs.n_fallback;
             }
         }
     }
@@ -505,20 +505,6 @@ static ggml_type llama_tensor_get_type(
     return new_type;
 }
 
-// update internal quantization state statistics based on the tensor name
-static void llama_tensor_update_stats(quantize_state_impl & qs, const std::string & name) {
-    if (name.find("attn_v.weight") != std::string::npos ||
-        name.find("attn_kv_b.weight") != std::string::npos) {
-        ++qs.i_attention_wv;
-    } else if (name.find("ffn_down") != std::string::npos) {
-        ++qs.i_ffn_down;
-    } else if (name.find("ffn_gate") != std::string::npos) {
-        ++qs.i_ffn_gate;
-    } else if (name.find("ffn_up") != std::string::npos) {
-        ++qs.i_ffn_up;
-    }
-}
-
 static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
     if (nthread < 2) {
         // single-thread
@@ -867,7 +853,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             tensor_requires_imatrix(tensor, target_type)
         ) {
             if (params->dry_run) {
-                will_require_imatrix = true; // set flag for warning later, but continue with dry run
+                // set flag for warning later, but continue with dry run
+                will_require_imatrix = true;
             } else {
                 LLAMA_LOG_ERROR("\n============================================================================\n"
                                   " ERROR: this quantization requires an importance matrix!\n"
@@ -880,6 +867,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         }
     }
 
+    qs.n_fallback = 0; // may have been falsely incremented by the preliminary loop over weights
     qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
 
     // Set split info if needed
@@ -964,15 +952,25 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
         ggml_type new_type = default_type;
 
-        // if so, what will be the target type?
+        // if so, what will be the new type?
         if (do_quantize) {
             new_type = llama_tensor_get_type(qs, params, tensor, default_type);
             // If we've decided to quantize to the same type the tensor is already
             // in then there's nothing to do.
             do_quantize = tensor->type != new_type;
-        }
 
-        llama_tensor_update_stats(qs, name);
+            // count stats for this tensor based on its name
+            if (name.find("attn_v.weight") != std::string::npos ||
+                name.find("attn_kv_b.weight") != std::string::npos) {
+                ++qs.i_attention_wv;
+            } else if (name.find("ffn_down") != std::string::npos) {
+                ++qs.i_ffn_down;
+            } else if (name.find("ffn_gate") != std::string::npos) {
+                ++qs.i_ffn_gate;
+            } else if (name.find("ffn_up") != std::string::npos) {
+                ++qs.i_ffn_up;
+            }
+        }
 
         void * new_data;
         size_t new_size;
@@ -1131,8 +1129,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     }
 
     if (qs.n_fallback > 0) {
-        LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",
-                __func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback);
+        LLAMA_LOG_WARN("%s: WARNING: %d tensor(s) required fallback quantization\n",
+                __func__, qs.n_fallback);
     }
 }
 
@@ -1143,7 +1141,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 llama_model_quantize_params llama_model_quantize_default_params() {
     llama_model_quantize_params result = {
         /*.nthread                     =*/ 0,
-        /*.ftype                       =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
+        /*.ftype                       =*/ LLAMA_FTYPE_MOSTLY_Q8_0,
         /*.output_tensor_type          =*/ GGML_TYPE_COUNT,
         /*.token_embedding_type        =*/ GGML_TYPE_COUNT,
         /*.allow_requantize            =*/ false,