correct fallback logic

2026-02-16 13:13:44 -06:00 · 2026-02-16 13:13:44 -06:00 · 3c1f94a49d
parent 521a13e6c6
commit 3c1f94a49d
1 changed files with 20 additions and 22 deletions
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@ -89,7 +89,6 @@ struct quantize_state_impl {
    int i_ffn_gate     = 0;
    int i_ffn_up       = 0;

-    int n_k_quantized = 0;
    int n_fallback    = 0;

    bool has_imatrix = false;
@ -493,6 +492,7 @@ static ggml_type llama_tensor_get_type(
                if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
                    new_type = GGML_TYPE_F16;
                }
+                ++qs.n_fallback;
            }
        }
    }
@ -505,20 +505,6 @@ static ggml_type llama_tensor_get_type(
    return new_type;
 }

-// update internal quantization state statistics based on the tensor name
-static void llama_tensor_update_stats(quantize_state_impl & qs, const std::string & name) {
-    if (name.find("attn_v.weight") != std::string::npos ||
-        name.find("attn_kv_b.weight") != std::string::npos) {
-        ++qs.i_attention_wv;
-    } else if (name.find("ffn_down") != std::string::npos) {
-        ++qs.i_ffn_down;
-    } else if (name.find("ffn_gate") != std::string::npos) {
-        ++qs.i_ffn_gate;
-    } else if (name.find("ffn_up") != std::string::npos) {
-        ++qs.i_ffn_up;
-    }
-}
-
 static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
    if (nthread < 2) {
        // single-thread
@ -867,7 +853,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
            tensor_requires_imatrix(tensor, target_type)
        ) {
            if (params->dry_run) {
-                will_require_imatrix = true; // set flag for warning later, but continue with dry run
+                // set flag for warning later, but continue with dry run
+                will_require_imatrix = true;
            } else {
                LLAMA_LOG_ERROR("\n============================================================================\n"
                                  " ERROR: this quantization requires an importance matrix!\n"
@ -880,6 +867,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
        }
    }

+    qs.n_fallback = 0; // may have been falsely incremented by the preliminary loop over weights
    qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;

    // Set split info if needed
@ -964,15 +952,25 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::

        ggml_type new_type = default_type;

-        // if so, what will be the target type?
+        // if so, what will be the new type?
        if (do_quantize) {
            new_type = llama_tensor_get_type(qs, params, tensor, default_type);
            // If we've decided to quantize to the same type the tensor is already
            // in then there's nothing to do.
            do_quantize = tensor->type != new_type;
-        }

-        llama_tensor_update_stats(qs, name);
+            // count stats for this tensor based on its name
+            if (name.find("attn_v.weight") != std::string::npos ||
+                name.find("attn_kv_b.weight") != std::string::npos) {
+                ++qs.i_attention_wv;
+            } else if (name.find("ffn_down") != std::string::npos) {
+                ++qs.i_ffn_down;
+            } else if (name.find("ffn_gate") != std::string::npos) {
+                ++qs.i_ffn_gate;
+            } else if (name.find("ffn_up") != std::string::npos) {
+                ++qs.i_ffn_up;
+            }
+        }

        void * new_data;
        size_t new_size;
@ -1131,8 +1129,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
    }

    if (qs.n_fallback > 0) {
-        LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",
-                __func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback);
+        LLAMA_LOG_WARN("%s: WARNING: %d tensor(s) required fallback quantization\n",
+                __func__, qs.n_fallback);
    }
 }

@ -1143,7 +1141,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 llama_model_quantize_params llama_model_quantize_default_params() {
    llama_model_quantize_params result = {
        /*.nthread                     =*/ 0,
-        /*.ftype                       =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
+        /*.ftype                       =*/ LLAMA_FTYPE_MOSTLY_Q8_0,
        /*.output_tensor_type          =*/ GGML_TYPE_COUNT,
        /*.token_embedding_type        =*/ GGML_TYPE_COUNT,
        /*.allow_requantize            =*/ false,