From 3c1f94a49d331d96ce7f2469fb901eebc10803bf Mon Sep 17 00:00:00 2001 From: ddh0 Date: Mon, 16 Feb 2026 13:13:44 -0600 Subject: [PATCH] correct fallback logic --- src/llama-quant.cpp | 42 ++++++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 5bce2bf221..afec667dc1 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -89,7 +89,6 @@ struct quantize_state_impl { int i_ffn_gate = 0; int i_ffn_up = 0; - int n_k_quantized = 0; int n_fallback = 0; bool has_imatrix = false; @@ -493,6 +492,7 @@ static ggml_type llama_tensor_get_type( if (tensor->ne[0] % ggml_blck_size(new_type) != 0) { new_type = GGML_TYPE_F16; } + ++qs.n_fallback; } } } @@ -505,20 +505,6 @@ static ggml_type llama_tensor_get_type( return new_type; } -// update internal quantization state statistics based on the tensor name -static void llama_tensor_update_stats(quantize_state_impl & qs, const std::string & name) { - if (name.find("attn_v.weight") != std::string::npos || - name.find("attn_kv_b.weight") != std::string::npos) { - ++qs.i_attention_wv; - } else if (name.find("ffn_down") != std::string::npos) { - ++qs.i_ffn_down; - } else if (name.find("ffn_gate") != std::string::npos) { - ++qs.i_ffn_gate; - } else if (name.find("ffn_up") != std::string::npos) { - ++qs.i_ffn_up; - } -} - static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector & workers, const int nthread) { if (nthread < 2) { // single-thread @@ -867,7 +853,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: tensor_requires_imatrix(tensor, target_type) ) { if (params->dry_run) { - will_require_imatrix = true; // set flag for warning later, but continue with dry run + // set flag for warning later, but continue with dry run + will_require_imatrix = true; } else { LLAMA_LOG_ERROR("\n============================================================================\n" " ERROR: this quantization requires an importance matrix!\n" @@ -880,6 +867,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } } + qs.n_fallback = 0; // may have been falsely incremented by the preliminary loop over weights qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer; // Set split info if needed @@ -964,15 +952,25 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: ggml_type new_type = default_type; - // if so, what will be the target type? + // if so, what will be the new type? if (do_quantize) { new_type = llama_tensor_get_type(qs, params, tensor, default_type); // If we've decided to quantize to the same type the tensor is already // in then there's nothing to do. do_quantize = tensor->type != new_type; - } - llama_tensor_update_stats(qs, name); + // count stats for this tensor based on its name + if (name.find("attn_v.weight") != std::string::npos || + name.find("attn_kv_b.weight") != std::string::npos) { + ++qs.i_attention_wv; + } else if (name.find("ffn_down") != std::string::npos) { + ++qs.i_ffn_down; + } else if (name.find("ffn_gate") != std::string::npos) { + ++qs.i_ffn_gate; + } else if (name.find("ffn_up") != std::string::npos) { + ++qs.i_ffn_up; + } + } void * new_data; size_t new_size; @@ -1131,8 +1129,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } if (qs.n_fallback > 0) { - LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n", - __func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback); + LLAMA_LOG_WARN("%s: WARNING: %d tensor(s) required fallback quantization\n", + __func__, qs.n_fallback); } } @@ -1143,7 +1141,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: llama_model_quantize_params llama_model_quantize_default_params() { llama_model_quantize_params result = { /*.nthread =*/ 0, - /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1, + /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q8_0, /*.output_tensor_type =*/ GGML_TYPE_COUNT, /*.token_embedding_type =*/ GGML_TYPE_COUNT, /*.allow_requantize =*/ false,