From 844ad3e3268259b85456ebfd4d3417f9b3825c29 Mon Sep 17 00:00:00 2001
From: ddh0 <dylanhalladay02@icloud.com>
Date: Wed, 11 Feb 2026 12:47:13 -0600
Subject: [PATCH 01/35] clean slate for branch

---
 include/llama.h             | 1 +
 src/llama-quant.cpp         | 3 ++-
 tools/quantize/quantize.cpp | 8 ++++++--
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index 46c3672e98..8bcefda896 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -393,6 +393,7 @@ extern "C" {
         void * kv_overrides;                  // pointer to vector containing overrides
         void * tensor_types;                  // pointer to vector containing tensor types
         void * prune_layers;                  // pointer to vector containing layer indices to prune
+        bool dry_run;                         // calculate and show the final quantization size without performing quantization
     } llama_model_quantize_params;
 
     typedef struct llama_logit_bias {
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index a7891647c3..730f13e29e 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1048,7 +1048,8 @@ llama_model_quantize_params llama_model_quantize_default_params() {
         /*.imatrix                     =*/ nullptr,
         /*.kv_overrides                =*/ nullptr,
         /*.tensor_type                 =*/ nullptr,
-        /*.prune_layers                =*/ nullptr
+        /*.prune_layers                =*/ nullptr,
+        /*.dry_run                     =*/ false
     };
 
     return result;
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index c0f49279ee..3f99d9e6a7 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -120,7 +120,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 static void usage(const char * executable) {
     printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights]\n", executable);
     printf("       [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--tensor-type-file]\n");
-    printf("       [--prune-layers] [--keep-split] [--override-kv]\n");
+    printf("       [--prune-layers] [--keep-split] [--override-kv] [--dry-run]\n");
     printf("       model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
     printf("  --allow-requantize\n");
     printf("                                      allow requantizing tensors that have already been quantized\n");
@@ -156,7 +156,9 @@ static void usage(const char * executable) {
     printf("                                      generate quantized model in the same shards as input\n");
     printf("  --override-kv KEY=TYPE:VALUE\n");
     printf("                                      override model metadata by key in the quantized model. may be specified multiple times.\n");
-    printf("                                      WARNING: this is an advanced option, use with care.\n\n");
+    printf("                                      WARNING: this is an advanced option, use with care.\n");
+    printf("  --dry-run\n");
+    printf("                                      calculate and show the final quantization size without performing quantization\n\n");
     printf("note: --include-weights and --exclude-weights cannot be used together\n\n");
     printf("-----------------------------------------------------------------------------\n");
     printf(" allowed quantization types\n");
@@ -532,6 +534,8 @@ int main(int argc, char ** argv) {
             if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) {
                 usage(argv[0]);
             }
+        } else if (strcmp(argv[arg_idx], "--dry-run") == 0) {
+            params.dry_run = true;
         } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
             params.allow_requantize = true;
         } else if (strcmp(argv[arg_idx], "--pure") == 0) {

From 0d22288f001163d5312a33a99ebf9db26c37e344 Mon Sep 17 00:00:00 2001
From: ddh0 <dylanhalladay02@icloud.com>
Date: Wed, 11 Feb 2026 14:08:01 -0600
Subject: [PATCH 02/35] use 6 characters for tensor dims

---
 src/llama-impl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-impl.cpp b/src/llama-impl.cpp
index 8e3e7b223a..60c7fcd050 100644
--- a/src/llama-impl.cpp
+++ b/src/llama-impl.cpp
@@ -111,7 +111,7 @@ std::string llama_format_tensor_shape(const struct ggml_tensor * t) {
     char buf[256];
     snprintf(buf, sizeof(buf), "%5" PRId64, t->ne[0]);
     for (int i = 1; i < GGML_MAX_DIMS; i++) {
-        snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, t->ne[i]);
+        snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %6" PRId64, t->ne[i]);
     }
     return buf;
 }

From 56c27b13ad0ea970111b68c90056ed8c830d2dc2 Mon Sep 17 00:00:00 2001
From: ddh0 <dylanhalladay02@icloud.com>
Date: Wed, 11 Feb 2026 14:08:17 -0600
Subject: [PATCH 03/35] add --dry-run to llama-quantize

---
 src/llama-quant.cpp         | 259 ++++++++++++++++++++----------------
 tools/quantize/quantize.cpp |  39 ++++--
 2 files changed, 169 insertions(+), 129 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 730f13e29e..2836caaf3a 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -735,24 +735,31 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     };
 
     const auto tn = LLM_TN(model.arch);
-    new_ofstream(0);
+
+    // no output file for --dry-run
+    if (!params->dry_run) {
+        new_ofstream(0);
+    }
+
     for (const auto * it : tensors) {
         const auto & weight = *it;
         ggml_tensor * tensor = weight.tensor;
-        if (weight.idx != cur_split && params->keep_split) {
+        if (!params->dry_run && (weight.idx != cur_split && params->keep_split)) {
             close_ofstream();
             new_ofstream(weight.idx);
         }
 
         const std::string name = ggml_get_name(tensor);
 
-        if (!ml.use_mmap) {
-            if (read_data.size() < ggml_nbytes(tensor)) {
-                read_data.resize(ggml_nbytes(tensor));
+        if (!params->dry_run) {
+            if (!ml.use_mmap) {
+                if (read_data.size() < ggml_nbytes(tensor)) {
+                    read_data.resize(ggml_nbytes(tensor));
+                }
+                tensor->data = read_data.data();
             }
-            tensor->data = read_data.data();
+            ml.load_data_for(tensor);
         }
-        ml.load_data_for(tensor);
 
         LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
                ++idx, ml.n_tensors,
@@ -900,126 +907,148 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             quantize = tensor->type != new_type;
         }
 
-        if (!quantize) {
-            new_type = tensor->type;
-            new_data = tensor->data;
-            new_size = ggml_nbytes(tensor);
-            LLAMA_LOG_INFO("size = %8.3f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0);
-        } else {
-            const int64_t nelements = ggml_nelements(tensor);
-
-            const float * imatrix = nullptr;
-            if (imatrix_data) {
-                auto it = imatrix_data->find(remap_imatrix(tensor->name, mapped));
-                if (it == imatrix_data->end()) {
-                    LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
-                } else {
-                    if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
-                        imatrix = it->second.data();
-                    } else {
-                        LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
-                                int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
-
-                        // this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix
-                        // this is a significant error and it may be good idea to abort the process if this happens,
-                        // since many people will miss the error and not realize that most of the model is being quantized without an imatrix
-                        // tok_embd should be ignored in this case, since it always causes this warning
-                        if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
-                            throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
-                                    int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
-                        }
-                    }
-                }
-            }
-            if ((new_type == GGML_TYPE_IQ2_XXS ||
-                 new_type == GGML_TYPE_IQ2_XS  ||
-                 new_type == GGML_TYPE_IQ2_S   ||
-                 new_type == GGML_TYPE_IQ1_S   ||
-                (new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight"))  ||
-                (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
-                LLAMA_LOG_ERROR("\n\n============================================================\n");
-                LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
-                LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
-                LLAMA_LOG_ERROR("============================================================\n\n");
-                throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name));
-            }
-
-            float * f32_data;
-
-            if (tensor->type == GGML_TYPE_F32) {
-                f32_data = (float *) tensor->data;
-            } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
-                throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
+        // we have now decided on the target type for this tensor
+        // the --dry-run option calculates the final quantization size without quantizting
+        if (params->dry_run) {
+            if (quantize) {
+                new_size = ggml_nrows(tensor) * ggml_row_size(new_type, tensor->ne[0]);
+                LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB (%s)\n",
+                               ggml_nbytes(tensor)/1024.0/1024.0,
+                               new_size/1024.0/1024.0,
+                               ggml_type_name(new_type));
             } else {
-                llama_tensor_dequantize_impl(tensor, f32_conv_buf, workers, nelements, nthread);
-                f32_data = (float *) f32_conv_buf.data();
+                new_size = ggml_nbytes(tensor);
+                LLAMA_LOG_INFO("size = %8.3f MiB\n", new_size/1024.0/1024.0);
             }
+            total_size_org += ggml_nbytes(tensor);
+            total_size_new += new_size;
+            continue;
+        } else {
+            // no --dry-run, perform quantization
+            if (!quantize) {
+                new_type = tensor->type;
+                new_data = tensor->data;
+                new_size = ggml_nbytes(tensor);
+                LLAMA_LOG_INFO("size = %8.3f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0);
+            } else {
+                const int64_t nelements = ggml_nelements(tensor);
 
-            LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
-            fflush(stdout);
+                const float * imatrix = nullptr;
+                if (imatrix_data) {
+                    auto it = imatrix_data->find(remap_imatrix(tensor->name, mapped));
+                    if (it == imatrix_data->end()) {
+                        LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
+                    } else {
+                        if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
+                            imatrix = it->second.data();
+                        } else {
+                            LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
+                                    int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
 
-            if (work.size() < (size_t)nelements * 4) {
-                work.resize(nelements * 4); // upper bound on size
-            }
-            new_data = work.data();
-
-            const int64_t n_per_row = tensor->ne[0];
-            const int64_t nrows = tensor->ne[1];
-
-            static const int64_t min_chunk_size = 32 * 512;
-            const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
-
-            const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
-            const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
-            const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
-
-            // quantize each expert separately since they have different importance matrices
-            new_size = 0;
-            for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
-                const float * f32_data_03 = f32_data + i03 * nelements_matrix;
-                void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
-                const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
-
-                new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
-
-                // TODO: temporary sanity check that the F16 -> MXFP4 is lossless
-#if 0
-                if (new_type == GGML_TYPE_MXFP4) {
-                    auto * x = f32_data_03;
-
-                    //LLAMA_LOG_INFO("nrows = %d, n_per_row = %d\n", nrows, n_per_row);
-                    std::vector<float> deq(nrows*n_per_row);
-                    const ggml_type_traits * qtype = ggml_get_type_traits(new_type);
-                    qtype->to_float(new_data_03, deq.data(), deq.size());
-
-                    double err = 0.0f;
-                    for (int i = 0; i < (int) deq.size(); ++i) {
-                        err += fabsf(deq[i] - x[i]);
-                        //if (fabsf(deq[i] - x[i]) > 0.00001 && i < 256) {
-                        if (deq[i] != x[i]) {
-                            LLAMA_LOG_INFO("deq[%d] = %f, x[%d] = %f\n", i, deq[i], i, x[i]);
+                            // this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix
+                            // this is a significant error and it may be good idea to abort the process if this happens,
+                            // since many people will miss the error and not realize that most of the model is being quantized without an imatrix
+                            // tok_embd should be ignored in this case, since it always causes this warning
+                            if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
+                                throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
+                                        int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
+                            }
                         }
                     }
-                    //LLAMA_LOG_INFO("err = %f\n", err);
-                    GGML_ASSERT(err == 0.00000);
                 }
+                if ((new_type == GGML_TYPE_IQ2_XXS ||
+                    new_type == GGML_TYPE_IQ2_XS  ||
+                    new_type == GGML_TYPE_IQ2_S   ||
+                    new_type == GGML_TYPE_IQ1_S   ||
+                    (new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight"))  ||
+                    (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
+                    LLAMA_LOG_ERROR("\n\n============================================================\n");
+                    LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
+                    LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
+                    LLAMA_LOG_ERROR("============================================================\n\n");
+                    throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name));
+                }
+
+                float * f32_data;
+
+                if (tensor->type == GGML_TYPE_F32) {
+                    f32_data = (float *) tensor->data;
+                } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
+                    throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
+                } else {
+                    llama_tensor_dequantize_impl(tensor, f32_conv_buf, workers, nelements, nthread);
+                    f32_data = (float *) f32_conv_buf.data();
+                }
+
+                LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
+                fflush(stdout);
+
+                if (work.size() < (size_t)nelements * 4) {
+                    work.resize(nelements * 4); // upper bound on size
+                }
+                new_data = work.data();
+
+                const int64_t n_per_row = tensor->ne[0];
+                const int64_t nrows = tensor->ne[1];
+
+                static const int64_t min_chunk_size = 32 * 512;
+                const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
+
+                const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
+                const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
+                const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
+
+                // quantize each expert separately since they have different importance matrices
+                new_size = 0;
+                for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
+                    const float * f32_data_03 = f32_data + i03 * nelements_matrix;
+                    void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
+                    const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
+
+                    new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
+
+                    // TODO: temporary sanity check that the F16 -> MXFP4 is lossless
+#if 0
+                    if (new_type == GGML_TYPE_MXFP4) {
+                        auto * x = f32_data_03;
+
+                        //LLAMA_LOG_INFO("nrows = %d, n_per_row = %d\n", nrows, n_per_row);
+                        std::vector<float> deq(nrows*n_per_row);
+                        const ggml_type_traits * qtype = ggml_get_type_traits(new_type);
+                        qtype->to_float(new_data_03, deq.data(), deq.size());
+
+                        double err = 0.0f;
+                        for (int i = 0; i < (int) deq.size(); ++i) {
+                            err += fabsf(deq[i] - x[i]);
+                            //if (fabsf(deq[i] - x[i]) > 0.00001 && i < 256) {
+                            if (deq[i] != x[i]) {
+                                LLAMA_LOG_INFO("deq[%d] = %f, x[%d] = %f\n", i, deq[i], i, x[i]);
+                            }
+                        }
+                        //LLAMA_LOG_INFO("err = %f\n", err);
+                        GGML_ASSERT(err == 0.00000);
+                    }
 #endif
+                }
+                LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
             }
-            LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
-        }
-        total_size_org += ggml_nbytes(tensor);
-        total_size_new += new_size;
+            total_size_org += ggml_nbytes(tensor);
+            total_size_new += new_size;
 
-        // update the gguf meta data as we go
-        gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
-        GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
-        gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data);
+            // update the gguf meta data as we go
+            gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
+            GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
+            gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data);
 
-        // write tensor data + padding
-        fout.write((const char *) new_data, new_size);
-        zeros(fout, GGML_PAD(new_size, align) - new_size);
+            // write tensor data + padding
+            fout.write((const char *) new_data, new_size);
+            zeros(fout, GGML_PAD(new_size, align) - new_size);
+        } // no --dry-run
+    } // iterate over tensors
+
+    if (!params->dry_run) {
+        close_ofstream();
     }
-    close_ofstream();
 
     LLAMA_LOG_INFO("%s: model size  = %8.2f MiB\n", __func__, total_size_org/1024.0/1024.0);
     LLAMA_LOG_INFO("%s: quant size  = %8.2f MiB\n", __func__, total_size_new/1024.0/1024.0);
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 3f99d9e6a7..91b0367742 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -626,7 +626,7 @@ int main(int argc, char ** argv) {
 
     llama_backend_init();
 
-    // parse command line arguments
+// parse command line arguments
     const std::string fname_inp = argv[arg_idx];
     arg_idx++;
     std::string fname_out;
@@ -634,22 +634,26 @@ int main(int argc, char ** argv) {
     std::string ftype_str;
     std::string suffix = ".gguf";
     if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
-        std::string fpath;
-        const size_t pos = fname_inp.find_last_of("/\\");
-        if (pos != std::string::npos) {
-            fpath = fname_inp.substr(0, pos + 1);
-        }
+        // argv[arg_idx] is the ftype directly: <input> <ftype>
+        if (!params.dry_run) {
+            std::string fpath;
+            const size_t pos = fname_inp.find_last_of("/\\");
+            if (pos != std::string::npos) {
+                fpath = fname_inp.substr(0, pos + 1);
+            }
 
-        // export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting
-        fname_out = fpath + "ggml-model-" + ftype_str;
-        if (!params.keep_split) {
-            fname_out += suffix;
+            // export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting
+            fname_out = fpath + "ggml-model-" + ftype_str;
+            if (!params.keep_split) {
+                fname_out += suffix;
+            }
         }
         arg_idx++;
         if (ftype_str == "COPY") {
             params.only_copy = true;
         }
     } else {
+        // argv[arg_idx] is not a valid ftype, so treat it as output path: <input> <output> <ftype>
         fname_out = argv[arg_idx];
         if (params.keep_split && fname_out.find(suffix) != std::string::npos) {
             fname_out = fname_out.substr(0, fname_out.length() - suffix.length());
@@ -692,14 +696,21 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    if (std::error_code ec; std::filesystem::equivalent(fname_inp, fname_out, ec)) {
-        fprintf(stderr, "%s: error: input and output files are the same: '%s'\n", __func__, fname_inp.c_str());
-        return 1;
+    if (!params.dry_run) {
+        if (std::error_code ec; std::filesystem::equivalent(fname_inp, fname_out, ec)) {
+            fprintf(stderr, "%s: error: input and output files are the same: '%s'\n", __func__, fname_inp.c_str());
+            return 1;
+        }
     }
 
     print_build_info();
 
-    fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());
+    if (params.dry_run) {
+        fprintf(stderr, "%s: calculating quantization size for '%s' as %s", __func__, fname_inp.c_str(), ftype_str.c_str());
+    } else {
+        fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());
+    }
+
     if (params.nthread > 0) {
         fprintf(stderr, " using %d threads", params.nthread);
     }

From c3f42dedd1f446b2e7733ef12c6d93e61a0e5509 Mon Sep 17 00:00:00 2001
From: ddh0 <dylanhalladay02@icloud.com>
Date: Wed, 11 Feb 2026 14:29:22 -0600
Subject: [PATCH 04/35] use 6 characters for tensor dims (cont.)

---
 src/llama-impl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-impl.cpp b/src/llama-impl.cpp
index 60c7fcd050..710a5a1e08 100644
--- a/src/llama-impl.cpp
+++ b/src/llama-impl.cpp
@@ -109,7 +109,7 @@ std::string llama_format_tensor_shape(const std::vector<int64_t> & ne) {
 
 std::string llama_format_tensor_shape(const struct ggml_tensor * t) {
     char buf[256];
-    snprintf(buf, sizeof(buf), "%5" PRId64, t->ne[0]);
+    snprintf(buf, sizeof(buf), "%6" PRId64, t->ne[0]);
     for (int i = 1; i < GGML_MAX_DIMS; i++) {
         snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %6" PRId64, t->ne[i]);
     }

From b9b32f0d2d7a8f041d97e6d6ce00f636cdd6f42b Mon Sep 17 00:00:00 2001
From: ddh0 <dylanhalladay02@icloud.com>
Date: Wed, 11 Feb 2026 14:45:44 -0600
Subject: [PATCH 05/35] no need to re-calculate ggml_nbytes for tensor

---
 src/llama-quant.cpp | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 2836caaf3a..e65c28723f 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -750,11 +750,12 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         }
 
         const std::string name = ggml_get_name(tensor);
+        const size_t tensor_size = ggml_nbytes(tensor);
 
         if (!params->dry_run) {
             if (!ml.use_mmap) {
-                if (read_data.size() < ggml_nbytes(tensor)) {
-                    read_data.resize(ggml_nbytes(tensor));
+                if (read_data.size() < tensor_size) {
+                    read_data.resize(tensor_size);
                 }
                 tensor->data = read_data.data();
             }
@@ -908,19 +909,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         }
 
         // we have now decided on the target type for this tensor
-        // the --dry-run option calculates the final quantization size without quantizting
         if (params->dry_run) {
+            // the --dry-run option calculates the final quantization size without quantizting
             if (quantize) {
                 new_size = ggml_nrows(tensor) * ggml_row_size(new_type, tensor->ne[0]);
                 LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB (%s)\n",
-                               ggml_nbytes(tensor)/1024.0/1024.0,
+                               tensor_size/1024.0/1024.0,
                                new_size/1024.0/1024.0,
                                ggml_type_name(new_type));
             } else {
-                new_size = ggml_nbytes(tensor);
+                new_size = tensor_size;
                 LLAMA_LOG_INFO("size = %8.3f MiB\n", new_size/1024.0/1024.0);
             }
-            total_size_org += ggml_nbytes(tensor);
+            total_size_org += tensor_size;
             total_size_new += new_size;
             continue;
         } else {
@@ -928,8 +929,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             if (!quantize) {
                 new_type = tensor->type;
                 new_data = tensor->data;
-                new_size = ggml_nbytes(tensor);
-                LLAMA_LOG_INFO("size = %8.3f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0);
+                new_size = tensor_size;
+                LLAMA_LOG_INFO("size = %8.3f MiB\n", tensor_size/1024.0/1024.0);
             } else {
                 const int64_t nelements = ggml_nelements(tensor);
 
@@ -1030,9 +1031,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                     }
 #endif
                 }
-                LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
+                LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", tensor_size/1024.0/1024.0, new_size/1024.0/1024.0);
             }
-            total_size_org += ggml_nbytes(tensor);
+            total_size_org += tensor_size;
             total_size_new += new_size;
 
             // update the gguf meta data as we go

From 150e1db21d32db1eb2b19c24cd82cd23aaf52398 Mon Sep 17 00:00:00 2001
From: ddh0 <dylanhalladay02@icloud.com>
Date: Wed, 11 Feb 2026 14:49:56 -0600
Subject: [PATCH 06/35] fix indent

---
 tools/quantize/quantize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 91b0367742..8497cb8039 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -626,7 +626,7 @@ int main(int argc, char ** argv) {
 
     llama_backend_init();
 
-// parse command line arguments
+    // parse command line arguments
     const std::string fname_inp = argv[arg_idx];
     arg_idx++;
     std::string fname_out;

From 966b21a981d2279358d6de76a03dc8de6b8617d4 Mon Sep 17 00:00:00 2001
From: ddh0 <dylanhalladay02@icloud.com>
Date: Wed, 11 Feb 2026 15:30:12 -0600
Subject: [PATCH 07/35] show model and quant BPW when quant completes

---
 src/llama-quant.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index e65c28723f..d7b90db01f 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1051,8 +1051,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         close_ofstream();
     }
 
-    LLAMA_LOG_INFO("%s: model size  = %8.2f MiB\n", __func__, total_size_org/1024.0/1024.0);
-    LLAMA_LOG_INFO("%s: quant size  = %8.2f MiB\n", __func__, total_size_new/1024.0/1024.0);
+    LLAMA_LOG_INFO("%s: model size  = %8.2f MiB (%.2f BPW)\n", __func__, total_size_org/1024.0/1024.0, total_size_org*8.0/ml.n_elements);
+    LLAMA_LOG_INFO("%s: quant size  = %8.2f MiB (%.2f BPW)\n", __func__, total_size_new/1024.0/1024.0, total_size_new*8.0/ml.n_elements);
 
     if (qs.n_fallback > 0) {
         LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",

From 07f882bbbb8380ad5ef1b5da845322d8dcd11b7d Mon Sep 17 00:00:00 2001
From: ddh0 <dylanhalladay02@icloud.com>
Date: Wed, 11 Feb 2026 15:36:42 -0600
Subject: [PATCH 08/35] add example to --help

---
 tools/quantize/quantize.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 8497cb8039..7c9a7f29cc 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -158,7 +158,8 @@ static void usage(const char * executable) {
     printf("                                      override model metadata by key in the quantized model. may be specified multiple times.\n");
     printf("                                      WARNING: this is an advanced option, use with care.\n");
     printf("  --dry-run\n");
-    printf("                                      calculate and show the final quantization size without performing quantization\n\n");
+    printf("                                      calculate and show the final quantization size without performing quantization\n");
+    printf("                                      example: llama-quantize --dry-run model-f32.gguf Q4_K\n\n");
     printf("note: --include-weights and --exclude-weights cannot be used together\n\n");
     printf("-----------------------------------------------------------------------------\n");
     printf(" allowed quantization types\n");

From 2769f352077c3692e3f4cf1ad1e1fa5f56a2af7b Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Wed, 11 Feb 2026 20:49:05 -0600
Subject: [PATCH 09/35] new function `tensor_requires_imatrix`, add courtesy
 warning about imatrix

---
 src/llama-quant.cpp | 33 +++++++++++++++++++++++++++------
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index d7b90db01f..8a668e6b23 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -479,6 +479,22 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
     return new_size;
 }
 
+static bool tensor_requires_imatrix(const llama_model_quantize_params * params, const ggml_tensor * t, const ggml_type dst_type) {
+    if (!params->imatrix) {
+        if (
+            dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS ||
+            dst_type == GGML_TYPE_IQ2_S   || dst_type == GGML_TYPE_IQ1_S  || (
+                dst_type == GGML_TYPE_IQ1_M && strcmp(t->name, "token_embd.weight") &&
+                strcmp(t->name, "output.weight")
+            ) || (
+                dst_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S &&
+                strcmp(t->name, "token_embd.weight") != 0
+            )
+        ) return true;
+    }
+    return false;
+}
+
 static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
     ggml_type default_type;
     llama_ftype ftype = params->ftype;
@@ -741,6 +757,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         new_ofstream(0);
     }
 
+    // flag for `--dry-run`, to let the user know if imatrix will be required for a real
+    // quantization, as a courtesy
+    bool will_require_imatrix = false;
+
     for (const auto * it : tensors) {
         const auto & weight = *it;
         ggml_tensor * tensor = weight.tensor;
@@ -921,6 +941,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                 new_size = tensor_size;
                 LLAMA_LOG_INFO("size = %8.3f MiB\n", new_size/1024.0/1024.0);
             }
+            if (!will_require_imatrix && tensor_requires_imatrix(params, tensor, new_type)) {
+                will_require_imatrix = true;
+            }
             total_size_org += tensor_size;
             total_size_new += new_size;
             continue;
@@ -957,12 +980,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                         }
                     }
                 }
-                if ((new_type == GGML_TYPE_IQ2_XXS ||
-                    new_type == GGML_TYPE_IQ2_XS  ||
-                    new_type == GGML_TYPE_IQ2_S   ||
-                    new_type == GGML_TYPE_IQ1_S   ||
-                    (new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight"))  ||
-                    (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
+                if (tensor_requires_imatrix(params, tensor, new_type)) {
                     LLAMA_LOG_ERROR("\n\n============================================================\n");
                     LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
                     LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
@@ -1053,6 +1071,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
     LLAMA_LOG_INFO("%s: model size  = %8.2f MiB (%.2f BPW)\n", __func__, total_size_org/1024.0/1024.0, total_size_org*8.0/ml.n_elements);
     LLAMA_LOG_INFO("%s: quant size  = %8.2f MiB (%.2f BPW)\n", __func__, total_size_new/1024.0/1024.0, total_size_new*8.0/ml.n_elements);
+    if (!params->imatrix && params->dry_run && will_require_imatrix) {
+        LLAMA_LOG_WARN("%s: WARNING: dry run completed successfully, but actually completing this quantization will require an imatrix!\n");
+    }
 
     if (qs.n_fallback > 0) {
         LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",

From ea8da0503c48077b0468c15345aaf49ebf8e1a37 Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Wed, 11 Feb 2026 20:57:16 -0600
Subject: [PATCH 10/35] missing __func__, move imatrix flag set

---
 src/llama-quant.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 8a668e6b23..76581f8b4b 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -937,13 +937,13 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                                tensor_size/1024.0/1024.0,
                                new_size/1024.0/1024.0,
                                ggml_type_name(new_type));
+                if (!will_require_imatrix && tensor_requires_imatrix(params, tensor, new_type)) {
+                    will_require_imatrix = true;
+                }
             } else {
                 new_size = tensor_size;
                 LLAMA_LOG_INFO("size = %8.3f MiB\n", new_size/1024.0/1024.0);
             }
-            if (!will_require_imatrix && tensor_requires_imatrix(params, tensor, new_type)) {
-                will_require_imatrix = true;
-            }
             total_size_org += tensor_size;
             total_size_new += new_size;
             continue;
@@ -1072,7 +1072,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     LLAMA_LOG_INFO("%s: model size  = %8.2f MiB (%.2f BPW)\n", __func__, total_size_org/1024.0/1024.0, total_size_org*8.0/ml.n_elements);
     LLAMA_LOG_INFO("%s: quant size  = %8.2f MiB (%.2f BPW)\n", __func__, total_size_new/1024.0/1024.0, total_size_new*8.0/ml.n_elements);
     if (!params->imatrix && params->dry_run && will_require_imatrix) {
-        LLAMA_LOG_WARN("%s: WARNING: dry run completed successfully, but actually completing this quantization will require an imatrix!\n");
+        LLAMA_LOG_WARN("%s: WARNING: dry run completed successfully, but actually completing this quantization will require an imatrix!\n",
+                       __func__
+        );
     }
 
     if (qs.n_fallback > 0) {

From 3211a847ef3c153fe499aeb259e2a6f996c6e75d Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Wed, 11 Feb 2026 20:58:52 -0600
Subject: [PATCH 11/35] logic error

---
 src/llama-quant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 76581f8b4b..c411d41153 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -980,7 +980,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                         }
                     }
                 }
-                if (tensor_requires_imatrix(params, tensor, new_type)) {
+                if (tensor_requires_imatrix(params, tensor, new_type) && !imatrix) {
                     LLAMA_LOG_ERROR("\n\n============================================================\n");
                     LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
                     LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");

From 55dbee2bbe1059dac78eb139869c0aa189558df2 Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Wed, 11 Feb 2026 21:03:34 -0600
Subject: [PATCH 12/35] fixup tensor_requires_imatrix

---
 src/llama-quant.cpp | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index c411d41153..252fbe2085 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -480,19 +480,18 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
 }
 
 static bool tensor_requires_imatrix(const llama_model_quantize_params * params, const ggml_tensor * t, const ggml_type dst_type) {
-    if (!params->imatrix) {
-        if (
-            dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS ||
-            dst_type == GGML_TYPE_IQ2_S   || dst_type == GGML_TYPE_IQ1_S  || (
-                dst_type == GGML_TYPE_IQ1_M && strcmp(t->name, "token_embd.weight") &&
-                strcmp(t->name, "output.weight")
-            ) || (
-                dst_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S &&
-                strcmp(t->name, "token_embd.weight") != 0
-            )
-        ) return true;
+    if (dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS ||
+        dst_type == GGML_TYPE_IQ2_S   || dst_type == GGML_TYPE_IQ1_S  || (
+            dst_type == GGML_TYPE_IQ1_M && strcmp(t->name, "token_embd.weight") &&
+            strcmp(t->name, "output.weight")
+        ) || (
+            dst_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S &&
+            strcmp(t->name, "token_embd.weight") != 0
+        )) {
+        return true;
+    } else {
+        return false;
     }
-    return false;
 }
 
 static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {

From 22db76409b7495835c2fac8f491423887445ad1a Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Wed, 11 Feb 2026 21:14:19 -0600
Subject: [PATCH 13/35] add missing `GGML_TYPE`s

---
 src/llama-quant.cpp | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 252fbe2085..3cad6bc6e7 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -480,14 +480,19 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
 }
 
 static bool tensor_requires_imatrix(const llama_model_quantize_params * params, const ggml_tensor * t, const ggml_type dst_type) {
-    if (dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS ||
-        dst_type == GGML_TYPE_IQ2_S   || dst_type == GGML_TYPE_IQ1_S  || (
+    if (
+        dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS ||
+        dst_type == GGML_TYPE_IQ3_XXS || dst_type == GGML_TYPE_IQ1_S  ||
+        dst_type == GGML_TYPE_IQ2_S   || dst_type == GGML_TYPE_IQ1_M  ||
+        dst_type == GGML_TYPE_TQ1_0   || dst_type == GGML_TYPE_TQ2_0  ||
+        (
             dst_type == GGML_TYPE_IQ1_M && strcmp(t->name, "token_embd.weight") &&
             strcmp(t->name, "output.weight")
         ) || (
             dst_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S &&
             strcmp(t->name, "token_embd.weight") != 0
-        )) {
+        )
+    ) {
         return true;
     } else {
         return false;
@@ -979,7 +984,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                         }
                     }
                 }
-                if (tensor_requires_imatrix(params, tensor, new_type) && !imatrix) {
+                if (!imatrix && tensor_requires_imatrix(params, tensor, new_type)) {
                     LLAMA_LOG_ERROR("\n\n============================================================\n");
                     LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
                     LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");

From ae786b862da889a9345a7360e1c7b57c6056510f Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Wed, 11 Feb 2026 21:21:40 -0600
Subject: [PATCH 14/35] simplify and rename `tensor_type_requires_imatrix`

---
 src/llama-quant.cpp | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 3cad6bc6e7..5b3fec3dc5 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -479,20 +479,11 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
     return new_size;
 }
 
-static bool tensor_requires_imatrix(const llama_model_quantize_params * params, const ggml_tensor * t, const ggml_type dst_type) {
-    if (
-        dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS ||
+static bool tensor_type_requires_imatrix(const ggml_type dst_type) {
+    if (dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS ||
         dst_type == GGML_TYPE_IQ3_XXS || dst_type == GGML_TYPE_IQ1_S  ||
         dst_type == GGML_TYPE_IQ2_S   || dst_type == GGML_TYPE_IQ1_M  ||
-        dst_type == GGML_TYPE_TQ1_0   || dst_type == GGML_TYPE_TQ2_0  ||
-        (
-            dst_type == GGML_TYPE_IQ1_M && strcmp(t->name, "token_embd.weight") &&
-            strcmp(t->name, "output.weight")
-        ) || (
-            dst_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S &&
-            strcmp(t->name, "token_embd.weight") != 0
-        )
-    ) {
+        dst_type == GGML_TYPE_TQ1_0   || dst_type == GGML_TYPE_TQ2_0) {
         return true;
     } else {
         return false;
@@ -941,7 +932,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                                tensor_size/1024.0/1024.0,
                                new_size/1024.0/1024.0,
                                ggml_type_name(new_type));
-                if (!will_require_imatrix && tensor_requires_imatrix(params, tensor, new_type)) {
+                if (!will_require_imatrix && tensor_type_requires_imatrix(new_type)) {
                     will_require_imatrix = true;
                 }
             } else {
@@ -984,7 +975,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                         }
                     }
                 }
-                if (!imatrix && tensor_requires_imatrix(params, tensor, new_type)) {
+                if (!imatrix && tensor_type_requires_imatrix(new_type)) {
                     LLAMA_LOG_ERROR("\n\n============================================================\n");
                     LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
                     LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");

From 1ccd7a49baeb5f4643bccc75008de47ba85d843c Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Wed, 11 Feb 2026 21:41:37 -0600
Subject: [PATCH 15/35] simplify for style

---
 src/llama-quant.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 5b3fec3dc5..31694e2834 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -480,14 +480,12 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
 }
 
 static bool tensor_type_requires_imatrix(const ggml_type dst_type) {
-    if (dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS ||
+    return (
+        dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS ||
         dst_type == GGML_TYPE_IQ3_XXS || dst_type == GGML_TYPE_IQ1_S  ||
         dst_type == GGML_TYPE_IQ2_S   || dst_type == GGML_TYPE_IQ1_M  ||
-        dst_type == GGML_TYPE_TQ1_0   || dst_type == GGML_TYPE_TQ2_0) {
-        return true;
-    } else {
-        return false;
-    }
+        dst_type == GGML_TYPE_TQ1_0   || dst_type == GGML_TYPE_TQ2_0
+    );
 }
 
 static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
@@ -1066,6 +1064,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
     LLAMA_LOG_INFO("%s: model size  = %8.2f MiB (%.2f BPW)\n", __func__, total_size_org/1024.0/1024.0, total_size_org*8.0/ml.n_elements);
     LLAMA_LOG_INFO("%s: quant size  = %8.2f MiB (%.2f BPW)\n", __func__, total_size_new/1024.0/1024.0, total_size_new*8.0/ml.n_elements);
+
     if (!params->imatrix && params->dry_run && will_require_imatrix) {
         LLAMA_LOG_WARN("%s: WARNING: dry run completed successfully, but actually completing this quantization will require an imatrix!\n",
                        __func__

From 1658228d6acc770c884965ff0582a7633b75f96a Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Wed, 11 Feb 2026 21:53:07 -0600
Subject: [PATCH 16/35] add back Q2_K edge case for imatrix

---
 src/llama-quant.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 31694e2834..543b658e56 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -479,12 +479,15 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
     return new_size;
 }
 
-static bool tensor_type_requires_imatrix(const ggml_type dst_type) {
+static bool tensor_type_requires_imatrix(const llama_model_quantize_params * params, const ggml_tensor * t, const ggml_type dst_type) {
     return (
         dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS ||
         dst_type == GGML_TYPE_IQ3_XXS || dst_type == GGML_TYPE_IQ1_S  ||
         dst_type == GGML_TYPE_IQ2_S   || dst_type == GGML_TYPE_IQ1_M  ||
-        dst_type == GGML_TYPE_TQ1_0   || dst_type == GGML_TYPE_TQ2_0
+        dst_type == GGML_TYPE_TQ1_0   || dst_type == GGML_TYPE_TQ2_0  ||
+        (   // Q2_K is the worst k-quant type - only allow it without imatrix for token embeddings
+            dst_type == GGML_TYPE_Q2_K && strcmp(t->name, "token_embd.weight") != 0
+        )
     );
 }
 
@@ -930,7 +933,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                                tensor_size/1024.0/1024.0,
                                new_size/1024.0/1024.0,
                                ggml_type_name(new_type));
-                if (!will_require_imatrix && tensor_type_requires_imatrix(new_type)) {
+                if (!will_require_imatrix && tensor_type_requires_imatrix(params, tensor, new_type)) {
                     will_require_imatrix = true;
                 }
             } else {
@@ -973,7 +976,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                         }
                     }
                 }
-                if (!imatrix && tensor_type_requires_imatrix(new_type)) {
+                if (!imatrix && tensor_type_requires_imatrix(params, tensor, new_type)) {
                     LLAMA_LOG_ERROR("\n\n============================================================\n");
                     LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
                     LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");

From b15bb3404cf49d4be1a4d1e5cafbdb544d086d0d Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Wed, 11 Feb 2026 21:57:55 -0600
Subject: [PATCH 17/35] guard ftype imatrix warning

---
 tools/quantize/quantize.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 7c9a7f29cc..59bf9bd3fd 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -686,11 +686,12 @@ int main(int argc, char ** argv) {
         }
     }
 
-    if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
-         params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S  ||
-         params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S ||
-         params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S  ||
-         params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && imatrix_data.empty()) {
+    if (!params.dry_run &&
+        (
+            params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS  || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
+            params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S   || params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S  ||
+            params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S   || params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M
+        ) && imatrix_data.empty()) {
         fprintf(stderr, "\n==========================================================================================================\n");
         fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
         fprintf(stderr, "==========================================================================================================\n\n\n");

From 40528248fcbc212bcde26f8d25b4b411a023d5f3 Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Wed, 11 Feb 2026 22:18:56 -0600
Subject: [PATCH 18/35] comment ref #12557

---
 src/llama-quant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 543b658e56..49a9696503 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -484,7 +484,7 @@ static bool tensor_type_requires_imatrix(const llama_model_quantize_params * par
         dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS ||
         dst_type == GGML_TYPE_IQ3_XXS || dst_type == GGML_TYPE_IQ1_S  ||
         dst_type == GGML_TYPE_IQ2_S   || dst_type == GGML_TYPE_IQ1_M  ||
-        dst_type == GGML_TYPE_TQ1_0   || dst_type == GGML_TYPE_TQ2_0  ||
+        // dst_type == GGML_TYPE_TQ1_0   || dst_type == GGML_TYPE_TQ2_0  || // uncomment if #12557 is merged
         (   // Q2_K is the worst k-quant type - only allow it without imatrix for token embeddings
             dst_type == GGML_TYPE_Q2_K && strcmp(t->name, "token_embd.weight") != 0
         )

From 44f9fee2488858307798bae9b576541e9e887599 Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Wed, 11 Feb 2026 22:23:10 -0600
Subject: [PATCH 19/35] remove per @compilade

---
 src/llama-quant.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 49a9696503..652d93dbc9 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -484,7 +484,6 @@ static bool tensor_type_requires_imatrix(const llama_model_quantize_params * par
         dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS ||
         dst_type == GGML_TYPE_IQ3_XXS || dst_type == GGML_TYPE_IQ1_S  ||
         dst_type == GGML_TYPE_IQ2_S   || dst_type == GGML_TYPE_IQ1_M  ||
-        // dst_type == GGML_TYPE_TQ1_0   || dst_type == GGML_TYPE_TQ2_0  || // uncomment if #12557 is merged
         (   // Q2_K is the worst k-quant type - only allow it without imatrix for token embeddings
             dst_type == GGML_TYPE_Q2_K && strcmp(t->name, "token_embd.weight") != 0
         )

From f58de63ec30f96b1e88eecd5ca659d9248b9eda8 Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Wed, 11 Feb 2026 22:30:06 -0600
Subject: [PATCH 20/35] remove unused `params` parameter

---
 src/llama-quant.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 652d93dbc9..9781202f90 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -479,7 +479,7 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
     return new_size;
 }
 
-static bool tensor_type_requires_imatrix(const llama_model_quantize_params * params, const ggml_tensor * t, const ggml_type dst_type) {
+static bool tensor_type_requires_imatrix(const ggml_tensor * t, const ggml_type dst_type) {
     return (
         dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS ||
         dst_type == GGML_TYPE_IQ3_XXS || dst_type == GGML_TYPE_IQ1_S  ||
@@ -932,7 +932,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                                tensor_size/1024.0/1024.0,
                                new_size/1024.0/1024.0,
                                ggml_type_name(new_type));
-                if (!will_require_imatrix && tensor_type_requires_imatrix(params, tensor, new_type)) {
+                if (!will_require_imatrix && tensor_type_requires_imatrix(tensor, new_type)) {
                     will_require_imatrix = true;
                 }
             } else {
@@ -975,7 +975,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                         }
                     }
                 }
-                if (!imatrix && tensor_type_requires_imatrix(params, tensor, new_type)) {
+                if (!imatrix && tensor_type_requires_imatrix(tensor, new_type)) {
                     LLAMA_LOG_ERROR("\n\n============================================================\n");
                     LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
                     LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");

From 5d6c92440cc773e8362f23f8afb1d6561a26a243 Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Thu, 12 Feb 2026 17:52:59 -0600
Subject: [PATCH 21/35] initial commit for branch

---
 src/llama-quant.cpp         | 400 ++++++++++++++++++++----------------
 tools/quantize/quantize.cpp |  12 --
 2 files changed, 226 insertions(+), 186 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 9781202f90..b805641416 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -479,7 +479,8 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
     return new_size;
 }
 
-static bool tensor_type_requires_imatrix(const ggml_tensor * t, const ggml_type dst_type) {
+// based on this tensor and the destination tensor type, do we require an importance matrix?
+static bool tensor_requires_imatrix(const ggml_tensor * t, const ggml_type dst_type) {
     return (
         dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS ||
         dst_type == GGML_TYPE_IQ3_XXS || dst_type == GGML_TYPE_IQ1_S  ||
@@ -490,6 +491,151 @@ static bool tensor_type_requires_imatrix(const ggml_tensor * t, const ggml_type
     );
 }
 
+// do we allow this tensor to be quantized?
+static bool tensor_allows_quantization(const llama_model_quantize_params * params, llm_arch arch, const ggml_tensor * tensor) {
+    const std::string name = tensor->name;
+
+    // This used to be a regex, but <regex> has an extreme cost to compile times.
+    bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
+
+    // quantize only 2D and 3D tensors (experts)
+    quantize &= (ggml_n_dims(tensor) >= 2);
+
+    // do not quantize norm tensors
+    quantize &= name.find("_norm.weight") == std::string::npos;
+
+    quantize &= params->quantize_output_tensor || name != "output.weight";
+    quantize &= !params->only_copy;
+
+    // do not quantize expert gating tensors
+    // NOTE: can't use LLM_TN here because the layer number is not known
+    quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
+
+    // these are very small (e.g. 4x4)
+    quantize &= name.find("altup")  == std::string::npos;
+    quantize &= name.find("laurel") == std::string::npos;
+
+    // these are not too big so keep them as it is
+    quantize &= name.find("per_layer_model_proj") == std::string::npos;
+
+    // do not quantize positional embeddings and token types (BERT)
+    quantize &= name != LLM_TN(arch)(LLM_TENSOR_POS_EMBD,    "weight");
+    quantize &= name != LLM_TN(arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
+
+    // do not quantize Mamba /Kimi's small conv1d weights
+    // NOTE: can't use LLM_TN here because the layer number is not known
+    quantize &= name.find("ssm_conv1d") == std::string::npos;
+    quantize &= name.find("shortconv.conv.weight") == std::string::npos;
+
+    // do not quantize RWKV's small yet 2D weights
+    quantize &= name.find("time_mix_first.weight") == std::string::npos;
+    quantize &= name.find("time_mix_w0.weight") == std::string::npos;
+    quantize &= name.find("time_mix_w1.weight") == std::string::npos;
+    quantize &= name.find("time_mix_w2.weight") == std::string::npos;
+    quantize &= name.find("time_mix_v0.weight") == std::string::npos;
+    quantize &= name.find("time_mix_v1.weight") == std::string::npos;
+    quantize &= name.find("time_mix_v2.weight") == std::string::npos;
+    quantize &= name.find("time_mix_a0.weight") == std::string::npos;
+    quantize &= name.find("time_mix_a1.weight") == std::string::npos;
+    quantize &= name.find("time_mix_a2.weight") == std::string::npos;
+    quantize &= name.find("time_mix_g1.weight") == std::string::npos;
+    quantize &= name.find("time_mix_g2.weight") == std::string::npos;
+    quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
+    quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
+    quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
+
+    // do not quantize relative position bias (T5)
+    quantize &= name.find("attn_rel_b.weight") == std::string::npos;
+
+    // do not quantize specific multimodal tensors
+    quantize &= name.find(".position_embd.") == std::string::npos;
+
+    return quantize;
+}
+
+static ggml_type get_tensor_target_type(
+                  quantize_state_impl & qs,
+    const llama_model_quantize_params * params,
+                    const ggml_tensor * tensor,
+                            ggml_type   default_type
+) {
+    ggml_type new_type;
+    // get more optimal quantization type based on the tensor shape, layer, etc.
+    if (!params->pure && ggml_is_quantized(default_type)) {
+
+        // if the user provided tensor types - use those
+        bool manual = false;
+        if (params->tensor_types) {
+            const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
+            const std::string tensor_name(tensor->name);
+            for (const auto & [tname, qtype] : tensor_types) {
+                if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
+                    if  (qtype != new_type) {
+                        LLAMA_LOG_WARN("(manual override: %s -> %s) ", ggml_type_name(new_type), ggml_type_name(qtype));
+                        new_type = qtype; // if two or more types are specified for the same tensor, the last match wins
+                        manual = true;
+                        break;
+                    }
+                }
+            }
+        }
+
+        // if not manual - use the standard logic for choosing the quantization type based on the selected mixture
+        if (!manual) {
+            new_type = llama_tensor_get_type(qs, new_type, tensor, params->ftype);
+        }
+
+        // incompatible tensor shapes are handled here - fallback to a compatible type
+        {
+            bool convert_incompatible_tensor = false;
+
+            const int64_t nx = tensor->ne[0];
+            const int64_t ny = tensor->ne[1];
+            const int64_t qk_k = ggml_blck_size(new_type);
+
+            if (nx % qk_k != 0) {
+                LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
+                convert_incompatible_tensor = true;
+            } else {
+                ++qs.n_k_quantized;
+            }
+
+            if (convert_incompatible_tensor) {
+                switch (new_type) {
+                    case GGML_TYPE_TQ1_0:
+                    case GGML_TYPE_TQ2_0:  new_type = GGML_TYPE_Q4_0; break;  // TODO: use a symmetric type instead
+                    case GGML_TYPE_IQ2_XXS:
+                    case GGML_TYPE_IQ2_XS:
+                    case GGML_TYPE_IQ2_S:
+                    case GGML_TYPE_IQ3_XXS:
+                    case GGML_TYPE_IQ3_S:
+                    case GGML_TYPE_IQ1_S:
+                    case GGML_TYPE_IQ1_M:
+                    case GGML_TYPE_Q2_K:
+                    case GGML_TYPE_Q3_K:
+                    case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
+                    case GGML_TYPE_Q4_K:   new_type = GGML_TYPE_Q5_0;   break;
+                    case GGML_TYPE_Q5_K:   new_type = GGML_TYPE_Q5_1;   break;
+                    case GGML_TYPE_Q6_K:   new_type = GGML_TYPE_Q8_0;   break;
+                    default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
+                }
+                if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
+                    new_type = GGML_TYPE_F16;
+                }
+                LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
+                ++qs.n_fallback;
+            }
+        }
+    }
+    if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
+        new_type = params->token_embedding_type;
+    }
+    if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
+        new_type = params->output_tensor_type;
+    }
+    return new_type;
+}
+
 static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
     ggml_type default_type;
     llama_ftype ftype = params->ftype;
@@ -628,8 +774,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     int blk_id = 0;
 
     // make a list of weights
-    std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
-    tensors.reserve(ml.weights_map.size());
+    std::vector<const llama_model_loader::llama_tensor_weight *> weights;
+    weights.reserve(ml.weights_map.size());
     for (const auto & it : ml.weights_map) {
         const std::string remapped_name(remap_layer(it.first, prune_list, mapped, blk_id));
         if (remapped_name.empty()) {
@@ -641,8 +787,16 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             ggml_set_name(it.second.tensor, remapped_name.c_str());
             LLAMA_LOG_DEBUG("%s: tensor %s remapped to %s\n", __func__, it.first.c_str(), ggml_get_name(it.second.tensor));
         }
-        tensors.push_back(&it.second);
+        weights.push_back(&it.second);
     }
+
+    // make a list of tensors (same pointers as from weights)
+    std::vector<ggml_tensor*> tensors;
+    tensors.reserve(weights.size());
+    for (size_t i = 0; i < weights.size(); ++i) {
+        tensors.push_back(weights[i]->tensor);
+    }
+
     if (!prune_list.empty()) {
         gguf_set_val_u32(ctx_out.get(), ml.llm_kv(LLM_KV_BLOCK_COUNT).c_str(), blk_id);
     }
@@ -657,26 +811,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         });
     }
 
-    for (const auto * it : tensors) {
-        const struct ggml_tensor * tensor = it->tensor;
-
-        const std::string name = ggml_get_name(tensor);
-
-        // TODO: avoid hardcoded tensor names - use the TN_* constants
-        if (name.find("attn_v.weight")   != std::string::npos ||
-            name.find("attn_qkv.weight") != std::string::npos ||
-            name.find("attn_kv_b.weight")!= std::string::npos) {
-            ++qs.n_attention_wv;
-        } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
-            qs.has_output = true;
-        }
-    }
-
-    qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
-
-    size_t total_size_org = 0;
-    size_t total_size_new = 0;
-
     std::vector<std::thread> workers;
     workers.reserve(nthread);
 
@@ -690,23 +824,61 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
     // Assume split index is continuous
     if (params->keep_split) {
-        for (const auto * it : tensors) {
+        for (const auto * it : weights) {
             n_split = std::max(uint16_t(it->idx + 1), n_split);
         }
     }
     std::vector<gguf_context_ptr> ctx_outs(n_split);
     ctx_outs[0] = std::move(ctx_out);
 
-    // populate the original tensors so we get an initial meta data
-    for (const auto * it : tensors) {
+    // flag for `--dry-run`, to let the user know if imatrix will be required for a real
+    // quantization, as a courtesy
+    bool will_require_imatrix = false;
+
+    // this is the preliminary iteration over all weights (not the main loop)
+    for (const auto * it : weights) {
+        const ggml_tensor * tensor = it->tensor;
+        const std::string name = tensor->name;
+
+        // TODO: avoid hardcoded tensor names - use the TN_* constants
+        if (name.find("attn_v.weight")   != std::string::npos ||
+            name.find("attn_qkv.weight") != std::string::npos ||
+            name.find("attn_kv_b.weight")!= std::string::npos) {
+            ++qs.n_attention_wv;
+        } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
+            qs.has_output = true;
+        }
+
+        // populate the original tensors so we get an initial meta data
         uint16_t i_split = params->keep_split ? it->idx : 0;
-        ggml_tensor * tensor = it->tensor;
         if (!ctx_outs[i_split]) {
             ctx_outs[i_split].reset(gguf_init_empty());
         }
         gguf_add_tensor(ctx_outs[i_split].get(), tensor);
+
+        // TODO: we could save this per-tensor and correlate it with the vector of tensors so we
+        //       don't have to call this function again later (currently twice per tensor)
+        ggml_type target_type = get_tensor_target_type(qs, params, tensor, default_type);
+
+        if (!params->imatrix &&
+            tensor_allows_quantization(params, model.arch, tensor) &&
+            tensor_requires_imatrix(tensor, target_type)
+        ) {
+            if (params->dry_run) {
+                will_require_imatrix = true; // set flag for warning later, but continue with dry run
+            } else {
+                LLAMA_LOG_ERROR("\n\n============================================================================\n"
+                                    " ERROR: this quantization requires an importance matrix!\n"
+                                    "        offending tensor: %s (target type: %s)\n"
+                                    "============================================================================\n\n",
+                    name, ggml_type_name(target_type));
+                throw new std::runtime_error("this quantization requires an imatrix!");
+            }
+        }
     }
 
+    qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
+
     // Set split info if needed
     if (n_split > 1) {
         for (size_t i = 0; i < ctx_outs.size(); ++i) {
@@ -752,13 +924,14 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         new_ofstream(0);
     }
 
-    // flag for `--dry-run`, to let the user know if imatrix will be required for a real
-    // quantization, as a courtesy
-    bool will_require_imatrix = false;
+    size_t total_size_org = 0;
+    size_t total_size_new = 0;
 
-    for (const auto * it : tensors) {
+    // iterate over all weights (main loop)
+    for (const auto * it : weights) {
         const auto & weight = *it;
         ggml_tensor * tensor = weight.tensor;
+
         if (!params->dry_run && (weight.idx != cur_split && params->keep_split)) {
             close_ofstream();
             new_ofstream(weight.idx);
@@ -778,161 +951,40 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         }
 
         LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
-               ++idx, ml.n_tensors,
-               ggml_get_name(tensor),
-               llama_format_tensor_shape(tensor).c_str(),
-               ggml_type_name(tensor->type));
+                       ++idx, ml.n_tensors,
+                       ggml_get_name(tensor),
+                       llama_format_tensor_shape(tensor).c_str(),
+                       ggml_type_name(tensor->type));
 
-        // This used to be a regex, but <regex> has an extreme cost to compile times.
-        bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
+        // will we quantize this tensor?
+        bool do_quantize = tensor_allows_quantization(params, model.arch, tensor);
 
-        // quantize only 2D and 3D tensors (experts)
-        quantize &= (ggml_n_dims(tensor) >= 2);
+        ggml_type new_type = default_type;
 
-        // do not quantize norm tensors
-        quantize &= name.find("_norm.weight") == std::string::npos;
+        // if so, what will be the target type?
+        if (do_quantize) {
+            new_type = get_tensor_target_type(qs, params, tensor, default_type);
+            // If we've decided to quantize to the same type the tensor is already
+            // in then there's nothing to do.
+            do_quantize = tensor->type != new_type;
+        }
 
-        quantize &= params->quantize_output_tensor || name != "output.weight";
-        quantize &= !params->only_copy;
-
-        // do not quantize expert gating tensors
-        // NOTE: can't use LLM_TN here because the layer number is not known
-        quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
-
-        // these are very small (e.g. 4x4)
-        quantize &= name.find("altup")  == std::string::npos;
-        quantize &= name.find("laurel") == std::string::npos;
-
-        // these are not too big so keep them as it is
-        quantize &= name.find("per_layer_model_proj") == std::string::npos;
-
-        // do not quantize positional embeddings and token types (BERT)
-        quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD,    "weight");
-        quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
-
-        // do not quantize Mamba /Kimi's small conv1d weights
-        // NOTE: can't use LLM_TN here because the layer number is not known
-        quantize &= name.find("ssm_conv1d") == std::string::npos;
-        quantize &= name.find("shortconv.conv.weight") == std::string::npos;
-
-        // do not quantize RWKV's small yet 2D weights
-        quantize &= name.find("time_mix_first.weight") == std::string::npos;
-        quantize &= name.find("time_mix_w0.weight") == std::string::npos;
-        quantize &= name.find("time_mix_w1.weight") == std::string::npos;
-        quantize &= name.find("time_mix_w2.weight") == std::string::npos;
-        quantize &= name.find("time_mix_v0.weight") == std::string::npos;
-        quantize &= name.find("time_mix_v1.weight") == std::string::npos;
-        quantize &= name.find("time_mix_v2.weight") == std::string::npos;
-        quantize &= name.find("time_mix_a0.weight") == std::string::npos;
-        quantize &= name.find("time_mix_a1.weight") == std::string::npos;
-        quantize &= name.find("time_mix_a2.weight") == std::string::npos;
-        quantize &= name.find("time_mix_g1.weight") == std::string::npos;
-        quantize &= name.find("time_mix_g2.weight") == std::string::npos;
-        quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
-        quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
-        quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
-
-        // do not quantize relative position bias (T5)
-        quantize &= name.find("attn_rel_b.weight") == std::string::npos;
-
-        // do not quantize specific multimodal tensors
-        quantize &= name.find(".position_embd.") == std::string::npos;
-
-        ggml_type new_type;
         void * new_data;
         size_t new_size;
 
-        if (quantize) {
-            new_type = default_type;
+        //
+        // perform quantization (or dry run)
+        //
 
-            // get more optimal quantization type based on the tensor shape, layer, etc.
-            if (!params->pure && ggml_is_quantized(default_type)) {
-                // if the user provided tensor types - use those
-                bool manual = false;
-                if (params->tensor_types) {
-                    const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
-                    const std::string tensor_name(tensor->name);
-                    for (const auto & [tname, qtype] : tensor_types) {
-                        if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
-                            if  (qtype != new_type) {
-                                LLAMA_LOG_WARN("(manual override: %s -> %s) ", ggml_type_name(new_type), ggml_type_name(qtype));
-                                new_type = qtype; // if two or more types are specified for the same tensor, the last match wins
-                                manual = true;
-                                break;
-                            }
-                        }
-                    }
-                }
-
-                // if not manual - use the standard logic for choosing the quantization type based on the selected mixture
-                if (!manual) {
-                    new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
-                }
-
-                // incompatible tensor shapes are handled here - fallback to a compatible type
-                {
-                    bool convert_incompatible_tensor = false;
-
-                    const int64_t nx = tensor->ne[0];
-                    const int64_t ny = tensor->ne[1];
-                    const int64_t qk_k = ggml_blck_size(new_type);
-
-                    if (nx % qk_k != 0) {
-                        LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
-                        convert_incompatible_tensor = true;
-                    } else {
-                        ++qs.n_k_quantized;
-                    }
-
-                    if (convert_incompatible_tensor) {
-                        switch (new_type) {
-                            case GGML_TYPE_TQ1_0:
-                            case GGML_TYPE_TQ2_0:  new_type = GGML_TYPE_Q4_0; break;  // TODO: use a symmetric type instead
-                            case GGML_TYPE_IQ2_XXS:
-                            case GGML_TYPE_IQ2_XS:
-                            case GGML_TYPE_IQ2_S:
-                            case GGML_TYPE_IQ3_XXS:
-                            case GGML_TYPE_IQ3_S:
-                            case GGML_TYPE_IQ1_S:
-                            case GGML_TYPE_IQ1_M:
-                            case GGML_TYPE_Q2_K:
-                            case GGML_TYPE_Q3_K:
-                            case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
-                            case GGML_TYPE_Q4_K:   new_type = GGML_TYPE_Q5_0;   break;
-                            case GGML_TYPE_Q5_K:   new_type = GGML_TYPE_Q5_1;   break;
-                            case GGML_TYPE_Q6_K:   new_type = GGML_TYPE_Q8_0;   break;
-                            default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
-                        }
-                        if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
-                            new_type = GGML_TYPE_F16;
-                        }
-                        LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
-                        ++qs.n_fallback;
-                    }
-                }
-            }
-            if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
-                new_type = params->token_embedding_type;
-            }
-            if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
-                new_type = params->output_tensor_type;
-            }
-
-            // If we've decided to quantize to the same type the tensor is already
-            // in then there's nothing to do.
-            quantize = tensor->type != new_type;
-        }
-
-        // we have now decided on the target type for this tensor
         if (params->dry_run) {
             // the --dry-run option calculates the final quantization size without quantizting
-            if (quantize) {
+            if (do_quantize) {
                 new_size = ggml_nrows(tensor) * ggml_row_size(new_type, tensor->ne[0]);
                 LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB (%s)\n",
                                tensor_size/1024.0/1024.0,
                                new_size/1024.0/1024.0,
                                ggml_type_name(new_type));
-                if (!will_require_imatrix && tensor_type_requires_imatrix(tensor, new_type)) {
+                if (!will_require_imatrix && tensor_requires_imatrix(tensor, new_type)) {
                     will_require_imatrix = true;
                 }
             } else {
@@ -944,7 +996,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             continue;
         } else {
             // no --dry-run, perform quantization
-            if (!quantize) {
+            if (!do_quantize) {
                 new_type = tensor->type;
                 new_data = tensor->data;
                 new_size = tensor_size;
@@ -975,7 +1027,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                         }
                     }
                 }
-                if (!imatrix && tensor_type_requires_imatrix(tensor, new_type)) {
+                if (!imatrix && tensor_requires_imatrix(tensor, new_type)) {
                     LLAMA_LOG_ERROR("\n\n============================================================\n");
                     LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
                     LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 59bf9bd3fd..e9448028da 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -686,18 +686,6 @@ int main(int argc, char ** argv) {
         }
     }
 
-    if (!params.dry_run &&
-        (
-            params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS  || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
-            params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S   || params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S  ||
-            params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S   || params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M
-        ) && imatrix_data.empty()) {
-        fprintf(stderr, "\n==========================================================================================================\n");
-        fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
-        fprintf(stderr, "==========================================================================================================\n\n\n");
-        return 1;
-    }
-
     if (!params.dry_run) {
         if (std::error_code ec; std::filesystem::equivalent(fname_inp, fname_out, ec)) {
             fprintf(stderr, "%s: error: input and output files are the same: '%s'\n", __func__, fname_inp.c_str());

From 67e25bbae1fd675ebeacd1f2720b4bfacbe9cc23 Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Thu, 12 Feb 2026 18:02:40 -0600
Subject: [PATCH 22/35] fix compile errors

---
 src/llama-quant.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index b805641416..467d847196 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -559,7 +559,7 @@ static ggml_type get_tensor_target_type(
                     const ggml_tensor * tensor,
                             ggml_type   default_type
 ) {
-    ggml_type new_type;
+    ggml_type new_type = default_type;
     // get more optimal quantization type based on the tensor shape, layer, etc.
     if (!params->pure && ggml_is_quantized(default_type)) {
 
@@ -803,7 +803,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
     // keep_split requires that the weights are sorted by split index
     if (params->keep_split) {
-        std::sort(tensors.begin(), tensors.end(), [](const llama_model_loader::llama_tensor_weight * a, const llama_model_loader::llama_tensor_weight * b) {
+        std::sort(weights.begin(), weights.end(), [](const llama_model_loader::llama_tensor_weight * a, const llama_model_loader::llama_tensor_weight * b) {
             if (a->idx == b->idx) {
                 return a->offs < b->offs;
             }
@@ -871,7 +871,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                                     " ERROR: this quantization requires an importance matrix!\n"
                                     "        offending tensor: %s (target type: %s)\n"
                                     "============================================================================\n\n",
-                    name, ggml_type_name(target_type));
+                    name.c_str(), ggml_type_name(target_type));
                 throw new std::runtime_error("this quantization requires an imatrix!");
             }
         }

From 1f25c130de615417bc2312ef6266acc32acf70dd Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Thu, 12 Feb 2026 18:11:44 -0600
Subject: [PATCH 23/35] pretty error msg

---
 src/llama-quant.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 467d847196..dd81b310d8 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -867,10 +867,11 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             if (params->dry_run) {
                 will_require_imatrix = true; // set flag for warning later, but continue with dry run
             } else {
-                LLAMA_LOG_ERROR("\n\n============================================================================\n"
-                                    " ERROR: this quantization requires an importance matrix!\n"
-                                    "        offending tensor: %s (target type: %s)\n"
-                                    "============================================================================\n\n",
+                LLAMA_LOG_ERROR("\n============================================================================\n"
+                                  " ERROR: this quantization requires an importance matrix!\n"
+                                  "        - offending tensor: %s\n"
+                                  "        - target type: %s\n"
+                                  "============================================================================\n\n",
                     name.c_str(), ggml_type_name(target_type));
                 throw new std::runtime_error("this quantization requires an imatrix!");
             }

From 6734e77662dac4ab24f4b5cb5cc39609ea3d2b13 Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Thu, 12 Feb 2026 18:22:14 -0600
Subject: [PATCH 24/35] don't throw by pointer; unify MiB formatting

---
 src/llama-quant.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index dd81b310d8..c9b7e5c7ce 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -873,7 +873,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                                   "        - target type: %s\n"
                                   "============================================================================\n\n",
                     name.c_str(), ggml_type_name(target_type));
-                throw new std::runtime_error("this quantization requires an imatrix!");
+                throw std::runtime_error("this quantization requires an imatrix!");
             }
         }
     }
@@ -981,7 +981,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             // the --dry-run option calculates the final quantization size without quantizting
             if (do_quantize) {
                 new_size = ggml_nrows(tensor) * ggml_row_size(new_type, tensor->ne[0]);
-                LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB (%s)\n",
+                LLAMA_LOG_INFO("size = %8.3f MiB -> %8.3f MiB (%s)\n",
                                tensor_size/1024.0/1024.0,
                                new_size/1024.0/1024.0,
                                ggml_type_name(new_type));
@@ -1097,7 +1097,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                     }
 #endif
                 }
-                LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", tensor_size/1024.0/1024.0, new_size/1024.0/1024.0);
+                LLAMA_LOG_INFO("size = %8.3f MiB -> %8.3f MiB\n", tensor_size/1024.0/1024.0, new_size/1024.0/1024.0);
             }
             total_size_org += tensor_size;
             total_size_new += new_size;

From d648629f560d7759524e6a76368b825ebf9aa652 Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Thu, 12 Feb 2026 18:24:16 -0600
Subject: [PATCH 25/35] remove unused `std::vector<ggml_tensor*> tensors;`

---
 src/llama-quant.cpp | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index c9b7e5c7ce..763c6e1baf 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -790,13 +790,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         weights.push_back(&it.second);
     }
 
-    // make a list of tensors (same pointers as from weights)
-    std::vector<ggml_tensor*> tensors;
-    tensors.reserve(weights.size());
-    for (size_t i = 0; i < weights.size(); ++i) {
-        tensors.push_back(weights[i]->tensor);
-    }
-
     if (!prune_list.empty()) {
         gguf_set_val_u32(ctx_out.get(), ml.llm_kv(LLM_KV_BLOCK_COUNT).c_str(), blk_id);
     }
@@ -885,7 +878,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         for (size_t i = 0; i < ctx_outs.size(); ++i) {
             gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
             gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
-            gguf_set_val_i32(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), (int32_t)tensors.size());
+            gguf_set_val_i32(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), (int32_t)weights.size());
         }
     }
 

From fd3787ee05a78d67407d30f2fcf979b147b5bb4c Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Thu, 12 Feb 2026 18:24:47 -0600
Subject: [PATCH 26/35] typo

---
 src/llama-quant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 763c6e1baf..dd6c978b94 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -971,7 +971,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         //
 
         if (params->dry_run) {
-            // the --dry-run option calculates the final quantization size without quantizting
+            // the --dry-run option calculates the final quantization size without quantizing
             if (do_quantize) {
                 new_size = ggml_nrows(tensor) * ggml_row_size(new_type, tensor->ne[0]);
                 LLAMA_LOG_INFO("size = %8.3f MiB -> %8.3f MiB (%s)\n",

From 053a28980b5185fdc57a66a2c9714114c39e95f1 Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Thu, 12 Feb 2026 18:31:59 -0600
Subject: [PATCH 27/35] don't double-count `qs`

---
 src/llama-quant.cpp | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index dd6c978b94..e35b4573f3 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -553,11 +553,12 @@ static bool tensor_allows_quantization(const llama_model_quantize_params * param
     return quantize;
 }
 
-static ggml_type get_tensor_target_type(
+static ggml_type tensor_get_target_type(
                   quantize_state_impl & qs,
     const llama_model_quantize_params * params,
                     const ggml_tensor * tensor,
-                            ggml_type   default_type
+                            ggml_type   default_type,
+                                 bool   update_stats // should we update qs or no?
 ) {
     ggml_type new_type = default_type;
     // get more optimal quantization type based on the tensor shape, layer, etc.
@@ -597,7 +598,9 @@ static ggml_type get_tensor_target_type(
                 LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
                 convert_incompatible_tensor = true;
             } else {
-                ++qs.n_k_quantized;
+                if (update_stats) {
+                    ++qs.n_k_quantized;
+                }
             }
 
             if (convert_incompatible_tensor) {
@@ -623,7 +626,9 @@ static ggml_type get_tensor_target_type(
                     new_type = GGML_TYPE_F16;
                 }
                 LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
-                ++qs.n_fallback;
+                if (update_stats) {
+                    ++qs.n_fallback;
+                }
             }
         }
     }
@@ -851,7 +856,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
         // TODO: we could save this per-tensor and correlate it with the vector of tensors so we
         //       don't have to call this function again later (currently twice per tensor)
-        ggml_type target_type = get_tensor_target_type(qs, params, tensor, default_type);
+        ggml_type target_type = tensor_get_target_type(qs, params, tensor, default_type, false);
 
         if (!params->imatrix &&
             tensor_allows_quantization(params, model.arch, tensor) &&
@@ -957,7 +962,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
         // if so, what will be the target type?
         if (do_quantize) {
-            new_type = get_tensor_target_type(qs, params, tensor, default_type);
+            new_type = tensor_get_target_type(qs, params, tensor, default_type, true);
             // If we've decided to quantize to the same type the tensor is already
             // in then there's nothing to do.
             do_quantize = tensor->type != new_type;

From 97aefac773779c8c803cea30321c95827f7b0cc4 Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Thu, 12 Feb 2026 20:00:23 -0600
Subject: [PATCH 28/35] update_stats guard

---
 src/llama-quant.cpp | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index e35b4573f3..461d7689a8 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -175,7 +175,7 @@ static void llama_tensor_dequantize_impl(
     workers.clear();
 }
 
-static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
+static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype, bool update_stats) {
     const std::string name = ggml_get_name(tensor);
 
     // TODO: avoid hardcoded tensor names - use the TN_* constants
@@ -257,7 +257,9 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
         if (name.find("attn_v.weight") != std::string::npos) {
             if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
             else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
-            ++qs.i_attention_wv;
+            if (update_stats) {
+                ++qs.i_attention_wv;
+            }
         }
         else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
             new_type = GGML_TYPE_Q4_K;
@@ -266,7 +268,9 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
             if (qs.i_ffn_down < qs.n_ffn_down/8) {
                 new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
             }
-            ++qs.i_ffn_down;
+            if (update_stats) {
+                ++qs.i_ffn_down;
+            }
         }
         else if (name.find("attn_output.weight") != std::string::npos) {
             if (qs.model.hparams.n_expert == 8) {
@@ -313,7 +317,9 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
             // TODO: explore better strategies
             new_type = GGML_TYPE_Q8_0;
         }
-        ++qs.i_attention_wv;
+        if (update_stats) {
+            ++qs.i_attention_wv;
+        }
     } else if (name.find("attn_k.weight") != std::string::npos) {
         if (qs.model.hparams.n_expert == 8) {
             // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
@@ -377,7 +383,9 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
             // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
             new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
         }
-        ++qs.i_ffn_down;
+        if (update_stats) {
+            ++qs.i_ffn_down;
+        }
     } else if (name.find("attn_output.weight") != std::string::npos) {
         if (arch != LLM_ARCH_FALCON) {
             if (qs.model.hparams.n_expert == 8) {
@@ -411,7 +419,9 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
         if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
             new_type = GGML_TYPE_IQ3_XXS;
         }
-        ++qs.i_ffn_gate;
+        if (update_stats) {
+            ++qs.i_ffn_gate;
+        }
     }
     else if (name.find("ffn_up") != std::string::npos) {
         auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
@@ -419,7 +429,9 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
         if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
             new_type = GGML_TYPE_IQ3_XXS;
         }
-        ++qs.i_ffn_up;
+        if (update_stats) {
+            ++qs.i_ffn_up;
+        }
     }
 
     return new_type;
@@ -583,7 +595,7 @@ static ggml_type tensor_get_target_type(
 
         // if not manual - use the standard logic for choosing the quantization type based on the selected mixture
         if (!manual) {
-            new_type = llama_tensor_get_type(qs, new_type, tensor, params->ftype);
+            new_type = llama_tensor_get_type(qs, new_type, tensor, params->ftype, update_stats);
         }
 
         // incompatible tensor shapes are handled here - fallback to a compatible type
@@ -625,8 +637,8 @@ static ggml_type tensor_get_target_type(
                 if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
                     new_type = GGML_TYPE_F16;
                 }
-                LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
                 if (update_stats) {
+                    LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
                     ++qs.n_fallback;
                 }
             }

From bddc67547fdf2fe13fcb6c2db855e5466d9f3c64 Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Fri, 13 Feb 2026 21:13:53 -0600
Subject: [PATCH 29/35] correct function names

---
 src/llama-quant.cpp | 245 ++++++++++++++++++++++----------------------
 1 file changed, 122 insertions(+), 123 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 461d7689a8..47ece2d666 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -175,7 +175,7 @@ static void llama_tensor_dequantize_impl(
     workers.clear();
 }
 
-static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype, bool update_stats) {
+static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype, bool update_stats) {
     const std::string name = ggml_get_name(tensor);
 
     // TODO: avoid hardcoded tensor names - use the TN_* constants
@@ -437,6 +437,95 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
     return new_type;
 }
 
+// determine the ggml_type that this tensor should be quantized to
+static ggml_type llama_tensor_get_type(
+                  quantize_state_impl & qs,
+    const llama_model_quantize_params * params,
+                    const ggml_tensor * tensor,
+                            ggml_type   default_type,
+                                 bool   update_stats // we only update qs if this flag is true
+) {
+    ggml_type new_type = default_type;
+    // get more optimal quantization type based on the tensor shape, layer, etc.
+    if (!params->pure && ggml_is_quantized(default_type)) {
+
+        // if the user provided tensor types - use those
+        bool manual = false;
+        if (params->tensor_types) {
+            const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
+            const std::string tensor_name(tensor->name);
+            for (const auto & [tname, qtype] : tensor_types) {
+                if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
+                    if  (qtype != new_type) {
+                        LLAMA_LOG_WARN("(manual override: %s -> %s) ", ggml_type_name(new_type), ggml_type_name(qtype));
+                        new_type = qtype; // if two or more types are specified for the same tensor, the last match wins
+                        manual = true;
+                        break;
+                    }
+                }
+            }
+        }
+
+        // if not manual - use the standard logic for choosing the quantization type based on the selected mixture
+        if (!manual) {
+            new_type = llama_tensor_get_type_impl(qs, new_type, tensor, params->ftype, update_stats);
+        }
+
+        // incompatible tensor shapes are handled here - fallback to a compatible type
+        {
+            bool convert_incompatible_tensor = false;
+
+            const int64_t nx = tensor->ne[0];
+            const int64_t ny = tensor->ne[1];
+            const int64_t qk_k = ggml_blck_size(new_type);
+
+            if (nx % qk_k != 0) {
+                LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
+                convert_incompatible_tensor = true;
+            } else {
+                if (update_stats) {
+                    ++qs.n_k_quantized;
+                }
+            }
+
+            if (convert_incompatible_tensor) {
+                switch (new_type) {
+                    case GGML_TYPE_TQ1_0:
+                    case GGML_TYPE_TQ2_0:  new_type = GGML_TYPE_Q4_0; break;  // TODO: use a symmetric type instead
+                    case GGML_TYPE_IQ2_XXS:
+                    case GGML_TYPE_IQ2_XS:
+                    case GGML_TYPE_IQ2_S:
+                    case GGML_TYPE_IQ3_XXS:
+                    case GGML_TYPE_IQ3_S:
+                    case GGML_TYPE_IQ1_S:
+                    case GGML_TYPE_IQ1_M:
+                    case GGML_TYPE_Q2_K:
+                    case GGML_TYPE_Q3_K:
+                    case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
+                    case GGML_TYPE_Q4_K:   new_type = GGML_TYPE_Q5_0;   break;
+                    case GGML_TYPE_Q5_K:   new_type = GGML_TYPE_Q5_1;   break;
+                    case GGML_TYPE_Q6_K:   new_type = GGML_TYPE_Q8_0;   break;
+                    default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
+                }
+                if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
+                    new_type = GGML_TYPE_F16;
+                }
+                if (update_stats) {
+                    LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
+                    ++qs.n_fallback;
+                }
+            }
+        }
+    }
+    if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
+        new_type = params->token_embedding_type;
+    }
+    if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
+        new_type = params->output_tensor_type;
+    }
+    return new_type;
+}
+
 static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
     if (nthread < 2) {
         // single-thread
@@ -508,149 +597,61 @@ static bool tensor_allows_quantization(const llama_model_quantize_params * param
     const std::string name = tensor->name;
 
     // This used to be a regex, but <regex> has an extreme cost to compile times.
-    bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
+    bool allowed = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
 
     // quantize only 2D and 3D tensors (experts)
-    quantize &= (ggml_n_dims(tensor) >= 2);
+    allowed &= (ggml_n_dims(tensor) >= 2);
 
     // do not quantize norm tensors
-    quantize &= name.find("_norm.weight") == std::string::npos;
+    allowed &= name.find("_norm.weight") == std::string::npos;
 
-    quantize &= params->quantize_output_tensor || name != "output.weight";
-    quantize &= !params->only_copy;
+    allowed &= params->quantize_output_tensor || name != "output.weight";
+    allowed &= !params->only_copy;
 
     // do not quantize expert gating tensors
     // NOTE: can't use LLM_TN here because the layer number is not known
-    quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
+    allowed &= name.find("ffn_gate_inp.weight") == std::string::npos;
 
     // these are very small (e.g. 4x4)
-    quantize &= name.find("altup")  == std::string::npos;
-    quantize &= name.find("laurel") == std::string::npos;
+    allowed &= name.find("altup")  == std::string::npos;
+    allowed &= name.find("laurel") == std::string::npos;
 
     // these are not too big so keep them as it is
-    quantize &= name.find("per_layer_model_proj") == std::string::npos;
+    allowed &= name.find("per_layer_model_proj") == std::string::npos;
 
     // do not quantize positional embeddings and token types (BERT)
-    quantize &= name != LLM_TN(arch)(LLM_TENSOR_POS_EMBD,    "weight");
-    quantize &= name != LLM_TN(arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
+    allowed &= name != LLM_TN(arch)(LLM_TENSOR_POS_EMBD,    "weight");
+    allowed &= name != LLM_TN(arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
 
     // do not quantize Mamba /Kimi's small conv1d weights
     // NOTE: can't use LLM_TN here because the layer number is not known
-    quantize &= name.find("ssm_conv1d") == std::string::npos;
-    quantize &= name.find("shortconv.conv.weight") == std::string::npos;
+    allowed &= name.find("ssm_conv1d") == std::string::npos;
+    allowed &= name.find("shortconv.conv.weight") == std::string::npos;
 
     // do not quantize RWKV's small yet 2D weights
-    quantize &= name.find("time_mix_first.weight") == std::string::npos;
-    quantize &= name.find("time_mix_w0.weight") == std::string::npos;
-    quantize &= name.find("time_mix_w1.weight") == std::string::npos;
-    quantize &= name.find("time_mix_w2.weight") == std::string::npos;
-    quantize &= name.find("time_mix_v0.weight") == std::string::npos;
-    quantize &= name.find("time_mix_v1.weight") == std::string::npos;
-    quantize &= name.find("time_mix_v2.weight") == std::string::npos;
-    quantize &= name.find("time_mix_a0.weight") == std::string::npos;
-    quantize &= name.find("time_mix_a1.weight") == std::string::npos;
-    quantize &= name.find("time_mix_a2.weight") == std::string::npos;
-    quantize &= name.find("time_mix_g1.weight") == std::string::npos;
-    quantize &= name.find("time_mix_g2.weight") == std::string::npos;
-    quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
-    quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
-    quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
+    allowed &= name.find("time_mix_first.weight") == std::string::npos;
+    allowed &= name.find("time_mix_w0.weight") == std::string::npos;
+    allowed &= name.find("time_mix_w1.weight") == std::string::npos;
+    allowed &= name.find("time_mix_w2.weight") == std::string::npos;
+    allowed &= name.find("time_mix_v0.weight") == std::string::npos;
+    allowed &= name.find("time_mix_v1.weight") == std::string::npos;
+    allowed &= name.find("time_mix_v2.weight") == std::string::npos;
+    allowed &= name.find("time_mix_a0.weight") == std::string::npos;
+    allowed &= name.find("time_mix_a1.weight") == std::string::npos;
+    allowed &= name.find("time_mix_a2.weight") == std::string::npos;
+    allowed &= name.find("time_mix_g1.weight") == std::string::npos;
+    allowed &= name.find("time_mix_g2.weight") == std::string::npos;
+    allowed &= name.find("time_mix_decay_w1.weight") == std::string::npos;
+    allowed &= name.find("time_mix_decay_w2.weight") == std::string::npos;
+    allowed &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
 
     // do not quantize relative position bias (T5)
-    quantize &= name.find("attn_rel_b.weight") == std::string::npos;
+    allowed &= name.find("attn_rel_b.weight") == std::string::npos;
 
     // do not quantize specific multimodal tensors
-    quantize &= name.find(".position_embd.") == std::string::npos;
+    allowed &= name.find(".position_embd.") == std::string::npos;
 
-    return quantize;
-}
-
-static ggml_type tensor_get_target_type(
-                  quantize_state_impl & qs,
-    const llama_model_quantize_params * params,
-                    const ggml_tensor * tensor,
-                            ggml_type   default_type,
-                                 bool   update_stats // should we update qs or no?
-) {
-    ggml_type new_type = default_type;
-    // get more optimal quantization type based on the tensor shape, layer, etc.
-    if (!params->pure && ggml_is_quantized(default_type)) {
-
-        // if the user provided tensor types - use those
-        bool manual = false;
-        if (params->tensor_types) {
-            const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
-            const std::string tensor_name(tensor->name);
-            for (const auto & [tname, qtype] : tensor_types) {
-                if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
-                    if  (qtype != new_type) {
-                        LLAMA_LOG_WARN("(manual override: %s -> %s) ", ggml_type_name(new_type), ggml_type_name(qtype));
-                        new_type = qtype; // if two or more types are specified for the same tensor, the last match wins
-                        manual = true;
-                        break;
-                    }
-                }
-            }
-        }
-
-        // if not manual - use the standard logic for choosing the quantization type based on the selected mixture
-        if (!manual) {
-            new_type = llama_tensor_get_type(qs, new_type, tensor, params->ftype, update_stats);
-        }
-
-        // incompatible tensor shapes are handled here - fallback to a compatible type
-        {
-            bool convert_incompatible_tensor = false;
-
-            const int64_t nx = tensor->ne[0];
-            const int64_t ny = tensor->ne[1];
-            const int64_t qk_k = ggml_blck_size(new_type);
-
-            if (nx % qk_k != 0) {
-                LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
-                convert_incompatible_tensor = true;
-            } else {
-                if (update_stats) {
-                    ++qs.n_k_quantized;
-                }
-            }
-
-            if (convert_incompatible_tensor) {
-                switch (new_type) {
-                    case GGML_TYPE_TQ1_0:
-                    case GGML_TYPE_TQ2_0:  new_type = GGML_TYPE_Q4_0; break;  // TODO: use a symmetric type instead
-                    case GGML_TYPE_IQ2_XXS:
-                    case GGML_TYPE_IQ2_XS:
-                    case GGML_TYPE_IQ2_S:
-                    case GGML_TYPE_IQ3_XXS:
-                    case GGML_TYPE_IQ3_S:
-                    case GGML_TYPE_IQ1_S:
-                    case GGML_TYPE_IQ1_M:
-                    case GGML_TYPE_Q2_K:
-                    case GGML_TYPE_Q3_K:
-                    case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
-                    case GGML_TYPE_Q4_K:   new_type = GGML_TYPE_Q5_0;   break;
-                    case GGML_TYPE_Q5_K:   new_type = GGML_TYPE_Q5_1;   break;
-                    case GGML_TYPE_Q6_K:   new_type = GGML_TYPE_Q8_0;   break;
-                    default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
-                }
-                if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
-                    new_type = GGML_TYPE_F16;
-                }
-                if (update_stats) {
-                    LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
-                    ++qs.n_fallback;
-                }
-            }
-        }
-    }
-    if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
-        new_type = params->token_embedding_type;
-    }
-    if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
-        new_type = params->output_tensor_type;
-    }
-    return new_type;
+    return allowed;
 }
 
 static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
@@ -866,9 +867,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         }
         gguf_add_tensor(ctx_outs[i_split].get(), tensor);
 
-        // TODO: we could save this per-tensor and correlate it with the vector of tensors so we
-        //       don't have to call this function again later (currently twice per tensor)
-        ggml_type target_type = tensor_get_target_type(qs, params, tensor, default_type, false);
+        ggml_type target_type = llama_tensor_get_type(qs, params, tensor, default_type, false);
 
         if (!params->imatrix &&
             tensor_allows_quantization(params, model.arch, tensor) &&

From 7b127e126aa6a51340695849dc9343fc053fc9a4 Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Fri, 13 Feb 2026 21:17:53 -0600
Subject: [PATCH 30/35] correct function names

---
 src/llama-quant.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 47ece2d666..4530cb1079 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -437,13 +437,15 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type
     return new_type;
 }
 
-// determine the ggml_type that this tensor should be quantized to
+// determine the ggml_type that this tensor should be quantized to.
+//
+// `qs` statistics will only be updated if the `update_stats` parameter is true.
 static ggml_type llama_tensor_get_type(
                   quantize_state_impl & qs,
     const llama_model_quantize_params * params,
                     const ggml_tensor * tensor,
                             ggml_type   default_type,
-                                 bool   update_stats // we only update qs if this flag is true
+                                 bool   update_stats
 ) {
     ggml_type new_type = default_type;
     // get more optimal quantization type based on the tensor shape, layer, etc.
@@ -973,7 +975,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
         // if so, what will be the target type?
         if (do_quantize) {
-            new_type = tensor_get_target_type(qs, params, tensor, default_type, true);
+            new_type = llama_tensor_get_type(qs, params, tensor, default_type, true);
             // If we've decided to quantize to the same type the tensor is already
             // in then there's nothing to do.
             do_quantize = tensor->type != new_type;

From aaf010edeb207fa5eed03b46f89a562faa90521e Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Mon, 16 Feb 2026 12:20:16 -0600
Subject: [PATCH 31/35] new function `llama_tensor_update_stats`

---
 src/llama-quant.cpp | 64 ++++++++++++++++++++-------------------------
 1 file changed, 29 insertions(+), 35 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 4530cb1079..9e7c58b167 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -175,7 +175,14 @@ static void llama_tensor_dequantize_impl(
     workers.clear();
 }
 
-static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype, bool update_stats) {
+// internal logic for selecting the target tensor type for a given quantization
+// and model arch
+static ggml_type llama_tensor_get_type_impl(
+            quantize_state_impl & qs,
+                      ggml_type   new_type,
+              const ggml_tensor * tensor,
+              const llama_ftype   ftype
+) {
     const std::string name = ggml_get_name(tensor);
 
     // TODO: avoid hardcoded tensor names - use the TN_* constants
@@ -257,9 +264,6 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type
         if (name.find("attn_v.weight") != std::string::npos) {
             if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
             else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
-            if (update_stats) {
-                ++qs.i_attention_wv;
-            }
         }
         else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
             new_type = GGML_TYPE_Q4_K;
@@ -268,9 +272,6 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type
             if (qs.i_ffn_down < qs.n_ffn_down/8) {
                 new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
             }
-            if (update_stats) {
-                ++qs.i_ffn_down;
-            }
         }
         else if (name.find("attn_output.weight") != std::string::npos) {
             if (qs.model.hparams.n_expert == 8) {
@@ -317,9 +318,6 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type
             // TODO: explore better strategies
             new_type = GGML_TYPE_Q8_0;
         }
-        if (update_stats) {
-            ++qs.i_attention_wv;
-        }
     } else if (name.find("attn_k.weight") != std::string::npos) {
         if (qs.model.hparams.n_expert == 8) {
             // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
@@ -383,9 +381,6 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type
             // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
             new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
         }
-        if (update_stats) {
-            ++qs.i_ffn_down;
-        }
     } else if (name.find("attn_output.weight") != std::string::npos) {
         if (arch != LLM_ARCH_FALCON) {
             if (qs.model.hparams.n_expert == 8) {
@@ -419,9 +414,6 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type
         if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
             new_type = GGML_TYPE_IQ3_XXS;
         }
-        if (update_stats) {
-            ++qs.i_ffn_gate;
-        }
     }
     else if (name.find("ffn_up") != std::string::npos) {
         auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
@@ -429,23 +421,17 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type
         if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
             new_type = GGML_TYPE_IQ3_XXS;
         }
-        if (update_stats) {
-            ++qs.i_ffn_up;
-        }
     }
 
     return new_type;
 }
 
-// determine the ggml_type that this tensor should be quantized to.
-//
-// `qs` statistics will only be updated if the `update_stats` parameter is true.
+// determine the ggml_type that this tensor should be quantized to
 static ggml_type llama_tensor_get_type(
                   quantize_state_impl & qs,
     const llama_model_quantize_params * params,
                     const ggml_tensor * tensor,
-                            ggml_type   default_type,
-                                 bool   update_stats
+                      const ggml_type   default_type
 ) {
     ggml_type new_type = default_type;
     // get more optimal quantization type based on the tensor shape, layer, etc.
@@ -470,7 +456,7 @@ static ggml_type llama_tensor_get_type(
 
         // if not manual - use the standard logic for choosing the quantization type based on the selected mixture
         if (!manual) {
-            new_type = llama_tensor_get_type_impl(qs, new_type, tensor, params->ftype, update_stats);
+            new_type = llama_tensor_get_type_impl(qs, new_type, tensor, params->ftype);
         }
 
         // incompatible tensor shapes are handled here - fallback to a compatible type
@@ -484,10 +470,6 @@ static ggml_type llama_tensor_get_type(
             if (nx % qk_k != 0) {
                 LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
                 convert_incompatible_tensor = true;
-            } else {
-                if (update_stats) {
-                    ++qs.n_k_quantized;
-                }
             }
 
             if (convert_incompatible_tensor) {
@@ -512,10 +494,6 @@ static ggml_type llama_tensor_get_type(
                 if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
                     new_type = GGML_TYPE_F16;
                 }
-                if (update_stats) {
-                    LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
-                    ++qs.n_fallback;
-                }
             }
         }
     }
@@ -528,6 +506,20 @@ static ggml_type llama_tensor_get_type(
     return new_type;
 }
 
+// update internal quantization state statistics based on the tensor name
+static void llama_tensor_update_stats(quantize_state_impl & qs, const std::string & name) {
+    if (name.find("attn_v.weight") != std::string::npos ||
+        name.find("attn_kv_b.weight") != std::string::npos) {
+        ++qs.i_attention_wv;
+    } else if (name.find("ffn_down") != std::string::npos) {
+        ++qs.i_ffn_down;
+    } else if (name.find("ffn_gate") != std::string::npos) {
+        ++qs.i_ffn_gate;
+    } else if (name.find("ffn_up") != std::string::npos) {
+        ++qs.i_ffn_up;
+    }
+}
+
 static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
     if (nthread < 2) {
         // single-thread
@@ -869,7 +861,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         }
         gguf_add_tensor(ctx_outs[i_split].get(), tensor);
 
-        ggml_type target_type = llama_tensor_get_type(qs, params, tensor, default_type, false);
+        ggml_type target_type = llama_tensor_get_type(qs, params, tensor, default_type);
 
         if (!params->imatrix &&
             tensor_allows_quantization(params, model.arch, tensor) &&
@@ -975,12 +967,14 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
         // if so, what will be the target type?
         if (do_quantize) {
-            new_type = llama_tensor_get_type(qs, params, tensor, default_type, true);
+            new_type = llama_tensor_get_type(qs, params, tensor, default_type);
             // If we've decided to quantize to the same type the tensor is already
             // in then there's nothing to do.
             do_quantize = tensor->type != new_type;
         }
 
+        llama_tensor_update_stats(qs, name);
+
         void * new_data;
         size_t new_size;
 

From 521a13e6c68b41120e985ad7645f03650ad9bf40 Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Mon, 16 Feb 2026 12:34:51 -0600
Subject: [PATCH 32/35] correct fallback logic

---
 src/llama-quant.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 9e7c58b167..5bce2bf221 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -175,8 +175,8 @@ static void llama_tensor_dequantize_impl(
     workers.clear();
 }
 
-// internal logic for selecting the target tensor type for a given quantization
-// and model arch
+// internal standard logic for selecting the target tensor type for a specific
+// quantization mixture & model architecture
 static ggml_type llama_tensor_get_type_impl(
             quantize_state_impl & qs,
                       ggml_type   new_type,
@@ -422,7 +422,6 @@ static ggml_type llama_tensor_get_type_impl(
             new_type = GGML_TYPE_IQ3_XXS;
         }
     }
-
     return new_type;
 }
 
@@ -468,7 +467,7 @@ static ggml_type llama_tensor_get_type(
             const int64_t qk_k = ggml_blck_size(new_type);
 
             if (nx % qk_k != 0) {
-                LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
+                LLAMA_LOG_WARN("\n%s: tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
                 convert_incompatible_tensor = true;
             }
 
@@ -489,7 +488,7 @@ static ggml_type llama_tensor_get_type(
                     case GGML_TYPE_Q4_K:   new_type = GGML_TYPE_Q5_0;   break;
                     case GGML_TYPE_Q5_K:   new_type = GGML_TYPE_Q5_1;   break;
                     case GGML_TYPE_Q6_K:   new_type = GGML_TYPE_Q8_0;   break;
-                    default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
+                    default: throw std::runtime_error("unsupported tensor size");
                 }
                 if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
                     new_type = GGML_TYPE_F16;

From 3c1f94a49d331d96ce7f2469fb901eebc10803bf Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Mon, 16 Feb 2026 13:13:44 -0600
Subject: [PATCH 33/35] correct fallback logic

---
 src/llama-quant.cpp | 42 ++++++++++++++++++++----------------------
 1 file changed, 20 insertions(+), 22 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 5bce2bf221..afec667dc1 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -89,7 +89,6 @@ struct quantize_state_impl {
     int i_ffn_gate     = 0;
     int i_ffn_up       = 0;
 
-    int n_k_quantized = 0;
     int n_fallback    = 0;
 
     bool has_imatrix = false;
@@ -493,6 +492,7 @@ static ggml_type llama_tensor_get_type(
                 if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
                     new_type = GGML_TYPE_F16;
                 }
+                ++qs.n_fallback;
             }
         }
     }
@@ -505,20 +505,6 @@ static ggml_type llama_tensor_get_type(
     return new_type;
 }
 
-// update internal quantization state statistics based on the tensor name
-static void llama_tensor_update_stats(quantize_state_impl & qs, const std::string & name) {
-    if (name.find("attn_v.weight") != std::string::npos ||
-        name.find("attn_kv_b.weight") != std::string::npos) {
-        ++qs.i_attention_wv;
-    } else if (name.find("ffn_down") != std::string::npos) {
-        ++qs.i_ffn_down;
-    } else if (name.find("ffn_gate") != std::string::npos) {
-        ++qs.i_ffn_gate;
-    } else if (name.find("ffn_up") != std::string::npos) {
-        ++qs.i_ffn_up;
-    }
-}
-
 static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
     if (nthread < 2) {
         // single-thread
@@ -867,7 +853,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             tensor_requires_imatrix(tensor, target_type)
         ) {
             if (params->dry_run) {
-                will_require_imatrix = true; // set flag for warning later, but continue with dry run
+                // set flag for warning later, but continue with dry run
+                will_require_imatrix = true;
             } else {
                 LLAMA_LOG_ERROR("\n============================================================================\n"
                                   " ERROR: this quantization requires an importance matrix!\n"
@@ -880,6 +867,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         }
     }
 
+    qs.n_fallback = 0; // may have been falsely incremented by the preliminary loop over weights
     qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
 
     // Set split info if needed
@@ -964,15 +952,25 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
         ggml_type new_type = default_type;
 
-        // if so, what will be the target type?
+        // if so, what will be the new type?
         if (do_quantize) {
             new_type = llama_tensor_get_type(qs, params, tensor, default_type);
             // If we've decided to quantize to the same type the tensor is already
             // in then there's nothing to do.
             do_quantize = tensor->type != new_type;
-        }
 
-        llama_tensor_update_stats(qs, name);
+            // count stats for this tensor based on its name
+            if (name.find("attn_v.weight") != std::string::npos ||
+                name.find("attn_kv_b.weight") != std::string::npos) {
+                ++qs.i_attention_wv;
+            } else if (name.find("ffn_down") != std::string::npos) {
+                ++qs.i_ffn_down;
+            } else if (name.find("ffn_gate") != std::string::npos) {
+                ++qs.i_ffn_gate;
+            } else if (name.find("ffn_up") != std::string::npos) {
+                ++qs.i_ffn_up;
+            }
+        }
 
         void * new_data;
         size_t new_size;
@@ -1131,8 +1129,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     }
 
     if (qs.n_fallback > 0) {
-        LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",
-                __func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback);
+        LLAMA_LOG_WARN("%s: WARNING: %d tensor(s) required fallback quantization\n",
+                __func__, qs.n_fallback);
     }
 }
 
@@ -1143,7 +1141,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 llama_model_quantize_params llama_model_quantize_default_params() {
     llama_model_quantize_params result = {
         /*.nthread                     =*/ 0,
-        /*.ftype                       =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
+        /*.ftype                       =*/ LLAMA_FTYPE_MOSTLY_Q8_0,
         /*.output_tensor_type          =*/ GGML_TYPE_COUNT,
         /*.token_embedding_type        =*/ GGML_TYPE_COUNT,
         /*.allow_requantize            =*/ false,

From a26db356c9ce52ca8b770de47613cfaa3344bf99 Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Mon, 16 Feb 2026 13:43:22 -0600
Subject: [PATCH 34/35] refactor

---
 src/llama-quant.cpp | 164 +++++++++++++++++++++++---------------------
 1 file changed, 85 insertions(+), 79 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index afec667dc1..5cc538caf0 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -11,6 +11,7 @@
 #include <mutex>
 #include <regex>
 #include <thread>
+#include <memory>
 #include <unordered_map>
 
 // Quantization types. Changes to this struct must be replicated in quantize.cpp
@@ -19,6 +20,35 @@ struct tensor_quantization {
     ggml_type quant = GGML_TYPE_COUNT;
 };
 
+struct quantize_state_impl {
+    const llama_model                 & model;
+    const llama_model_quantize_params * params;
+
+    int n_attention_wv = 0;
+    int n_ffn_down     = 0;
+    int n_ffn_gate     = 0;
+    int n_ffn_up       = 0;
+    int i_attention_wv = 0;
+    int i_ffn_down     = 0;
+    int i_ffn_gate     = 0;
+    int i_ffn_up       = 0;
+
+    int n_fallback    = 0;
+
+    bool has_imatrix = false;
+
+    // used to figure out if a model shares tok_embd with the output weight
+    bool has_output = false;
+
+    // if this flag is false, the code will skip updating this struct
+    bool do_count = false;
+
+    quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params)
+        : model(model)
+        , params(params)
+        {}
+};
+
 static void zeros(std::ofstream & file, size_t n) {
     char zero = 0;
     for (size_t i = 0; i < n; ++i) {
@@ -76,32 +106,6 @@ static std::string remap_imatrix (const std::string & orig_name, const std::map<
     return orig_name;
 }
 
-struct quantize_state_impl {
-    const llama_model                 & model;
-    const llama_model_quantize_params * params;
-
-    int n_attention_wv = 0;
-    int n_ffn_down     = 0;
-    int n_ffn_gate     = 0;
-    int n_ffn_up       = 0;
-    int i_attention_wv = 0;
-    int i_ffn_down     = 0;
-    int i_ffn_gate     = 0;
-    int i_ffn_up       = 0;
-
-    int n_fallback    = 0;
-
-    bool has_imatrix = false;
-
-    // used to figure out if a model shares tok_embd with the output weight
-    bool has_output = false;
-
-    quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params)
-        : model(model)
-        , params(params)
-        {}
-};
-
 static void llama_tensor_dequantize_impl(
     ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
     const size_t nelements, const int nthread
@@ -177,7 +181,7 @@ static void llama_tensor_dequantize_impl(
 // internal standard logic for selecting the target tensor type for a specific
 // quantization mixture & model architecture
 static ggml_type llama_tensor_get_type_impl(
-            quantize_state_impl & qs,
+            quantize_state_impl * qs,
                       ggml_type   new_type,
               const ggml_tensor * tensor,
               const llama_ftype   ftype
@@ -185,13 +189,13 @@ static ggml_type llama_tensor_get_type_impl(
     const std::string name = ggml_get_name(tensor);
 
     // TODO: avoid hardcoded tensor names - use the TN_* constants
-    const llm_arch arch = qs.model.arch;
+    const llm_arch arch = qs->model.arch;
     const auto       tn = LLM_TN(arch);
 
     auto use_more_bits = [](int i_layer, int n_layers) -> bool {
         return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2;
     };
-    const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
+    const int n_expert = std::max(1, (int)qs->model.hparams.n_expert);
     auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
         if (n_expert > 1) {
             // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but occasionally randomly
@@ -210,9 +214,9 @@ static ggml_type llama_tensor_get_type_impl(
 
     // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
     // with the quantization of the output tensor
-    if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
-        if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
-            new_type = qs.params->output_tensor_type;
+    if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs->has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
+        if (qs->params->output_tensor_type < GGML_TYPE_COUNT) {
+            new_type = qs->params->output_tensor_type;
         } else {
             const int64_t nx = tensor->ne[0];
             const int64_t qk_k = ggml_blck_size(new_type);
@@ -241,8 +245,8 @@ static ggml_type llama_tensor_get_type_impl(
             new_type = GGML_TYPE_Q8_0;
         }
     } else if (name == "token_embd.weight" || name == "per_layer_token_embd.weight") {
-        if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
-            new_type = qs.params->token_embedding_type;
+        if (qs->params->token_embedding_type < GGML_TYPE_COUNT) {
+            new_type = qs->params->token_embedding_type;
         } else {
             if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
                 ftype == LLAMA_FTYPE_MOSTLY_IQ1_S   || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
@@ -261,19 +265,19 @@ static ggml_type llama_tensor_get_type_impl(
     } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
                ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M    || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
         if (name.find("attn_v.weight") != std::string::npos) {
-            if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
+            if (qs->model.hparams.n_gqa() >= 4 || qs->model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
             else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
         }
-        else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
+        else if (qs->model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
             new_type = GGML_TYPE_Q4_K;
         }
         else if (name.find("ffn_down") != std::string::npos) {
-            if (qs.i_ffn_down < qs.n_ffn_down/8) {
+            if (qs->i_ffn_down < qs->n_ffn_down/8) {
                 new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
             }
         }
         else if (name.find("attn_output.weight") != std::string::npos) {
-            if (qs.model.hparams.n_expert == 8) {
+            if (qs->model.hparams.n_expert == 8) {
                 new_type = GGML_TYPE_Q5_K;
             } else {
                 if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
@@ -282,43 +286,43 @@ static ggml_type llama_tensor_get_type_impl(
         }
     } else if (name.find("attn_v.weight") != std::string::npos) {
         if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
-            new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
+            new_type = qs->model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
         }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs->model.hparams.n_gqa() >= 4) {
             new_type = GGML_TYPE_Q4_K;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
-            new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
+            new_type = qs->model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs->has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
         }
-        else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
+        else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs->model.hparams.n_gqa() >= 4) {
             new_type = GGML_TYPE_Q4_K;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
             new_type = GGML_TYPE_Q4_K;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
-            new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
+            new_type = qs->i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
-        else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) {
+        else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs->model.hparams.n_gqa() >= 4) {
             new_type = GGML_TYPE_Q5_K;
         }
         else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
-                use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
-        if (qs.model.type == LLM_TYPE_70B) {
+                use_more_bits(qs->i_attention_wv, qs->n_attention_wv)) new_type = GGML_TYPE_Q6_K;
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs->i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
+        if (qs->model.type == LLM_TYPE_70B) {
             // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
             // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
             // nearly negligible increase in model size by quantizing this tensor with more bits:
             if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
         }
-        if (qs.model.hparams.n_expert == 8) {
+        if (qs->model.hparams.n_expert == 8) {
             // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
             // TODO: explore better strategies
             new_type = GGML_TYPE_Q8_0;
         }
     } else if (name.find("attn_k.weight") != std::string::npos) {
-        if (qs.model.hparams.n_expert == 8) {
+        if (qs->model.hparams.n_expert == 8) {
             // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
             // TODO: explore better strategies
             new_type = GGML_TYPE_Q8_0;
@@ -337,13 +341,13 @@ static ggml_type llama_tensor_get_type_impl(
             new_type = GGML_TYPE_IQ2_S;
         }
     } else if (name.find("ffn_down") != std::string::npos) {
-        auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
+        auto info = layer_info(qs->i_ffn_down, qs->n_ffn_down, name.c_str());
         int i_layer = info.first, n_layer = info.second;
         if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
             if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
         }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
+        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs->has_imatrix) {
             new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
@@ -352,7 +356,7 @@ static ggml_type llama_tensor_get_type_impl(
                      : GGML_TYPE_Q3_K;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
-                    (qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
+                    (qs->model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
             new_type = GGML_TYPE_Q4_K;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
@@ -366,7 +370,7 @@ static ggml_type llama_tensor_get_type_impl(
                 if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
             }
         }
-        else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) {
+        else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs->has_imatrix) {
             new_type = GGML_TYPE_Q5_K;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
@@ -374,7 +378,7 @@ static ggml_type llama_tensor_get_type_impl(
             new_type = GGML_TYPE_Q5_K;
         }
         else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0)
-                && qs.has_imatrix && i_layer < n_layer/8) {
+                && qs->has_imatrix && i_layer < n_layer/8) {
             // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
             // We only do it when an imatrix is provided because a) we want to make sure that one can always get the
             // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
@@ -382,7 +386,7 @@ static ggml_type llama_tensor_get_type_impl(
         }
     } else if (name.find("attn_output.weight") != std::string::npos) {
         if (arch != LLM_ARCH_FALCON) {
-            if (qs.model.hparams.n_expert == 8) {
+            if (qs->model.hparams.n_expert == 8) {
                 if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K   || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
                     ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL  ||
                     ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M  || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S  ||
@@ -408,14 +412,14 @@ static ggml_type llama_tensor_get_type_impl(
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
     }
     else if (name.find("ffn_gate") != std::string::npos) {
-        auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
+        auto info = layer_info(qs->i_ffn_gate, qs->n_ffn_gate, name.c_str());
         int i_layer = info.first, n_layer = info.second;
         if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
             new_type = GGML_TYPE_IQ3_XXS;
         }
     }
     else if (name.find("ffn_up") != std::string::npos) {
-        auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
+        auto info = layer_info(qs->i_ffn_up, qs->n_ffn_up, name.c_str());
         int i_layer = info.first, n_layer = info.second;
         if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
             new_type = GGML_TYPE_IQ3_XXS;
@@ -426,7 +430,7 @@ static ggml_type llama_tensor_get_type_impl(
 
 // determine the ggml_type that this tensor should be quantized to
 static ggml_type llama_tensor_get_type(
-                  quantize_state_impl & qs,
+                  quantize_state_impl * qs,
     const llama_model_quantize_params * params,
                     const ggml_tensor * tensor,
                       const ggml_type   default_type
@@ -492,7 +496,9 @@ static ggml_type llama_tensor_get_type(
                 if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
                     new_type = GGML_TYPE_F16;
                 }
-                ++qs.n_fallback;
+                if (qs->do_count) {
+                    ++qs->n_fallback;
+                }
             }
         }
     }
@@ -708,7 +714,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     model.load_hparams(ml);
     model.load_stats  (ml);
 
-    quantize_state_impl qs(model, params);
+    auto qs = std::make_unique<quantize_state_impl>(model, params);
 
     if (params->only_copy) {
         ftype = ml.ftype;
@@ -718,7 +724,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
         if (imatrix_data) {
             LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
-            qs.has_imatrix = true;
+            qs->has_imatrix = true;
             // check imatrix for nans or infs
             for (const auto & kv : *imatrix_data) {
                 for (float f : kv.second) {
@@ -830,15 +836,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         const ggml_tensor * tensor = it->tensor;
         const std::string name = tensor->name;
 
-        // TODO: avoid hardcoded tensor names - use the TN_* constants
-        if (name.find("attn_v.weight")   != std::string::npos ||
-            name.find("attn_qkv.weight") != std::string::npos ||
-            name.find("attn_kv_b.weight")!= std::string::npos) {
-            ++qs.n_attention_wv;
-        } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
-            qs.has_output = true;
-        }
-
         // populate the original tensors so we get an initial meta data
         uint16_t i_split = params->keep_split ? it->idx : 0;
         if (!ctx_outs[i_split]) {
@@ -867,9 +864,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         }
     }
 
-    qs.n_fallback = 0; // may have been falsely incremented by the preliminary loop over weights
-    qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
-
     // Set split info if needed
     if (n_split > 1) {
         for (size_t i = 0; i < ctx_outs.size(); ++i) {
@@ -918,6 +912,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     size_t total_size_org = 0;
     size_t total_size_new = 0;
 
+    qs->n_ffn_down = qs->n_ffn_gate = qs->n_ffn_up = (int)model.hparams.n_layer;
+    qs->do_count = true; // we start counting stats for the main loop
+
     // iterate over all weights (main loop)
     for (const auto * it : weights) {
         const auto & weight = *it;
@@ -931,6 +928,15 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         const std::string name = ggml_get_name(tensor);
         const size_t tensor_size = ggml_nbytes(tensor);
 
+        // TODO: avoid hardcoded tensor names - use the TN_* constants
+        if (name.find("attn_v.weight")   != std::string::npos ||
+            name.find("attn_qkv.weight") != std::string::npos ||
+            name.find("attn_kv_b.weight")!= std::string::npos) {
+            ++qs->n_attention_wv;
+        } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
+            qs->has_output = true;
+        }
+
         if (!params->dry_run) {
             if (!ml.use_mmap) {
                 if (read_data.size() < tensor_size) {
@@ -962,13 +968,13 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             // count stats for this tensor based on its name
             if (name.find("attn_v.weight") != std::string::npos ||
                 name.find("attn_kv_b.weight") != std::string::npos) {
-                ++qs.i_attention_wv;
+                ++qs->i_attention_wv;
             } else if (name.find("ffn_down") != std::string::npos) {
-                ++qs.i_ffn_down;
+                ++qs->i_ffn_down;
             } else if (name.find("ffn_gate") != std::string::npos) {
-                ++qs.i_ffn_gate;
+                ++qs->i_ffn_gate;
             } else if (name.find("ffn_up") != std::string::npos) {
-                ++qs.i_ffn_up;
+                ++qs->i_ffn_up;
             }
         }
 
@@ -1128,9 +1134,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         );
     }
 
-    if (qs.n_fallback > 0) {
+    if (qs->n_fallback > 0) {
         LLAMA_LOG_WARN("%s: WARNING: %d tensor(s) required fallback quantization\n",
-                __func__, qs.n_fallback);
+                __func__, qs->n_fallback);
     }
 }
 

From ce0ad2986b4e044b24240059cc64437387989e65 Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Mon, 16 Feb 2026 13:59:13 -0600
Subject: [PATCH 35/35] refactor

---
 src/llama-quant.cpp | 33 ++++++++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 5cc538caf0..93fef63bd2 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -40,7 +40,7 @@ struct quantize_state_impl {
     // used to figure out if a model shares tok_embd with the output weight
     bool has_output = false;
 
-    // if this flag is false, the code will skip updating this struct
+    // if this flag is false, the code will skip updating the per-tensor counters
     bool do_count = false;
 
     quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params)
@@ -181,10 +181,10 @@ static void llama_tensor_dequantize_impl(
 // internal standard logic for selecting the target tensor type for a specific
 // quantization mixture & model architecture
 static ggml_type llama_tensor_get_type_impl(
-            quantize_state_impl * qs,
-                      ggml_type   new_type,
-              const ggml_tensor * tensor,
-              const llama_ftype   ftype
+    quantize_state_impl * qs,
+              ggml_type   new_type,
+      const ggml_tensor * tensor,
+            llama_ftype   ftype
 ) {
     const std::string name = ggml_get_name(tensor);
 
@@ -267,6 +267,9 @@ static ggml_type llama_tensor_get_type_impl(
         if (name.find("attn_v.weight") != std::string::npos) {
             if (qs->model.hparams.n_gqa() >= 4 || qs->model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
             else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
+            if (qs->do_count) {
+                ++qs->i_attention_wv;
+            }
         }
         else if (qs->model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
             new_type = GGML_TYPE_Q4_K;
@@ -275,6 +278,9 @@ static ggml_type llama_tensor_get_type_impl(
             if (qs->i_ffn_down < qs->n_ffn_down/8) {
                 new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
             }
+            if (qs->do_count) {
+                ++qs->i_ffn_down;
+            }
         }
         else if (name.find("attn_output.weight") != std::string::npos) {
             if (qs->model.hparams.n_expert == 8) {
@@ -321,6 +327,9 @@ static ggml_type llama_tensor_get_type_impl(
             // TODO: explore better strategies
             new_type = GGML_TYPE_Q8_0;
         }
+        if (qs->do_count) {
+            ++qs->i_attention_wv;
+        }
     } else if (name.find("attn_k.weight") != std::string::npos) {
         if (qs->model.hparams.n_expert == 8) {
             // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
@@ -384,6 +393,9 @@ static ggml_type llama_tensor_get_type_impl(
             // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
             new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
         }
+        if (qs->do_count) {
+            ++qs->i_ffn_down;
+        }
     } else if (name.find("attn_output.weight") != std::string::npos) {
         if (arch != LLM_ARCH_FALCON) {
             if (qs->model.hparams.n_expert == 8) {
@@ -417,6 +429,9 @@ static ggml_type llama_tensor_get_type_impl(
         if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
             new_type = GGML_TYPE_IQ3_XXS;
         }
+        if (qs->do_count) {
+            ++qs->i_ffn_gate;
+        }
     }
     else if (name.find("ffn_up") != std::string::npos) {
         auto info = layer_info(qs->i_ffn_up, qs->n_ffn_up, name.c_str());
@@ -424,6 +439,9 @@ static ggml_type llama_tensor_get_type_impl(
         if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
             new_type = GGML_TYPE_IQ3_XXS;
         }
+        if (qs->do_count) {
+            ++qs->i_ffn_up;
+        }
     }
     return new_type;
 }
@@ -714,6 +732,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     model.load_hparams(ml);
     model.load_stats  (ml);
 
+    // quantize_state_impl qs(model, params);
     auto qs = std::make_unique<quantize_state_impl>(model, params);
 
     if (params->only_copy) {
@@ -843,7 +862,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         }
         gguf_add_tensor(ctx_outs[i_split].get(), tensor);
 
-        ggml_type target_type = llama_tensor_get_type(qs, params, tensor, default_type);
+        ggml_type target_type = llama_tensor_get_type(qs.get(), params, tensor, default_type);
 
         if (!params->imatrix &&
             tensor_allows_quantization(params, model.arch, tensor) &&
@@ -960,7 +979,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
         // if so, what will be the new type?
         if (do_quantize) {
-            new_type = llama_tensor_get_type(qs, params, tensor, default_type);
+            new_type = llama_tensor_get_type(qs.get(), params, tensor, default_type);
             // If we've decided to quantize to the same type the tensor is already
             // in then there's nothing to do.
             do_quantize = tensor->type != new_type;