From ba7335efb363515052a5f8aa755e4a5cd1250150 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 19 Aug 2025 09:54:29 +0100
Subject: [PATCH 001/155] Refactor variable name

---
 include/llama.h | 1 +
 1 file changed, 1 insertion(+)
diff --git a/include/llama.h b/include/llama.h
index 545e957e5f..b17e8f3353 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -354,6 +354,7 @@ extern "C" {
         bool pure;                            // quantize all tensors to the default type
         bool keep_split;                      // quantize to the same number of shards
         void * imatrix;                       // pointer to importance matrix data
+        void * activations;                   // pointer to activations data
         void * kv_overrides;                  // pointer to vector containing overrides
         void * tensor_types;                  // pointer to vector containing tensor types
         void * prune_layers;                  // pointer to vector containing layer indices to prune

From 4d9491141b591d31f7fb91940ef4b1cf41bf94f6 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 19 Aug 2025 10:43:21 +0100
Subject: [PATCH 002/155] Add target_bpw parameter

---
 include/llama.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/llama.h b/include/llama.h
index b17e8f3353..f44e2383d0 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -358,6 +358,7 @@ extern "C" {
         void * kv_overrides;                  // pointer to vector containing overrides
         void * tensor_types;                  // pointer to vector containing tensor types
         void * prune_layers;                  // pointer to vector containing layer indices to prune
+        float target_bpw;                     // target bits per weight (bpw)
     } llama_model_quantize_params;
 
     typedef struct llama_logit_bias {

From cfec4048abc478cd2769d1908e3ecc53ad2f28bd Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 19 Aug 2025 10:43:51 +0100
Subject: [PATCH 003/155] Update usage

---
 tools/quantize/quantize.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 470dc3d916..b2d62f1490 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -132,6 +132,7 @@ static void usage(const char * executable) {
     printf("      Advanced option to selectively quantize tensors. May be specified multiple times.\n");
     printf("  --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n");
     printf("      Advanced option to remove all tensors from the given layers\n");
+    printf("  --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0 \n");
     printf("  --keep-split: will generate quantized model in the same shards as input\n");
     printf("  --override-kv KEY=TYPE:VALUE\n");
     printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");

From 5e85fb3ff34c5253c3dfa51eb5b9b9bfd6aaaaea Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 19 Aug 2025 10:46:36 +0100
Subject: [PATCH 004/155] Add parse_target_bpw()

---
 tools/quantize/quantize.cpp | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index b2d62f1490..afd2edb156 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -441,6 +441,27 @@ static bool parse_layer_prune(const char * data, std::vector<int> & prune_layers
     return true;
 }
 
+static bool parse_target_bpw(const char * data, float & target_bpw) {
+    if (!data) {
+        printf("\n%s: no target bits per weight (bpw) provided\n\n", __func__);
+        return false;
+    }
+
+    try {
+        target_bpw = std::stof(data);
+        if (target_bpw < 0.0f || target_bpw > 8.0f) {
+            printf("\n%s: target bits per weight (bpw) must be a positive number between 0.0 and 8.0\n\n", __func__);
+            return false;
+        }
+    }
+    catch (const std::exception & e) {
+        printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 8.0\n\n", __func__, data);
+        return false;
+    }
+
+    return true;
+}
+
 int main(int argc, char ** argv) {
     if (argc < 3) {
         usage(argv[0]);

From e6d55dc47b42054dcef4a72145cfffb3cb26bd0f Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 19 Aug 2025 10:49:01 +0100
Subject: [PATCH 005/155] Load activations

---
 tools/quantize/quantize.cpp | 46 ++++++++++++++++++++++++-------------
 1 file changed, 30 insertions(+), 16 deletions(-)

diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index afd2edb156..3d07abd2d0 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -247,56 +247,69 @@ static int load_imatrix(const std::string & imatrix_file, std::vector<std::strin
 
     const uint32_t chunk_size = gguf_get_val_u32(ctx_gguf, chunk_size_idx);
 
-    const std::string sums_suffix{ ".in_sum2" };
+    const std::string sums_suffix{ ".in_sum" };
+    const std::string sums2_suffix{ ".in_sum2" };
     const std::string counts_suffix{ ".counts" };
 
     // Using an ordered map to get a deterministic iteration order.
-    std::map<std::string, std::pair<struct ggml_tensor *, struct ggml_tensor *>> sums_counts_for;
+    std::map<std::string, std::tuple<struct ggml_tensor *, struct ggml_tensor *, struct ggml_tensor *>> sums_counts_for;
 
     for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
         std::string name = cur->name;
 
         if (name.empty()) { continue; }
 
-        if (string_remove_suffix(name, sums_suffix)) {
+        if (string_remove_suffix(name, sums2_suffix)) {
             // in_sum2
-            sums_counts_for[std::move(name)].first = cur;
+            std::get<0>(sums_counts_for[std::move(name)]) = cur;
         } else if (string_remove_suffix(name, counts_suffix)) {
             // counts
-            sums_counts_for[std::move(name)].second = cur;
-        } else {
+            std::get<1>(sums_counts_for[std::move(name)]) = cur;
+        }  else if (string_remove_suffix(name, sums_suffix)) {
+            // in_sum
+            std::get<2>(sums_counts_for[std::move(name)]) = cur;
+        }
+        else {
             // ignore other tensors
         }
     }
 
     for (const auto & sc : sums_counts_for) {
         const        std::string & name   = sc.first;
-        const struct ggml_tensor * sums   = sc.second.first;
-        const struct ggml_tensor * counts = sc.second.second;
+        const struct ggml_tensor * sums   = std::get<2>(sc.second);
+        const struct ggml_tensor * sums2  = std::get<0>(sc.second);
+        const struct ggml_tensor * counts = std::get<1>(sc.second);
 
-        if (!sums || !counts) {
+        // check that sums, sums2 and counts have the same shape
+        if (!sums2 || !counts || (sums != nullptr && ggml_nelements(sums) != ggml_nelements(sums2))) {
             fprintf(stderr, "%s: mismatched sums and counts for %s\n", __func__, name.c_str());
             gguf_free(ctx_gguf);
             ggml_free(ctx);
             exit(1);
         }
 
-        const int64_t ne0 = sums->ne[0];
-        const int64_t ne1 = sums->ne[1];
+        const int64_t ne0 = sums2->ne[0];
+        const int64_t ne1 = sums2->ne[1];
 
-        auto & e = imatrix_data[name];
-        e.resize(ggml_nelements(sums));
+        auto & activations = activations_data[name];
+        auto & values = values_data[name];
+        if (sums) {
+            activations.resize(ggml_nelements(sums));
+        }
+        values.resize(ggml_nelements(sums2));
         float max_count = 0.0f;
         for (int64_t j = 0; j < ne1; ++j) {
             const float count = ((const float *) counts->data)[j];
             if (count > 0.0f) {
                 for (int64_t i = 0; i < ne0; ++i) {
-                    e[j*ne0 + i] = ((const float *) sums->data)[j*ne0 + i] / count;
+                    values[j*ne0 + i] = ((const float *) sums2->data)[j*ne0 + i] / count;
+                    if (sums) { activations[j*ne0 + i] = ((const float *) sums->data)[j*ne0 + i] / count; }
                 }
             } else {
                 // Partial imatrix data, this tensor never got any input during calibration
                 for (int64_t i = 0; i < ne0; ++i) {
-                    e[j*ne0 + i] = 1;
+                    values[j*ne0 + i] = 1;
+                    if (sums) { activations[j*ne0 + i] = 0; }
                 }
             }
             if (count > max_count) {
@@ -304,7 +317,8 @@ static int load_imatrix(const std::string & imatrix_file, std::vector<std::strin
             }
         }
         if (getenv("LLAMA_TRACE")) {
-            printf("%s: loaded data (size = %6d, n_tokens = %6d, n_chunks = %6d) for '%s'\n", __func__, int(e.size()), int(max_count), int(max_count / chunk_size), name.c_str());
+            printf("%s: loaded data (size = %6d, n_tokens = %6d, n_chunks = %6d) for '%s'\n",
+                __func__, int(values.size()), int(max_count), int(max_count / chunk_size), name.c_str());
         }
     }
 

From 77b818c040b97da4fb8b2aa849e64f285e039c98 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 19 Aug 2025 10:50:37 +0100
Subject: [PATCH 006/155] Populate activations_data with imatrix activations if
 present

---
 tools/quantize/quantize.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 3d07abd2d0..c2a4767fc9 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -561,10 +561,11 @@ int main(int argc, char ** argv) {
     }
 
     std::vector<std::string> imatrix_datasets;
-    std::unordered_map<std::string, std::vector<float>> imatrix_data;
-    int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, imatrix_data);
-    if (!imatrix_data.empty()) {
-        params.imatrix = &imatrix_data;
+    std::unordered_map<std::string, std::vector<float>> values_data;
+    std::unordered_map<std::string, std::vector<float>> activations_data;
+    int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, values_data, activations_data);
+    if (!values_data.empty()) {
+        params.imatrix = &values_data;
         {
             llama_model_kv_override kvo;
             std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE);

From 0edbf0c176236b795d8707504388052839556b67 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 19 Aug 2025 10:51:58 +0100
Subject: [PATCH 007/155] Process activations

---
 tools/quantize/quantize.cpp | 51 +++++++++++++++++++++++++++----------
 1 file changed, 37 insertions(+), 14 deletions(-)

diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index c2a4767fc9..2c45adab75 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -215,7 +215,10 @@ static int load_legacy_imatrix(const std::string & imatrix_file, std::vector<std
     return m_last_call;
 }
 
-static int load_imatrix(const std::string & imatrix_file, std::vector<std::string> & imatrix_datasets, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
+static int load_imatrix(const std::string & imatrix_file,
+    std::vector<std::string> & imatrix_datasets,
+    std::unordered_map<std::string, std::vector<float>> & values_data,
+    std::unordered_map<std::string, std::vector<float>> & activations_data) {
 
     struct ggml_context * ctx = nullptr;
     struct gguf_init_params meta_gguf_params = {
@@ -225,7 +228,7 @@ static int load_imatrix(const std::string & imatrix_file, std::vector<std::strin
     struct gguf_context * ctx_gguf = gguf_init_from_file(imatrix_file.c_str(), meta_gguf_params);
     if (!ctx_gguf) {
         fprintf(stderr, "%s: imatrix file '%s' is using old format\n", __func__, imatrix_file.c_str());
-        return load_legacy_imatrix(imatrix_file, imatrix_datasets, imatrix_data);
+        return load_legacy_imatrix(imatrix_file, imatrix_datasets, values_data);
     }
     const int32_t n_entries = gguf_get_n_tensors(ctx_gguf);
     if (n_entries < 1) {
@@ -335,7 +338,7 @@ static int load_imatrix(const std::string & imatrix_file, std::vector<std::strin
     }
     printf("]\n");
 
-    printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_chunk);
+    printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(values_data.size()), imatrix_file.c_str(), m_last_chunk);
 
     gguf_free(ctx_gguf);
     ggml_free(ctx);
@@ -347,40 +350,60 @@ static int prepare_imatrix(const std::string & imatrix_file,
         std::vector<std::string> & imatrix_dataset,
         const std::vector<std::string> & included_weights,
         const std::vector<std::string> & excluded_weights,
-        std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
+        std::unordered_map<std::string, std::vector<float>> & values_data,
+        std::unordered_map<std::string, std::vector<float>> & activations_data) {
     int m_last_call = -1;
     if (!imatrix_file.empty()) {
-        m_last_call = load_imatrix(imatrix_file, imatrix_dataset, imatrix_data);
+        m_last_call = load_imatrix(imatrix_file, imatrix_dataset, values_data, activations_data);
     }
-    if (imatrix_data.empty()) {
+    if (values_data.empty()) {
         return m_last_call;
     }
     if (!excluded_weights.empty()) {
         for (const auto & name : excluded_weights) {
-            for (auto it = imatrix_data.begin(); it != imatrix_data.end();) {
+            for (auto it = values_data.begin(); it != values_data.end();) {
                 auto pos = it->first.find(name);
                 if (pos != std::string::npos) {
-                    it = imatrix_data.erase(it);
+                    it = values_data.erase(it);
                 } else {
                     ++it;
                 }
             }
+            for (auto at = activations_data.begin(); at != activations_data.end();) {
+                auto pos = at->first.find(name);
+                if (pos != std::string::npos) {
+                    at = activations_data.erase(at);
+                } else {
+                    ++at;
+                }
+            }
         }
     }
     if (!included_weights.empty()) {
-        std::unordered_map<std::string, std::vector<float>> tmp;
+        std::unordered_map<std::string, std::vector<float>> tmp_values;
+        std::unordered_map<std::string, std::vector<float>> tmp_activations;
         for (const auto & name : included_weights) {
-            for (auto & e : imatrix_data) {
+            for (auto & e : values_data) {
                 auto pos = e.first.find(name);
                 if (pos != std::string::npos) {
-                    tmp.emplace(std::move(e));
+                    tmp_values.emplace(std::move(e));
+                }
+            }
+            for (auto & a : activations_data) {
+                auto pos = a.first.find(name);
+                if (pos != std::string::npos) {
+                    tmp_activations.emplace(std::move(a));
                 }
             }
         }
-        imatrix_data = std::move(tmp);
+        values_data = std::move(tmp_values);
+        activations_data = std::move(tmp_activations);
     }
-    if (!imatrix_data.empty()) {
-        printf("%s: have %d importance matrix entries\n", __func__, int(imatrix_data.size()));
+    if (!values_data.empty()) {
+        printf("%s: have %d importance matrix value entries\n", __func__, int(values_data.size()));
+    }
+    if (!activations_data.empty()) {
+        printf("%s: have %d importance matrix activation entries\n", __func__, int(activations_data.size()));
     }
     return m_last_call;
 }

From e8774744584689db682866b71121597fe4d35c84 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 19 Aug 2025 10:54:02 +0100
Subject: [PATCH 008/155] Process target_bpw parameter

---
 tools/quantize/quantize.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 2c45adab75..5331dec80c 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -512,6 +512,7 @@ int main(int argc, char ** argv) {
     std::vector<llama_model_kv_override> kv_overrides;
     std::vector<tensor_quantization> tensor_types;
     std::vector<int> prune_layers;
+    float target_bpw = -1.0f;
 
     for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
         if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
@@ -538,6 +539,10 @@ int main(int argc, char ** argv) {
             if (arg_idx == argc-1 || !parse_tensor_type(argv[++arg_idx], tensor_types)) {
                 usage(argv[0]);
             }
+        } else if (strcmp(argv[arg_idx], "--target-bpw") == 0) {
+            if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) {
+                usage(argv[0]);
+            }
         } else if (strcmp(argv[arg_idx], "--prune-layers") == 0) {
             if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) {
                 usage(argv[0]);

From 1b3d5b574414ffc03c5d575ef470c74f4e509a80 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 19 Aug 2025 10:56:02 +0100
Subject: [PATCH 009/155] Populate params

---
 tools/quantize/quantize.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 5331dec80c..86a96cdfcc 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -616,7 +616,7 @@ int main(int argc, char ** argv) {
             llama_model_kv_override kvo;
             std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES);
             kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
-            kvo.val_i64 = imatrix_data.size();
+            kvo.val_i64 = values_data.size();
             kv_overrides.emplace_back(std::move(kvo));
         }
 
@@ -628,6 +628,9 @@ int main(int argc, char ** argv) {
             kv_overrides.emplace_back(std::move(kvo));
         }
     }
+    if (!activations_data.empty()) {
+        params.activations = &activations_data;
+    }
     if (!kv_overrides.empty()) {
         kv_overrides.emplace_back();
         kv_overrides.back().key[0] = 0;
@@ -639,6 +642,9 @@ int main(int argc, char ** argv) {
     if (!prune_layers.empty()) {
         params.prune_layers = &prune_layers;
     }
+    if (target_bpw != -1.0f) {
+        params.target_bpw = target_bpw;
+    }
 
     llama_backend_init();
 
@@ -701,7 +707,7 @@ int main(int argc, char ** argv) {
          params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S  ||
          params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S ||
          params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S  ||
-         params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && imatrix_data.empty()) {
+         params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && values_data.empty()) {
         fprintf(stderr, "\n==========================================================================================================\n");
         fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
         fprintf(stderr, "==========================================================================================================\n\n\n");

From a22a9deeeeb51e6f647bb185301b9874538d0324 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 19 Aug 2025 10:57:44 +0100
Subject: [PATCH 010/155] Refactor variable and add target_bpw

---
 src/llama-quant.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 1d0361cc16..2e1ca7216e 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1062,9 +1062,11 @@ llama_model_quantize_params llama_model_quantize_default_params() {
         /*.pure                        =*/ false,
         /*.keep_split                  =*/ false,
         /*.imatrix                     =*/ nullptr,
+        /*.activations                 =*/ nullptr,
         /*.kv_overrides                =*/ nullptr,
         /*.tensor_type                 =*/ nullptr,
-        /*.prune_layers                =*/ nullptr
+        /*.prune_layers                =*/ nullptr,
+        /*.target_bpw                  =*/ -1.0f
     };
 
     return result;

From c96b8eef949b479d505b63788d2c214e4221abcb Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 19 Aug 2025 11:00:05 +0100
Subject: [PATCH 011/155] Add fallback_type enum

---
 src/llama-quant.cpp | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 2e1ca7216e..b2879bc847 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -19,6 +19,32 @@ struct tensor_quantization {
     ggml_type quant = GGML_TYPE_COUNT;
 };
 
+static enum ggml_type fallback_type(const enum ggml_type new_type) {
+    switch (new_type) {
+        case GGML_TYPE_TQ1_0:
+        case GGML_TYPE_TQ2_0:
+            return GGML_TYPE_Q4_0; // symmetric-ish fallback
+        case GGML_TYPE_IQ2_XXS:
+        case GGML_TYPE_IQ2_XS:
+        case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_IQ3_XXS:
+        case GGML_TYPE_IQ3_S:
+        case GGML_TYPE_IQ1_S:
+        case GGML_TYPE_IQ1_M:
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_IQ4_XS:
+            return GGML_TYPE_IQ4_NL;
+        case GGML_TYPE_Q4_K:
+            return GGML_TYPE_Q5_0;
+        case GGML_TYPE_Q5_K:
+            return GGML_TYPE_Q5_1;
+        case GGML_TYPE_Q6_K:
+            return GGML_TYPE_Q8_0;
+        default:
+            return new_type;
+    }
+}
 static void zeros(std::ofstream & file, size_t n) {
     char zero = 0;
     for (size_t i = 0; i < n; ++i) {

From 9adae08789aefeb945b55858afbdf047e818147f Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 19 Aug 2025 11:00:50 +0100
Subject: [PATCH 012/155] Add is_iq()

---
 src/llama-quant.cpp | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index b2879bc847..1e837a7d41 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -19,6 +19,22 @@ struct tensor_quantization {
     ggml_type quant = GGML_TYPE_COUNT;
 };
 
+static bool is_iq(const enum ggml_type t) {
+    switch (t) {
+        case GGML_TYPE_IQ1_S:
+        case GGML_TYPE_IQ1_M:
+        case GGML_TYPE_IQ2_XXS:
+        case GGML_TYPE_IQ2_XS:
+        case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_IQ3_XXS:
+        case GGML_TYPE_IQ3_S:
+        case GGML_TYPE_IQ4_NL:
+        case GGML_TYPE_IQ4_XS:
+            return true;
+        default:
+            return false;
+    }
+}
 static enum ggml_type fallback_type(const enum ggml_type new_type) {
     switch (new_type) {
         case GGML_TYPE_TQ1_0:

From 017945a3b20726dc000da1245ecdbf539a7ba0cf Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 19 Aug 2025 11:03:52 +0100
Subject: [PATCH 013/155] Validate if imatrix contains activations

---
 src/llama-quant.cpp | 48 ++++++++++++++++++++++++++++++---------------
 1 file changed, 32 insertions(+), 16 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 1e837a7d41..fdda5d35a1 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -35,6 +35,7 @@ static bool is_iq(const enum ggml_type t) {
             return false;
     }
 }
+
 static enum ggml_type fallback_type(const enum ggml_type new_type) {
     switch (new_type) {
         case GGML_TYPE_TQ1_0:
@@ -61,6 +62,7 @@ static enum ggml_type fallback_type(const enum ggml_type new_type) {
             return new_type;
     }
 }
+
 static void zeros(std::ofstream & file, size_t n) {
     char zero = 0;
     for (size_t i = 0; i < n; ++i) {
@@ -131,10 +133,11 @@ struct quantize_state_impl {
     int i_ffn_gate     = 0;
     int i_ffn_up       = 0;
 
-    int n_k_quantized = 0;
-    int n_fallback    = 0;
+    int n_k_quantized  = 0;
+    int n_fallback     = 0;
 
-    bool has_imatrix = false;
+    bool has_imatrix     = false;
+    bool has_activations = false;
 
     // used to figure out if a model shares tok_embd with the output weight
     bool has_output = false;
@@ -652,14 +655,15 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     if (params->only_copy) {
         ftype = ml.ftype;
     }
-    const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
+    const std::unordered_map<std::string, std::vector<float>> * values_data = nullptr;
+    const std::unordered_map<std::string, std::vector<float>> * activations_data = nullptr;
     if (params->imatrix) {
-        imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
-        if (imatrix_data) {
-            LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
+        values_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
+        if (values_data) {
+            LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(values_data->size()));
             qs.has_imatrix = true;
             // check imatrix for nans or infs
-            for (const auto & kv : *imatrix_data) {
+            for (const auto & kv : *values_data) {
                 for (float f : kv.second) {
                     if (!std::isfinite(f)) {
                         throw std::runtime_error(format("imatrix contains non-finite value %f\n", f));
@@ -668,8 +672,22 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             }
         }
     }
+    if (params->activations) {
+        activations_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->activations);
+        if (activations_data) {
+            LLAMA_LOG_INFO("================================ Have activations data with %d entries\n",int(activations_data->size()));
+            qs.has_activations = true;
+            // check activations for nans or infs
+            for (const auto & kv : *activations_data) {
+                for (float f : kv.second) {
+                    if (!std::isfinite(f)) {
+                        throw std::runtime_error(format("activations contain non-finite value %f\n", f));
+                    }
+                }
+            }
+        }
+    }
 
-    const size_t align = GGUF_DEFAULT_ALIGNMENT;
     gguf_context_ptr ctx_out { gguf_init_empty() };
 
     std::vector<int> prune_list = {};
@@ -846,6 +864,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     const auto tn = LLM_TN(model.arch);
     new_ofstream(0);
     for (const auto * it : tensors) {
+        const size_t  align  = GGUF_DEFAULT_ALIGNMENT;
         const auto & weight = *it;
         ggml_tensor * tensor = weight.tensor;
         if (weight.idx != cur_split && params->keep_split) {
@@ -864,10 +883,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         ml.load_data_for(tensor);
 
         LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
-               ++idx, ml.n_tensors,
-               ggml_get_name(tensor),
-               llama_format_tensor_shape(tensor).c_str(),
-               ggml_type_name(tensor->type));
+            ++idx, ml.n_tensors, ggml_get_name(tensor), llama_format_tensor_shape(tensor).c_str(), ggml_type_name(tensor->type));
 
         // This used to be a regex, but <regex> has an extreme cost to compile times.
         bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
@@ -967,9 +983,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             const int64_t nelements = ggml_nelements(tensor);
 
             const float * imatrix = nullptr;
-            if (imatrix_data) {
-                auto it = imatrix_data->find(remap_imatrix(tensor->name, mapped));
-                if (it == imatrix_data->end()) {
+            if (values_data) {
+                auto it = values_data->find(remap_imatrix(tensor->name, mapped));
+                if (it == values_data->end()) {
                     LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
                 } else {
                     if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {

From 92f49ab39949221ff84b4f70d4528e4f5f43db93 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 19 Aug 2025 11:05:01 +0100
Subject: [PATCH 014/155] Add target_bpw_type() logic

---
 src/llama-quant.cpp | 482 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 482 insertions(+)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index fdda5d35a1..1e24303c52 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -575,6 +575,488 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
     return new_size;
 }
 
+// Returns per-tensor overrides of quantization types to meet target BPW with best expected quality.
+// imatrix_data: map from tensor name -> length (ne[0] * ne[2]) containing per-column E[a^2] by expert
+// activations_data: optional map from tensor name -> length (ne[0] * ne[2]) containing per-column E[a] by expert
+// bias_lambda: relative weight on bias term (|sum e_j * E[a_j]|) vs MSE term (sum e_j^2 * E[a_j^2])
+static std::unordered_map<std::string, ggml_type> target_bpw_type(
+    llama_model_loader & ml,
+    std::vector<no_init<uint8_t>> & read_data,
+    const llama_model & model,
+    const std::vector<const llama_model_loader::llama_tensor_weight *> & tensors,
+    const std::map<int, std::string> & mapped,
+    const std::unordered_map<std::string, std::vector<float>> * values_data,
+    const std::unordered_map<std::string, std::vector<float>> * activations_data,
+    float target_bpw,
+    int nthread,
+    int sample_rows_per_expert = 128,
+    float bias_lambda = 1.0
+) {
+    struct candidate_types {
+        ggml_type type;
+        float bpw;
+        size_t bytes;
+        float error;  // lower is better
+    };
+
+    struct tensor_info {
+        const llama_model_loader::llama_tensor_weight * w;
+        std::vector<candidate_types> candidate; // sorted by bpw ascending
+        int choice = -1;             // index into cand
+        float min_bpw = 0.0;
+        float max_bpw = 0.0;
+        size_t n_elements = 0;
+    };
+
+    auto name_tn = LLM_TN(model.arch);
+
+    // The candidate types we consider; adjust as needed
+    const ggml_type base_candidates[] = {
+        // Model's
+        GGML_TYPE_IQ1_S,
+        GGML_TYPE_IQ1_M,
+        GGML_TYPE_IQ2_XXS,
+        GGML_TYPE_IQ2_XS,
+        GGML_TYPE_IQ2_S,
+        GGML_TYPE_IQ3_XXS,
+        GGML_TYPE_IQ3_S,
+        GGML_TYPE_IQ4_XS,
+        GGML_TYPE_IQ4_NL,
+        GGML_TYPE_Q2_K,
+        GGML_TYPE_Q3_K,
+        GGML_TYPE_Q4_0,
+        GGML_TYPE_Q4_1,
+        GGML_TYPE_Q4_K,
+        GGML_TYPE_Q5_0,
+        GGML_TYPE_Q5_1,
+        GGML_TYPE_Q5_K,
+        GGML_TYPE_Q6_K,
+        GGML_TYPE_Q8_0
+    };
+
+    auto can_quantize = [&](const ggml_tensor * t) -> bool {
+        const std::string name = ggml_get_name(t);
+        bool q = name.rfind("weight") == name.size() - 6;
+        q &= (ggml_n_dims(t) >= 2);
+        q &= name.find("_norm.weight") == std::string::npos;
+        //q &= name != name_tn(LLM_TENSOR_TOKEN_EMBD, "weight");
+        //q &= name != name_tn(LLM_TENSOR_OUTPUT, "weight");
+        q &= name.find("ffn_gate_inp.weight") == std::string::npos;
+        q &= name.find("altup") == std::string::npos;
+        q &= name.find("laurel") == std::string::npos;
+        q &= name.find("per_layer_model_proj") == std::string::npos;
+        q &= name != name_tn(LLM_TENSOR_POS_EMBD, "weight");
+        q &= name != name_tn(LLM_TENSOR_TOKEN_TYPES, "weight");
+        q &= name.find("ssm_conv1d.weight") == std::string::npos;
+        q &= name.find("shortconv.conv.weight") == std::string::npos;
+        q &= name.find("time_mix_first.weight") == std::string::npos;
+        q &= name.find("time_mix_w0.weight") == std::string::npos;
+        q &= name.find("time_mix_w1.weight") == std::string::npos;
+        q &= name.find("time_mix_w2.weight") == std::string::npos;
+        q &= name.find("time_mix_v0.weight") == std::string::npos;
+        q &= name.find("time_mix_v1.weight") == std::string::npos;
+        q &= name.find("time_mix_v2.weight") == std::string::npos;
+        q &= name.find("time_mix_a0.weight") == std::string::npos;
+        q &= name.find("time_mix_a1.weight") == std::string::npos;
+        q &= name.find("time_mix_a2.weight") == std::string::npos;
+        q &= name.find("time_mix_g1.weight") == std::string::npos;
+        q &= name.find("time_mix_g2.weight") == std::string::npos;
+        q &= name.find("time_mix_decay_w1.weight") == std::string::npos;
+        q &= name.find("time_mix_decay_w2.weight") == std::string::npos;
+        q &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
+        q &= name.find("attn_rel_b.weight") == std::string::npos;
+        return q;
+    };
+
+    auto get_values = [&](const std::string & tensor_name) -> const float * {
+        if (!values_data) { return nullptr; }
+        const auto it = values_data->find(remap_imatrix(tensor_name, mapped));
+        if (it == values_data->end()) { return nullptr; }
+        return it->second.data();
+    };
+
+    auto get_activations = [&](const std::string & tensor_name) -> const float * {
+        if (!activations_data) { return nullptr; }
+        const auto it = activations_data->find(remap_imatrix(tensor_name, mapped));
+        if (it == activations_data->end()) { return nullptr; }
+        return it->second.data();
+    };
+
+    auto total_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t {
+        const int64_t n_per_row = t->ne[0];
+        const int64_t nrows     = t->ne[1];
+        const int64_t ne2       = t->ne[2] > 0 ? t->ne[2] : 1;
+        const size_t  row_sz    = ggml_row_size(typ, n_per_row);
+        return (size_t)ne2 * (size_t)nrows * row_sz;
+    };
+
+    auto tensor_bpw = [&](const ggml_tensor * t, const ggml_type typ) -> double {
+        const int64_t nelem = ggml_nelements(t);
+        const size_t bytes = total_bytes(t, typ);
+        return bytes * 8.0 / nelem;
+    };
+
+    auto is_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> bool {
+        const int64_t n_per_row = t->ne[0];
+        const int64_t blck = ggml_blck_size(typ);
+        if (blck <= 1) { return true; }  // FP16/BF16/Q8_0 etc
+        return n_per_row % blck == 0;
+    };
+
+    auto make_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> ggml_type {
+        if (is_compatible(t, typ)) { return typ; }
+        ggml_type fb = fallback_type(typ);
+        if (is_compatible(t, fb)) { return fb; }
+        return GGML_TYPE_F16; // final guard
+    };
+
+    // Estimate error for a given type using a sampled subset of rows.
+    // Uses both imatrix (E[a^2]) and activations (E[a]) if available.
+    auto estimate_error = [&](const ggml_tensor * t, const float * f32_data, const ggml_type typ, const float * values_all, const float * activations_all) -> double {
+        const int64_t n_per_row = t->ne[0];
+        const int64_t nrows     = t->ne[1];
+        const int64_t ne2       = t->ne[2] > 0 ? t->ne[2] : 1;
+
+        const ggml_type_traits * traits = ggml_get_type_traits(typ);
+        if (!traits || !traits->to_float) {
+            // cannot dequantize candidate -> assign very high error
+            return 1e35f;
+        }
+
+        // Sampling plan: for each expert slice, take up to sample_rows rows spread uniformly
+        const int64_t rows_per_expert = nrows;
+        const int64_t sample_rows = std::max<int64_t>(1, std::min<int64_t>(rows_per_expert, sample_rows_per_expert));
+        const int64_t stride = std::max<int64_t>(1, rows_per_expert / sample_rows);
+
+        const size_t row_sz = ggml_row_size(typ, n_per_row);
+        std::vector<uint8_t> qbuf(row_sz * sample_rows);
+        std::vector<float>   f32_sample(sample_rows * n_per_row);
+        std::vector<float>   deq(sample_rows * n_per_row);
+
+        float total_err = 0.0;
+
+        for (int64_t i03 = 0; i03 < ne2; ++i03) {
+            const float * value = values_all ? (values_all + i03 * n_per_row) : nullptr;
+            const float * activation = activations_all ? (activations_all + i03 * n_per_row) : nullptr;
+
+            // Assemble sampled rows into contiguous f32_sample
+            int64_t rs = 0;
+            for (int64_t r = 0; r < rows_per_expert && rs < sample_rows; r += stride) {
+                const float * src = f32_data + i03 * (n_per_row * rows_per_expert) + r * n_per_row;
+                std::memcpy(f32_sample.data() + rs * n_per_row, src, sizeof(float) * n_per_row);
+                ++rs;
+            }
+            if (rs == 0) { continue; }
+
+            // Quantize sampled rows in one chunk; pass the imatrix for this expert slice
+            const size_t got = ggml_quantize_chunk(typ, f32_sample.data(), qbuf.data(), 0, rs, n_per_row, value);
+            (void)got; // not strictly needed here
+
+            // Dequantize
+            traits->to_float(qbuf.data(), deq.data(), rs * n_per_row);
+
+            // Compute error proxy per sampled row
+            for (int64_t s = 0; s < rs; ++s) {
+                const float * xs = f32_sample.data() + s * n_per_row;
+                const float * ys =        deq.data() + s * n_per_row;
+
+                float mse_w    = 0.0;
+                float bias     = 0.0;
+                float bias_sum = 0.0;
+
+                if (value) {
+                    for (int64_t j = 0; j < n_per_row; ++j) {
+                        const float e = ys[j] - xs[j];
+                        mse_w += e * e * value[j];
+                        if (activation) {
+                            bias_sum += e * activation[j];
+                        }
+                    }
+                } else {
+                    for (int64_t j = 0; j < n_per_row; ++j) {
+                        const float e = ys[j] - xs[j];
+                        mse_w += e*e;
+                        if (activation) {
+                            bias_sum += e * activation[j];
+                        }
+                    }
+                }
+
+                if (activation) {
+                    bias = std::abs(bias_sum);
+                }
+
+                // Normalize by n_per_row to get a per-row average scale
+                float row_err = mse_w / std::max<int64_t>(1, n_per_row);
+                if (bias_lambda != 0.0) {
+                    row_err += bias_lambda * (bias / std::max<int64_t>(1, n_per_row));
+                }
+
+                total_err += row_err;
+            }
+
+            // Scale for the rows we didn't sample in this expert: multiply by stride-ish factor
+            const float scale_rows = rows_per_expert / std::max<int64_t>(1, rs);
+            total_err *= scale_rows;
+        }
+
+        return total_err;
+    };
+
+    // Produce per-tensor candidate lists
+    std::vector<tensor_info> all;
+    all.reserve(tensors.size());
+
+    for (const auto * tw : tensors) {
+        // Temporary workers for dequantization
+        std::vector<std::thread> workers;
+        workers.reserve(std::max(1, nthread));
+
+        ggml_tensor * t = tw->tensor;
+        const std::string name = ggml_get_name(t);
+
+        if (!can_quantize(t)) {
+            continue;
+        }
+
+        LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12d elements)\n", __func__, name.c_str(), (int)ggml_nelements(t));
+        if (!ml.use_mmap) {
+            if (read_data.size() < ggml_nbytes(t)) {
+                read_data.resize(ggml_nbytes(t));
+            }
+            t->data = read_data.data();
+        }
+        ml.load_data_for(t);
+
+        // Prepare f32 weights for error estimates
+        const int64_t nelem = ggml_nelements(t);
+        std::vector<no_init<float>> f32_conv_buf;
+        float * f32_data = nullptr;
+
+        if (t->type == GGML_TYPE_F32) {
+            f32_data = (float *)t->data;
+        } else {
+            llama_tensor_dequantize_impl(t, f32_conv_buf, workers, nelem, nthread);
+            f32_data = (float *)f32_conv_buf.data();
+        }
+
+        const float * values = get_values(name);
+        const float * activations = get_activations(name);
+
+        tensor_info info;
+        info.w = tw;
+        info.n_elements = nelem;
+
+        // Candidate build with compatibility handling and availability checks
+        for (ggml_type ts_type : base_candidates) {
+            // Skip IQ* without imatrix
+            if (is_iq(ts_type) && !values) { continue; }
+            ggml_type tt = make_compatible(t, ts_type);
+            // After fallback, if still incompatible, skip
+            if (!is_compatible(t, tt)) { continue; }
+
+            // Compute bpw and bytes
+            auto bpw = (float)tensor_bpw(t, tt);
+            size_t bytes = total_bytes(t, tt);
+
+            // Estimate error
+            auto err = (float)estimate_error(t, f32_data, tt, values, activations);
+
+            info.candidate.push_back(candidate_types{tt, bpw, bytes, err});
+        }
+
+        if (info.candidate.empty()) {
+            // as a last resort, keep original type
+            float bpw = ggml_nbytes(t) * 8.0f / nelem;
+            info.candidate.push_back(candidate_types{t->type, bpw, ggml_nbytes(t), 0.0});
+        }
+
+        // Sort by bpw ascending
+        std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types &a, const candidate_types &b) {
+            if (a.bpw != b.bpw) { return a.bpw < b.bpw; }
+            if (a.error != b.error) { return a.error < b.error; }
+            return a.bytes < b.bytes;
+        });
+
+        // collapse candidates with identical storage size (bytes)
+        {
+            std::vector<candidate_types> uniq;
+            uniq.reserve(info.candidate.size());
+
+            for (size_t i = 0; i < info.candidate.size(); ) {
+                size_t j = i + 1;
+                candidate_types best = info.candidate[i];
+                // group same-byte entries, keep the one with the lowest error
+                while (j < info.candidate.size() && info.candidate[j].bytes == info.candidate[i].bytes) {
+                    if (info.candidate[j].error < best.error) { best = info.candidate[j]; }
+                    ++j;
+                }
+                uniq.push_back(best);
+                i = j;
+            }
+            info.candidate.swap(uniq);
+        }
+
+        // Initialize choice at the smallest bpw candidate
+        info.choice = 0;
+        info.min_bpw = info.candidate.front().bpw;
+        info.max_bpw = info.candidate.back().bpw;
+
+        all.push_back(std::move(info));
+    }
+
+    if (all.empty()) { return {}; }
+
+    // Greedy allocation from minimum bpw upward to reach target_bpw
+    // Start with minimal bpw assignment
+    auto current_total_bytes = [&]() -> size_t {
+        size_t b = 0;
+        for (const auto & ti : all) {
+            b += ti.candidate[ti.choice].bytes;
+        }
+        return b;
+    };
+
+    auto total_weights = [&]() -> size_t {
+        size_t w = 0;
+        for (const auto & ti : all) {
+            w += ti.n_elements;
+        }
+        return w;
+    };
+
+    const size_t tw = total_weights();
+    auto current_bpw = [&]() -> double {
+        return (double)current_total_bytes() * 8.0f / (double)tw;
+    };
+
+    // Precompute current bpw
+    double bpw_now = current_bpw();
+
+    // If minimal bpw is already above the target, we're constrained by geometry; return closest (min bpw)
+    if (bpw_now >= target_bpw) {
+        std::unordered_map<std::string, ggml_type> overrides;
+        for (const auto & ti : all) {
+            overrides[ggml_get_name(ti.w->tensor)] = ti.candidate[ti.choice].type;
+        }
+        return overrides;
+    }
+
+    struct upgrade {
+        int idx;            // tensor index
+        int next;           // next candidate index (strictly larger bytes)
+        double err;         // error reduction
+        size_t delta_bytes; // increase in bytes
+        double ratio;       // err per added bit
+    };
+
+    // Find next strictly-larger candidate index for a tensor
+    auto next_distinct_idx = [&](const tensor_info &ti) -> int {
+        const auto &cand = ti.candidate;
+        const auto &cur  = cand[ti.choice];
+        int j = ti.choice + 1;
+        while (j < (int)cand.size() && cand[j].bytes == cur.bytes) ++j;
+        return j < (int)cand.size() ? j : -1;
+    };
+
+    auto recompute_best_upgrade = [&]() -> upgrade {
+        const double eps = 1e-12;
+        upgrade best{-1, -1, 0.0, 0, -1.0};
+        for (int i = 0; i < (int)all.size(); ++i) {
+            const auto &ti = all[i];
+            if (ti.choice >= (int)ti.candidate.size() - 1) { continue; }
+
+            int j = next_distinct_idx(ti);
+            if (j < 0) { continue; } // no larger-size candidate remains
+
+            const auto &cur = ti.candidate[ti.choice];
+            const auto &nxt = ti.candidate[j];
+
+            size_t delta_bytes = nxt.bytes - cur.bytes;
+            if (delta_bytes == 0) { continue; } // should not happen after dedup, but be safe
+
+            double err = (double)cur.error - (double)nxt.error;
+            err = std::max(err, 0.0); // do not penalize due to sampling noise
+
+            double ratio = err / (double)(delta_bytes * 8ull);
+            if (ratio > best.ratio + eps || (std::abs(ratio - best.ratio) <= eps && delta_bytes < best.delta_bytes)) {
+                best = upgrade{i, j, err, delta_bytes, ratio};
+            }
+        }
+        return best;
+    };
+
+    while (true) {
+        upgrade up = recompute_best_upgrade();
+        if (up.idx < 0) { break; }
+
+        size_t now_bytes = current_total_bytes();
+        size_t next_bytes = now_bytes + up.delta_bytes;
+        double bpw_next = (double)next_bytes * 8.0 / (double)tw;
+
+        if (bpw_next <= (double)target_bpw + 1e-12) {
+            all[up.idx].choice = up.next;
+            bpw_now = bpw_next;
+        } else {
+            break;
+        }
+    }
+
+    // We might still be below target but taking any single upgrade overshoots.
+    {
+        double under_gap = (double)target_bpw - bpw_now;
+
+        upgrade best_over{-1, -1, 0.0, 0, -1.0};
+        double best_over_gap = 1e300;
+
+        size_t now_bytes = current_total_bytes();
+
+        for (int i = 0; i < (int)all.size(); ++i) {
+            const auto &ti = all[i];
+            if (ti.choice >= (int)ti.candidate.size() - 1) { continue; }
+
+            int j = next_distinct_idx(ti);
+            if (j < 0) { continue; }
+
+            const auto &cur = ti.candidate[ti.choice];
+            const auto &nxt = ti.candidate[j];
+
+            size_t delta_bytes = nxt.bytes - cur.bytes;
+            if (delta_bytes == 0) { continue; }
+
+            size_t over_bytes = now_bytes + delta_bytes;
+            double bpw_over = (double)over_bytes * 8.0 / (double)tw;
+
+            double over_gap = std::abs(bpw_over - (double)target_bpw);
+
+            double err = (double)cur.error - (double)nxt.error;
+            if (err < 0.0) { err = 0.0; }
+            double ratio = err / (double)(delta_bytes * 8ull);
+
+            if (over_gap < best_over_gap - 1e-12 || (std::abs(over_gap - best_over_gap) <= 1e-12 && ratio > best_over.ratio)) {
+                best_over_gap = over_gap;
+                best_over = upgrade{i, j, err, delta_bytes, ratio};
+            }
+        }
+
+        if (best_over.idx >= 0) {
+            if (best_over_gap < under_gap) {
+                all[best_over.idx].choice = best_over.next;
+            }
+        }
+    }
+
+    // Build the override map
+    std::unordered_map<std::string, ggml_type> overrides;
+    LLAMA_LOG_INFO("%s: - estimated tensor quantization mix to achieve %.4f bpw at lowest ppl\n", __func__, target_bpw);
+    for (const auto & ti : all) {
+        LLAMA_LOG_INFO("\t%s: %45s - \t%8s, \t%1.4f bpw,\terror: %.4f\n",
+            __func__, ggml_get_name(ti.w->tensor), ggml_type_name(ti.candidate[ti.choice].type), ti.candidate[ti.choice].bpw, ti.candidate[ti.choice].error);
+        overrides[ggml_get_name(ti.w->tensor)] = ti.candidate[ti.choice].type;
+    }
+    return overrides;
+}
+
 static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
     ggml_type default_type;
     llama_ftype ftype = params->ftype;

From 1187f6aa9eb4cf7a3bf3945d0ecd292a49c03efa Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 19 Aug 2025 11:07:03 +0100
Subject: [PATCH 015/155] Implement bpw_overrides call

---
 src/llama-quant.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 1e24303c52..b0b3be76ca 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1314,6 +1314,12 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         }
     }
 
+    std::unordered_map<std::string, ggml_type> bpw_overrides = {};
+    if (params->target_bpw != -1.0f) {
+        LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.3f bpw at lowest ppl - this opearation may take some time\n", __func__, params->target_bpw);
+        bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params->target_bpw, nthread);
+    }
+
     int cur_split = -1;
     std::ofstream fout;
     auto close_ofstream = [&]() {
@@ -1430,6 +1436,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             if (!params->pure && ggml_is_quantized(default_type)) {
                 int fallback = qs.n_fallback;
                 new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
+                // get bpw override
+                const auto override = bpw_overrides.find(name);
+                if (override != bpw_overrides.end()) { new_type = override->second; }
                 // unless the user specifies a type, and the tensor geometry will not require fallback quantisation
                 if (params->tensor_types && qs.n_fallback - fallback == 0) {
                     const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);

From 5aceb9e3ae016ed057a0963934c53203b74ad3c5 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 19 Aug 2025 22:29:27 +0100
Subject: [PATCH 016/155] Refactor variable names

---
 src/llama-quant.cpp | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index b0b3be76ca..5af70c1c9b 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -575,13 +575,13 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
     return new_size;
 }
 
-// Returns per-tensor overrides of quantization types to meet target BPW with best expected quality.
-// imatrix_data: map from tensor name -> length (ne[0] * ne[2]) containing per-column E[a^2] by expert
-// activations_data: optional map from tensor name -> length (ne[0] * ne[2]) containing per-column E[a] by expert
-// bias_lambda: relative weight on bias term (|sum e_j * E[a_j]|) vs MSE term (sum e_j^2 * E[a_j^2])
+// Returns per-tensor overrides of quantization types to meet target BPW with the lowest ppl
+// sample_rows_per_expert: Larger values will result in more accurate error estimates, but will take longer to compute
+// bias_lambda: Affects the weight of the bias term in the MSE error function. 0.0 means no bias, 1.0 means equal weight
+//              for bias and error, 2.0 means twice as much weight for bias
 static std::unordered_map<std::string, ggml_type> target_bpw_type(
     llama_model_loader & ml,
-    std::vector<no_init<uint8_t>> & read_data,
+    std::vector<no_init<uint8_t>> & buffer,
     const llama_model & model,
     const std::vector<const llama_model_loader::llama_tensor_weight *> & tensors,
     const std::map<int, std::string> & mapped,
@@ -735,24 +735,21 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         float total_err = 0.0;
 
-        for (int64_t i03 = 0; i03 < ne2; ++i03) {
-            const float * value = values_all ? (values_all + i03 * n_per_row) : nullptr;
-            const float * activation = activations_all ? (activations_all + i03 * n_per_row) : nullptr;
+        for (int64_t slice = 0; slice < ne2; ++slice) {
+            const float * value = values_all ? (values_all + slice * n_per_row) : nullptr;
+            const float * activation = activations_all ? (activations_all + slice * n_per_row) : nullptr;
 
-            // Assemble sampled rows into contiguous f32_sample
             int64_t rs = 0;
             for (int64_t r = 0; r < rows_per_expert && rs < sample_rows; r += stride) {
-                const float * src = f32_data + i03 * (n_per_row * rows_per_expert) + r * n_per_row;
+                const float * src = f32_data + slice * (n_per_row * rows_per_expert) + r * n_per_row;
                 std::memcpy(f32_sample.data() + rs * n_per_row, src, sizeof(float) * n_per_row);
                 ++rs;
             }
             if (rs == 0) { continue; }
 
-            // Quantize sampled rows in one chunk; pass the imatrix for this expert slice
             const size_t got = ggml_quantize_chunk(typ, f32_sample.data(), qbuf.data(), 0, rs, n_per_row, value);
-            (void)got; // not strictly needed here
+            (void)got;
 
-            // Dequantize
             traits->to_float(qbuf.data(), deq.data(), rs * n_per_row);
 
             // Compute error proxy per sampled row
@@ -821,10 +818,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12d elements)\n", __func__, name.c_str(), (int)ggml_nelements(t));
         if (!ml.use_mmap) {
-            if (read_data.size() < ggml_nbytes(t)) {
-                read_data.resize(ggml_nbytes(t));
-            }
-            t->data = read_data.data();
+            if (buffer.size() < ggml_nbytes(t)) { buffer.resize(ggml_nbytes(t)); }
+            t->data = buffer.data();
         }
         ml.load_data_for(t);
 

From ee05d6bc0b250a7c19b9dedf504163509ef736f8 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 19 Aug 2025 22:32:53 +0100
Subject: [PATCH 017/155] Update comments

---
 src/llama-quant.cpp | 32 +++++++++++++-------------------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 5af70c1c9b..546f6b438c 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -596,13 +596,13 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         ggml_type type;
         float bpw;
         size_t bytes;
-        float error;  // lower is better
+        float error;
     };
 
     struct tensor_info {
         const llama_model_loader::llama_tensor_weight * w;
-        std::vector<candidate_types> candidate; // sorted by bpw ascending
-        int choice = -1;             // index into cand
+        std::vector<candidate_types> candidate;
+        int choice = -1;
         float min_bpw = 0.0;
         float max_bpw = 0.0;
         size_t n_elements = 0;
@@ -610,7 +610,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     auto name_tn = LLM_TN(model.arch);
 
-    // The candidate types we consider; adjust as needed
     const ggml_type base_candidates[] = {
         // Model's
         GGML_TYPE_IQ1_S,
@@ -639,8 +638,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         bool q = name.rfind("weight") == name.size() - 6;
         q &= (ggml_n_dims(t) >= 2);
         q &= name.find("_norm.weight") == std::string::npos;
-        //q &= name != name_tn(LLM_TENSOR_TOKEN_EMBD, "weight");
-        //q &= name != name_tn(LLM_TENSOR_OUTPUT, "weight");
         q &= name.find("ffn_gate_inp.weight") == std::string::npos;
         q &= name.find("altup") == std::string::npos;
         q &= name.find("laurel") == std::string::npos;
@@ -719,7 +716,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         const ggml_type_traits * traits = ggml_get_type_traits(typ);
         if (!traits || !traits->to_float) {
-            // cannot dequantize candidate -> assign very high error
+            // Cannot dequantize candidate -> assign very high error
             return 1e35f;
         }
 
@@ -842,12 +839,10 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         info.w = tw;
         info.n_elements = nelem;
 
-        // Candidate build with compatibility handling and availability checks
+        // Build per-tensor candidate list
         for (ggml_type ts_type : base_candidates) {
-            // Skip IQ* without imatrix
             if (is_iq(ts_type) && !values) { continue; }
             ggml_type tt = make_compatible(t, ts_type);
-            // After fallback, if still incompatible, skip
             if (!is_compatible(t, tt)) { continue; }
 
             // Compute bpw and bytes
@@ -861,19 +856,18 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         }
 
         if (info.candidate.empty()) {
-            // as a last resort, keep original type
+            // As a last resort, keep original type
             float bpw = ggml_nbytes(t) * 8.0f / nelem;
             info.candidate.push_back(candidate_types{t->type, bpw, ggml_nbytes(t), 0.0});
         }
 
-        // Sort by bpw ascending
         std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types &a, const candidate_types &b) {
             if (a.bpw != b.bpw) { return a.bpw < b.bpw; }
             if (a.error != b.error) { return a.error < b.error; }
             return a.bytes < b.bytes;
         });
 
-        // collapse candidates with identical storage size (bytes)
+        // Collapse candidates with identical storage size (bytes)
         {
             std::vector<candidate_types> uniq;
             uniq.reserve(info.candidate.size());
@@ -903,7 +897,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     if (all.empty()) { return {}; }
 
     // Greedy allocation from minimum bpw upward to reach target_bpw
-    // Start with minimal bpw assignment
     auto current_total_bytes = [&]() -> size_t {
         size_t b = 0;
         for (const auto & ti : all) {
@@ -938,11 +931,11 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     }
 
     struct upgrade {
-        int idx;            // tensor index
-        int next;           // next candidate index (strictly larger bytes)
-        double err;         // error reduction
-        size_t delta_bytes; // increase in bytes
-        double ratio;       // err per added bit
+        int idx;
+        int next;
+        double err;
+        size_t delta_bytes;
+        double ratio;
     };
 
     // Find next strictly-larger candidate index for a tensor
@@ -998,6 +991,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     }
 
     // We might still be below target but taking any single upgrade overshoots.
+    // Try to find the best upgrade that overshoots the target_bpw by the least and has the best error-to-size ratio.
     {
         double under_gap = (double)target_bpw - bpw_now;
 

From f22b3097eb144a913d02fbb445cbdb9b97e91859 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 19 Aug 2025 22:34:01 +0100
Subject: [PATCH 018/155] Avoid division by zero if truncation occurs

---
 src/llama-quant.cpp | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 546f6b438c..3911eba43b 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -790,28 +790,24 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             }
 
             // Scale for the rows we didn't sample in this expert: multiply by stride-ish factor
-            const float scale_rows = rows_per_expert / std::max<int64_t>(1, rs);
+            const float scale_rows = (float)rows_per_expert / std::max(1.0f, (float)rs);
             total_err *= scale_rows;
         }
 
         return total_err;
     };
 
-    // Produce per-tensor candidate lists
     std::vector<tensor_info> all;
     all.reserve(tensors.size());
 
     for (const auto * tw : tensors) {
-        // Temporary workers for dequantization
         std::vector<std::thread> workers;
         workers.reserve(std::max(1, nthread));
 
         ggml_tensor * t = tw->tensor;
         const std::string name = ggml_get_name(t);
 
-        if (!can_quantize(t)) {
-            continue;
-        }
+        if (!can_quantize(t)) { continue; }
 
         LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12d elements)\n", __func__, name.c_str(), (int)ggml_nelements(t));
         if (!ml.use_mmap) {
@@ -820,7 +816,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         }
         ml.load_data_for(t);
 
-        // Prepare f32 weights for error estimates
         const int64_t nelem = ggml_nelements(t);
         std::vector<no_init<float>> f32_conv_buf;
         float * f32_data = nullptr;
@@ -955,13 +950,13 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             if (ti.choice >= (int)ti.candidate.size() - 1) { continue; }
 
             int j = next_distinct_idx(ti);
-            if (j < 0) { continue; } // no larger-size candidate remains
+            if (j < 0) { continue; }
 
             const auto &cur = ti.candidate[ti.choice];
             const auto &nxt = ti.candidate[j];
 
             size_t delta_bytes = nxt.bytes - cur.bytes;
-            if (delta_bytes == 0) { continue; } // should not happen after dedup, but be safe
+            if (delta_bytes == 0) { continue; }
 
             double err = (double)cur.error - (double)nxt.error;
             err = std::max(err, 0.0); // do not penalize due to sampling noise

From 936294f6afb10aea69ac5ae85fcc29313b49cd9e Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 19 Aug 2025 23:31:22 +0100
Subject: [PATCH 019/155] Increase precision for error calculation

---
 src/llama-quant.cpp | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 3911eba43b..a4a10da062 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -730,7 +730,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         std::vector<float>   f32_sample(sample_rows * n_per_row);
         std::vector<float>   deq(sample_rows * n_per_row);
 
-        float total_err = 0.0;
+        double total_err = 0.0;
 
         for (int64_t slice = 0; slice < ne2; ++slice) {
             const float * value = values_all ? (values_all + slice * n_per_row) : nullptr;
@@ -754,9 +754,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 const float * xs = f32_sample.data() + s * n_per_row;
                 const float * ys =        deq.data() + s * n_per_row;
 
-                float mse_w    = 0.0;
-                float bias     = 0.0;
-                float bias_sum = 0.0;
+                double mse_w    = 0.0;
+                double bias     = 0.0;
+                double bias_sum = 0.0;
 
                 if (value) {
                     for (int64_t j = 0; j < n_per_row; ++j) {
@@ -769,19 +769,17 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 } else {
                     for (int64_t j = 0; j < n_per_row; ++j) {
                         const float e = ys[j] - xs[j];
-                        mse_w += e*e;
+                        mse_w += e * e;
                         if (activation) {
                             bias_sum += e * activation[j];
                         }
                     }
                 }
 
-                if (activation) {
-                    bias = std::abs(bias_sum);
-                }
+                if (activation) { bias = std::abs(bias_sum); }
 
                 // Normalize by n_per_row to get a per-row average scale
-                float row_err = mse_w / std::max<int64_t>(1, n_per_row);
+                double row_err = mse_w / std::max<int64_t>(1, n_per_row);
                 if (bias_lambda != 0.0) {
                     row_err += bias_lambda * (bias / std::max<int64_t>(1, n_per_row));
                 }
@@ -790,11 +788,11 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             }
 
             // Scale for the rows we didn't sample in this expert: multiply by stride-ish factor
-            const float scale_rows = (float)rows_per_expert / std::max(1.0f, (float)rs);
+            const auto scale_rows = (double)rows_per_expert / std::max(1.0, (double)rs);
             total_err *= scale_rows;
         }
 
-        return total_err;
+        return std::isfinite(total_err) ? total_err : 1e35;
     };
 
     std::vector<tensor_info> all;

From 5cd69a6809c56922e1b973ce900f3680c28a5117 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Wed, 20 Aug 2025 09:41:39 +0100
Subject: [PATCH 020/155] Add F16/BF16 type

---
 src/llama-quant.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index a4a10da062..5522fe39d2 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -630,7 +630,13 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         GGML_TYPE_Q5_1,
         GGML_TYPE_Q5_K,
         GGML_TYPE_Q6_K,
-        GGML_TYPE_Q8_0
+        GGML_TYPE_Q8_0,
+// TODO: find better way to handle F16/BF16
+#ifdef GGML_USE_METAL
+        GGML_TYPE_F16
+#else
+        GGML_TYPE_BF16
+#endif
     };
 
     auto can_quantize = [&](const ggml_tensor * t) -> bool {

From 69586e212e76849fcdff17e68e8023b91025b415 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Wed, 20 Aug 2025 13:23:11 +0100
Subject: [PATCH 021/155] Add F16/BF16 type

---
 tools/quantize/quantize.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 86a96cdfcc..b907008cb4 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -132,7 +132,7 @@ static void usage(const char * executable) {
     printf("      Advanced option to selectively quantize tensors. May be specified multiple times.\n");
     printf("  --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n");
     printf("      Advanced option to remove all tensors from the given layers\n");
-    printf("  --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0 \n");
+    printf("  --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 16.0 \n");
     printf("  --keep-split: will generate quantized model in the same shards as input\n");
     printf("  --override-kv KEY=TYPE:VALUE\n");
     printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
@@ -486,13 +486,13 @@ static bool parse_target_bpw(const char * data, float & target_bpw) {
 
     try {
         target_bpw = std::stof(data);
-        if (target_bpw < 0.0f || target_bpw > 8.0f) {
-            printf("\n%s: target bits per weight (bpw) must be a positive number between 0.0 and 8.0\n\n", __func__);
+        if (target_bpw < 0.0f || target_bpw > 16.0f) {
+            printf("\n%s: target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__);
             return false;
         }
     }
     catch (const std::exception & e) {
-        printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 8.0\n\n", __func__, data);
+        printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__, data);
         return false;
     }
 

From 29b2dc3ec0ddefde21394007649df6c268ebca3d Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Wed, 20 Aug 2025 13:27:01 +0100
Subject: [PATCH 022/155] Do not mix K and IQ quants

---
 src/llama-quant.cpp | 62 +++++++++++++++++++++++++++++++++------------
 1 file changed, 46 insertions(+), 16 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 5522fe39d2..9dc903874f 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -36,6 +36,26 @@ static bool is_iq(const enum ggml_type t) {
     }
 }
 
+static bool is_iq(const enum llama_ftype t) {
+    switch (t) {
+        case LLAMA_FTYPE_MOSTLY_IQ1_S:
+        case LLAMA_FTYPE_MOSTLY_IQ1_M:
+        case LLAMA_FTYPE_MOSTLY_IQ2_XXS:
+        case LLAMA_FTYPE_MOSTLY_IQ2_XS:
+        case LLAMA_FTYPE_MOSTLY_IQ2_S:
+        case LLAMA_FTYPE_MOSTLY_IQ2_M:
+        case LLAMA_FTYPE_MOSTLY_IQ3_XXS:
+        case LLAMA_FTYPE_MOSTLY_IQ3_XS:
+        case LLAMA_FTYPE_MOSTLY_IQ3_S:
+        case LLAMA_FTYPE_MOSTLY_IQ3_M:
+        case LLAMA_FTYPE_MOSTLY_IQ4_XS:
+        case LLAMA_FTYPE_MOSTLY_IQ4_NL:
+            return true;
+        default:
+            return false;
+    }
+}
+
 static enum ggml_type fallback_type(const enum ggml_type new_type) {
     switch (new_type) {
         case GGML_TYPE_TQ1_0:
@@ -587,7 +607,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     const std::map<int, std::string> & mapped,
     const std::unordered_map<std::string, std::vector<float>> * values_data,
     const std::unordered_map<std::string, std::vector<float>> * activations_data,
-    float target_bpw,
+    const llama_model_quantize_params * params,
     int nthread,
     int sample_rows_per_expert = 128,
     float bias_lambda = 1.0
@@ -608,19 +628,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         size_t n_elements = 0;
     };
 
-    auto name_tn = LLM_TN(model.arch);
-
-    const ggml_type base_candidates[] = {
-        // Model's
-        GGML_TYPE_IQ1_S,
-        GGML_TYPE_IQ1_M,
-        GGML_TYPE_IQ2_XXS,
-        GGML_TYPE_IQ2_XS,
-        GGML_TYPE_IQ2_S,
-        GGML_TYPE_IQ3_XXS,
-        GGML_TYPE_IQ3_S,
-        GGML_TYPE_IQ4_XS,
-        GGML_TYPE_IQ4_NL,
+    const ggml_type k_candidates[] = {
         GGML_TYPE_Q2_K,
         GGML_TYPE_Q3_K,
         GGML_TYPE_Q4_0,
@@ -639,6 +647,21 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 #endif
     };
 
+    const ggml_type iq_candidates[] = {
+        GGML_TYPE_IQ1_S,
+        GGML_TYPE_IQ1_M,
+        GGML_TYPE_IQ2_XXS,
+        GGML_TYPE_IQ2_XS,
+        GGML_TYPE_IQ2_S,
+        GGML_TYPE_IQ3_XXS,
+        GGML_TYPE_IQ3_S,
+        GGML_TYPE_IQ4_XS,
+        GGML_TYPE_IQ4_NL,
+    };
+
+    auto name_tn = LLM_TN(model.arch);
+    float target_bpw = params->target_bpw;
+
     auto can_quantize = [&](const ggml_tensor * t) -> bool {
         const std::string name = ggml_get_name(t);
         bool q = name.rfind("weight") == name.size() - 6;
@@ -838,8 +861,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         info.w = tw;
         info.n_elements = nelem;
 
+        std::vector<ggml_type> quant_candidates;
+        if (is_iq(params->ftype)) {
+            quant_candidates.assign(std::begin(iq_candidates), std::end(iq_candidates));
+        } else {
+            quant_candidates.assign(std::begin(k_candidates), std::end(k_candidates));
+        }
+
         // Build per-tensor candidate list
-        for (ggml_type ts_type : base_candidates) {
+        for (ggml_type ts_type : quant_candidates) {
             if (is_iq(ts_type) && !values) { continue; }
             ggml_type tt = make_compatible(t, ts_type);
             if (!is_compatible(t, tt)) { continue; }
@@ -1305,7 +1335,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     std::unordered_map<std::string, ggml_type> bpw_overrides = {};
     if (params->target_bpw != -1.0f) {
         LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.3f bpw at lowest ppl - this opearation may take some time\n", __func__, params->target_bpw);
-        bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params->target_bpw, nthread);
+        bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread);
     }
 
     int cur_split = -1;

From 43caadf783a4bae41011e3b9aca5bbe79185a7a6 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Wed, 20 Aug 2025 17:24:48 +0100
Subject: [PATCH 023/155] Add better fallbacks for IQ mixes

---
 src/llama-quant.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 9dc903874f..c412191c8f 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -657,6 +657,12 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         GGML_TYPE_IQ3_S,
         GGML_TYPE_IQ4_XS,
         GGML_TYPE_IQ4_NL,
+        // Add higher-precision fallbacks for IQ mixes to improve ppl if bpw budget allows it
+        GGML_TYPE_Q5_0,
+        GGML_TYPE_Q5_1,
+        GGML_TYPE_Q5_K,
+        GGML_TYPE_Q6_K,
+        GGML_TYPE_Q8_0
     };
 
     auto name_tn = LLM_TN(model.arch);

From 52da4a4f8c28d063378d54dd806da03614251e76 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Wed, 20 Aug 2025 17:26:05 +0100
Subject: [PATCH 024/155] Skip if output.weight or type is COPY

---
 src/llama-quant.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index c412191c8f..786adfe547 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -697,6 +697,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         q &= name.find("time_mix_decay_w2.weight") == std::string::npos;
         q &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
         q &= name.find("attn_rel_b.weight") == std::string::npos;
+        q &= params->quantize_output_tensor || name != "output.weight";
+        q &= !params->only_copy;
+
         return q;
     };
 

From 3f0118d6029450955c43cd84109bdfc36a8cecd3 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Wed, 20 Aug 2025 17:26:37 +0100
Subject: [PATCH 025/155] Fix bias lambda bug

---
 src/llama-quant.cpp | 35 +++++++++++++++--------------------
 1 file changed, 15 insertions(+), 20 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 786adfe547..44cf9e30e3 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -782,52 +782,47 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             }
             if (rs == 0) { continue; }
 
-            const size_t got = ggml_quantize_chunk(typ, f32_sample.data(), qbuf.data(), 0, rs, n_per_row, value);
-            (void)got;
-
+            // Quantize sample rows and dequantize back
+            (void)ggml_quantize_chunk(typ, f32_sample.data(), qbuf.data(), 0, rs, n_per_row, value);
             traits->to_float(qbuf.data(), deq.data(), rs * n_per_row);
 
-            // Compute error proxy per sampled row
+            // Compute error proxy per sampled slice
+            double slice_err = 0.0;
             for (int64_t s = 0; s < rs; ++s) {
                 const float * xs = f32_sample.data() + s * n_per_row;
                 const float * ys =        deq.data() + s * n_per_row;
 
                 double mse_w    = 0.0;
-                double bias     = 0.0;
                 double bias_sum = 0.0;
 
                 if (value) {
                     for (int64_t j = 0; j < n_per_row; ++j) {
                         const float e = ys[j] - xs[j];
                         mse_w += e * e * value[j];
-                        if (activation) {
-                            bias_sum += e * activation[j];
-                        }
+                        if (activation) { bias_sum += e * activation[j]; }
                     }
                 } else {
                     for (int64_t j = 0; j < n_per_row; ++j) {
                         const float e = ys[j] - xs[j];
                         mse_w += e * e;
-                        if (activation) {
-                            bias_sum += e * activation[j];
-                        }
+                        if (activation) { bias_sum += e * activation[j]; }
                     }
                 }
 
-                if (activation) { bias = std::abs(bias_sum); }
-
                 // Normalize by n_per_row to get a per-row average scale
                 double row_err = mse_w / std::max<int64_t>(1, n_per_row);
-                if (bias_lambda != 0.0) {
-                    row_err += bias_lambda * (bias / std::max<int64_t>(1, n_per_row));
+                if (activation && bias_lambda != 0.0) {
+                    // bias_sum ~= sum_j ( (w_q - w_fp)[j] * E[a_j] )
+                    const double bias = std::abs(bias_sum) / std::max<int64_t>(1, n_per_row);
+                    row_err += bias_lambda * bias;
                 }
 
-                total_err += row_err;
+                slice_err += row_err;
             }
 
-            // Scale for the rows we didn't sample in this expert: multiply by stride-ish factor
-            const auto scale_rows = (double)rows_per_expert / std::max(1.0, (double)rs);
-            total_err *= scale_rows;
+            // Scale the slice contribution by the sampling factor
+            const auto  scale_rows = (double)rows_per_expert / std::max(1.0, (double)rs);
+            total_err += slice_err * scale_rows;
         }
 
         return std::isfinite(total_err) ? total_err : 1e35;
@@ -1002,7 +997,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             if (delta_bytes == 0) { continue; }
 
             double err = (double)cur.error - (double)nxt.error;
-            err = std::max(err, 0.0); // do not penalize due to sampling noise
+            err = std::max(err, 0.0);
 
             double ratio = err / (double)(delta_bytes * 8ull);
             if (ratio > best.ratio + eps || (std::abs(ratio - best.ratio) <= eps && delta_bytes < best.delta_bytes)) {

From b0b33b7ccbc5880e6ac5206ea309ee328e685c08 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Wed, 20 Aug 2025 20:58:26 +0100
Subject: [PATCH 026/155] Optimise tensor sampling

---
 src/llama-quant.cpp | 197 ++++++++++++++++++++++++++------------------
 1 file changed, 119 insertions(+), 78 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 44cf9e30e3..830bf915cf 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -609,7 +609,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     const std::unordered_map<std::string, std::vector<float>> * activations_data,
     const llama_model_quantize_params * params,
     int nthread,
-    int sample_rows_per_expert = 128,
+    int sample_rows_per_expert = 256,
     float bias_lambda = 1.0
 ) {
     struct candidate_types {
@@ -671,7 +671,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     auto can_quantize = [&](const ggml_tensor * t) -> bool {
         const std::string name = ggml_get_name(t);
         bool q = name.rfind("weight") == name.size() - 6;
-        q &= (ggml_n_dims(t) >= 2);
+        q &= ggml_n_dims(t) >= 2;
         q &= name.find("_norm.weight") == std::string::npos;
         q &= name.find("ffn_gate_inp.weight") == std::string::npos;
         q &= name.find("altup") == std::string::npos;
@@ -719,9 +719,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     auto total_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t {
         const int64_t n_per_row = t->ne[0];
-        const int64_t nrows     = t->ne[1];
-        const int64_t ne2       = t->ne[2] > 0 ? t->ne[2] : 1;
-        const size_t  row_sz    = ggml_row_size(typ, n_per_row);
+        const int64_t nrows = t->ne[1];
+        const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
+        const size_t  row_sz = ggml_row_size(typ, n_per_row);
         return (size_t)ne2 * (size_t)nrows * row_sz;
     };
 
@@ -734,7 +734,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     auto is_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> bool {
         const int64_t n_per_row = t->ne[0];
         const int64_t blck = ggml_blck_size(typ);
-        if (blck <= 1) { return true; }  // FP16/BF16/Q8_0 etc
+        if (blck <= 1) { return true; }
         return n_per_row % blck == 0;
     };
 
@@ -742,15 +742,20 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         if (is_compatible(t, typ)) { return typ; }
         ggml_type fb = fallback_type(typ);
         if (is_compatible(t, fb)) { return fb; }
-        return GGML_TYPE_F16; // final guard
+        return GGML_TYPE_F16;
     };
 
-    // Estimate error for a given type using a sampled subset of rows.
-    // Uses both imatrix (E[a^2]) and activations (E[a]) if available.
-    auto estimate_error = [&](const ggml_tensor * t, const float * f32_data, const ggml_type typ, const float * values_all, const float * activations_all) -> double {
+    // Estimate error for a given type using a sampled subset of rows
+    auto estimate_error = [&](const ggml_tensor * t,
+        const ggml_type typ,
+        const std::vector<float> & f32_sample,
+        const std::vector<int64_t> & sample_rows_per_slice,
+        const std::vector<float> & values_sample,
+        const std::vector<float> & activations_sample) -> double
+    {
         const int64_t n_per_row = t->ne[0];
-        const int64_t nrows     = t->ne[1];
-        const int64_t ne2       = t->ne[2] > 0 ? t->ne[2] : 1;
+        const int64_t nrows = t->ne[1];
+        const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
 
         const ggml_type_traits * traits = ggml_get_type_traits(typ);
         if (!traits || !traits->to_float) {
@@ -758,70 +763,73 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             return 1e35f;
         }
 
-        // Sampling plan: for each expert slice, take up to sample_rows rows spread uniformly
-        const int64_t rows_per_expert = nrows;
-        const int64_t sample_rows = std::max<int64_t>(1, std::min<int64_t>(rows_per_expert, sample_rows_per_expert));
-        const int64_t stride = std::max<int64_t>(1, rows_per_expert / sample_rows);
+        const size_t total_sampled_rows = f32_sample.size() / n_per_row;
+        if (total_sampled_rows == 0) { return 0.0; }
 
-        const size_t row_sz = ggml_row_size(typ, n_per_row);
-        std::vector<uint8_t> qbuf(row_sz * sample_rows);
-        std::vector<float>   f32_sample(sample_rows * n_per_row);
-        std::vector<float>   deq(sample_rows * n_per_row);
-
-        double total_err = 0.0;
+        const size_t qbuf_size = ggml_row_size(typ, n_per_row) * total_sampled_rows;
+        std::vector<uint8_t> qbuf(qbuf_size);
+        std::vector<float> deq(f32_sample.size());
 
+        // Quantize all sampled rows at once and dequantize back
+        size_t qbuf_offset = 0;
+        size_t f32_offset = 0;
         for (int64_t slice = 0; slice < ne2; ++slice) {
-            const float * value = values_all ? (values_all + slice * n_per_row) : nullptr;
-            const float * activation = activations_all ? (activations_all + slice * n_per_row) : nullptr;
-
-            int64_t rs = 0;
-            for (int64_t r = 0; r < rows_per_expert && rs < sample_rows; r += stride) {
-                const float * src = f32_data + slice * (n_per_row * rows_per_expert) + r * n_per_row;
-                std::memcpy(f32_sample.data() + rs * n_per_row, src, sizeof(float) * n_per_row);
-                ++rs;
-            }
+            const int64_t rs = sample_rows_per_slice[slice];
             if (rs == 0) { continue; }
 
-            // Quantize sample rows and dequantize back
-            (void)ggml_quantize_chunk(typ, f32_sample.data(), qbuf.data(), 0, rs, n_per_row, value);
-            traits->to_float(qbuf.data(), deq.data(), rs * n_per_row);
+            const float * value = values_sample.empty() ? nullptr : values_sample.data() + slice * n_per_row;
+            (void)ggml_quantize_chunk(typ, f32_sample.data() + f32_offset, qbuf.data() + qbuf_offset, 0, rs, n_per_row, value);
+            qbuf_offset += ggml_row_size(typ, n_per_row) * rs;
+            f32_offset += rs * n_per_row;
+        }
+
+        traits->to_float(qbuf.data(), deq.data(), f32_sample.size());
+
+        double total_err = 0.0;
+        size_t sample_offset = 0;
+
+        for (int64_t slice = 0; slice < ne2; ++slice) {
+            const float * value_slice = values_sample.empty() ? nullptr : values_sample.data() + slice * n_per_row;
+            const float * activation_slice = activations_sample.empty() ? nullptr : activations_sample.data() + slice * n_per_row;
+            const int64_t rs = sample_rows_per_slice[slice];
 
-            // Compute error proxy per sampled slice
             double slice_err = 0.0;
             for (int64_t s = 0; s < rs; ++s) {
-                const float * xs = f32_sample.data() + s * n_per_row;
-                const float * ys =        deq.data() + s * n_per_row;
+                const float * xs = f32_sample.data() + sample_offset;
+                const float * ys = deq.data() + sample_offset;
 
-                double mse_w    = 0.0;
+                double mse_w = 0.0;
                 double bias_sum = 0.0;
 
-                if (value) {
+                if (value_slice) {
                     for (int64_t j = 0; j < n_per_row; ++j) {
                         const float e = ys[j] - xs[j];
-                        mse_w += e * e * value[j];
-                        if (activation) { bias_sum += e * activation[j]; }
+                        mse_w += e * e * value_slice[j];
+                        if (activation_slice) { bias_sum += e * activation_slice[j]; }
                     }
                 } else {
                     for (int64_t j = 0; j < n_per_row; ++j) {
                         const float e = ys[j] - xs[j];
                         mse_w += e * e;
-                        if (activation) { bias_sum += e * activation[j]; }
+                        if (activation_slice) { bias_sum += e * activation_slice[j]; }
                     }
                 }
 
                 // Normalize by n_per_row to get a per-row average scale
                 double row_err = mse_w / std::max<int64_t>(1, n_per_row);
-                if (activation && bias_lambda != 0.0) {
+                if (activation_slice && bias_lambda != 0.0) {
                     // bias_sum ~= sum_j ( (w_q - w_fp)[j] * E[a_j] )
                     const double bias = std::abs(bias_sum) / std::max<int64_t>(1, n_per_row);
                     row_err += bias_lambda * bias;
                 }
 
                 slice_err += row_err;
+                sample_offset += n_per_row;
             }
 
             // Scale the slice contribution by the sampling factor
-            const auto  scale_rows = (double)rows_per_expert / std::max(1.0, (double)rs);
+            const double rows_per_expert = (double) nrows;
+            const auto   scale_rows = rows_per_expert / std::max(1.0, (double) rs);
             total_err += slice_err * scale_rows;
         }
 
@@ -858,8 +866,40 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             f32_data = (float *)f32_conv_buf.data();
         }
 
-        const float * values = get_values(name);
-        const float * activations = get_activations(name);
+        const float * values_all = get_values(name);
+        const float * activations_all = get_activations(name);
+
+        // Sample the tensor rows once, before looping through quantization candidates.
+        const int64_t n_per_row = t->ne[0];
+        const int64_t nrows_total = t->ne[1];
+        const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
+        const int64_t rows_per_expert = nrows_total;
+        const int64_t sample_rows_max = std::max<int64_t>(1, std::min<int64_t>(rows_per_expert, sample_rows_per_expert));
+        const int64_t stride = std::max<int64_t>(1, rows_per_expert / sample_rows_max);
+
+        std::vector<float> f32_sample;
+        std::vector<float> values_sample;
+        std::vector<float> activations_sample;
+        std::vector<int64_t> sample_rows_per_slice(ne2);
+
+        for (int64_t slice = 0; slice < ne2; ++slice) {
+            int64_t current_sampled_rows = 0;
+            for (int64_t r = 0; r < rows_per_expert && current_sampled_rows < sample_rows_max; r += stride) {
+                const float * src_row = f32_data + slice * (n_per_row * rows_per_expert) + r * n_per_row;
+                f32_sample.insert(f32_sample.end(), src_row, src_row + n_per_row);
+                current_sampled_rows++;
+            }
+            sample_rows_per_slice[slice] = current_sampled_rows;
+        }
+
+        if (values_all) {
+            values_sample.resize(ne2 * n_per_row);
+            std::memcpy(values_sample.data(), values_all, ne2 * n_per_row * sizeof(float));
+        }
+        if (activations_all) {
+            activations_sample.resize(ne2 * n_per_row);
+            std::memcpy(activations_sample.data(), activations_all, ne2 * n_per_row * sizeof(float));
+        }
 
         tensor_info info;
         info.w = tw;
@@ -874,7 +914,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         // Build per-tensor candidate list
         for (ggml_type ts_type : quant_candidates) {
-            if (is_iq(ts_type) && !values) { continue; }
+            if (is_iq(ts_type) && !values_all) { continue; }
             ggml_type tt = make_compatible(t, ts_type);
             if (!is_compatible(t, tt)) { continue; }
 
@@ -882,19 +922,18 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             auto bpw = (float)tensor_bpw(t, tt);
             size_t bytes = total_bytes(t, tt);
 
-            // Estimate error
-            auto err = (float)estimate_error(t, f32_data, tt, values, activations);
-
-            info.candidate.push_back(candidate_types{tt, bpw, bytes, err});
+            // Estimate error using the pre-sampled data
+            auto err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, values_sample, activations_sample);
+            info.candidate.push_back(candidate_types{ tt, bpw, bytes, err });
         }
 
         if (info.candidate.empty()) {
             // As a last resort, keep original type
             float bpw = ggml_nbytes(t) * 8.0f / nelem;
-            info.candidate.push_back(candidate_types{t->type, bpw, ggml_nbytes(t), 0.0});
+            info.candidate.push_back(candidate_types{ t->type, bpw, ggml_nbytes(t), 0.0 });
         }
 
-        std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types &a, const candidate_types &b) {
+        std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types & a, const candidate_types & b) {
             if (a.bpw != b.bpw) { return a.bpw < b.bpw; }
             if (a.error != b.error) { return a.error < b.error; }
             return a.bytes < b.bytes;
@@ -905,7 +944,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             std::vector<candidate_types> uniq;
             uniq.reserve(info.candidate.size());
 
-            for (size_t i = 0; i < info.candidate.size(); ) {
+            for (size_t i = 0; i < info.candidate.size();) {
                 size_t j = i + 1;
                 candidate_types best = info.candidate[i];
                 // group same-byte entries, keep the one with the lowest error
@@ -972,36 +1011,39 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     };
 
     // Find next strictly-larger candidate index for a tensor
-    auto next_distinct_idx = [&](const tensor_info &ti) -> int {
-        const auto &cand = ti.candidate;
-        const auto &cur  = cand[ti.choice];
+    auto next_distinct_idx = [&](const tensor_info & ti) -> int {
+        const auto & cand = ti.candidate;
+        const auto & cur  = cand[ti.choice];
         int j = ti.choice + 1;
-        while (j < (int)cand.size() && cand[j].bytes == cur.bytes) ++j;
+        while (j < (int)cand.size() && cand[j].bytes == cur.bytes) {
+            ++j;
+        }
+
         return j < (int)cand.size() ? j : -1;
     };
 
     auto recompute_best_upgrade = [&]() -> upgrade {
         const double eps = 1e-12;
-        upgrade best{-1, -1, 0.0, 0, -1.0};
-        for (int i = 0; i < (int)all.size(); ++i) {
-            const auto &ti = all[i];
+        upgrade best{ -1, -1, 0.0, 0, -1.0 };
+        for (int i = 0; i < (int) all.size(); ++i) {
+            const auto & ti = all[i];
             if (ti.choice >= (int)ti.candidate.size() - 1) { continue; }
 
-            int j = next_distinct_idx(ti);
+            const int j = next_distinct_idx(ti);
             if (j < 0) { continue; }
 
-            const auto &cur = ti.candidate[ti.choice];
-            const auto &nxt = ti.candidate[j];
+            const auto & cur = ti.candidate[ti.choice];
+            const auto & nxt = ti.candidate[j];
 
-            size_t delta_bytes = nxt.bytes - cur.bytes;
+            const size_t delta_bytes = nxt.bytes - cur.bytes;
             if (delta_bytes == 0) { continue; }
 
-            double err = (double)cur.error - (double)nxt.error;
+            double err = cur.error - nxt.error;
             err = std::max(err, 0.0);
 
             double ratio = err / (double)(delta_bytes * 8ull);
             if (ratio > best.ratio + eps || (std::abs(ratio - best.ratio) <= eps && delta_bytes < best.delta_bytes)) {
-                best = upgrade{i, j, err, delta_bytes, ratio};
+                best = upgrade{ i, j, err, delta_bytes, ratio };
             }
         }
         return best;
@@ -1014,8 +1056,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         size_t now_bytes = current_total_bytes();
         size_t next_bytes = now_bytes + up.delta_bytes;
         double bpw_next = (double)next_bytes * 8.0 / (double)tw;
-
-        if (bpw_next <= (double)target_bpw + 1e-12) {
+        if (bpw_next <= target_bpw + 1e-12) {
             all[up.idx].choice = up.next;
             bpw_now = bpw_next;
         } else {
@@ -1026,22 +1067,22 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     // We might still be below target but taking any single upgrade overshoots.
     // Try to find the best upgrade that overshoots the target_bpw by the least and has the best error-to-size ratio.
     {
-        double under_gap = (double)target_bpw - bpw_now;
+        double under_gap = target_bpw - bpw_now;
 
-        upgrade best_over{-1, -1, 0.0, 0, -1.0};
-        double best_over_gap = 1e300;
+        upgrade best_over{ -1, -1, 0.0, 0, -1.0 };
+        double  best_over_gap = 1e300;
 
         size_t now_bytes = current_total_bytes();
 
-        for (int i = 0; i < (int)all.size(); ++i) {
-            const auto &ti = all[i];
+        for (int i = 0; i < (int) all.size(); ++i) {
+            const auto & ti = all[i];
             if (ti.choice >= (int)ti.candidate.size() - 1) { continue; }
 
             int j = next_distinct_idx(ti);
             if (j < 0) { continue; }
 
-            const auto &cur = ti.candidate[ti.choice];
-            const auto &nxt = ti.candidate[j];
+            const auto & cur = ti.candidate[ti.choice];
+            const auto & nxt = ti.candidate[j];
 
             size_t delta_bytes = nxt.bytes - cur.bytes;
             if (delta_bytes == 0) { continue; }
@@ -1051,13 +1092,13 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
             double over_gap = std::abs(bpw_over - (double)target_bpw);
 
-            double err = (double)cur.error - (double)nxt.error;
+            double err = cur.error - nxt.error;
             if (err < 0.0) { err = 0.0; }
             double ratio = err / (double)(delta_bytes * 8ull);
 
             if (over_gap < best_over_gap - 1e-12 || (std::abs(over_gap - best_over_gap) <= 1e-12 && ratio > best_over.ratio)) {
                 best_over_gap = over_gap;
-                best_over = upgrade{i, j, err, delta_bytes, ratio};
+                best_over = upgrade{ i, j, err, delta_bytes, ratio };
             }
         }
 

From 35ad0fc4addf92e9dc0700a88004962731f3c9e0 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Wed, 20 Aug 2025 23:27:20 +0100
Subject: [PATCH 027/155] Improve error estimation using weighted MSE

---
 src/llama-quant.cpp | 60 +++++++++++++++++++++++++++------------------
 1 file changed, 36 insertions(+), 24 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 830bf915cf..f5fa309c44 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -783,14 +783,26 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             f32_offset += rs * n_per_row;
         }
 
-        traits->to_float(qbuf.data(), deq.data(), f32_sample.size());
+        if (typ == GGML_TYPE_F16) {
+            const auto *const src = (const ggml_fp16_t *)qbuf.data();
+            for (size_t r = 0; r < total_sampled_rows; ++r) {
+                ggml_fp16_to_fp32_row(src + r * n_per_row, deq.data() + r * n_per_row, n_per_row);
+            }
+        } else if (typ == GGML_TYPE_BF16) {
+            const auto *const src = (const ggml_bf16_t *)qbuf.data();
+            for (size_t r = 0; r < total_sampled_rows; ++r) {
+                ggml_bf16_to_fp32_row(src + r * n_per_row, deq.data() + r * n_per_row, n_per_row);
+            }
+        } else {
+            traits->to_float(qbuf.data(), deq.data(), f32_sample.size());
+        }
 
         double total_err = 0.0;
         size_t sample_offset = 0;
 
         for (int64_t slice = 0; slice < ne2; ++slice) {
-            const float * value_slice = values_sample.empty() ? nullptr : values_sample.data() + slice * n_per_row;
-            const float * activation_slice = activations_sample.empty() ? nullptr : activations_sample.data() + slice * n_per_row;
+            const float * wv = values_sample.empty() ? nullptr : values_sample.data() + slice * n_per_row;
+            const float * act = activations_sample.empty() ? nullptr : activations_sample.data() + slice * n_per_row;
             const int64_t rs = sample_rows_per_slice[slice];
 
             double slice_err = 0.0;
@@ -799,37 +811,37 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 const float * ys = deq.data() + sample_offset;
 
                 double mse_w = 0.0;
-                double bias_sum = 0.0;
+                double x2_w = 0.0;
+                double bias_num = 0.0;
+                double bias_den = 0.0;
 
-                if (value_slice) {
-                    for (int64_t j = 0; j < n_per_row; ++j) {
-                        const float e = ys[j] - xs[j];
-                        mse_w += e * e * value_slice[j];
-                        if (activation_slice) { bias_sum += e * activation_slice[j]; }
-                    }
-                } else {
-                    for (int64_t j = 0; j < n_per_row; ++j) {
-                        const float e = ys[j] - xs[j];
-                        mse_w += e * e;
-                        if (activation_slice) { bias_sum += e * activation_slice[j]; }
+                for (int64_t j = 0; j < n_per_row; ++j) {
+                    const double e = ys[j] - xs[j];
+                    const double w = wv ? wv[j] : 1.0;
+                    mse_w += w * e * e;
+                    x2_w  += w * xs[j] * xs[j];
+
+                    if (act) {
+                        const double a = act[j];
+                        bias_num += e * a;
+                        bias_den += a * a;
                     }
                 }
 
-                // Normalize by n_per_row to get a per-row average scale
-                double row_err = mse_w / std::max<int64_t>(1, n_per_row);
-                if (activation_slice && bias_lambda != 0.0) {
-                    // bias_sum ~= sum_j ( (w_q - w_fp)[j] * E[a_j] )
-                    const double bias = std::abs(bias_sum) / std::max<int64_t>(1, n_per_row);
-                    row_err += bias_lambda * bias;
+                const double eps = 1e-30;
+                double row_err = mse_w / (x2_w + eps);
+
+                if (act && bias_lambda != 0.0) {
+                    const double bias_norm = bias_num * bias_num / (bias_den + eps);
+                    row_err += bias_lambda * bias_norm;
                 }
 
                 slice_err += row_err;
                 sample_offset += n_per_row;
             }
 
-            // Scale the slice contribution by the sampling factor
-            const double rows_per_expert = (double) nrows;
-            const auto   scale_rows = rows_per_expert / std::max(1.0, (double) rs);
+            const auto rows_per_expert = nrows;
+            const double scale_rows = (double)rows_per_expert / std::max(1.0, (double)rs);
             total_err += slice_err * scale_rows;
         }
 

From 5ef493ea1a01385c02ef4c56d38dfe5e116c47c6 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 21 Aug 2025 09:48:29 +0100
Subject: [PATCH 028/155] Exclude embeddings and output tensor

---
 src/llama-quant.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index f5fa309c44..32013e47ba 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -697,8 +697,10 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         q &= name.find("time_mix_decay_w2.weight") == std::string::npos;
         q &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
         q &= name.find("attn_rel_b.weight") == std::string::npos;
-        q &= params->quantize_output_tensor || name != "output.weight";
         q &= !params->only_copy;
+        // TODO: Exclude embeddings and output tensors?
+        q &= params->quantize_output_tensor || name != "output.weight";
+        q &= name != name_tn(LLM_TENSOR_TOKEN_EMBD, "weight");
 
         return q;
     };

From 95b2ab2800e26a5bd5b60c61f9593d720a97eb7a Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 21 Aug 2025 10:46:37 +0100
Subject: [PATCH 029/155] Change error estimate to use normalised weighted MSE

---
 src/llama-quant.cpp | 204 +++++++++++++++++++++++++++++---------------
 1 file changed, 134 insertions(+), 70 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 32013e47ba..629056ee06 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -9,6 +9,7 @@
 #include <cinttypes>
 #include <fstream>
 #include <mutex>
+#include <random>
 #include <regex>
 #include <thread>
 #include <unordered_map>
@@ -661,8 +662,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         GGML_TYPE_Q5_0,
         GGML_TYPE_Q5_1,
         GGML_TYPE_Q5_K,
-        GGML_TYPE_Q6_K,
-        GGML_TYPE_Q8_0
+        GGML_TYPE_Q6_K
     };
 
     auto name_tn = LLM_TN(model.arch);
@@ -752,103 +752,125 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         const ggml_type typ,
         const std::vector<float> & f32_sample,
         const std::vector<int64_t> & sample_rows_per_slice,
-        const std::vector<float> & values_sample,
-        const std::vector<float> & activations_sample) -> double
+        const float * values_sample,
+        const float * activations_sample,
+        std::vector<uint8_t> & qbuf,
+        std::vector<float> & deq) -> double
     {
         const int64_t n_per_row = t->ne[0];
-        const int64_t nrows = t->ne[1];
-        const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
-
-        const ggml_type_traits * traits = ggml_get_type_traits(typ);
-        if (!traits || !traits->to_float) {
-            // Cannot dequantize candidate -> assign very high error
-            return 1e35f;
-        }
+        const int64_t nrows     = t->ne[1];
+        const int64_t ne2       = t->ne[2] > 0 ? t->ne[2] : 1;
 
         const size_t total_sampled_rows = f32_sample.size() / n_per_row;
         if (total_sampled_rows == 0) { return 0.0; }
 
-        const size_t qbuf_size = ggml_row_size(typ, n_per_row) * total_sampled_rows;
-        std::vector<uint8_t> qbuf(qbuf_size);
-        std::vector<float> deq(f32_sample.size());
+        const size_t row_sz = ggml_row_size(typ, n_per_row);
+        const size_t need_q = row_sz * total_sampled_rows;
+        if (qbuf.size() < need_q) { qbuf.resize(need_q); }
+        if (deq.size() < f32_sample.size()) { deq.resize(f32_sample.size()); }
 
-        // Quantize all sampled rows at once and dequantize back
-        size_t qbuf_offset = 0;
-        size_t f32_offset = 0;
+        // Quantize sampled rows slice-by-slice
+        size_t qoff = 0;
+        size_t foff = 0;
         for (int64_t slice = 0; slice < ne2; ++slice) {
             const int64_t rs = sample_rows_per_slice[slice];
             if (rs == 0) { continue; }
 
-            const float * value = values_sample.empty() ? nullptr : values_sample.data() + slice * n_per_row;
-            (void)ggml_quantize_chunk(typ, f32_sample.data() + f32_offset, qbuf.data() + qbuf_offset, 0, rs, n_per_row, value);
-            qbuf_offset += ggml_row_size(typ, n_per_row) * rs;
-            f32_offset += rs * n_per_row;
+            const float * value = values_sample ? values_sample + slice * n_per_row : nullptr;
+
+            (void)ggml_quantize_chunk(typ, f32_sample.data() + foff, qbuf.data() + qoff, 0, rs, n_per_row, value);
+
+            qoff += row_sz * rs;
+            foff += (size_t)rs * n_per_row;
         }
 
+        // Dequantize to deq
         if (typ == GGML_TYPE_F16) {
-            const auto *const src = (const ggml_fp16_t *)qbuf.data();
-            for (size_t r = 0; r < total_sampled_rows; ++r) {
-                ggml_fp16_to_fp32_row(src + r * n_per_row, deq.data() + r * n_per_row, n_per_row);
-            }
+            ggml_fp16_to_fp32_row((const ggml_fp16_t *)qbuf.data(), deq.data(), (int)f32_sample.size());
         } else if (typ == GGML_TYPE_BF16) {
-            const auto *const src = (const ggml_bf16_t *)qbuf.data();
-            for (size_t r = 0; r < total_sampled_rows; ++r) {
-                ggml_bf16_to_fp32_row(src + r * n_per_row, deq.data() + r * n_per_row, n_per_row);
-            }
+            ggml_bf16_to_fp32_row((const ggml_bf16_t *)qbuf.data(), deq.data(), (int)f32_sample.size());
         } else {
-            traits->to_float(qbuf.data(), deq.data(), f32_sample.size());
+            const ggml_type_traits * traits = ggml_get_type_traits(typ);
+            if (!traits || !traits->to_float) {
+                // no dequantizer available
+                return 1e35;
+            }
+            traits->to_float(qbuf.data(), deq.data(), (int) f32_sample.size());
         }
 
+        // Compute error
+        size_t off = 0;
         double total_err = 0.0;
-        size_t sample_offset = 0;
+        const double eps = 1e-12;
 
         for (int64_t slice = 0; slice < ne2; ++slice) {
-            const float * wv = values_sample.empty() ? nullptr : values_sample.data() + slice * n_per_row;
-            const float * act = activations_sample.empty() ? nullptr : activations_sample.data() + slice * n_per_row;
             const int64_t rs = sample_rows_per_slice[slice];
+            if (rs == 0) { continue; }
+
+            const float * wv  = values_sample ? values_sample + slice * n_per_row : nullptr;
+            const float * act = activations_sample ? activations_sample + slice * n_per_row : nullptr;
 
             double slice_err = 0.0;
-            for (int64_t s = 0; s < rs; ++s) {
-                const float * xs = f32_sample.data() + sample_offset;
-                const float * ys = deq.data() + sample_offset;
+
+            for (int64_t r = 0; r < rs; ++r) {
+                const float * x = f32_sample.data() + off;
+                const float * y = deq.data() + off;
 
                 double mse_w = 0.0;
                 double x2_w = 0.0;
-                double bias_num = 0.0;
-                double bias_den = 0.0;
+                double bnum = 0.0;
+                double bden = 0.0;
 
-                for (int64_t j = 0; j < n_per_row; ++j) {
-                    const double e = ys[j] - xs[j];
-                    const double w = wv ? wv[j] : 1.0;
-                    mse_w += w * e * e;
-                    x2_w  += w * xs[j] * xs[j];
-
-                    if (act) {
+                if (wv && act) {
+                    for (int64_t j = 0; j < n_per_row; ++j) {
+                        const double w = wv[j];
+                        const double e = y[j] - x[j];
                         const double a = act[j];
-                        bias_num += e * a;
-                        bias_den += a * a;
+                        mse_w += w * e * e;
+                        x2_w += w * x[j] * x[j];
+                        bnum += e * a;
+                        bden += a * a;
+                    }
+                } else if (wv) {
+                    for (int64_t j = 0; j < n_per_row; ++j) {
+                        const double w = wv[j];
+                        const double e = y[j] - x[j];
+                        mse_w += w * e * e;
+                        x2_w += w * x[j] * x[j];
+                    }
+                } else if (act) {
+                    for (int64_t j = 0; j < n_per_row; ++j) {
+                        const double e = y[j] - x[j];
+                        const double a = act[j];
+                        mse_w += e * e;
+                        x2_w += x[j] * x[j];
+                        bnum += e * a;
+                        bden += a * a;
+                    }
+                } else {
+                    for (int64_t j = 0; j < n_per_row; ++j) {
+                        const double e = y[j] - x[j];
+                        mse_w += e * e;
+                        x2_w += x[j] * x[j];
                     }
                 }
 
-                const double eps = 1e-30;
                 double row_err = mse_w / (x2_w + eps);
-
                 if (act && bias_lambda != 0.0) {
-                    const double bias_norm = bias_num * bias_num / (bias_den + eps);
-                    row_err += bias_lambda * bias_norm;
+                    row_err += bias_lambda * (bnum * bnum) / (bden + eps);
                 }
 
                 slice_err += row_err;
-                sample_offset += n_per_row;
+                off += (size_t)n_per_row;
             }
 
-            const auto rows_per_expert = nrows;
-            const double scale_rows = (double)rows_per_expert / std::max(1.0, (double)rs);
+            // scale back up to the full number of rows in this slice
+            const double scale_rows = (double)nrows / std::max(1.0, (double)rs);
             total_err += slice_err * scale_rows;
         }
 
         return std::isfinite(total_err) ? total_err : 1e35;
-    };
+};
 
     std::vector<tensor_info> all;
     all.reserve(tensors.size());
@@ -887,38 +909,70 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         const int64_t n_per_row = t->ne[0];
         const int64_t nrows_total = t->ne[1];
         const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
-        const int64_t rows_per_expert = nrows_total;
-        const int64_t sample_rows_max = std::max<int64_t>(1, std::min<int64_t>(rows_per_expert, sample_rows_per_expert));
-        const int64_t stride = std::max<int64_t>(1, rows_per_expert / sample_rows_max);
+        const int64_t sample_rows_max = std::max<int64_t>(1, std::min<int64_t>(nrows_total, sample_rows_per_expert));
+        const int64_t stride = std::max<int64_t>(1, nrows_total / sample_rows_max);
 
         std::vector<float> f32_sample;
         std::vector<float> values_sample;
         std::vector<float> activations_sample;
         std::vector<int64_t> sample_rows_per_slice(ne2);
 
+        std::mt19937 rng(std::random_device{}());
         for (int64_t slice = 0; slice < ne2; ++slice) {
             int64_t current_sampled_rows = 0;
-            for (int64_t r = 0; r < rows_per_expert && current_sampled_rows < sample_rows_max; r += stride) {
-                const float * src_row = f32_data + slice * (n_per_row * rows_per_expert) + r * n_per_row;
+            int64_t offset = 0;
+            if (stride > 1) {
+                std::uniform_int_distribution<int64_t> dist(0, stride - 1);
+                offset = dist(rng);
+            }
+            for (int64_t r = offset; r < nrows_total && current_sampled_rows < sample_rows_max; r += stride) {
+                const float * src_row = f32_data + slice * (n_per_row * nrows_total) + r * n_per_row;
                 f32_sample.insert(f32_sample.end(), src_row, src_row + n_per_row);
                 current_sampled_rows++;
             }
             sample_rows_per_slice[slice] = current_sampled_rows;
         }
 
+        auto copy_or_broadcast = [&](const float *src, size_t src_sz, std::vector<float> &dst) {
+            const size_t want = (size_t)ne2 * (size_t)n_per_row;
+            dst.clear();
+            if (!src || src_sz == 0) { return; }
+
+            if (src_sz == want) {
+                dst.resize(want);
+                std::memcpy(dst.data(), src, want * sizeof(float));
+            } else if (src_sz == (size_t)n_per_row) {
+                dst.resize(want);
+                for (int64_t s = 0; s < ne2; ++s) {
+                    std::memcpy(dst.data() + s * n_per_row, src, n_per_row * sizeof(float));
+                }
+            } else {
+                // Mismatch – safer to skip using it for this tensor
+                LLAMA_LOG_WARN("%s: side data size mismatch for %s: got %zu, expected %zu or %zu; ignoring\n",
+                    __func__, name.c_str(), src_sz, (size_t)n_per_row, want);
+            }
+        };
+
         if (values_all) {
-            values_sample.resize(ne2 * n_per_row);
-            std::memcpy(values_sample.data(), values_all, ne2 * n_per_row * sizeof(float));
+            // get size from the map (not just the raw pointer)
+            auto itv = values_data->find(remap_imatrix(name, mapped));
+            const size_t sz = itv == values_data->end() ? 0 : itv->second.size();
+            copy_or_broadcast(values_all, sz, values_sample);
         }
         if (activations_all) {
-            activations_sample.resize(ne2 * n_per_row);
-            std::memcpy(activations_sample.data(), activations_all, ne2 * n_per_row * sizeof(float));
+            auto ita = activations_data->find(remap_imatrix(name, mapped));
+            const size_t sz = ita == activations_data->end() ? 0 : ita->second.size();
+            copy_or_broadcast(activations_all, sz, activations_sample);
         }
 
         tensor_info info;
         info.w = tw;
         info.n_elements = nelem;
 
+        // Prepare scratch buffers sized for the largest candidate row size
+        size_t total_sampled_rows = f32_sample.size() / n_per_row;
+
+        // Build list of candidate types first (compatible ones)
         std::vector<ggml_type> quant_candidates;
         if (is_iq(params->ftype)) {
             quant_candidates.assign(std::begin(iq_candidates), std::end(iq_candidates));
@@ -926,18 +980,28 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             quant_candidates.assign(std::begin(k_candidates), std::end(k_candidates));
         }
 
-        // Build per-tensor candidate list
+        // Compute maximum row size among compatible candidates (to size qbuf once)
+        size_t max_row_sz = 0;
+        std::vector<ggml_type> compatible_candidates;
+        compatible_candidates.reserve(quant_candidates.size());
         for (ggml_type ts_type : quant_candidates) {
             if (is_iq(ts_type) && !values_all) { continue; }
             ggml_type tt = make_compatible(t, ts_type);
             if (!is_compatible(t, tt)) { continue; }
+            compatible_candidates.push_back(tt);
+            max_row_sz = std::max(max_row_sz, ggml_row_size(tt, n_per_row));
+        }
 
-            // Compute bpw and bytes
+        std::vector<uint8_t> qbuf(max_row_sz * total_sampled_rows);
+        std::vector<float>   deq(f32_sample.size());
+
+        // Now evaluate candidates
+        for (ggml_type tt : compatible_candidates) {
             auto bpw = (float)tensor_bpw(t, tt);
             size_t bytes = total_bytes(t, tt);
-
-            // Estimate error using the pre-sampled data
-            auto err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, values_sample, activations_sample);
+            const float *vals_ptr = values_sample.empty() ? nullptr : values_sample.data();
+            const float *acts_ptr = activations_sample.empty() ? nullptr : activations_sample.data();
+            float  err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, vals_ptr, acts_ptr, qbuf, deq);
             info.candidate.push_back(candidate_types{ tt, bpw, bytes, err });
         }
 

From e01dad886bd2314146ce768240fd0c8a2abecabb Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 21 Aug 2025 12:47:13 +0100
Subject: [PATCH 030/155] Parallelise candidate evaluation

---
 src/llama-quant.cpp | 87 ++++++++++++++++++++++++++++++---------------
 1 file changed, 59 insertions(+), 28 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 629056ee06..3cade0bf6f 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -610,7 +610,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     const std::unordered_map<std::string, std::vector<float>> * activations_data,
     const llama_model_quantize_params * params,
     int nthread,
-    int sample_rows_per_expert = 256,
+    int sample_rows_per_expert = 384,
     float bias_lambda = 1.0
 ) {
     struct candidate_types {
@@ -758,16 +758,17 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         std::vector<float> & deq) -> double
     {
         const int64_t n_per_row = t->ne[0];
-        const int64_t nrows     = t->ne[1];
-        const int64_t ne2       = t->ne[2] > 0 ? t->ne[2] : 1;
+        const int64_t nrows = t->ne[1];
+        const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
 
-        const size_t total_sampled_rows = f32_sample.size() / n_per_row;
+        const size_t nels = f32_sample.size();
+        const size_t total_sampled_rows = nels / (size_t)n_per_row;
         if (total_sampled_rows == 0) { return 0.0; }
 
         const size_t row_sz = ggml_row_size(typ, n_per_row);
         const size_t need_q = row_sz * total_sampled_rows;
         if (qbuf.size() < need_q) { qbuf.resize(need_q); }
-        if (deq.size() < f32_sample.size()) { deq.resize(f32_sample.size()); }
+        if (deq.size() < nels) { deq.resize(nels); }
 
         // Quantize sampled rows slice-by-slice
         size_t qoff = 0;
@@ -777,31 +778,31 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             if (rs == 0) { continue; }
 
             const float * value = values_sample ? values_sample + slice * n_per_row : nullptr;
-
             (void)ggml_quantize_chunk(typ, f32_sample.data() + foff, qbuf.data() + qoff, 0, rs, n_per_row, value);
 
-            qoff += row_sz * rs;
-            foff += (size_t)rs * n_per_row;
+            qoff += row_sz * (size_t)rs;
+            foff += (size_t)rs * (size_t)n_per_row;
         }
 
-        // Dequantize to deq
+        // Dequantize into deq
         if (typ == GGML_TYPE_F16) {
-            ggml_fp16_to_fp32_row((const ggml_fp16_t *)qbuf.data(), deq.data(), (int)f32_sample.size());
+            ggml_fp16_to_fp32_row((const ggml_fp16_t *)qbuf.data(), deq.data(), (int)nels);
         } else if (typ == GGML_TYPE_BF16) {
-            ggml_bf16_to_fp32_row((const ggml_bf16_t *)qbuf.data(), deq.data(), (int)f32_sample.size());
+            ggml_bf16_to_fp32_row((const ggml_bf16_t *)qbuf.data(), deq.data(), (int)nels);
         } else {
             const ggml_type_traits * traits = ggml_get_type_traits(typ);
             if (!traits || !traits->to_float) {
-                // no dequantizer available
+                LLAMA_LOG_WARN("%s: unsupported quantization type %s\n", __func__, ggml_type_name(typ));
                 return 1e35;
             }
-            traits->to_float(qbuf.data(), deq.data(), (int) f32_sample.size());
+
+            traits->to_float(qbuf.data(), deq.data(), (int) nels);
         }
 
         // Compute error
+        const double eps = 1e-12;
         size_t off = 0;
         double total_err = 0.0;
-        const double eps = 1e-12;
 
         for (int64_t slice = 0; slice < ne2; ++slice) {
             const int64_t rs = sample_rows_per_slice[slice];
@@ -817,9 +818,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 const float * y = deq.data() + off;
 
                 double mse_w = 0.0;
-                double x2_w = 0.0;
-                double bnum = 0.0;
-                double bden = 0.0;
+                double x2_w  = 0.0;
+                double bnum  = 0.0;
+                double bden  = 0.0;
 
                 if (wv && act) {
                     for (int64_t j = 0; j < n_per_row; ++j) {
@@ -828,8 +829,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                         const double a = act[j];
                         mse_w += w * e * e;
                         x2_w += w * x[j] * x[j];
-                        bnum += e * a;
-                        bden += a * a;
+                        bnum += w * e * a;  // weighted bias
+                        bden += w * a * a;  // weighted norm
                     }
                 } else if (wv) {
                     for (int64_t j = 0; j < n_per_row; ++j) {
@@ -856,7 +857,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 }
 
                 double row_err = mse_w / (x2_w + eps);
+
                 if (act && bias_lambda != 0.0) {
+                    // penalize squared projection of error onto activations
                     row_err += bias_lambda * (bnum * bnum) / (bden + eps);
                 }
 
@@ -864,7 +867,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 off += (size_t)n_per_row;
             }
 
-            // scale back up to the full number of rows in this slice
+            // scale to full rows in this slice (nrows)
             const double scale_rows = (double)nrows / std::max(1.0, (double)rs);
             total_err += slice_err * scale_rows;
         }
@@ -982,10 +985,14 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         // Compute maximum row size among compatible candidates (to size qbuf once)
         size_t max_row_sz = 0;
+        const bool has_valid_imatrix = !values_sample.empty() && values_sample.size() == (size_t)ne2 * (size_t)n_per_row;
         std::vector<ggml_type> compatible_candidates;
         compatible_candidates.reserve(quant_candidates.size());
         for (ggml_type ts_type : quant_candidates) {
-            if (is_iq(ts_type) && !values_all) { continue; }
+            if (is_iq(ts_type) && !has_valid_imatrix) {
+                LLAMA_LOG_WARN("%s: skipping IQ quantization for %s, no or mismatched imatrix provided\n", __func__, name.c_str());
+                continue;
+            }
             ggml_type tt = make_compatible(t, ts_type);
             if (!is_compatible(t, tt)) { continue; }
             compatible_candidates.push_back(tt);
@@ -996,13 +1003,37 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         std::vector<float>   deq(f32_sample.size());
 
         // Now evaluate candidates
-        for (ggml_type tt : compatible_candidates) {
-            auto bpw = (float)tensor_bpw(t, tt);
-            size_t bytes = total_bytes(t, tt);
-            const float *vals_ptr = values_sample.empty() ? nullptr : values_sample.data();
-            const float *acts_ptr = activations_sample.empty() ? nullptr : activations_sample.data();
-            float  err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, vals_ptr, acts_ptr, qbuf, deq);
-            info.candidate.push_back(candidate_types{ tt, bpw, bytes, err });
+        std::vector<candidate_types> cand_out(compatible_candidates.size());
+        const float *vals_ptr = values_sample.empty() ? nullptr : values_sample.data();
+        const float *acts_ptr = activations_sample.empty() ? nullptr : activations_sample.data();
+
+        int n_eval_threads = std::max(1, nthread);
+        std::atomic<size_t> cidx{0};
+        std::vector<std::thread> eval_workers;
+        eval_workers.reserve(n_eval_threads);
+
+        for (int ti = 0; ti < n_eval_threads; ++ti) {
+            eval_workers.emplace_back([&] {
+                // thread-local scratch
+                std::vector<uint8_t> tl_qbuf(qbuf.size());
+                std::vector<float>   tl_deq(deq.size());
+
+                for (;;) {
+                    const size_t i = cidx.fetch_add(1, std::memory_order_relaxed);
+                    if (i >= compatible_candidates.size()) { break; }
+
+                    const ggml_type tt = compatible_candidates[i];
+                    const auto bpw = (float)tensor_bpw(t, tt);
+                    const size_t bytes = total_bytes(t, tt);
+                    const auto err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, vals_ptr, acts_ptr, tl_qbuf, tl_deq);
+                    cand_out[i] = candidate_types{ tt, bpw, bytes, err };
+                }
+            });
+        }
+        for (auto &th : eval_workers) { th.join(); }
+
+        for (auto &c : cand_out) {
+            if (c.bytes > 0) { info.candidate.push_back(c); }
         }
 
         if (info.candidate.empty()) {

From 887490c5ec3c679e8bc0c274b743b483e7c595e3 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 21 Aug 2025 15:11:49 +0100
Subject: [PATCH 031/155] Dequantise sampled rows only

---
 src/llama-quant.cpp | 73 ++++++++++++++++++++++++++++++---------------
 1 file changed, 49 insertions(+), 24 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 3cade0bf6f..547281bd7d 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -610,7 +610,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     const std::unordered_map<std::string, std::vector<float>> * activations_data,
     const llama_model_quantize_params * params,
     int nthread,
-    int sample_rows_per_expert = 384,
+    int sample_rows_per_expert = 512,
     float bias_lambda = 1.0
 ) {
     struct candidate_types {
@@ -699,7 +699,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         q &= name.find("attn_rel_b.weight") == std::string::npos;
         q &= !params->only_copy;
         // TODO: Exclude embeddings and output tensors?
-        q &= params->quantize_output_tensor || name != "output.weight";
+        // q &= params->quantize_output_tensor || name != "output.weight";
         q &= name != name_tn(LLM_TENSOR_TOKEN_EMBD, "weight");
 
         return q;
@@ -896,31 +896,35 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         const int64_t nelem = ggml_nelements(t);
         std::vector<no_init<float>> f32_conv_buf;
-        float * f32_data = nullptr;
-
-        if (t->type == GGML_TYPE_F32) {
-            f32_data = (float *)t->data;
-        } else {
-            llama_tensor_dequantize_impl(t, f32_conv_buf, workers, nelem, nthread);
-            f32_data = (float *)f32_conv_buf.data();
-        }
-
         const float * values_all = get_values(name);
         const float * activations_all = get_activations(name);
 
-        // Sample the tensor rows once, before looping through quantization candidates.
+        // Dequantize only sampled rows into f32_sample
         const int64_t n_per_row = t->ne[0];
         const int64_t nrows_total = t->ne[1];
         const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
+
+        const ggml_type src_type = t->type;
+        const ggml_type_traits *src_traits = ggml_get_type_traits(src_type);
+        const bool src_is_quant = ggml_is_quantized(src_type);
+        const size_t src_row_sz = ggml_row_size(src_type, n_per_row);
+
+        std::vector<float> f32_sample;
+        f32_sample.reserve((size_t)ne2 * (size_t)std::min<int64_t>(nrows_total, sample_rows_per_expert) * (size_t)n_per_row);
+
+        std::vector<float> values_sample;
+        std::vector<float> activations_sample;
+        std::vector<int64_t> sample_rows_per_slice(ne2, 0);
+
+        // deterministic sampling seed based on tensor name + fixed constant
+        std::mt19937 rng(std::hash<std::string>{}(name) ^0xeabada55cafed00d);
+
         const int64_t sample_rows_max = std::max<int64_t>(1, std::min<int64_t>(nrows_total, sample_rows_per_expert));
         const int64_t stride = std::max<int64_t>(1, nrows_total / sample_rows_max);
 
-        std::vector<float> f32_sample;
-        std::vector<float> values_sample;
-        std::vector<float> activations_sample;
-        std::vector<int64_t> sample_rows_per_slice(ne2);
+        // Temporary buffer for one dequantized row
+        std::vector<float> rowbuf((size_t)n_per_row);
 
-        std::mt19937 rng(std::random_device{}());
         for (int64_t slice = 0; slice < ne2; ++slice) {
             int64_t current_sampled_rows = 0;
             int64_t offset = 0;
@@ -928,10 +932,30 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 std::uniform_int_distribution<int64_t> dist(0, stride - 1);
                 offset = dist(rng);
             }
+
             for (int64_t r = offset; r < nrows_total && current_sampled_rows < sample_rows_max; r += stride) {
-                const float * src_row = f32_data + slice * (n_per_row * nrows_total) + r * n_per_row;
-                f32_sample.insert(f32_sample.end(), src_row, src_row + n_per_row);
-                current_sampled_rows++;
+                if (src_type == GGML_TYPE_F32) {
+                    const float * src_row = (const float *)t->data + slice * (n_per_row * nrows_total) + r * n_per_row;
+                    f32_sample.insert(f32_sample.end(), src_row, src_row + n_per_row);
+                } else if (src_type == GGML_TYPE_F16) {
+                    const ggml_fp16_t * src_row = (const ggml_fp16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz);
+                    ggml_fp16_to_fp32_row(src_row, rowbuf.data(), (int)n_per_row);
+                    f32_sample.insert(f32_sample.end(), rowbuf.begin(), rowbuf.end());
+                } else if (src_type == GGML_TYPE_BF16) {
+                    const ggml_bf16_t * src_row = (const ggml_bf16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz);
+                    ggml_bf16_to_fp32_row(src_row, rowbuf.data(), (int)n_per_row);
+                    f32_sample.insert(f32_sample.end(), rowbuf.begin(), rowbuf.end());
+                } else if (src_is_quant) {
+                    const uint8_t * qrow = (const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz;
+                    if (!src_traits || !src_traits->to_float) {
+                        throw std::runtime_error(format("cannot dequantize type %s for sampling", ggml_type_name(src_type)));
+                    }
+                    src_traits->to_float(qrow, rowbuf.data(), (int)n_per_row);
+                    f32_sample.insert(f32_sample.end(), rowbuf.begin(), rowbuf.end());
+                } else {
+                    throw std::runtime_error(format("unsupported src type %s for sampling", ggml_type_name(src_type)));
+                }
+                ++current_sampled_rows;
             }
             sample_rows_per_slice[slice] = current_sampled_rows;
         }
@@ -999,15 +1023,16 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             max_row_sz = std::max(max_row_sz, ggml_row_size(tt, n_per_row));
         }
 
-        std::vector<uint8_t> qbuf(max_row_sz * total_sampled_rows);
-        std::vector<float>   deq(f32_sample.size());
+        std::sort(compatible_candidates.begin(), compatible_candidates.end());
+        compatible_candidates.erase(std::unique(compatible_candidates.begin(), compatible_candidates.end()), compatible_candidates.end());
 
         // Now evaluate candidates
         std::vector<candidate_types> cand_out(compatible_candidates.size());
         const float *vals_ptr = values_sample.empty() ? nullptr : values_sample.data();
         const float *acts_ptr = activations_sample.empty() ? nullptr : activations_sample.data();
-
-        int n_eval_threads = std::max(1, nthread);
+        std::vector<uint8_t> qbuf(max_row_sz * total_sampled_rows);
+        std::vector<float>   deq(f32_sample.size());
+        int n_eval_threads = std::max(1, std::min<int>(nthread, (int)compatible_candidates.size()));
         std::atomic<size_t> cidx{0};
         std::vector<std::thread> eval_workers;
         eval_workers.reserve(n_eval_threads);

From 9e11f82e8f5ad29cb62cba0bab7014db17a0b2c2 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 21 Aug 2025 16:25:31 +0100
Subject: [PATCH 032/155] Precompute error denominator in estimate_erro()

---
 src/llama-quant.cpp | 154 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 121 insertions(+), 33 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 547281bd7d..03f8a4bd11 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -598,8 +598,8 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
 
 // Returns per-tensor overrides of quantization types to meet target BPW with the lowest ppl
 // sample_rows_per_expert: Larger values will result in more accurate error estimates, but will take longer to compute
-// bias_lambda: Affects the weight of the bias term in the MSE error function. 0.0 means no bias, 1.0 means equal weight
-//              for bias and error, 2.0 means twice as much weight for bias
+// bias_lambda: Affects the weight of the bias term in the weigthed MSE error function. 0.0 means no bias (standard MSE),
+//              1.0 means equal weight for bias and error, 2.0 means twice as much weight for bias
 static std::unordered_map<std::string, ggml_type> target_bpw_type(
     llama_model_loader & ml,
     std::vector<no_init<uint8_t>> & buffer,
@@ -658,7 +658,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         GGML_TYPE_IQ3_S,
         GGML_TYPE_IQ4_XS,
         GGML_TYPE_IQ4_NL,
-        // Add higher-precision fallbacks for IQ mixes to improve ppl if bpw budget allows it
+        // TODO: add higher-precision fallbacks for IQ mixes to improve ppl if bpw budget allows it?
         GGML_TYPE_Q5_0,
         GGML_TYPE_Q5_1,
         GGML_TYPE_Q5_K,
@@ -770,7 +770,68 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         if (qbuf.size() < need_q) { qbuf.resize(need_q); }
         if (deq.size() < nels) { deq.resize(nels); }
 
-        // Quantize sampled rows slice-by-slice
+        // Precompute denominators:
+        // - x2_per_row: sum_j w[j]*x[j]^2 if w present else sum_j x[j]^2
+        // - bden_per_slice: sum_j w[j]*a[j]^2 if w & a present; sum_j a[j]^2 if only a present; 0 otherwise
+        std::vector x2_per_row(total_sampled_rows, 0.0);
+        std::vector bden_per_slice(ne2, 0.0);
+
+        const bool has_w = (values_sample != nullptr);
+        const bool has_a = (activations_sample != nullptr);
+
+        // Precompute bden per slice (depends only on w,a)
+        if (has_a) {
+            for (int64_t s = 0; s < ne2; ++s) {
+                const float * wv   = has_w ? values_sample + s * n_per_row : nullptr;
+                const float * act  = activations_sample + s * n_per_row;
+                double        bden = 0.0;
+                if (has_w) {
+                    for (int64_t j = 0; j < n_per_row; ++j) {
+                        const double a = act[j];
+                        bden += (double) wv[j] * a * a;
+                    }
+                } else {
+                    for (int64_t j = 0; j < n_per_row; ++j) {
+                        const double a = act[j];
+                        bden += a * a;
+                    }
+                }
+                bden_per_slice[s] = bden;
+            }
+        }
+
+        // Precompute x2 per sampled row
+        {
+            size_t off = 0;
+            size_t row_idx = 0;
+            for (int64_t s = 0; s < ne2; ++s) {
+                const int64_t rs = sample_rows_per_slice[s];
+                if (rs == 0) { continue; }
+
+                const float * wv = has_w ? values_sample + s * n_per_row : nullptr;
+
+                for (int64_t r = 0; r < rs; ++r, ++row_idx) {
+                    const float * x  = f32_sample.data() + off;
+                    double x2 = 0.0;
+                    if (has_w) {
+                        for (int64_t j = 0; j < n_per_row; ++j) {
+                            const double w = wv[j];
+                            const double xx = x[j];
+                            x2 += w * xx * xx;
+                        }
+                    } else {
+                        for (int64_t j = 0; j < n_per_row; ++j) {
+                            const double xx = x[j];
+                            x2 += xx * xx;
+                        }
+                    }
+                    x2_per_row[row_idx] = x2;
+                    off += (size_t)n_per_row;
+                }
+            }
+        }
+
+        // Quantize sampled rows slice-by-slice into qbuf
         size_t qoff = 0;
         size_t foff = 0;
         for (int64_t slice = 0; slice < ne2; ++slice) {
@@ -784,43 +845,50 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             foff += (size_t)rs * (size_t)n_per_row;
         }
 
-        // Dequantize into deq
-        if (typ == GGML_TYPE_F16) {
-            ggml_fp16_to_fp32_row((const ggml_fp16_t *)qbuf.data(), deq.data(), (int)nels);
-        } else if (typ == GGML_TYPE_BF16) {
-            ggml_bf16_to_fp32_row((const ggml_bf16_t *)qbuf.data(), deq.data(), (int)nels);
-        } else {
+        // Dequantize into deq (row-wise if needed to avoid int overflow)
+        {
             const ggml_type_traits * traits = ggml_get_type_traits(typ);
-            if (!traits || !traits->to_float) {
-                LLAMA_LOG_WARN("%s: unsupported quantization type %s\n", __func__, ggml_type_name(typ));
-                return 1e35;
-            }
+            if (typ == GGML_TYPE_F16) {
+                ggml_fp16_to_fp32_row((const ggml_fp16_t *)qbuf.data(), deq.data(), (int)nels);
+            } else if (typ == GGML_TYPE_BF16) {
+                ggml_bf16_to_fp32_row((const ggml_bf16_t *)qbuf.data(), deq.data(), (int)nels);
+            } else {
+                if (!traits || !traits->to_float) {
+                    LLAMA_LOG_WARN("%s: unsupported quantization type %s\n", __func__, ggml_type_name(typ));
+                    return 1e35;
+                }
 
-            traits->to_float(qbuf.data(), deq.data(), (int) nels);
+                size_t done = 0;
+                while (done < nels) {
+                    const size_t chunk = std::min((size_t)n_per_row, nels - done);
+                    traits->to_float(qbuf.data() + done / n_per_row * row_sz, deq.data() + done, (int)chunk);
+                    done += chunk;
+                }
+            }
         }
 
         // Compute error
         const double eps = 1e-12;
         size_t off = 0;
+        size_t row_idx = 0;
         double total_err = 0.0;
 
         for (int64_t slice = 0; slice < ne2; ++slice) {
             const int64_t rs = sample_rows_per_slice[slice];
             if (rs == 0) { continue; }
 
-            const float * wv  = values_sample ? values_sample + slice * n_per_row : nullptr;
-            const float * act = activations_sample ? activations_sample + slice * n_per_row : nullptr;
+            const float * wv = has_w ? values_sample + slice * n_per_row : nullptr;
+            const float * act = has_a ? activations_sample + slice * n_per_row : nullptr;
+            const double bden = has_a ? bden_per_slice[slice] : 0.0;
 
             double slice_err = 0.0;
 
-            for (int64_t r = 0; r < rs; ++r) {
+            for (int64_t r = 0; r < rs; ++r, ++row_idx) {
                 const float * x = f32_sample.data() + off;
                 const float * y = deq.data() + off;
 
                 double mse_w = 0.0;
-                double x2_w  = 0.0;
                 double bnum  = 0.0;
-                double bden  = 0.0;
 
                 if (wv && act) {
                     for (int64_t j = 0; j < n_per_row; ++j) {
@@ -828,52 +896,49 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                         const double e = y[j] - x[j];
                         const double a = act[j];
                         mse_w += w * e * e;
-                        x2_w += w * x[j] * x[j];
-                        bnum += w * e * a;  // weighted bias
-                        bden += w * a * a;  // weighted norm
+                        bnum += w * e * a;
                     }
                 } else if (wv) {
                     for (int64_t j = 0; j < n_per_row; ++j) {
                         const double w = wv[j];
                         const double e = y[j] - x[j];
                         mse_w += w * e * e;
-                        x2_w += w * x[j] * x[j];
                     }
                 } else if (act) {
                     for (int64_t j = 0; j < n_per_row; ++j) {
                         const double e = y[j] - x[j];
                         const double a = act[j];
                         mse_w += e * e;
-                        x2_w += x[j] * x[j];
                         bnum += e * a;
-                        bden += a * a;
                     }
                 } else {
                     for (int64_t j = 0; j < n_per_row; ++j) {
                         const double e = y[j] - x[j];
                         mse_w += e * e;
-                        x2_w += x[j] * x[j];
                     }
                 }
 
-                double row_err = mse_w / (x2_w + eps);
-
+                // corrected normalization: divide the full numerator by x2
+                double numer = mse_w;
                 if (act && bias_lambda != 0.0) {
-                    // penalize squared projection of error onto activations
-                    row_err += bias_lambda * (bnum * bnum) / (bden + eps);
+                    const double proj = bnum * bnum / (bden + eps);
+                    numer += bias_lambda * proj;
                 }
 
+                const double denom = x2_per_row[row_idx] + eps;
+                const double row_err = numer / denom;
+
                 slice_err += row_err;
                 off += (size_t)n_per_row;
             }
 
-            // scale to full rows in this slice (nrows)
+            // scale to full rows (nrows)
             const double scale_rows = (double)nrows / std::max(1.0, (double)rs);
             total_err += slice_err * scale_rows;
         }
 
         return std::isfinite(total_err) ? total_err : 1e35;
-};
+    };
 
     std::vector<tensor_info> all;
     all.reserve(tensors.size());
@@ -1067,6 +1132,29 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             info.candidate.push_back(candidate_types{ t->type, bpw, ggml_nbytes(t), 0.0 });
         }
 
+        // Remove dominated candidates: if A has >= bytes and >= error than B (and > in at least one), drop A.
+        {
+            std::vector<candidate_types> pruned;
+            pruned.reserve(info.candidate.size());
+            // Sort by bytes asc, error asc
+            std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types &a, const candidate_types &b) {
+                if (a.bytes != b.bytes) { return a.bytes < b.bytes; }
+                return a.error < b.error;
+            });
+
+            double best_err = std::numeric_limits<double>::infinity();
+            size_t last_bytes = std::numeric_limits<size_t>::max();
+
+            for (const auto &c : info.candidate) {
+                if (c.error < best_err || c.bytes > last_bytes) {
+                    pruned.push_back(c);
+                    best_err = std::min(best_err, (double)c.error);
+                    last_bytes = c.bytes;
+                }
+            }
+            info.candidate.swap(pruned);
+        }
+
         std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types & a, const candidate_types & b) {
             if (a.bpw != b.bpw) { return a.bpw < b.bpw; }
             if (a.error != b.error) { return a.error < b.error; }

From 5b6f1e9fde8dc6fd3456358c5b5c758b1f10b11c Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 21 Aug 2025 19:18:54 +0100
Subject: [PATCH 033/155] General code refactor

---
 src/llama-quant.cpp | 415 +++++++++++++++++++++-----------------------
 1 file changed, 196 insertions(+), 219 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 03f8a4bd11..85191a66ae 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -596,10 +596,7 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
     return new_size;
 }
 
-// Returns per-tensor overrides of quantization types to meet target BPW with the lowest ppl
-// sample_rows_per_expert: Larger values will result in more accurate error estimates, but will take longer to compute
-// bias_lambda: Affects the weight of the bias term in the weigthed MSE error function. 0.0 means no bias (standard MSE),
-//              1.0 means equal weight for bias and error, 2.0 means twice as much weight for bias
+// Returns per-tensor type overrides to meet target BPW at lowest ppl
 static std::unordered_map<std::string, ggml_type> target_bpw_type(
     llama_model_loader & ml,
     std::vector<no_init<uint8_t>> & buffer,
@@ -609,9 +606,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     const std::unordered_map<std::string, std::vector<float>> * values_data,
     const std::unordered_map<std::string, std::vector<float>> * activations_data,
     const llama_model_quantize_params * params,
-    int nthread,
-    int sample_rows_per_expert = 512,
-    float bias_lambda = 1.0
+    int nthread
 ) {
     struct candidate_types {
         ggml_type type;
@@ -621,15 +616,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     };
 
     struct tensor_info {
-        const llama_model_loader::llama_tensor_weight * w;
-        std::vector<candidate_types> candidate;
+        const llama_model_loader::llama_tensor_weight * w = nullptr;
+        std::vector<candidate_types> candidate = {};
         int choice = -1;
         float min_bpw = 0.0;
         float max_bpw = 0.0;
         size_t n_elements = 0;
     };
 
-    const ggml_type k_candidates[] = {
+    constexpr ggml_type k_quants[] = {
         GGML_TYPE_Q2_K,
         GGML_TYPE_Q3_K,
         GGML_TYPE_Q4_0,
@@ -648,7 +643,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 #endif
     };
 
-    const ggml_type iq_candidates[] = {
+    constexpr ggml_type iq_quants[] = {
         GGML_TYPE_IQ1_S,
         GGML_TYPE_IQ1_M,
         GGML_TYPE_IQ2_XXS,
@@ -665,9 +660,49 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         GGML_TYPE_Q6_K
     };
 
-    auto name_tn = LLM_TN(model.arch);
-    float target_bpw = params->target_bpw;
+    auto get_values = [&](const std::string & tensor_name) -> const float * {
+        if (!values_data) { return nullptr; }
+        const auto it = values_data->find(remap_imatrix(tensor_name, mapped));
+        if (it == values_data->end()) { return nullptr; }
+        return it->second.data();
+    };
 
+    auto get_activations = [&](const std::string & tensor_name) -> const float * {
+        if (!activations_data) { return nullptr; }
+        const auto it = activations_data->find(remap_imatrix(tensor_name, mapped));
+        if (it == activations_data->end()) { return nullptr; }
+        return it->second.data();
+    };
+
+    auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t {
+        const int64_t n_per_row = t->ne[0];
+        const int64_t nrows = t->ne[1];
+        const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
+        const size_t  row_sz = ggml_row_size(typ, n_per_row);
+        return (size_t)ne2 * (size_t)nrows * row_sz;
+    };
+
+    auto tensor_bpw = [&](const ggml_tensor * t, const ggml_type typ) -> double {
+        const int64_t nelem = ggml_nelements(t);
+        const size_t bytes = tensor_bytes(t, typ);
+        return (double)bytes * 8.0 / (double)nelem;
+    };
+
+    auto is_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> bool {
+        const int64_t n_per_row = t->ne[0];
+        const int64_t blck = ggml_blck_size(typ);
+        if (blck <= 1) { return true; }
+        return n_per_row % blck == 0;
+    };
+
+    auto make_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> ggml_type {
+        if (is_compatible(t, typ)) { return typ; }
+        ggml_type fb = fallback_type(typ);
+        if (is_compatible(t, fb)) { return fb; }
+        return GGML_TYPE_F16;
+    };
+
+    auto name_tn = LLM_TN(model.arch);
     auto can_quantize = [&](const ggml_tensor * t) -> bool {
         const std::string name = ggml_get_name(t);
         bool q = name.rfind("weight") == name.size() - 6;
@@ -705,231 +740,182 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return q;
     };
 
-    auto get_values = [&](const std::string & tensor_name) -> const float * {
-        if (!values_data) { return nullptr; }
-        const auto it = values_data->find(remap_imatrix(tensor_name, mapped));
-        if (it == values_data->end()) { return nullptr; }
-        return it->second.data();
-    };
-
-    auto get_activations = [&](const std::string & tensor_name) -> const float * {
-        if (!activations_data) { return nullptr; }
-        const auto it = activations_data->find(remap_imatrix(tensor_name, mapped));
-        if (it == activations_data->end()) { return nullptr; }
-        return it->second.data();
-    };
-
-    auto total_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t {
-        const int64_t n_per_row = t->ne[0];
-        const int64_t nrows = t->ne[1];
-        const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
-        const size_t  row_sz = ggml_row_size(typ, n_per_row);
-        return (size_t)ne2 * (size_t)nrows * row_sz;
-    };
-
-    auto tensor_bpw = [&](const ggml_tensor * t, const ggml_type typ) -> double {
-        const int64_t nelem = ggml_nelements(t);
-        const size_t bytes = total_bytes(t, typ);
-        return bytes * 8.0 / nelem;
-    };
-
-    auto is_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> bool {
-        const int64_t n_per_row = t->ne[0];
-        const int64_t blck = ggml_blck_size(typ);
-        if (blck <= 1) { return true; }
-        return n_per_row % blck == 0;
-    };
-
-    auto make_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> ggml_type {
-        if (is_compatible(t, typ)) { return typ; }
-        ggml_type fb = fallback_type(typ);
-        if (is_compatible(t, fb)) { return fb; }
-        return GGML_TYPE_F16;
-    };
-
     // Estimate error for a given type using a sampled subset of rows
     auto estimate_error = [&](const ggml_tensor * t,
-        const ggml_type typ,
+        const ggml_type quant_type,
         const std::vector<float> & f32_sample,
         const std::vector<int64_t> & sample_rows_per_slice,
         const float * values_sample,
         const float * activations_sample,
-        std::vector<uint8_t> & qbuf,
-        std::vector<float> & deq) -> double
+        std::vector<uint8_t> & quantized_buffer,
+        std::vector<float> & dequantized_buffer) -> double
     {
         const int64_t n_per_row = t->ne[0];
         const int64_t nrows = t->ne[1];
         const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
 
-        const size_t nels = f32_sample.size();
-        const size_t total_sampled_rows = nels / (size_t)n_per_row;
-        if (total_sampled_rows == 0) { return 0.0; }
+        const size_t sample_element_count = f32_sample.size();
+        const size_t sample_row_count = sample_element_count / (size_t)n_per_row;
+        if (sample_row_count == 0) { return 0.0; }
 
-        const size_t row_sz = ggml_row_size(typ, n_per_row);
-        const size_t need_q = row_sz * total_sampled_rows;
-        if (qbuf.size() < need_q) { qbuf.resize(need_q); }
-        if (deq.size() < nels) { deq.resize(nels); }
+        const size_t row_size = ggml_row_size(quant_type, n_per_row);
+        const size_t buffer_size = row_size * sample_row_count;
+        if (quantized_buffer.size() < buffer_size) { quantized_buffer.resize(buffer_size); }
+        if (dequantized_buffer.size() < sample_element_count) { dequantized_buffer.resize(sample_element_count); }
 
-        // Precompute denominators:
-        // - x2_per_row: sum_j w[j]*x[j]^2 if w present else sum_j x[j]^2
-        // - bden_per_slice: sum_j w[j]*a[j]^2 if w & a present; sum_j a[j]^2 if only a present; 0 otherwise
-        std::vector x2_per_row(total_sampled_rows, 0.0);
-        std::vector bden_per_slice(ne2, 0.0);
+        std::vector row_sq_norm(sample_row_count, 0.0);
+        std::vector bias_denominator_per_slice(ne2, 0.0);
 
-        const bool has_w = (values_sample != nullptr);
-        const bool has_a = (activations_sample != nullptr);
-
-        // Precompute bden per slice (depends only on w,a)
-        if (has_a) {
+        // Precompute bias denominator per slice
+        const bool has_values = (values_sample != nullptr);
+        const bool has_activations = (activations_sample != nullptr);
+        if (has_activations) {
             for (int64_t s = 0; s < ne2; ++s) {
-                const float * wv   = has_w ? values_sample + s * n_per_row : nullptr;
-                const float * act  = activations_sample + s * n_per_row;
-                double        bden = 0.0;
-                if (has_w) {
+                const float * values = has_values ? values_sample + s * n_per_row : nullptr;
+                const float * activations = activations_sample + s * n_per_row;
+                double bias_denominator = 0.0;
+                if (has_values) {
                     for (int64_t j = 0; j < n_per_row; ++j) {
-                        const double a = act[j];
-                        bden += (double) wv[j] * a * a;
+                        const double a = activations[j];
+                        bias_denominator += values[j] * a * a;
                     }
                 } else {
                     for (int64_t j = 0; j < n_per_row; ++j) {
-                        const double a = act[j];
-                        bden += a * a;
+                        const double a = activations[j];
+                        bias_denominator += a * a;
                     }
                 }
-                bden_per_slice[s] = bden;
+                bias_denominator_per_slice[s] = bias_denominator;
             }
         }
 
-        // Precompute x2 per sampled row
+        // Compute squared norms of sampled rows
         {
-            size_t off = 0;
+            size_t offset = 0;
             size_t row_idx = 0;
             for (int64_t s = 0; s < ne2; ++s) {
                 const int64_t rs = sample_rows_per_slice[s];
                 if (rs == 0) { continue; }
 
-                const float * wv = has_w ? values_sample + s * n_per_row : nullptr;
+                const float * values = has_values ? values_sample + s * n_per_row : nullptr;
 
                 for (int64_t r = 0; r < rs; ++r, ++row_idx) {
-                    const float * x  = f32_sample.data() + off;
-                    double x2 = 0.0;
-                    if (has_w) {
+                    const float * row  = f32_sample.data() + offset;
+                    double rsn = 0.0;
+                    if (has_values) {
                         for (int64_t j = 0; j < n_per_row; ++j) {
-                            const double w = wv[j];
-                            const double xx = x[j];
-                            x2 += w * xx * xx;
+                            const double v = values[j];
+                            const double x = row[j];
+                            rsn += v * x * x;
                         }
                     } else {
                         for (int64_t j = 0; j < n_per_row; ++j) {
-                            const double xx = x[j];
-                            x2 += xx * xx;
+                            const double x = row[j];
+                            rsn += x * x;
                         }
                     }
-                    x2_per_row[row_idx] = x2;
-                    off += (size_t)n_per_row;
+                    row_sq_norm[row_idx] = rsn;
+                    offset += (size_t)n_per_row;
                 }
             }
         }
 
-        // Quantize sampled rows slice-by-slice into qbuf
-        size_t qoff = 0;
-        size_t foff = 0;
+        // Quantize sampled rows slice-by-slice into quantized_buffer
+        size_t quantised_offset = 0;
+        size_t floats_offset = 0;
         for (int64_t slice = 0; slice < ne2; ++slice) {
             const int64_t rs = sample_rows_per_slice[slice];
             if (rs == 0) { continue; }
 
             const float * value = values_sample ? values_sample + slice * n_per_row : nullptr;
-            (void)ggml_quantize_chunk(typ, f32_sample.data() + foff, qbuf.data() + qoff, 0, rs, n_per_row, value);
+            (void)ggml_quantize_chunk(quant_type, f32_sample.data() + floats_offset, quantized_buffer.data() + quantised_offset, 0, rs, n_per_row, value);
 
-            qoff += row_sz * (size_t)rs;
-            foff += (size_t)rs * (size_t)n_per_row;
+            quantised_offset += row_size * (size_t)rs;
+            floats_offset += (size_t)rs * (size_t)n_per_row;
         }
 
-        // Dequantize into deq (row-wise if needed to avoid int overflow)
+        // Dequantize into dequantized_buffer
         {
-            const ggml_type_traits * traits = ggml_get_type_traits(typ);
-            if (typ == GGML_TYPE_F16) {
-                ggml_fp16_to_fp32_row((const ggml_fp16_t *)qbuf.data(), deq.data(), (int)nels);
-            } else if (typ == GGML_TYPE_BF16) {
-                ggml_bf16_to_fp32_row((const ggml_bf16_t *)qbuf.data(), deq.data(), (int)nels);
+            const ggml_type_traits * traits = ggml_get_type_traits(quant_type);
+            if (quant_type == GGML_TYPE_F16) {
+                ggml_fp16_to_fp32_row((const ggml_fp16_t *)quantized_buffer.data(), dequantized_buffer.data(), (int)sample_element_count);
+            } else if (quant_type == GGML_TYPE_BF16) {
+                ggml_bf16_to_fp32_row((const ggml_bf16_t *)quantized_buffer.data(), dequantized_buffer.data(), (int)sample_element_count);
             } else {
                 if (!traits || !traits->to_float) {
-                    LLAMA_LOG_WARN("%s: unsupported quantization type %s\n", __func__, ggml_type_name(typ));
+                    LLAMA_LOG_WARN("%s: unsupported quantization type %s\n", __func__, ggml_type_name(quant_type));
                     return 1e35;
                 }
 
                 size_t done = 0;
-                while (done < nels) {
-                    const size_t chunk = std::min((size_t)n_per_row, nels - done);
-                    traits->to_float(qbuf.data() + done / n_per_row * row_sz, deq.data() + done, (int)chunk);
+                while (done < sample_element_count) {
+                    const size_t chunk = std::min((size_t)n_per_row, sample_element_count - done);
+                    traits->to_float(quantized_buffer.data() + done / n_per_row * row_size, dequantized_buffer.data() + done, (int)chunk);
                     done += chunk;
                 }
             }
         }
 
         // Compute error
-        const double eps = 1e-12;
-        size_t off = 0;
+        size_t offset = 0;
         size_t row_idx = 0;
         double total_err = 0.0;
-
         for (int64_t slice = 0; slice < ne2; ++slice) {
             const int64_t rs = sample_rows_per_slice[slice];
             if (rs == 0) { continue; }
 
-            const float * wv = has_w ? values_sample + slice * n_per_row : nullptr;
-            const float * act = has_a ? activations_sample + slice * n_per_row : nullptr;
-            const double bden = has_a ? bden_per_slice[slice] : 0.0;
-
+            const float * values = has_values ? values_sample + slice * n_per_row : nullptr;
+            const float * activations = has_activations ? activations_sample + slice * n_per_row : nullptr;
+            const double bias_denominator = has_activations ? bias_denominator_per_slice[slice] : 0.0;
             double slice_err = 0.0;
-
             for (int64_t r = 0; r < rs; ++r, ++row_idx) {
-                const float * x = f32_sample.data() + off;
-                const float * y = deq.data() + off;
-
-                double mse_w = 0.0;
-                double bnum  = 0.0;
-
-                if (wv && act) {
+                const float * x = f32_sample.data() + offset;
+                const float * y = dequantized_buffer.data() + offset;
+                double weighted_mse = 0.0;
+                double bias_numerator  = 0.0;
+                if (values && activations) {
                     for (int64_t j = 0; j < n_per_row; ++j) {
-                        const double w = wv[j];
+                        const double v = values[j];
                         const double e = y[j] - x[j];
-                        const double a = act[j];
-                        mse_w += w * e * e;
-                        bnum += w * e * a;
+                        const double a = activations[j];
+                        weighted_mse += v * e * e;
+                        bias_numerator += v * e * a;
                     }
-                } else if (wv) {
+                } else if (values) {
                     for (int64_t j = 0; j < n_per_row; ++j) {
-                        const double w = wv[j];
+                        const double v = values[j];
                         const double e = y[j] - x[j];
-                        mse_w += w * e * e;
+                        weighted_mse += v * e * e;
                     }
-                } else if (act) {
+                } else if (activations) {
                     for (int64_t j = 0; j < n_per_row; ++j) {
                         const double e = y[j] - x[j];
-                        const double a = act[j];
-                        mse_w += e * e;
-                        bnum += e * a;
+                        const double a = activations[j];
+                        weighted_mse += e * e;
+                        bias_numerator += e * a;
                     }
                 } else {
                     for (int64_t j = 0; j < n_per_row; ++j) {
                         const double e = y[j] - x[j];
-                        mse_w += e * e;
+                        weighted_mse += e * e;
                     }
                 }
 
-                // corrected normalization: divide the full numerator by x2
-                double numer = mse_w;
-                if (act && bias_lambda != 0.0) {
-                    const double proj = bnum * bnum / (bden + eps);
-                    numer += bias_lambda * proj;
+                double err_numerator = weighted_mse;
+                constexpr double epsilon = 1e-12;
+                constexpr float bias_lambda = 1.0;
+                //bias_lambda defines the weight of the bias term in the weigthed MSE error function
+                // 0.0 means no bias (standard MSE) 1.0 means equal weight for bias and error,
+                // 2.0 means twice as much weight for bias, etc
+                if (activations && bias_lambda != 0.0) {
+                    const double proj = bias_numerator * bias_numerator / (bias_denominator + epsilon);
+                    err_numerator += bias_lambda * proj;
                 }
 
-                const double denom = x2_per_row[row_idx] + eps;
-                const double row_err = numer / denom;
-
+                const double err_denominator = row_sq_norm[row_idx] + epsilon;
+                const double row_err = err_numerator / err_denominator;
                 slice_err += row_err;
-                off += (size_t)n_per_row;
+                offset += (size_t)n_per_row;
             }
 
             // scale to full rows (nrows)
@@ -942,14 +928,11 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     std::vector<tensor_info> all;
     all.reserve(tensors.size());
-
     for (const auto * tw : tensors) {
         std::vector<std::thread> workers;
         workers.reserve(std::max(1, nthread));
-
         ggml_tensor * t = tw->tensor;
         const std::string name = ggml_get_name(t);
-
         if (!can_quantize(t)) { continue; }
 
         LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12d elements)\n", __func__, name.c_str(), (int)ggml_nelements(t));
@@ -959,37 +942,26 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         }
         ml.load_data_for(t);
 
-        const int64_t nelem = ggml_nelements(t);
-        std::vector<no_init<float>> f32_conv_buf;
-        const float * values_all = get_values(name);
-        const float * activations_all = get_activations(name);
-
         // Dequantize only sampled rows into f32_sample
         const int64_t n_per_row = t->ne[0];
         const int64_t nrows_total = t->ne[1];
         const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
 
+        // Larger sample_rows_per_expert values may result in more accurate error estimates, but will take longer to compute
+        int sample_rows_per_expert = 512;
+        std::vector<float> f32_sample;
+        f32_sample.reserve((size_t)ne2 * (size_t)std::min<int64_t>(nrows_total, sample_rows_per_expert) * (size_t)n_per_row);
+
+        // deterministic sampling seed based on tensor name + fixed constant
+        std::mt19937 rng(std::hash<std::string>{}(name) ^0xeabada55cafed00d);
+        std::vector<int64_t> sample_rows_per_slice(ne2, 0);
+        const int64_t sample_rows_max = std::max<int64_t>(1, std::min<int64_t>(nrows_total, sample_rows_per_expert));
+        const int64_t stride = std::max<int64_t>(1, nrows_total / sample_rows_max);
+        std::vector<float> row_buffer(n_per_row);
         const ggml_type src_type = t->type;
         const ggml_type_traits *src_traits = ggml_get_type_traits(src_type);
         const bool src_is_quant = ggml_is_quantized(src_type);
         const size_t src_row_sz = ggml_row_size(src_type, n_per_row);
-
-        std::vector<float> f32_sample;
-        f32_sample.reserve((size_t)ne2 * (size_t)std::min<int64_t>(nrows_total, sample_rows_per_expert) * (size_t)n_per_row);
-
-        std::vector<float> values_sample;
-        std::vector<float> activations_sample;
-        std::vector<int64_t> sample_rows_per_slice(ne2, 0);
-
-        // deterministic sampling seed based on tensor name + fixed constant
-        std::mt19937 rng(std::hash<std::string>{}(name) ^0xeabada55cafed00d);
-
-        const int64_t sample_rows_max = std::max<int64_t>(1, std::min<int64_t>(nrows_total, sample_rows_per_expert));
-        const int64_t stride = std::max<int64_t>(1, nrows_total / sample_rows_max);
-
-        // Temporary buffer for one dequantized row
-        std::vector<float> rowbuf((size_t)n_per_row);
-
         for (int64_t slice = 0; slice < ne2; ++slice) {
             int64_t current_sampled_rows = 0;
             int64_t offset = 0;
@@ -1004,19 +976,19 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     f32_sample.insert(f32_sample.end(), src_row, src_row + n_per_row);
                 } else if (src_type == GGML_TYPE_F16) {
                     const ggml_fp16_t * src_row = (const ggml_fp16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz);
-                    ggml_fp16_to_fp32_row(src_row, rowbuf.data(), (int)n_per_row);
-                    f32_sample.insert(f32_sample.end(), rowbuf.begin(), rowbuf.end());
+                    ggml_fp16_to_fp32_row(src_row, row_buffer.data(), (int)n_per_row);
+                    f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end());
                 } else if (src_type == GGML_TYPE_BF16) {
                     const ggml_bf16_t * src_row = (const ggml_bf16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz);
-                    ggml_bf16_to_fp32_row(src_row, rowbuf.data(), (int)n_per_row);
-                    f32_sample.insert(f32_sample.end(), rowbuf.begin(), rowbuf.end());
+                    ggml_bf16_to_fp32_row(src_row, row_buffer.data(), (int)n_per_row);
+                    f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end());
                 } else if (src_is_quant) {
                     const uint8_t * qrow = (const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz;
                     if (!src_traits || !src_traits->to_float) {
                         throw std::runtime_error(format("cannot dequantize type %s for sampling", ggml_type_name(src_type)));
                     }
-                    src_traits->to_float(qrow, rowbuf.data(), (int)n_per_row);
-                    f32_sample.insert(f32_sample.end(), rowbuf.begin(), rowbuf.end());
+                    src_traits->to_float(qrow, row_buffer.data(), (int)n_per_row);
+                    f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end());
                 } else {
                     throw std::runtime_error(format("unsupported src type %s for sampling", ggml_type_name(src_type)));
                 }
@@ -1045,6 +1017,10 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             }
         };
 
+        const float * values_all = get_values(name);
+        const float * activations_all = get_activations(name);
+        std::vector<float> values_sample;
+        std::vector<float> activations_sample;
         if (values_all) {
             // get size from the map (not just the raw pointer)
             auto itv = values_data->find(remap_imatrix(name, mapped));
@@ -1057,6 +1033,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             copy_or_broadcast(activations_all, sz, activations_sample);
         }
 
+        const int64_t nelem = ggml_nelements(t);
         tensor_info info;
         info.w = tw;
         info.n_elements = nelem;
@@ -1067,12 +1044,12 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         // Build list of candidate types first (compatible ones)
         std::vector<ggml_type> quant_candidates;
         if (is_iq(params->ftype)) {
-            quant_candidates.assign(std::begin(iq_candidates), std::end(iq_candidates));
+            quant_candidates.assign(std::begin(iq_quants), std::end(iq_quants));
         } else {
-            quant_candidates.assign(std::begin(k_candidates), std::end(k_candidates));
+            quant_candidates.assign(std::begin(k_quants), std::end(k_quants));
         }
 
-        // Compute maximum row size among compatible candidates (to size qbuf once)
+        // Compute maximum row size among compatible candidates (to size quantized_buffer once)
         size_t max_row_sz = 0;
         const bool has_valid_imatrix = !values_sample.empty() && values_sample.size() == (size_t)ne2 * (size_t)n_per_row;
         std::vector<ggml_type> compatible_candidates;
@@ -1092,21 +1069,20 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         compatible_candidates.erase(std::unique(compatible_candidates.begin(), compatible_candidates.end()), compatible_candidates.end());
 
         // Now evaluate candidates
-        std::vector<candidate_types> cand_out(compatible_candidates.size());
-        const float *vals_ptr = values_sample.empty() ? nullptr : values_sample.data();
-        const float *acts_ptr = activations_sample.empty() ? nullptr : activations_sample.data();
-        std::vector<uint8_t> qbuf(max_row_sz * total_sampled_rows);
-        std::vector<float>   deq(f32_sample.size());
+        std::vector<candidate_types> eval_candidates(compatible_candidates.size());
+        const float *values = values_sample.empty() ? nullptr : values_sample.data();
+        const float *activations = activations_sample.empty() ? nullptr : activations_sample.data();
+        std::vector<uint8_t> quantized_buffer(max_row_sz * total_sampled_rows);
+        std::vector<float> dequantised_buffer(f32_sample.size());
         int n_eval_threads = std::max(1, std::min<int>(nthread, (int)compatible_candidates.size()));
         std::atomic<size_t> cidx{0};
         std::vector<std::thread> eval_workers;
         eval_workers.reserve(n_eval_threads);
-
         for (int ti = 0; ti < n_eval_threads; ++ti) {
             eval_workers.emplace_back([&] {
                 // thread-local scratch
-                std::vector<uint8_t> tl_qbuf(qbuf.size());
-                std::vector<float>   tl_deq(deq.size());
+                std::vector<uint8_t> tl_quantized_buffer(quantized_buffer.size());
+                std::vector<float>   tl_dequantised_buffer(dequantised_buffer.size());
 
                 for (;;) {
                     const size_t i = cidx.fetch_add(1, std::memory_order_relaxed);
@@ -1114,15 +1090,16 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
                     const ggml_type tt = compatible_candidates[i];
                     const auto bpw = (float)tensor_bpw(t, tt);
-                    const size_t bytes = total_bytes(t, tt);
-                    const auto err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, vals_ptr, acts_ptr, tl_qbuf, tl_deq);
-                    cand_out[i] = candidate_types{ tt, bpw, bytes, err };
+                    const size_t bytes = tensor_bytes(t, tt);
+                    const auto err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, values, activations, tl_quantized_buffer, tl_dequantised_buffer);
+                    eval_candidates[i] = candidate_types{ tt, bpw, bytes, err };
                 }
             });
         }
+
         for (auto &th : eval_workers) { th.join(); }
 
-        for (auto &c : cand_out) {
+        for (auto &c : eval_candidates) {
             if (c.bytes > 0) { info.candidate.push_back(c); }
         }
 
@@ -1132,7 +1109,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             info.candidate.push_back(candidate_types{ t->type, bpw, ggml_nbytes(t), 0.0 });
         }
 
-        // Remove dominated candidates: if A has >= bytes and >= error than B (and > in at least one), drop A.
+        // Keep only the Pareto‑optimal candidates: if A has >= bytes and >= error than B, drop A.
         {
             std::vector<candidate_types> pruned;
             pruned.reserve(info.candidate.size());
@@ -1155,36 +1132,37 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             info.candidate.swap(pruned);
         }
 
-        std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types & a, const candidate_types & b) {
-            if (a.bpw != b.bpw) { return a.bpw < b.bpw; }
-            if (a.error != b.error) { return a.error < b.error; }
-            return a.bytes < b.bytes;
-        });
-
         // Collapse candidates with identical storage size (bytes)
         {
-            std::vector<candidate_types> uniq;
-            uniq.reserve(info.candidate.size());
+            std::vector<candidate_types> unique;
+            unique.reserve(info.candidate.size());
+            // Sort by bpw asc, error asc, bytes asc
+            std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types & a, const candidate_types & b) {
+                if (a.bpw != b.bpw) { return a.bpw < b.bpw; }
+                if (a.error != b.error) { return a.error < b.error; }
+                return a.bytes < b.bytes;
+            });
 
             for (size_t i = 0; i < info.candidate.size();) {
-                size_t j = i + 1;
+                size_t          j    = i + 1;
                 candidate_types best = info.candidate[i];
                 // group same-byte entries, keep the one with the lowest error
                 while (j < info.candidate.size() && info.candidate[j].bytes == info.candidate[i].bytes) {
-                    if (info.candidate[j].error < best.error) { best = info.candidate[j]; }
+                    if (info.candidate[j].error < best.error) {
+                        best = info.candidate[j];
+                    }
                     ++j;
                 }
-                uniq.push_back(best);
+                unique.push_back(best);
                 i = j;
             }
-            info.candidate.swap(uniq);
+            info.candidate.swap(unique);
         }
 
         // Initialize choice at the smallest bpw candidate
         info.choice = 0;
         info.min_bpw = info.candidate.front().bpw;
         info.max_bpw = info.candidate.back().bpw;
-
         all.push_back(std::move(info));
     }
 
@@ -1196,6 +1174,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         for (const auto & ti : all) {
             b += ti.candidate[ti.choice].bytes;
         }
+
         return b;
     };
 
@@ -1204,6 +1183,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         for (const auto & ti : all) {
             w += ti.n_elements;
         }
+
         return w;
     };
 
@@ -1215,12 +1195,14 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     // Precompute current bpw
     double bpw_now = current_bpw();
 
+    float target_bpw = params->target_bpw;
     // If minimal bpw is already above the target, we're constrained by geometry; return closest (min bpw)
     if (bpw_now >= target_bpw) {
         std::unordered_map<std::string, ggml_type> overrides;
         for (const auto & ti : all) {
             overrides[ggml_get_name(ti.w->tensor)] = ti.candidate[ti.choice].type;
         }
+
         return overrides;
     }
 
@@ -1268,6 +1250,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 best = upgrade{ i, j, err, delta_bytes, ratio };
             }
         }
+
         return best;
     };
 
@@ -1286,16 +1269,12 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         }
     }
 
-    // We might still be below target but taking any single upgrade overshoots.
-    // Try to find the best upgrade that overshoots the target_bpw by the least and has the best error-to-size ratio.
+    // We might still be below target so we try to find the best upgrade one last time
     {
-        double under_gap = target_bpw - bpw_now;
-
         upgrade best_over{ -1, -1, 0.0, 0, -1.0 };
         double  best_over_gap = 1e300;
-
+        double  under_gap = target_bpw - bpw_now;
         size_t now_bytes = current_total_bytes();
-
         for (int i = 0; i < (int) all.size(); ++i) {
             const auto & ti = all[i];
             if (ti.choice >= (int)ti.candidate.size() - 1) { continue; }
@@ -1305,19 +1284,16 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
             const auto & cur = ti.candidate[ti.choice];
             const auto & nxt = ti.candidate[j];
-
             size_t delta_bytes = nxt.bytes - cur.bytes;
             if (delta_bytes == 0) { continue; }
 
             size_t over_bytes = now_bytes + delta_bytes;
             double bpw_over = (double)over_bytes * 8.0 / (double)tw;
-
-            double over_gap = std::abs(bpw_over - (double)target_bpw);
-
             double err = cur.error - nxt.error;
             if (err < 0.0) { err = 0.0; }
             double ratio = err / (double)(delta_bytes * 8ull);
 
+            double over_gap = std::abs(bpw_over - (double)target_bpw);
             if (over_gap < best_over_gap - 1e-12 || (std::abs(over_gap - best_over_gap) <= 1e-12 && ratio > best_over.ratio)) {
                 best_over_gap = over_gap;
                 best_over = upgrade{ i, j, err, delta_bytes, ratio };
@@ -1339,6 +1315,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             __func__, ggml_get_name(ti.w->tensor), ggml_type_name(ti.candidate[ti.choice].type), ti.candidate[ti.choice].bpw, ti.candidate[ti.choice].error);
         overrides[ggml_get_name(ti.w->tensor)] = ti.candidate[ti.choice].type;
     }
+
     return overrides;
 }
 

From ec0afbe79ff001af56846365f91f97240bd2dbf4 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Fri, 22 Aug 2025 01:46:09 +0100
Subject: [PATCH 034/155] Include embeddings and output tensors

---
 src/llama-quant.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 85191a66ae..b9e3c19a89 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -733,9 +733,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         q &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
         q &= name.find("attn_rel_b.weight") == std::string::npos;
         q &= !params->only_copy;
-        // TODO: Exclude embeddings and output tensors?
-        // q &= params->quantize_output_tensor || name != "output.weight";
-        q &= name != name_tn(LLM_TENSOR_TOKEN_EMBD, "weight");
 
         return q;
     };

From 35c1504441eb03b126b15a6ddd4625f094dc7dfe Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Fri, 22 Aug 2025 09:01:57 +0100
Subject: [PATCH 035/155] Fix byte count for 3d or higher tensors

---
 src/llama-quant.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index b9e3c19a89..8cc5f221ea 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -676,10 +676,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t {
         const int64_t n_per_row = t->ne[0];
-        const int64_t nrows = t->ne[1];
-        const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
-        const size_t  row_sz = ggml_row_size(typ, n_per_row);
-        return (size_t)ne2 * (size_t)nrows * row_sz;
+        const size_t row_sz = ggml_row_size(typ, n_per_row);
+        const int64_t nrows = ggml_nrows(t);
+        return (size_t)nrows * row_sz;
     };
 
     auto tensor_bpw = [&](const ggml_tensor * t, const ggml_type typ) -> double {

From bb0d912c1f93de2ef1af4ef9fb467c4862012898 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Fri, 22 Aug 2025 09:02:56 +0100
Subject: [PATCH 036/155] Update comments

---
 src/llama-quant.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 8cc5f221ea..4b846c7d0c 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -703,6 +703,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     auto name_tn = LLM_TN(model.arch);
     auto can_quantize = [&](const ggml_tensor * t) -> bool {
+        // This list should be kept in sync with llama_tensor_quantize_impl()
         const std::string name = ggml_get_name(t);
         bool q = name.rfind("weight") == name.size() - 6;
         q &= ggml_n_dims(t) >= 2;
@@ -902,7 +903,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 constexpr float bias_lambda = 1.0;
                 //bias_lambda defines the weight of the bias term in the weigthed MSE error function
                 // 0.0 means no bias (standard MSE) 1.0 means equal weight for bias and error,
-                // 2.0 means twice as much weight for bias, etc
+                // 2.0 means twice as much weight for bias, etc. Default is 1.0.
                 if (activations && bias_lambda != 0.0) {
                     const double proj = bias_numerator * bias_numerator / (bias_denominator + epsilon);
                     err_numerator += bias_lambda * proj;
@@ -1192,7 +1193,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     double bpw_now = current_bpw();
 
     float target_bpw = params->target_bpw;
-    // If minimal bpw is already above the target, we're constrained by geometry; return closest (min bpw)
+    // If minimal bpw is already above the target, we're constrained by the tensor's shape; return closest (min bpw)
     if (bpw_now >= target_bpw) {
         std::unordered_map<std::string, ggml_type> overrides;
         for (const auto & ti : all) {

From 2f13fee795639841de46b8f415a233062aa5d2b8 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Fri, 22 Aug 2025 09:05:55 +0100
Subject: [PATCH 037/155] Parameterise type

---
 src/llama-quant.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 4b846c7d0c..e5e27da509 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -760,8 +760,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         if (quantized_buffer.size() < buffer_size) { quantized_buffer.resize(buffer_size); }
         if (dequantized_buffer.size() < sample_element_count) { dequantized_buffer.resize(sample_element_count); }
 
-        std::vector row_sq_norm(sample_row_count, 0.0);
-        std::vector bias_denominator_per_slice(ne2, 0.0);
+        std::vector<double> row_sq_norm(sample_row_count, 0.0);
+        std::vector<double> bias_denominator_per_slice(ne2, 0.0);
 
         // Precompute bias denominator per slice
         const bool has_values = (values_sample != nullptr);

From 47cdbe21552324cd79b9243485eeb455cab4673a Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Fri, 22 Aug 2025 09:11:11 +0100
Subject: [PATCH 038/155] Reduce sampling window to speedup process

---
 src/llama-quant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index e5e27da509..5460669e7c 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -945,7 +945,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
 
         // Larger sample_rows_per_expert values may result in more accurate error estimates, but will take longer to compute
-        int sample_rows_per_expert = 512;
+        constexpr int sample_rows_per_expert = 384;
         std::vector<float> f32_sample;
         f32_sample.reserve((size_t)ne2 * (size_t)std::min<int64_t>(nrows_total, sample_rows_per_expert) * (size_t)n_per_row);
 

From 01c927fb94163ddb36365323683274071c034690 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Fri, 22 Aug 2025 09:14:14 +0100
Subject: [PATCH 039/155] Improve pareto efficient candidate selection

---
 src/llama-quant.cpp | 49 +++++++++++++--------------------------------
 1 file changed, 14 insertions(+), 35 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 5460669e7c..14d9087f53 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1106,56 +1106,35 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             info.candidate.push_back(candidate_types{ t->type, bpw, ggml_nbytes(t), 0.0 });
         }
 
-        // Keep only the Pareto‑optimal candidates: if A has >= bytes and >= error than B, drop A.
+        // Keep only the pareto‑optimal candidates: if A has >= bytes and >= error than B, drop A.
         {
             std::vector<candidate_types> pruned;
             pruned.reserve(info.candidate.size());
-            // Sort by bytes asc, error asc
-            std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types &a, const candidate_types &b) {
+
+            // Sort by bytes ascending, error ascending
+            std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types & a, const candidate_types & b) {
                 if (a.bytes != b.bytes) { return a.bytes < b.bytes; }
                 return a.error < b.error;
             });
 
             double best_err = std::numeric_limits<double>::infinity();
             size_t last_bytes = std::numeric_limits<size_t>::max();
-
-            for (const auto &c : info.candidate) {
-                if (c.error < best_err || c.bytes > last_bytes) {
-                    pruned.push_back(c);
-                    best_err = std::min(best_err, (double)c.error);
+            for (const auto & c : info.candidate) {
+                // Only keep the best error seen so far at strictly larger byte sizes
+                if (c.bytes != last_bytes) {
+                    // first time we see this byte size
                     last_bytes = c.bytes;
+                    if (c.error < best_err) {
+                        pruned.push_back(c);
+                        best_err = c.error;
+                    }
+                } else {
+                    // same bytes: we already sorted by error; skip
                 }
             }
             info.candidate.swap(pruned);
         }
 
-        // Collapse candidates with identical storage size (bytes)
-        {
-            std::vector<candidate_types> unique;
-            unique.reserve(info.candidate.size());
-            // Sort by bpw asc, error asc, bytes asc
-            std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types & a, const candidate_types & b) {
-                if (a.bpw != b.bpw) { return a.bpw < b.bpw; }
-                if (a.error != b.error) { return a.error < b.error; }
-                return a.bytes < b.bytes;
-            });
-
-            for (size_t i = 0; i < info.candidate.size();) {
-                size_t          j    = i + 1;
-                candidate_types best = info.candidate[i];
-                // group same-byte entries, keep the one with the lowest error
-                while (j < info.candidate.size() && info.candidate[j].bytes == info.candidate[i].bytes) {
-                    if (info.candidate[j].error < best.error) {
-                        best = info.candidate[j];
-                    }
-                    ++j;
-                }
-                unique.push_back(best);
-                i = j;
-            }
-            info.candidate.swap(unique);
-        }
-
         // Initialize choice at the smallest bpw candidate
         info.choice = 0;
         info.min_bpw = info.candidate.front().bpw;

From 897decbe8a062ded079f1f1a866392571ed7f95f Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Fri, 22 Aug 2025 09:15:11 +0100
Subject: [PATCH 040/155] Show skipped IQ tensors

---
 src/llama-quant.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 14d9087f53..c5c19f3c5f 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1019,7 +1019,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         std::vector<float> values_sample;
         std::vector<float> activations_sample;
         if (values_all) {
-            // get size from the map (not just the raw pointer)
             auto itv = values_data->find(remap_imatrix(name, mapped));
             const size_t sz = itv == values_data->end() ? 0 : itv->second.size();
             copy_or_broadcast(values_all, sz, values_sample);
@@ -1053,7 +1052,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         compatible_candidates.reserve(quant_candidates.size());
         for (ggml_type ts_type : quant_candidates) {
             if (is_iq(ts_type) && !has_valid_imatrix) {
-                LLAMA_LOG_WARN("%s: skipping IQ quantization for %s, no or mismatched imatrix provided\n", __func__, name.c_str());
+                LLAMA_LOG_WARN("%s: skipping %s quantization for %s, no or mismatched imatrix provided\n", __func__, ggml_type_name(ts_type) , name.c_str());
                 continue;
             }
             ggml_type tt = make_compatible(t, ts_type);
@@ -1214,13 +1213,11 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
             const auto & cur = ti.candidate[ti.choice];
             const auto & nxt = ti.candidate[j];
-
             const size_t delta_bytes = nxt.bytes - cur.bytes;
             if (delta_bytes == 0) { continue; }
 
             double err = cur.error - nxt.error;
             err = std::max(err, 0.0);
-
             double ratio = err / (double)(delta_bytes * 8ull);
             if (ratio > best.ratio + eps || (std::abs(ratio - best.ratio) <= eps && delta_bytes < best.delta_bytes)) {
                 best = upgrade{ i, j, err, delta_bytes, ratio };

From f05c8483d8b138c58a41ecdf32f95947bb130be5 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Fri, 22 Aug 2025 09:17:58 +0100
Subject: [PATCH 041/155] Improve dequantized_buffer fill

---
 src/llama-quant.cpp | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index c5c19f3c5f..db4a0e1a20 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -843,12 +843,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     LLAMA_LOG_WARN("%s: unsupported quantization type %s\n", __func__, ggml_type_name(quant_type));
                     return 1e35;
                 }
-
-                size_t done = 0;
-                while (done < sample_element_count) {
-                    const size_t chunk = std::min((size_t)n_per_row, sample_element_count - done);
-                    traits->to_float(quantized_buffer.data() + done / n_per_row * row_size, dequantized_buffer.data() + done, (int)chunk);
-                    done += chunk;
+                const size_t row_size = ggml_row_size(quant_type, n_per_row);
+                for (size_t r = 0; r < sample_row_count; ++r) {
+                    traits->to_float(quantized_buffer.data() + r * row_size, dequantized_buffer.data() + r * n_per_row, (int)n_per_row);
                 }
             }
         }

From fea99d051ad3a9f3cce3cdf084074e0655f47e14 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Fri, 22 Aug 2025 16:57:58 +0100
Subject: [PATCH 042/155] Refactor and combine lambdas

---
 src/llama-quant.cpp | 40 +++++++++++++---------------------------
 1 file changed, 13 insertions(+), 27 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index db4a0e1a20..10993e89c6 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -660,20 +660,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         GGML_TYPE_Q6_K
     };
 
-    auto get_values = [&](const std::string & tensor_name) -> const float * {
-        if (!values_data) { return nullptr; }
-        const auto it = values_data->find(remap_imatrix(tensor_name, mapped));
-        if (it == values_data->end()) { return nullptr; }
-        return it->second.data();
-    };
-
-    auto get_activations = [&](const std::string & tensor_name) -> const float * {
-        if (!activations_data) { return nullptr; }
-        const auto it = activations_data->find(remap_imatrix(tensor_name, mapped));
-        if (it == activations_data->end()) { return nullptr; }
-        return it->second.data();
-    };
-
     auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t {
         const int64_t n_per_row = t->ne[0];
         const size_t row_sz = ggml_row_size(typ, n_per_row);
@@ -991,6 +977,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             sample_rows_per_slice[slice] = current_sampled_rows;
         }
 
+        auto side_data = [&](const std::unordered_map<std::string, std::vector<float>> * m, const std::string & tensor_name) -> std::pair<const float*, size_t> {
+            if (!m) { return {nullptr, 0}; }
+            const std::string key = remap_imatrix(tensor_name, mapped);
+            const auto it = m->find(key);
+            if (it == m->end()) { return {nullptr, 0}; }
+            return { it->second.data(), it->second.size() };
+        };
+
+        // Copy this row's side data (values and activations), or broadcasts to all slices
         auto copy_or_broadcast = [&](const float *src, size_t src_sz, std::vector<float> &dst) {
             const size_t want = (size_t)ne2 * (size_t)n_per_row;
             dst.clear();
@@ -1005,26 +1000,17 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     std::memcpy(dst.data() + s * n_per_row, src, n_per_row * sizeof(float));
                 }
             } else {
-                // Mismatch – safer to skip using it for this tensor
                 LLAMA_LOG_WARN("%s: side data size mismatch for %s: got %zu, expected %zu or %zu; ignoring\n",
                     __func__, name.c_str(), src_sz, (size_t)n_per_row, want);
             }
         };
 
-        const float * values_all = get_values(name);
-        const float * activations_all = get_activations(name);
+        const auto [values_all, values_sz] = side_data(values_data, name);
+        const auto [activations_all, activations_sz] = side_data(activations_data, name);
         std::vector<float> values_sample;
         std::vector<float> activations_sample;
-        if (values_all) {
-            auto itv = values_data->find(remap_imatrix(name, mapped));
-            const size_t sz = itv == values_data->end() ? 0 : itv->second.size();
-            copy_or_broadcast(values_all, sz, values_sample);
-        }
-        if (activations_all) {
-            auto ita = activations_data->find(remap_imatrix(name, mapped));
-            const size_t sz = ita == activations_data->end() ? 0 : ita->second.size();
-            copy_or_broadcast(activations_all, sz, activations_sample);
-        }
+        if (values_all) { copy_or_broadcast(values_all, values_sz, values_sample); }
+        if (activations_all) { copy_or_broadcast(activations_all, activations_sz, activations_sample); }
 
         const int64_t nelem = ggml_nelements(t);
         tensor_info info;

From 6d17889addf3aa18000334e1dd958111104cdf3e Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Fri, 22 Aug 2025 16:58:46 +0100
Subject: [PATCH 043/155] Log if override is from tensor-type or from
 bpw-target

---
 src/llama-quant.cpp | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 10993e89c6..721deaddad 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1049,8 +1049,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         // Now evaluate candidates
         std::vector<candidate_types> eval_candidates(compatible_candidates.size());
-        const float *values = values_sample.empty() ? nullptr : values_sample.data();
-        const float *activations = activations_sample.empty() ? nullptr : activations_sample.data();
+        const float * values = values_sample.empty() ? nullptr : values_sample.data();
+        const float * activations = activations_sample.empty() ? nullptr : activations_sample.data();
         std::vector<uint8_t> quantized_buffer(max_row_sz * total_sampled_rows);
         std::vector<float> dequantised_buffer(f32_sample.size());
         int n_eval_threads = std::max(1, std::min<int>(nthread, (int)compatible_candidates.size()));
@@ -1656,15 +1656,18 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                 new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
                 // get bpw override
                 const auto override = bpw_overrides.find(name);
-                if (override != bpw_overrides.end()) { new_type = override->second; }
-                // unless the user specifies a type, and the tensor geometry will not require fallback quantisation
+                if (override != bpw_overrides.end() && override->second != new_type) {
+                    LLAMA_LOG_DEBUG("(bpw overriding %s) ", ggml_type_name(new_type));
+                    new_type = override->second;
+                }
+                // unless the user specifies a type, and the tensor shape will not require fallback quantisation
                 if (params->tensor_types && qs.n_fallback - fallback == 0) {
                     const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
                     const std::string tensor_name(tensor->name);
                     for (const auto & [tname, qtype] : tensor_types) {
                         if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
                             if  (qtype != new_type) {
-                                LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
+                                LLAMA_LOG_DEBUG("(type overriding %s) ", ggml_type_name(new_type));
                                 new_type = qtype; // if two or more types are specified for the same tensor, the last match wins
                             }
                         }

From 9a4b1154974d5ddbfb9d9d3f785f5a29bb202fac Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 23 Aug 2025 01:08:01 +0100
Subject: [PATCH 044/155] Explicitly adding <atomic> include

---
 src/llama-quant.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 721deaddad..d17b21d008 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -4,6 +4,7 @@
 #include "llama-model-loader.h"
 
 #include <algorithm>
+#include <atomic>
 #include <cmath>
 #include <cstring>
 #include <cinttypes>

From f75265f55bb1d4470dea57f4c9e3ad108cc343a1 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 23 Aug 2025 01:08:37 +0100
Subject: [PATCH 045/155] Fix typo

---
 src/llama-quant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index d17b21d008..6e3aa3f83d 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1535,7 +1535,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
     std::unordered_map<std::string, ggml_type> bpw_overrides = {};
     if (params->target_bpw != -1.0f) {
-        LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.3f bpw at lowest ppl - this opearation may take some time\n", __func__, params->target_bpw);
+        LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.3f bpw at lowest ppl - this operation may take some time\n", __func__, params->target_bpw);
         bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread);
     }
 

From 73124a9921b967fe9e5afbb9f48924a3d48983a6 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 23 Aug 2025 02:17:22 +0100
Subject: [PATCH 046/155] Refactor estimate_error()

---
 src/llama-quant.cpp | 131 ++++++++++++++++++++++----------------------
 1 file changed, 66 insertions(+), 65 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 6e3aa3f83d..3c358fb67e 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -742,38 +742,33 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         const size_t sample_row_count = sample_element_count / (size_t)n_per_row;
         if (sample_row_count == 0) { return 0.0; }
 
-        const size_t row_size = ggml_row_size(quant_type, n_per_row);
-        const size_t buffer_size = row_size * sample_row_count;
-        if (quantized_buffer.size() < buffer_size) { quantized_buffer.resize(buffer_size); }
+        const size_t row_sz = ggml_row_size(quant_type, n_per_row);
+        const size_t buffer_sz = row_sz * sample_row_count;
+
+        if (quantized_buffer.size() < buffer_sz) { quantized_buffer.resize(buffer_sz); }
         if (dequantized_buffer.size() < sample_element_count) { dequantized_buffer.resize(sample_element_count); }
 
-        std::vector<double> row_sq_norm(sample_row_count, 0.0);
-        std::vector<double> bias_denominator_per_slice(ne2, 0.0);
+        const bool has_values = values_sample != nullptr;
+        const bool has_activations = activations_sample != nullptr;
 
-        // Precompute bias denominator per slice
-        const bool has_values = (values_sample != nullptr);
-        const bool has_activations = (activations_sample != nullptr);
+        // Bias denominators per slice (only needed if we have activations)
+        std::vector<double> bias_denominator_per_slice(ne2, 0.0);
         if (has_activations) {
             for (int64_t s = 0; s < ne2; ++s) {
                 const float * values = has_values ? values_sample + s * n_per_row : nullptr;
                 const float * activations = activations_sample + s * n_per_row;
-                double bias_denominator = 0.0;
-                if (has_values) {
-                    for (int64_t j = 0; j < n_per_row; ++j) {
-                        const double a = activations[j];
-                        bias_denominator += values[j] * a * a;
-                    }
-                } else {
-                    for (int64_t j = 0; j < n_per_row; ++j) {
-                        const double a = activations[j];
-                        bias_denominator += a * a;
-                    }
+                double denom = 0.0;
+                for (int64_t j = 0; j < n_per_row; ++j) {
+                    const double a = activations[j];
+                    const double w = values ? values[j] : 1.0;
+                    denom += w * a * a;
                 }
-                bias_denominator_per_slice[s] = bias_denominator;
+                bias_denominator_per_slice[s] = denom;
             }
         }
 
-        // Compute squared norms of sampled rows
+        // Compute per-row squared norms with weighting (if values are provided)
+        std::vector<double> row_sq_norm(sample_row_count, 0.0);
         {
             size_t offset = 0;
             size_t row_idx = 0;
@@ -784,18 +779,18 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 const float * values = has_values ? values_sample + s * n_per_row : nullptr;
 
                 for (int64_t r = 0; r < rs; ++r, ++row_idx) {
-                    const float * row  = f32_sample.data() + offset;
+                    const float * x = f32_sample.data() + offset;
                     double rsn = 0.0;
-                    if (has_values) {
+                    if (values) {
                         for (int64_t j = 0; j < n_per_row; ++j) {
-                            const double v = values[j];
-                            const double x = row[j];
-                            rsn += v * x * x;
+                            const double v  = values[j];
+                            const double xx = x[j];
+                            rsn += v * xx * xx;
                         }
                     } else {
                         for (int64_t j = 0; j < n_per_row; ++j) {
-                            const double x = row[j];
-                            rsn += x * x;
+                            const double xx = x[j];
+                            rsn += xx * xx;
                         }
                     }
                     row_sq_norm[row_idx] = rsn;
@@ -805,35 +800,44 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         }
 
         // Quantize sampled rows slice-by-slice into quantized_buffer
-        size_t quantised_offset = 0;
-        size_t floats_offset = 0;
-        for (int64_t slice = 0; slice < ne2; ++slice) {
-            const int64_t rs = sample_rows_per_slice[slice];
-            if (rs == 0) { continue; }
+        {
+            size_t q_offset = 0;
+            size_t f_offset = 0;
+            for (int64_t slice = 0; slice < ne2; ++slice) {
+                const int64_t rs = sample_rows_per_slice[slice];
+                if (rs == 0) { continue; }
 
-            const float * value = values_sample ? values_sample + slice * n_per_row : nullptr;
-            (void)ggml_quantize_chunk(quant_type, f32_sample.data() + floats_offset, quantized_buffer.data() + quantised_offset, 0, rs, n_per_row, value);
+                const float * value = has_values ? values_sample + slice * n_per_row : nullptr;
+                (void)ggml_quantize_chunk(quant_type, f32_sample.data() + f_offset, quantized_buffer.data() + q_offset, 0, rs, n_per_row, value);
 
-            quantised_offset += row_size * (size_t)rs;
-            floats_offset += (size_t)rs * (size_t)n_per_row;
+                q_offset += row_sz * (size_t)rs;
+                f_offset += (size_t)rs * (size_t)n_per_row;
+            }
         }
 
         // Dequantize into dequantized_buffer
         {
             const ggml_type_traits * traits = ggml_get_type_traits(quant_type);
-            if (quant_type == GGML_TYPE_F16) {
-                ggml_fp16_to_fp32_row((const ggml_fp16_t *)quantized_buffer.data(), dequantized_buffer.data(), (int)sample_element_count);
-            } else if (quant_type == GGML_TYPE_BF16) {
-                ggml_bf16_to_fp32_row((const ggml_bf16_t *)quantized_buffer.data(), dequantized_buffer.data(), (int)sample_element_count);
-            } else {
-                if (!traits || !traits->to_float) {
-                    LLAMA_LOG_WARN("%s: unsupported quantization type %s\n", __func__, ggml_type_name(quant_type));
-                    return 1e35;
-                }
-                const size_t row_size = ggml_row_size(quant_type, n_per_row);
-                for (size_t r = 0; r < sample_row_count; ++r) {
-                    traits->to_float(quantized_buffer.data() + r * row_size, dequantized_buffer.data() + r * n_per_row, (int)n_per_row);
+            auto row_to_float = [&](size_t r) {
+                uint8_t * src = quantized_buffer.data() + r * row_sz;
+                float * dst = dequantized_buffer.data() + r * (size_t)n_per_row;
+                if (quant_type == GGML_TYPE_F16) {
+                    ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row);
+                } else if (quant_type == GGML_TYPE_BF16) {
+                    ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row);
+                } else {
+                    if (!traits || !traits->to_float) {
+                        LLAMA_LOG_WARN("%s: unsupported quantization type %s\n", __func__, ggml_type_name(quant_type));
+                        return false;
+                    }
+                    traits->to_float(src, dst, (int)n_per_row);
                 }
+
+                return true;
+            };
+
+            for (size_t r = 0; r < sample_row_count; ++r) {
+                if (!row_to_float(r)) { return 1e35; }
             }
         }
 
@@ -847,20 +851,22 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
             const float * values = has_values ? values_sample + slice * n_per_row : nullptr;
             const float * activations = has_activations ? activations_sample + slice * n_per_row : nullptr;
-            const double bias_denominator = has_activations ? bias_denominator_per_slice[slice] : 0.0;
+            const double bias_denom = has_activations ? bias_denominator_per_slice[slice] : 0.0;
+
             double slice_err = 0.0;
+
             for (int64_t r = 0; r < rs; ++r, ++row_idx) {
                 const float * x = f32_sample.data() + offset;
                 const float * y = dequantized_buffer.data() + offset;
                 double weighted_mse = 0.0;
-                double bias_numerator  = 0.0;
+                double bias_num = 0.0;
                 if (values && activations) {
                     for (int64_t j = 0; j < n_per_row; ++j) {
                         const double v = values[j];
                         const double e = y[j] - x[j];
                         const double a = activations[j];
                         weighted_mse += v * e * e;
-                        bias_numerator += v * e * a;
+                        bias_num += v * e * a;
                     }
                 } else if (values) {
                     for (int64_t j = 0; j < n_per_row; ++j) {
@@ -873,7 +879,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                         const double e = y[j] - x[j];
                         const double a = activations[j];
                         weighted_mse += e * e;
-                        bias_numerator += e * a;
+                        bias_num += e * a;
                     }
                 } else {
                     for (int64_t j = 0; j < n_per_row; ++j) {
@@ -882,24 +888,19 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     }
                 }
 
-                double err_numerator = weighted_mse;
+                constexpr float bias_lambda = 1.75f;
                 constexpr double epsilon = 1e-12;
-                constexpr float bias_lambda = 1.0;
-                //bias_lambda defines the weight of the bias term in the weigthed MSE error function
-                // 0.0 means no bias (standard MSE) 1.0 means equal weight for bias and error,
-                // 2.0 means twice as much weight for bias, etc. Default is 1.0.
-                if (activations && bias_lambda != 0.0) {
-                    const double proj = bias_numerator * bias_numerator / (bias_denominator + epsilon);
-                    err_numerator += bias_lambda * proj;
+                double err_num = weighted_mse;
+                if (activations && bias_lambda != 0.0f) {
+                    const double proj = bias_num * bias_num / (bias_denom + epsilon);
+                    err_num += (double)bias_lambda * proj;
                 }
 
-                const double err_denominator = row_sq_norm[row_idx] + epsilon;
-                const double row_err = err_numerator / err_denominator;
-                slice_err += row_err;
+                const double err_den = row_sq_norm[row_idx] + epsilon;
+                slice_err += err_num / err_den;
                 offset += (size_t)n_per_row;
             }
 
-            // scale to full rows (nrows)
             const double scale_rows = (double)nrows / std::max(1.0, (double)rs);
             total_err += slice_err * scale_rows;
         }

From 68ae5e66cea41457a3ed11018374b64e2f94d3d3 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 23 Aug 2025 02:50:55 +0100
Subject: [PATCH 047/155] Improve list of candidate types

---
 src/llama-quant.cpp | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 3c358fb67e..392a23b5ca 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1023,21 +1023,20 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         size_t total_sampled_rows = f32_sample.size() / n_per_row;
 
         // Build list of candidate types first (compatible ones)
-        std::vector<ggml_type> quant_candidates;
-        if (is_iq(params->ftype)) {
-            quant_candidates.assign(std::begin(iq_quants), std::end(iq_quants));
-        } else {
-            quant_candidates.assign(std::begin(k_quants), std::end(k_quants));
-        }
+        const ggml_type * base_arr = is_iq(params->ftype) ? iq_quants : k_quants;
+        const size_t base_sz = is_iq(params->ftype) ? sizeof(iq_quants) / sizeof(iq_quants[0]) : sizeof(k_quants) / sizeof(k_quants[0]);
 
-        // Compute maximum row size among compatible candidates (to size quantized_buffer once)
         size_t max_row_sz = 0;
         const bool has_valid_imatrix = !values_sample.empty() && values_sample.size() == (size_t)ne2 * (size_t)n_per_row;
+
         std::vector<ggml_type> compatible_candidates;
-        compatible_candidates.reserve(quant_candidates.size());
-        for (ggml_type ts_type : quant_candidates) {
+        compatible_candidates.reserve(base_sz);
+
+        for (size_t i = 0; i < base_sz; ++i) {
+            ggml_type ts_type = base_arr[i];
             if (is_iq(ts_type) && !has_valid_imatrix) {
-                LLAMA_LOG_WARN("%s: skipping %s quantization for %s, no or mismatched imatrix provided\n", __func__, ggml_type_name(ts_type) , name.c_str());
+                LLAMA_LOG_WARN("%s: skipping %s quantization for %s, no or mismatched imatrix provided\n",
+                    __func__, ggml_type_name(ts_type), name.c_str());
                 continue;
             }
             ggml_type tt = make_compatible(t, ts_type);

From decafae27060ed923c69ce3b89db505538a9b230 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 23 Aug 2025 11:30:11 +0100
Subject: [PATCH 048/155] Adjust bias_lambda

---
 src/llama-quant.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 392a23b5ca..4ce651723f 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -888,7 +888,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     }
                 }
 
-                constexpr float bias_lambda = 1.75f;
+                // abias_lambda djusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE
+                // larger value favours quantisation types that produce a smaller bias even if the MSE is slightly larger
+                constexpr float bias_lambda = 1.5f;
                 constexpr double epsilon = 1e-12;
                 double err_num = weighted_mse;
                 if (activations && bias_lambda != 0.0f) {
@@ -1024,7 +1026,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         // Build list of candidate types first (compatible ones)
         const ggml_type * base_arr = is_iq(params->ftype) ? iq_quants : k_quants;
-        const size_t base_sz = is_iq(params->ftype) ? sizeof(iq_quants) / sizeof(iq_quants[0]) : sizeof(k_quants) / sizeof(k_quants[0]);
+        const size_t base_sz = is_iq(params->ftype) ? std::size(iq_quants) : std::size(k_quants);
 
         size_t max_row_sz = 0;
         const bool has_valid_imatrix = !values_sample.empty() && values_sample.size() == (size_t)ne2 * (size_t)n_per_row;

From 3856d60328349c5b2a4e381d6fdff20d272415ab Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 23 Aug 2025 14:45:07 +0100
Subject: [PATCH 049/155] Restrict quant types per family

---
 src/llama-quant.cpp | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 4ce651723f..7615376e31 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -628,11 +628,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     constexpr ggml_type k_quants[] = {
         GGML_TYPE_Q2_K,
         GGML_TYPE_Q3_K,
-        GGML_TYPE_Q4_0,
-        GGML_TYPE_Q4_1,
         GGML_TYPE_Q4_K,
-        GGML_TYPE_Q5_0,
-        GGML_TYPE_Q5_1,
         GGML_TYPE_Q5_K,
         GGML_TYPE_Q6_K,
         GGML_TYPE_Q8_0,
@@ -646,19 +642,12 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     constexpr ggml_type iq_quants[] = {
         GGML_TYPE_IQ1_S,
-        GGML_TYPE_IQ1_M,
-        GGML_TYPE_IQ2_XXS,
-        GGML_TYPE_IQ2_XS,
         GGML_TYPE_IQ2_S,
-        GGML_TYPE_IQ3_XXS,
         GGML_TYPE_IQ3_S,
         GGML_TYPE_IQ4_XS,
-        GGML_TYPE_IQ4_NL,
-        // TODO: add higher-precision fallbacks for IQ mixes to improve ppl if bpw budget allows it?
-        GGML_TYPE_Q5_0,
-        GGML_TYPE_Q5_1,
         GGML_TYPE_Q5_K,
-        GGML_TYPE_Q6_K
+        GGML_TYPE_Q6_K,
+        GGML_TYPE_Q8_0
     };
 
     auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t {
@@ -888,8 +877,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     }
                 }
 
-                // abias_lambda djusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE
-                // larger value favours quantisation types that produce a smaller bias even if the MSE is slightly larger
+                // bias_lambda adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE
+                // larger value favours quantisation types that produce smaller bias even if the MSE is slightly larger
                 constexpr float bias_lambda = 1.5f;
                 constexpr double epsilon = 1e-12;
                 double err_num = weighted_mse;

From 61c0e01f500ef2610904045c6a7852956c7ba6ba Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 24 Aug 2025 13:36:03 +0100
Subject: [PATCH 050/155] Execute bpw_overrides() only if an imatrix file is
 provided

---
 src/llama-quant.cpp | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 7615376e31..4ed9454068 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1525,9 +1525,18 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     }
 
     std::unordered_map<std::string, ggml_type> bpw_overrides = {};
-    if (params->target_bpw != -1.0f) {
-        LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.3f bpw at lowest ppl - this operation may take some time\n", __func__, params->target_bpw);
-        bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread);
+    if (params->target_bpw != -1.0f && !params->only_copy) {
+        if (params->imatrix) {
+            if (params->activations) {
+                LLAMA_LOG_INFO("%s: imatrix with activations provided, target bpw quantization will be more accurate\n", __func__);
+            } else {
+                LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate\n", __func__);
+            }
+            LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw);
+            bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread);
+        } else {
+            LLAMA_LOG_WARN("%s: no imatrix provided, target bpw will not apply\n", __func__);
+        }
     }
 
     int cur_split = -1;

From d4ac2106fb5b9e1a98d6aef8a0931e73e46f324e Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 24 Aug 2025 13:39:10 +0100
Subject: [PATCH 051/155] Improve logging and some minor code refactoring

---
 src/llama-quant.cpp         | 26 +++++++++++++++-----------
 tools/quantize/quantize.cpp |  7 +------
 2 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 4ed9454068..407a63d887 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -132,7 +132,6 @@ static std::string remap_imatrix (const std::string & orig_name, const std::map<
 
         for (const auto & p : mapped) {
             if (p.second == blk) {
-                LLAMA_LOG_DEBUG("(blk.%d imatrix) ", p.first);
                 return new_name.replace(match.position(1), match.length(1), std::to_string(p.first));
             }
         }
@@ -1257,7 +1256,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     // Build the override map
     std::unordered_map<std::string, ggml_type> overrides;
-    LLAMA_LOG_INFO("%s: - estimated tensor quantization mix to achieve %.4f bpw at lowest ppl\n", __func__, target_bpw);
+    LLAMA_LOG_INFO("%s: - estimated tensor quantization mix:\n", __func__);
     for (const auto & ti : all) {
         LLAMA_LOG_INFO("\t%s: %45s - \t%8s, \t%1.4f bpw,\terror: %.4f\n",
             __func__, ggml_get_name(ti.w->tensor), ggml_type_name(ti.candidate[ti.choice].type), ti.candidate[ti.choice].bpw, ti.candidate[ti.choice].error);
@@ -1352,7 +1351,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     if (params->imatrix) {
         values_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
         if (values_data) {
-            LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(values_data->size()));
+            LLAMA_LOG_INFO("================================ Have weights data with %d entries",int(values_data->size()));
             qs.has_imatrix = true;
             // check imatrix for nans or infs
             for (const auto & kv : *values_data) {
@@ -1367,7 +1366,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     if (params->activations) {
         activations_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->activations);
         if (activations_data) {
-            LLAMA_LOG_INFO("================================ Have activations data with %d entries\n",int(activations_data->size()));
+            LLAMA_LOG_INFO(" and %d activations",int(activations_data->size()));
             qs.has_activations = true;
             // check activations for nans or infs
             for (const auto & kv : *activations_data) {
@@ -1379,6 +1378,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             }
         }
     }
+    LLAMA_LOG_INFO("\n");
 
     gguf_context_ptr ctx_out { gguf_init_empty() };
 
@@ -1655,12 +1655,16 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             if (!params->pure && ggml_is_quantized(default_type)) {
                 int fallback = qs.n_fallback;
                 new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
-                // get bpw override
-                const auto override = bpw_overrides.find(name);
-                if (override != bpw_overrides.end() && override->second != new_type) {
-                    LLAMA_LOG_DEBUG("(bpw overriding %s) ", ggml_type_name(new_type));
-                    new_type = override->second;
+
+                // get quantization type overrides targeting a given bits per weight budget
+                if (params->target_bpw != -1.0f && !bpw_overrides.empty()) {
+                    const auto override = bpw_overrides.find(name);
+                    if (override != bpw_overrides.end() && override->second != new_type) {
+                        LLAMA_LOG_DEBUG("(bpw override %s) ", ggml_type_name(new_type));
+                        new_type = override->second;
+                    }
                 }
+
                 // unless the user specifies a type, and the tensor shape will not require fallback quantisation
                 if (params->tensor_types && qs.n_fallback - fallback == 0) {
                     const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
@@ -1668,7 +1672,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                     for (const auto & [tname, qtype] : tensor_types) {
                         if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
                             if  (qtype != new_type) {
-                                LLAMA_LOG_DEBUG("(type overriding %s) ", ggml_type_name(new_type));
+                                LLAMA_LOG_DEBUG("(type override %s) ", ggml_type_name(new_type));
                                 new_type = qtype; // if two or more types are specified for the same tensor, the last match wins
                             }
                         }
@@ -1699,7 +1703,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             if (values_data) {
                 auto it = values_data->find(remap_imatrix(tensor->name, mapped));
                 if (it == values_data->end()) {
-                    LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
+                    LLAMA_LOG_INFO("\n====== %s: did not find weights for %s, ", __func__, tensor->name);
                 } else {
                     if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
                         imatrix = it->second.data();
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index b907008cb4..77fa6b90ce 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -399,12 +399,7 @@ static int prepare_imatrix(const std::string & imatrix_file,
         values_data = std::move(tmp_values);
         activations_data = std::move(tmp_activations);
     }
-    if (!values_data.empty()) {
-        printf("%s: have %d importance matrix value entries\n", __func__, int(values_data.size()));
-    }
-    if (!activations_data.empty()) {
-        printf("%s: have %d importance matrix activation entries\n", __func__, int(activations_data.size()));
-    }
+
     return m_last_call;
 }
 

From 4286690019f21cae3abb92a7903c6675a3367e5e Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 26 Aug 2025 21:39:40 +0100
Subject: [PATCH 052/155] Minor comment update

---
 src/llama-quant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 407a63d887..cbbfdedfbd 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -596,7 +596,7 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
     return new_size;
 }
 
-// Returns per-tensor type overrides to meet target BPW at lowest ppl
+// Returns per-tensor type overrides to meet target BPW at lowest error
 static std::unordered_map<std::string, ggml_type> target_bpw_type(
     llama_model_loader & ml,
     std::vector<no_init<uint8_t>> & buffer,

From 04946114c9009cd04f665ed98b55304e376e19d3 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 28 Aug 2025 16:01:03 +0100
Subject: [PATCH 053/155] Refactor epsilon into a function-wide variable

---
 src/llama-quant.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index cbbfdedfbd..da1267ddbc 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -649,6 +649,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         GGML_TYPE_Q8_0
     };
 
+    constexpr double epsilon = 1e-12;
+
     auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t {
         const int64_t n_per_row = t->ne[0];
         const size_t row_sz = ggml_row_size(typ, n_per_row);
@@ -1193,7 +1195,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             double err = cur.error - nxt.error;
             err = std::max(err, 0.0);
             double ratio = err / (double)(delta_bytes * 8ull);
-            if (ratio > best.ratio + eps || (std::abs(ratio - best.ratio) <= eps && delta_bytes < best.delta_bytes)) {
+            if (ratio > best.ratio + epsilon || (std::abs(ratio - best.ratio) <= epsilon && delta_bytes < best.delta_bytes)) {
                 best = upgrade{ i, j, err, delta_bytes, ratio };
             }
         }
@@ -1208,7 +1210,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         size_t now_bytes = current_total_bytes();
         size_t next_bytes = now_bytes + up.delta_bytes;
         double bpw_next = (double)next_bytes * 8.0 / (double)tw;
-        if (bpw_next <= target_bpw + 1e-12) {
+        if (bpw_next <= target_bpw + epsilon) {
             all[up.idx].choice = up.next;
             bpw_now = bpw_next;
         } else {
@@ -1241,7 +1243,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             double ratio = err / (double)(delta_bytes * 8ull);
 
             double over_gap = std::abs(bpw_over - (double)target_bpw);
-            if (over_gap < best_over_gap - 1e-12 || (std::abs(over_gap - best_over_gap) <= 1e-12 && ratio > best_over.ratio)) {
+            if (over_gap < best_over_gap - epsilon || (std::abs(over_gap - best_over_gap) <= epsilon && ratio > best_over.ratio)) {
                 best_over_gap = over_gap;
                 best_over = upgrade{ i, j, err, delta_bytes, ratio };
             }

From 8df1d00ae4042a1eee38c1fc9ac06137d5ce5078 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 28 Aug 2025 16:04:28 +0100
Subject: [PATCH 054/155] Add directional scaling

---
 src/llama-quant.cpp | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index da1267ddbc..a9621eab8e 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -900,6 +900,27 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return std::isfinite(total_err) ? total_err : 1e35;
     };
 
+    auto directional_scale = [&](const float * values, const float * activations, int64_t n_per_row) {
+        if (!activations) { return 1.0f; }
+        // Compute dominance = ||sqrt(v).*a||_2 / (RMS(a)*sqrt(sum(v)))
+        // If no values, use v=1
+        double sum_v   = 0.0;
+        double sum_aw2 = 0.0;
+        double sum_a2  = 0.0;
+        for (int64_t j = 0; j < n_per_row; ++j) {
+            const double v = values ? std::max(0.0f, values[j]) : 1.0;
+            const double a = activations[j];
+            sum_v += v;
+            sum_aw2 += v * a * a;
+            sum_a2 += a * a;
+        }
+        const double rms_a = std::sqrt(sum_a2 / std::max(1.0, (double)n_per_row));
+        const double denom = std::sqrt(std::max(epsilon, sum_v)) * std::max(epsilon, rms_a);
+        const double scale = denom > 0.0 ? std::sqrt(sum_aw2) / denom : 1.0;
+
+        // Clamp to a reasonable range
+        return (float)std::clamp(scale, 0.5, 2.0);
+    };
     std::vector<tensor_info> all;
     all.reserve(tensors.size());
     for (const auto * tw : tensors) {

From 66aff8fa1ee1d34c7faaa0ff658a730a9554ef36 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 28 Aug 2025 16:06:42 +0100
Subject: [PATCH 055/155] Add precise_lambda()

---
 src/llama-quant.cpp | 102 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 102 insertions(+)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index a9621eab8e..662760fbe9 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -921,6 +921,108 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         // Clamp to a reasonable range
         return (float)std::clamp(scale, 0.5, 2.0);
     };
+
+    // Returns an adaptive lambda for this tensor using a small probe set
+    // bias_lambda adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE
+    // larger value favours quantisation types that produce smaller bias even if the MSE is slightly larger
+    auto precise_lambda = [&](const ggml_tensor * t,
+        const std::vector<float> & f32_sample,
+        const std::vector<int64_t> & sample_rows_per_slice,
+        const float * values,
+        const float * activations,
+        const std::vector<ggml_type> & compatible_candidates) -> float
+    {
+        // No activations => no projection term
+        if (!activations) { return 0.0f; }
+
+        // pick a tiny probe set: try to spread around mid-range types
+        std::vector<ggml_type> probes;
+        probes.reserve(3);
+        auto push_if = [&](const ggml_type tiny) {
+            if (std::find(compatible_candidates.begin(), compatible_candidates.end(), tiny) != compatible_candidates.end()) {
+                probes.push_back(tiny);
+            }
+        };
+
+        // Prefer family-consistent probes; fall back to whatever exists
+        push_if(GGML_TYPE_Q4_K);
+        push_if(GGML_TYPE_Q3_K);
+        push_if(GGML_TYPE_Q5_K);
+        if (probes.empty() && !compatible_candidates.empty()) {
+            probes.push_back(compatible_candidates[compatible_candidates.size() / 2]);
+        }
+        if (probes.size() == 1 && compatible_candidates.size() >= 2) {
+            probes.push_back(compatible_candidates.front());
+        }
+        if (probes.empty()) { return 0.0f; }
+
+        // Scratch buffers (reused)
+        const int64_t n_per_row = t->ne[0];
+        const size_t total_sampled_rows = f32_sample.size() / n_per_row;
+        size_t max_row_sz = 0;
+        for (auto pt : probes) {
+            max_row_sz = std::max(max_row_sz, ggml_row_size(pt, n_per_row));
+        }
+        std::vector<uint8_t> quantized_buffer(max_row_sz * total_sampled_rows);
+        std::vector<float>   dequantized_buffer(f32_sample.size());
+
+        std::vector<double> ratios;
+        ratios.reserve(probes.size());
+
+        for (const auto pt : probes) {
+            // err at lambda=0 => pure weighted MSE part
+            double err0 = estimate_error(t, pt, f32_sample, sample_rows_per_slice, values, activations, quantized_buffer, dequantized_buffer, 0.0f);
+            // err at lambda=1 => weighted MSE + projection penalty
+            const double err1 = estimate_error(t, pt, f32_sample, sample_rows_per_slice, values, activations, quantized_buffer, dequantized_buffer, 1.0f);
+
+            const double p = std::max(0.0, err1 - err0);  // projection term contribution
+            const double m = std::max(0.0, err0); // MSE term contribution
+            if (p > epsilon && std::isfinite(m) && std::isfinite(p)) {
+                ratios.push_back(m / p);
+            }
+        }
+
+        if (ratios.empty()) { return 0.0f; }
+
+        std::nth_element(ratios.begin(), ratios.begin() + ratios.size() / 2, ratios.end());
+        double lambda = ratios[ratios.size() / 2];
+
+        // activations directional scale
+        const float scale = directional_scale(values, activations, n_per_row);
+        lambda *= scale;
+
+        // clamp to safe range
+        lambda = std::clamp(lambda, 0.0, 8.0);
+        return (float)lambda;
+    };
+
+    auto fast_lambda = [&](const float * values, const float * activations, const int64_t n_per_row) {
+        if (!activations) { return 0.0f; }
+        double s = 0.0;
+        double s2 = 0.0;
+        for (int64_t j = 0; j < n_per_row; ++j) {
+            const double w = values ? std::max(0.0f, values[j]) : 1.0;
+            const double aw = std::sqrt(w) * activations[j];
+            const double aw2 = aw * aw;
+            s += aw2;
+            s2 += aw2 * aw2;
+        }
+        if (s2 <= 0.0) { return 0.0f; }
+        const auto d = (double)n_per_row;
+        //const double p = s * s / (d * s2 + epsilon);
+        //const double lambda = 8.0 * std::clamp(1.0 - p, 0.0, 1.0);
+        // Map p in (0,1] to lambda in [0,8] decreasing
+        double base = 1.0 - s * s / (d * s2 + epsilon);
+        base = std::clamp(base, 0.0, 1.0);
+
+        // activations directional scale
+        const double scale = directional_scale(values, activations, n_per_row);
+        // clamp to safe range
+        const double lambda = std::clamp(base * scale, 0.0, 1.0) * 8.0;
+
+        return (float)lambda;
+    };
+
     std::vector<tensor_info> all;
     all.reserve(tensors.size());
     for (const auto * tw : tensors) {

From 556f6b04fed2092568e31948708af8102c9e5433 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 28 Aug 2025 16:08:08 +0100
Subject: [PATCH 056/155] Add --precise-lambda option

---
 include/llama.h             |  1 +
 src/llama-quant.cpp         | 27 +++++++++++++++++----------
 tools/quantize/quantize.cpp |  6 +++++-
 3 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index 01c5b67c75..3a5bda32ea 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -357,6 +357,7 @@ extern "C" {
         void * tensor_types;                  // pointer to vector containing tensor types
         void * prune_layers;                  // pointer to vector containing layer indices to prune
         float target_bpw;                     // target bits per weight (bpw)
+        bool precise_lambda;                  // use precise_lambda calculation - slow computation but very accurate
     } llama_model_quantize_params;
 
     typedef struct llama_logit_bias {
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 662760fbe9..98fc11d840 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -722,7 +722,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         const float * values_sample,
         const float * activations_sample,
         std::vector<uint8_t> & quantized_buffer,
-        std::vector<float> & dequantized_buffer) -> double
+        std::vector<float> & dequantized_buffer,
+        float bias_lambda) -> double
     {
         const int64_t n_per_row = t->ne[0];
         const int64_t nrows = t->ne[1];
@@ -878,10 +879,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     }
                 }
 
-                // bias_lambda adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE
-                // larger value favours quantisation types that produce smaller bias even if the MSE is slightly larger
-                constexpr float bias_lambda = 1.5f;
-                constexpr double epsilon = 1e-12;
                 double err_num = weighted_mse;
                 if (activations && bias_lambda != 0.0f) {
                     const double proj = bias_num * bias_num / (bias_denom + epsilon);
@@ -1163,6 +1160,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         std::sort(compatible_candidates.begin(), compatible_candidates.end());
         compatible_candidates.erase(std::unique(compatible_candidates.begin(), compatible_candidates.end()), compatible_candidates.end());
 
+        // Compute adaptive bias_lambda for this tensor
+        float bias_lambda = 0.0f;
+        {
+            const float * values = values_sample.empty() ? nullptr : values_sample.data();
+            const float * activations = activations_sample.empty() ? nullptr : activations_sample.data();
+            bias_lambda = params->precise_lambda ? precise_lambda(t, f32_sample, sample_rows_per_slice, values, activations, compatible_candidates) :
+                fast_lambda(values, activations, n_per_row);
+        }
+
         // Now evaluate candidates
         std::vector<candidate_types> eval_candidates(compatible_candidates.size());
         const float * values = values_sample.empty() ? nullptr : values_sample.data();
@@ -1186,7 +1192,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     const ggml_type tt = compatible_candidates[i];
                     const auto bpw = (float)tensor_bpw(t, tt);
                     const size_t bytes = tensor_bytes(t, tt);
-                    const auto err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, values, activations, tl_quantized_buffer, tl_dequantised_buffer);
+                    const auto err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, values, activations, tl_quantized_buffer, tl_dequantised_buffer, bias_lambda);
                     eval_candidates[i] = candidate_types{ tt, bpw, bytes, err };
                 }
             });
@@ -1301,7 +1307,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     };
 
     auto recompute_best_upgrade = [&]() -> upgrade {
-        const double eps = 1e-12;
         upgrade best{ -1, -1, 0.0, 0, -1.0 };
         for (int i = 0; i < (int) all.size(); ++i) {
             const auto & ti = all[i];
@@ -1653,10 +1658,11 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     if (params->target_bpw != -1.0f && !params->only_copy) {
         if (params->imatrix) {
             if (params->activations) {
-                LLAMA_LOG_INFO("%s: imatrix with activations provided, target bpw quantization will be more accurate\n", __func__);
+                LLAMA_LOG_INFO("%s: imatrix with activations provided, target bpw quantization will be more accurate - ",__func__);
             } else {
-                LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate\n", __func__);
+                LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate - ", __func__);
             }
+            LLAMA_LOG_INFO("using %s\n", params->precise_lambda ? "precise lambda (slow)" : "fast lambda");
             LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw);
             bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread);
         } else {
@@ -1966,7 +1972,8 @@ llama_model_quantize_params llama_model_quantize_default_params() {
         /*.kv_overrides                =*/ nullptr,
         /*.tensor_type                 =*/ nullptr,
         /*.prune_layers                =*/ nullptr,
-        /*.target_bpw                  =*/ -1.0f
+        /*.target_bpw                  =*/ -1.0f,
+        /*.precise_lambda              =*/ false
     };
 
     return result;
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 77fa6b90ce..0c9460513c 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -132,7 +132,9 @@ static void usage(const char * executable) {
     printf("      Advanced option to selectively quantize tensors. May be specified multiple times.\n");
     printf("  --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n");
     printf("      Advanced option to remove all tensors from the given layers\n");
-    printf("  --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 16.0 \n");
+    printf("  --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 16.0\n");
+    printf("      Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n");
+    printf("  --precise-lambda: given a target bpw, use a high-precision error computation at the expense of longer processing times\n");
     printf("  --keep-split: will generate quantized model in the same shards as input\n");
     printf("  --override-kv KEY=TYPE:VALUE\n");
     printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
@@ -538,6 +540,8 @@ int main(int argc, char ** argv) {
             if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) {
                 usage(argv[0]);
             }
+        } else if (strcmp(argv[arg_idx], "--precise-lambda") == 0) {
+            params.precise_lambda = true;
         } else if (strcmp(argv[arg_idx], "--prune-layers") == 0) {
             if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) {
                 usage(argv[0]);

From eab8708244db703c5c7219261b0c875c4b57825f Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 30 Aug 2025 10:14:46 +0100
Subject: [PATCH 057/155] Minor factoring for efficiency and correctness

---
 src/llama-quant.cpp | 126 +++++++++++++++++++++-----------------------
 1 file changed, 60 insertions(+), 66 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 98fc11d840..db688fdf02 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -596,7 +596,7 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
     return new_size;
 }
 
-// Returns per-tensor type overrides to meet target BPW at lowest error
+// Returns tensor type overrides to meet a global bpw target
 static std::unordered_map<std::string, ggml_type> target_bpw_type(
     llama_model_loader & ml,
     std::vector<no_init<uint8_t>> & buffer,
@@ -650,6 +650,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     };
 
     constexpr double epsilon = 1e-12;
+    constexpr double infinity = std::numeric_limits<double>::infinity();
 
     auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t {
         const int64_t n_per_row = t->ne[0];
@@ -680,7 +681,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     auto name_tn = LLM_TN(model.arch);
     auto can_quantize = [&](const ggml_tensor * t) -> bool {
-        // This list should be kept in sync with llama_tensor_quantize_impl()
+        // This list should be kept in sync with llama_tensor_quantize_impl() to avoid drift
         const std::string name = ggml_get_name(t);
         bool q = name.rfind("weight") == name.size() - 6;
         q &= ggml_n_dims(t) >= 2;
@@ -730,9 +731,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
 
         const size_t sample_element_count = f32_sample.size();
-        const size_t sample_row_count = sample_element_count / (size_t)n_per_row;
+        const size_t sample_row_count = n_per_row > 0 ? sample_element_count / (size_t)n_per_row : 0;
         if (sample_row_count == 0) { return 0.0; }
 
+        size_t expected_rows = 0;
+        for (int64_t s = 0; s < ne2; ++s) {
+            expected_rows += (size_t)sample_rows_per_slice[s];
+        }
+        if (expected_rows != sample_row_count) { return infinity; }
+
         const size_t row_sz = ggml_row_size(quant_type, n_per_row);
         const size_t buffer_sz = row_sz * sample_row_count;
 
@@ -750,15 +757,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 const float * activations = activations_sample + s * n_per_row;
                 double denom = 0.0;
                 for (int64_t j = 0; j < n_per_row; ++j) {
+                    const double w = values ? std::max(0.0f, values[j]) : 1.0;
                     const double a = activations[j];
-                    const double w = values ? values[j] : 1.0;
                     denom += w * a * a;
                 }
                 bias_denominator_per_slice[s] = denom;
             }
         }
 
-        // Compute per-row squared norms with weighting (if values are provided)
+        // Per-row squared norms with weighting
         std::vector<double> row_sq_norm(sample_row_count, 0.0);
         {
             size_t offset = 0;
@@ -768,15 +775,14 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 if (rs == 0) { continue; }
 
                 const float * values = has_values ? values_sample + s * n_per_row : nullptr;
-
                 for (int64_t r = 0; r < rs; ++r, ++row_idx) {
                     const float * x = f32_sample.data() + offset;
                     double rsn = 0.0;
                     if (values) {
                         for (int64_t j = 0; j < n_per_row; ++j) {
-                            const double v  = values[j];
+                            const double w = std::max(0.0f, values[j]);
                             const double xx = x[j];
-                            rsn += v * xx * xx;
+                            rsn += w * xx * xx;
                         }
                     } else {
                         for (int64_t j = 0; j < n_per_row; ++j) {
@@ -790,7 +796,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             }
         }
 
-        // Quantize sampled rows slice-by-slice into quantized_buffer
+        // Quantize sampled rows per slice -> quantized_buffer
         {
             size_t q_offset = 0;
             size_t f_offset = 0;
@@ -800,35 +806,32 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
                 const float * value = has_values ? values_sample + slice * n_per_row : nullptr;
                 (void)ggml_quantize_chunk(quant_type, f32_sample.data() + f_offset, quantized_buffer.data() + q_offset, 0, rs, n_per_row, value);
-
                 q_offset += row_sz * (size_t)rs;
                 f_offset += (size_t)rs * (size_t)n_per_row;
             }
         }
 
-        // Dequantize into dequantized_buffer
+        // quantized_buffer -> dequantized_buffer
         {
             const ggml_type_traits * traits = ggml_get_type_traits(quant_type);
-            auto row_to_float = [&](size_t r) {
-                uint8_t * src = quantized_buffer.data() + r * row_sz;
-                float * dst = dequantized_buffer.data() + r * (size_t)n_per_row;
-                if (quant_type == GGML_TYPE_F16) {
-                    ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row);
-                } else if (quant_type == GGML_TYPE_BF16) {
-                    ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row);
-                } else {
-                    if (!traits || !traits->to_float) {
-                        LLAMA_LOG_WARN("%s: unsupported quantization type %s\n", __func__, ggml_type_name(quant_type));
-                        return false;
+
+            const bool is_fp16 = quant_type == GGML_TYPE_F16;
+            const bool is_bf16 = quant_type == GGML_TYPE_BF16;
+            if (!is_fp16 && !is_bf16 && traits && traits->to_float) {
+                traits->to_float(quantized_buffer.data(), dequantized_buffer.data(), (int)(sample_row_count * (size_t)n_per_row));
+            } else {
+                for (size_t r = 0; r < sample_row_count; ++r) {
+                    uint8_t * src = quantized_buffer.data() + r * row_sz;
+                    float * dst = dequantized_buffer.data() + r * (size_t) n_per_row;
+                    if (is_fp16) {
+                        ggml_fp16_to_fp32_row((const ggml_fp16_t *) src, dst, (int)n_per_row);
+                    } else if (is_bf16) {
+                        ggml_bf16_to_fp32_row((const ggml_bf16_t *) src, dst, (int)n_per_row);
+                    } else {
+                        if (!traits || !traits->to_float) { return infinity; }
+                        traits->to_float(src, dst, (int)n_per_row);
                     }
-                    traits->to_float(src, dst, (int)n_per_row);
                 }
-
-                return true;
-            };
-
-            for (size_t r = 0; r < sample_row_count; ++r) {
-                if (!row_to_float(r)) { return 1e35; }
             }
         }
 
@@ -836,6 +839,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         size_t offset = 0;
         size_t row_idx = 0;
         double total_err = 0.0;
+
         for (int64_t slice = 0; slice < ne2; ++slice) {
             const int64_t rs = sample_rows_per_slice[slice];
             if (rs == 0) { continue; }
@@ -843,9 +847,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             const float * values = has_values ? values_sample + slice * n_per_row : nullptr;
             const float * activations = has_activations ? activations_sample + slice * n_per_row : nullptr;
             const double bias_denom = has_activations ? bias_denominator_per_slice[slice] : 0.0;
-
             double slice_err = 0.0;
-
             for (int64_t r = 0; r < rs; ++r, ++row_idx) {
                 const float * x = f32_sample.data() + offset;
                 const float * y = dequantized_buffer.data() + offset;
@@ -853,17 +855,17 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 double bias_num = 0.0;
                 if (values && activations) {
                     for (int64_t j = 0; j < n_per_row; ++j) {
-                        const double v = values[j];
+                        const double w = std::max(0.0f, values[j]);
                         const double e = y[j] - x[j];
                         const double a = activations[j];
-                        weighted_mse += v * e * e;
-                        bias_num += v * e * a;
+                        weighted_mse += w * e * e;
+                        bias_num += w * e * a;
                     }
                 } else if (values) {
                     for (int64_t j = 0; j < n_per_row; ++j) {
-                        const double v = values[j];
+                        const double w = std::max(0.0f, values[j]);
                         const double e = y[j] - x[j];
-                        weighted_mse += v * e * e;
+                        weighted_mse += w * e * e;
                     }
                 } else if (activations) {
                     for (int64_t j = 0; j < n_per_row; ++j) {
@@ -881,26 +883,28 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
                 double err_num = weighted_mse;
                 if (activations && bias_lambda != 0.0f) {
-                    const double proj = bias_num * bias_num / (bias_denom + epsilon);
-                    err_num += (double)bias_lambda * proj;
+                    if (bias_denom > 0.0) {
+                        const double proj = bias_num * bias_num / (bias_denom + epsilon);
+                        err_num += bias_lambda * proj;
+                    }
                 }
 
-                const double err_den = row_sq_norm[row_idx] + epsilon;
-                slice_err += err_num / err_den;
+                const double denom = row_sq_norm[row_idx] + epsilon;
+                slice_err += err_num / denom;
                 offset += (size_t)n_per_row;
             }
 
             const double scale_rows = (double)nrows / std::max(1.0, (double)rs);
             total_err += slice_err * scale_rows;
+            if (!std::isfinite(total_err)) { return infinity; }
         }
 
-        return std::isfinite(total_err) ? total_err : 1e35;
+        return std::isfinite(total_err) ? total_err : infinity;
     };
 
+    // Scaling factor to increase lambda when activations are concentrated
     auto directional_scale = [&](const float * values, const float * activations, int64_t n_per_row) {
         if (!activations) { return 1.0f; }
-        // Compute dominance = ||sqrt(v).*a||_2 / (RMS(a)*sqrt(sum(v)))
-        // If no values, use v=1
         double sum_v   = 0.0;
         double sum_aw2 = 0.0;
         double sum_a2  = 0.0;
@@ -915,13 +919,10 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         const double denom = std::sqrt(std::max(epsilon, sum_v)) * std::max(epsilon, rms_a);
         const double scale = denom > 0.0 ? std::sqrt(sum_aw2) / denom : 1.0;
 
-        // Clamp to a reasonable range
         return (float)std::clamp(scale, 0.5, 2.0);
     };
 
-    // Returns an adaptive lambda for this tensor using a small probe set
-    // bias_lambda adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE
-    // larger value favours quantisation types that produce smaller bias even if the MSE is slightly larger
+    // Higher precision but much longer to compute
     auto precise_lambda = [&](const ggml_tensor * t,
         const std::vector<float> & f32_sample,
         const std::vector<int64_t> & sample_rows_per_slice,
@@ -929,10 +930,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         const float * activations,
         const std::vector<ggml_type> & compatible_candidates) -> float
     {
-        // No activations => no projection term
         if (!activations) { return 0.0f; }
 
-        // pick a tiny probe set: try to spread around mid-range types
         std::vector<ggml_type> probes;
         probes.reserve(3);
         auto push_if = [&](const ggml_type tiny) {
@@ -941,7 +940,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             }
         };
 
-        // Prefer family-consistent probes; fall back to whatever exists
         push_if(GGML_TYPE_Q4_K);
         push_if(GGML_TYPE_Q3_K);
         push_if(GGML_TYPE_Q5_K);
@@ -953,19 +951,18 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         }
         if (probes.empty()) { return 0.0f; }
 
-        // Scratch buffers (reused)
+        // Scratch buffers
         const int64_t n_per_row = t->ne[0];
         const size_t total_sampled_rows = f32_sample.size() / n_per_row;
         size_t max_row_sz = 0;
         for (auto pt : probes) {
             max_row_sz = std::max(max_row_sz, ggml_row_size(pt, n_per_row));
         }
+
         std::vector<uint8_t> quantized_buffer(max_row_sz * total_sampled_rows);
         std::vector<float>   dequantized_buffer(f32_sample.size());
-
         std::vector<double> ratios;
         ratios.reserve(probes.size());
-
         for (const auto pt : probes) {
             // err at lambda=0 => pure weighted MSE part
             double err0 = estimate_error(t, pt, f32_sample, sample_rows_per_slice, values, activations, quantized_buffer, dequantized_buffer, 0.0f);
@@ -984,17 +981,17 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         std::nth_element(ratios.begin(), ratios.begin() + ratios.size() / 2, ratios.end());
         double lambda = ratios[ratios.size() / 2];
 
-        // activations directional scale
         const float scale = directional_scale(values, activations, n_per_row);
         lambda *= scale;
-
-        // clamp to safe range
         lambda = std::clamp(lambda, 0.0, 8.0);
+
         return (float)lambda;
     };
 
+    // Faster to compute but lower precision. Best option for the vast majority of models
     auto fast_lambda = [&](const float * values, const float * activations, const int64_t n_per_row) {
         if (!activations) { return 0.0f; }
+
         double s = 0.0;
         double s2 = 0.0;
         for (int64_t j = 0; j < n_per_row; ++j) {
@@ -1004,17 +1001,13 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             s += aw2;
             s2 += aw2 * aw2;
         }
+
         if (s2 <= 0.0) { return 0.0f; }
         const auto d = (double)n_per_row;
-        //const double p = s * s / (d * s2 + epsilon);
-        //const double lambda = 8.0 * std::clamp(1.0 - p, 0.0, 1.0);
-        // Map p in (0,1] to lambda in [0,8] decreasing
         double base = 1.0 - s * s / (d * s2 + epsilon);
         base = std::clamp(base, 0.0, 1.0);
 
-        // activations directional scale
         const double scale = directional_scale(values, activations, n_per_row);
-        // clamp to safe range
         const double lambda = std::clamp(base * scale, 0.0, 1.0) * 8.0;
 
         return (float)lambda;
@@ -1036,13 +1029,13 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         }
         ml.load_data_for(t);
 
-        // Dequantize only sampled rows into f32_sample
+        // Dequantize sampled rows into f32_sample
         const int64_t n_per_row = t->ne[0];
         const int64_t nrows_total = t->ne[1];
         const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
 
-        // Larger sample_rows_per_expert values may result in more accurate error estimates, but will take longer to compute
-        constexpr int sample_rows_per_expert = 384;
+        // Larger sample_rows_per_expert values may result in more accurate error estimates, but it will take much longer to compute
+        constexpr int sample_rows_per_expert = 256;
         std::vector<float> f32_sample;
         f32_sample.reserve((size_t)ne2 * (size_t)std::min<int64_t>(nrows_total, sample_rows_per_expert) * (size_t)n_per_row);
 
@@ -1096,6 +1089,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             const std::string key = remap_imatrix(tensor_name, mapped);
             const auto it = m->find(key);
             if (it == m->end()) { return {nullptr, 0}; }
+
             return { it->second.data(), it->second.size() };
         };
 
@@ -1104,7 +1098,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             const size_t want = (size_t)ne2 * (size_t)n_per_row;
             dst.clear();
             if (!src || src_sz == 0) { return; }
-
             if (src_sz == want) {
                 dst.resize(want);
                 std::memcpy(dst.data(), src, want * sizeof(float));
@@ -1160,7 +1153,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         std::sort(compatible_candidates.begin(), compatible_candidates.end());
         compatible_candidates.erase(std::unique(compatible_candidates.begin(), compatible_candidates.end()), compatible_candidates.end());
 
-        // Compute adaptive bias_lambda for this tensor
+        // Adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE.
+        // Larger values favours quantisation types that produce smaller bias even if the MSE is slightly bigger
         float bias_lambda = 0.0f;
         {
             const float * values = values_sample.empty() ? nullptr : values_sample.data();

From 04c07b3272f067ba30d32fb82d693fb0013cc47d Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Wed, 10 Sep 2025 18:00:56 +0100
Subject: [PATCH 058/155] Add better control over MSE and directional bias
 computation

---
 include/llama.h             |  2 +-
 src/llama-quant.cpp         | 41 +++++++++----------------------------
 tools/quantize/quantize.cpp | 31 +++++++++++++++++++++++++---
 3 files changed, 39 insertions(+), 35 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index d0ca37dc65..ba6c185346 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -365,7 +365,7 @@ extern "C" {
         void * tensor_types;                  // pointer to vector containing tensor types
         void * prune_layers;                  // pointer to vector containing layer indices to prune
         float target_bpw;                     // target bits per weight (bpw)
-        bool precise_lambda;                  // use precise_lambda calculation - slow computation but very accurate
+        int32_t bpw_bias;                     // type of error bias to use: 0 = no bias (MSE only), 1 = fast (default), 2 = precise (slow)
     } llama_model_quantize_params;
 
     typedef struct llama_logit_bias {
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index db688fdf02..74ceb3de9c 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -902,26 +902,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return std::isfinite(total_err) ? total_err : infinity;
     };
 
-    // Scaling factor to increase lambda when activations are concentrated
-    auto directional_scale = [&](const float * values, const float * activations, int64_t n_per_row) {
-        if (!activations) { return 1.0f; }
-        double sum_v   = 0.0;
-        double sum_aw2 = 0.0;
-        double sum_a2  = 0.0;
-        for (int64_t j = 0; j < n_per_row; ++j) {
-            const double v = values ? std::max(0.0f, values[j]) : 1.0;
-            const double a = activations[j];
-            sum_v += v;
-            sum_aw2 += v * a * a;
-            sum_a2 += a * a;
-        }
-        const double rms_a = std::sqrt(sum_a2 / std::max(1.0, (double)n_per_row));
-        const double denom = std::sqrt(std::max(epsilon, sum_v)) * std::max(epsilon, rms_a);
-        const double scale = denom > 0.0 ? std::sqrt(sum_aw2) / denom : 1.0;
-
-        return (float)std::clamp(scale, 0.5, 2.0);
-    };
-
     // Higher precision but much longer to compute
     auto precise_lambda = [&](const ggml_tensor * t,
         const std::vector<float> & f32_sample,
@@ -979,11 +959,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         if (ratios.empty()) { return 0.0f; }
 
         std::nth_element(ratios.begin(), ratios.begin() + ratios.size() / 2, ratios.end());
-        double lambda = ratios[ratios.size() / 2];
-
-        const float scale = directional_scale(values, activations, n_per_row);
-        lambda *= scale;
-        lambda = std::clamp(lambda, 0.0, 8.0);
+        const double lambda = std::clamp(ratios[ratios.size() / 2], 0.0, 8.0);
 
         return (float)lambda;
     };
@@ -1007,8 +983,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         double base = 1.0 - s * s / (d * s2 + epsilon);
         base = std::clamp(base, 0.0, 1.0);
 
-        const double scale = directional_scale(values, activations, n_per_row);
-        const double lambda = std::clamp(base * scale, 0.0, 1.0) * 8.0;
+        const double lambda = std::clamp(base, 0.0, 1.0) * 8.0;
 
         return (float)lambda;
     };
@@ -1159,8 +1134,11 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         {
             const float * values = values_sample.empty() ? nullptr : values_sample.data();
             const float * activations = activations_sample.empty() ? nullptr : activations_sample.data();
-            bias_lambda = params->precise_lambda ? precise_lambda(t, f32_sample, sample_rows_per_slice, values, activations, compatible_candidates) :
-                fast_lambda(values, activations, n_per_row);
+            if (params->bpw_bias == 1) {
+                bias_lambda = fast_lambda(values, activations, n_per_row);
+            } else if (params->bpw_bias == 2) {
+                bias_lambda = precise_lambda(t, f32_sample, sample_rows_per_slice, values, activations, compatible_candidates);
+            }
         }
 
         // Now evaluate candidates
@@ -1656,7 +1634,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             } else {
                 LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate - ", __func__);
             }
-            LLAMA_LOG_INFO("using %s\n", params->precise_lambda ? "precise lambda (slow)" : "fast lambda");
+            const char* msg[] = {"no bias (MSE only)", "fast (default)", "precise (slow)"};
+            LLAMA_LOG_INFO("using %s error estimation\n", msg[params->bpw_bias]);
             LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw);
             bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread);
         } else {
@@ -1967,7 +1946,7 @@ llama_model_quantize_params llama_model_quantize_default_params() {
         /*.tensor_type                 =*/ nullptr,
         /*.prune_layers                =*/ nullptr,
         /*.target_bpw                  =*/ -1.0f,
-        /*.precise_lambda              =*/ false
+        /*.bpw_bias                    =*/ 1
     };
 
     return result;
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 0c9460513c..0fe65daea0 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -134,7 +134,7 @@ static void usage(const char * executable) {
     printf("      Advanced option to remove all tensors from the given layers\n");
     printf("  --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 16.0\n");
     printf("      Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n");
-    printf("  --precise-lambda: given a target bpw, use a high-precision error computation at the expense of longer processing times\n");
+    printf("  --bpw_bias: type of error bias to use: 0 = no bias (MSE only), 1 = fast (default), 2 = precise (slow)\n");
     printf("  --keep-split: will generate quantized model in the same shards as input\n");
     printf("  --override-kv KEY=TYPE:VALUE\n");
     printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
@@ -496,6 +496,27 @@ static bool parse_target_bpw(const char * data, float & target_bpw) {
     return true;
 }
 
+static bool parse_bpw_bias(const char * data, int & bpw_bias) {
+    if (!data) {
+        printf("\n%s: error bias type not provided\n\n", __func__);
+        return false;
+    }
+
+    try {
+        bpw_bias = std::stoi(data);
+        if (bpw_bias < 0 || bpw_bias > 2) {
+            printf("\n%s: error bias type must be one of 0 (no bias, MSE only), 1 (fast), or 2 (precise, but slow)\n\n", __func__);
+            return false;
+        }
+    }
+    catch (const std::exception & e) {
+        printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__, data);
+        return false;
+    }
+
+    return true;
+}
+
 int main(int argc, char ** argv) {
     if (argc < 3) {
         usage(argv[0]);
@@ -510,6 +531,7 @@ int main(int argc, char ** argv) {
     std::vector<tensor_quantization> tensor_types;
     std::vector<int> prune_layers;
     float target_bpw = -1.0f;
+    int bpw_bias = 1;
 
     for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
         if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
@@ -540,8 +562,11 @@ int main(int argc, char ** argv) {
             if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) {
                 usage(argv[0]);
             }
-        } else if (strcmp(argv[arg_idx], "--precise-lambda") == 0) {
-            params.precise_lambda = true;
+        } else if (strcmp(argv[arg_idx], "--bpw-bias") == 0) {
+            if (arg_idx == argc-1 || !parse_bpw_bias(argv[++arg_idx], bpw_bias)) {
+                usage(argv[0]);
+            }
+            params.bpw_bias = bpw_bias;
         } else if (strcmp(argv[arg_idx], "--prune-layers") == 0) {
             if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) {
                 usage(argv[0]);

From 886536d80ab5c227cd6c3f8813b8b5fbf5bea41d Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 13 Sep 2025 08:27:23 +0100
Subject: [PATCH 059/155] Increase error type precision

---
 src/llama-quant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 74ceb3de9c..c4c525c68e 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -612,7 +612,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         ggml_type type;
         float bpw;
         size_t bytes;
-        float error;
+        double error;
     };
 
     struct tensor_info {

From bc8762f27f185c5db1cbd0d8ec3bcc8e1771856d Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 13 Sep 2025 08:33:22 +0100
Subject: [PATCH 060/155] Capture surrounding function name

---
 src/llama-quant.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index c4c525c68e..cae908803b 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -651,6 +651,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     constexpr double epsilon = 1e-12;
     constexpr double infinity = std::numeric_limits<double>::infinity();
+    const char * func = __func__;
 
     auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t {
         const int64_t n_per_row = t->ne[0];
@@ -1083,7 +1084,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 }
             } else {
                 LLAMA_LOG_WARN("%s: side data size mismatch for %s: got %zu, expected %zu or %zu; ignoring\n",
-                    __func__, name.c_str(), src_sz, (size_t)n_per_row, want);
+                    func, name.c_str(), src_sz, (size_t)n_per_row, want);
             }
         };
 

From 4dff85fbe54336130155a8e4fa5e7f4db48f4451 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 13 Sep 2025 08:41:37 +0100
Subject: [PATCH 061/155] Improve precise_lambda() efficiency

---
 src/llama-quant.cpp | 126 ++++++++++++++++++++++++++++++--------------
 1 file changed, 86 insertions(+), 40 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index cae908803b..1677b242d9 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -725,7 +725,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         const float * activations_sample,
         std::vector<uint8_t> & quantized_buffer,
         std::vector<float> & dequantized_buffer,
-        float bias_lambda) -> double
+        float bias_lambda,
+        double * out_mse = nullptr,
+        double * out_proj = nullptr) -> double
     {
         const int64_t n_per_row = t->ne[0];
         const int64_t nrows = t->ne[1];
@@ -733,13 +735,23 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         const size_t sample_element_count = f32_sample.size();
         const size_t sample_row_count = n_per_row > 0 ? sample_element_count / (size_t)n_per_row : 0;
-        if (sample_row_count == 0) { return 0.0; }
+        if (sample_row_count == 0) {
+            if (out_mse) { *out_mse = 0.0; }
+            if (out_proj) { *out_proj = 0.0; }
+
+            return 0.0;
+        }
 
         size_t expected_rows = 0;
         for (int64_t s = 0; s < ne2; ++s) {
             expected_rows += (size_t)sample_rows_per_slice[s];
         }
-        if (expected_rows != sample_row_count) { return infinity; }
+        if (expected_rows != sample_row_count) {
+            if (out_mse) { *out_mse = infinity; }
+            if (out_proj) { *out_proj = 0.0; }
+
+            return infinity;
+        }
 
         const size_t row_sz = ggml_row_size(quant_type, n_per_row);
         const size_t buffer_sz = row_sz * sample_row_count;
@@ -750,7 +762,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         const bool has_values = values_sample != nullptr;
         const bool has_activations = activations_sample != nullptr;
 
-        // Bias denominators per slice (only needed if we have activations)
+        // Bias denominators per slice
         std::vector<double> bias_denominator_per_slice(ne2, 0.0);
         if (has_activations) {
             for (int64_t s = 0; s < ne2; ++s) {
@@ -815,7 +827,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         // quantized_buffer -> dequantized_buffer
         {
             const ggml_type_traits * traits = ggml_get_type_traits(quant_type);
-
             const bool is_fp16 = quant_type == GGML_TYPE_F16;
             const bool is_bf16 = quant_type == GGML_TYPE_BF16;
             if (!is_fp16 && !is_bf16 && traits && traits->to_float) {
@@ -825,12 +836,19 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     uint8_t * src = quantized_buffer.data() + r * row_sz;
                     float * dst = dequantized_buffer.data() + r * (size_t) n_per_row;
                     if (is_fp16) {
-                        ggml_fp16_to_fp32_row((const ggml_fp16_t *) src, dst, (int)n_per_row);
-                    } else if (is_bf16) {
-                        ggml_bf16_to_fp32_row((const ggml_bf16_t *) src, dst, (int)n_per_row);
-                    } else {
-                        if (!traits || !traits->to_float) { return infinity; }
-                        traits->to_float(src, dst, (int)n_per_row);
+                        ggml_fp16_to_fp32_row((const ggml_fp16_t *) src, dst, (int) n_per_row);
+                    }
+                    else if (is_bf16) {
+                        ggml_bf16_to_fp32_row((const ggml_bf16_t *) src, dst, (int) n_per_row);
+                    }
+                    else {
+                        if (!traits || !traits->to_float) {
+                            if (out_mse) { *out_mse = infinity; }
+                            if (out_proj) { *out_proj = 0.0; }
+
+                            return infinity;
+                        }
+                        traits->to_float(src, dst, (int) n_per_row);
                     }
                 }
             }
@@ -839,8 +857,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         // Compute error
         size_t offset = 0;
         size_t row_idx = 0;
-        double total_err = 0.0;
-
+        double total_mse = 0.0;
+        double total_proj = 0.0;
         for (int64_t slice = 0; slice < ne2; ++slice) {
             const int64_t rs = sample_rows_per_slice[slice];
             if (rs == 0) { continue; }
@@ -848,7 +866,11 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             const float * values = has_values ? values_sample + slice * n_per_row : nullptr;
             const float * activations = has_activations ? activations_sample + slice * n_per_row : nullptr;
             const double bias_denom = has_activations ? bias_denominator_per_slice[slice] : 0.0;
-            double slice_err = 0.0;
+            std::vector<double> row_mse_norm;
+            std::vector<double> row_proj_norm;
+            row_mse_norm.reserve(rs);
+            if (activations) { row_proj_norm.reserve(rs); }
+
             for (int64_t r = 0; r < rs; ++r, ++row_idx) {
                 const float * x = f32_sample.data() + offset;
                 const float * y = dequantized_buffer.data() + offset;
@@ -868,13 +890,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                         const double e = y[j] - x[j];
                         weighted_mse += w * e * e;
                     }
-                } else if (activations) {
-                    for (int64_t j = 0; j < n_per_row; ++j) {
-                        const double e = y[j] - x[j];
-                        const double a = activations[j];
-                        weighted_mse += e * e;
-                        bias_num += e * a;
-                    }
                 } else {
                     for (int64_t j = 0; j < n_per_row; ++j) {
                         const double e = y[j] - x[j];
@@ -882,28 +897,64 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     }
                 }
 
-                double err_num = weighted_mse;
-                if (activations && bias_lambda != 0.0f) {
+                const double denom_x = row_sq_norm[row_idx];
+                double m_norm = weighted_mse / (denom_x + epsilon);
+                row_mse_norm.push_back(std::isfinite(m_norm) ? m_norm : infinity);
+
+                if (activations) {
+                    double p_norm = 0.0;
                     if (bias_denom > 0.0) {
                         const double proj = bias_num * bias_num / (bias_denom + epsilon);
-                        err_num += bias_lambda * proj;
+                        p_norm = std::isfinite(proj) ? proj : 0.0;
                     }
+                    row_proj_norm.push_back(p_norm);
                 }
-
-                const double denom = row_sq_norm[row_idx] + epsilon;
-                slice_err += err_num / denom;
                 offset += (size_t)n_per_row;
             }
 
+            // Trimmed sum to avoid outlier rows dominating the results
+            auto trimmed_sum = [&](std::vector<double> & v) -> double {
+                if (v.empty()) { return 0.0; }
+                const int64_t n = (int64_t)v.size();
+                if (n < 50) {
+                    double s = 0.0;
+                    for (const double z : v) { s += z; }
+                    return s;
+                }
+
+                int64_t k = (int64_t) std::floor(0.02 * (double)n); // trim 2% on each side
+                k = std::max<int64_t>(0, std::min<int64_t>(k, n / 32)); // but not more than 3.125%
+                std::nth_element(v.begin(), v.begin() + k, v.end());
+                std::nth_element(v.begin() + k, v.begin() + (n - k), v.end());
+                double s = 0.0;
+                for (int64_t i = k; i < n - k; ++i) {
+                    s += v[i];
+                }
+
+                return s;
+            };
+
             const double scale_rows = (double)nrows / std::max(1.0, (double)rs);
-            total_err += slice_err * scale_rows;
-            if (!std::isfinite(total_err)) { return infinity; }
+
+            total_mse += trimmed_sum(row_mse_norm) * scale_rows;
+            if (activations) { total_proj += trimmed_sum(row_proj_norm) * scale_rows; }
+
+            if (!std::isfinite(total_mse) || !std::isfinite(total_proj)) {
+                if (out_mse) { *out_mse = infinity; }
+                if (out_proj) { *out_proj = 0.0; }
+
+                return infinity;
+            }
         }
 
+        if (out_mse) { *out_mse = total_mse; }
+        if (out_proj) { *out_proj = total_proj; }
+
+        const double total_err = total_mse + bias_lambda * total_proj;
         return std::isfinite(total_err) ? total_err : infinity;
     };
 
-    // Higher precision but much longer to compute
+    // Higher precision but longer to compute
     auto precise_lambda = [&](const ggml_tensor * t,
         const std::vector<float> & f32_sample,
         const std::vector<int64_t> & sample_rows_per_slice,
@@ -936,22 +987,17 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         const int64_t n_per_row = t->ne[0];
         const size_t total_sampled_rows = f32_sample.size() / n_per_row;
         size_t max_row_sz = 0;
-        for (auto pt : probes) {
-            max_row_sz = std::max(max_row_sz, ggml_row_size(pt, n_per_row));
-        }
+        for (auto pt : probes) max_row_sz = std::max(max_row_sz, ggml_row_size(pt, n_per_row));
 
         std::vector<uint8_t> quantized_buffer(max_row_sz * total_sampled_rows);
         std::vector<float>   dequantized_buffer(f32_sample.size());
+
         std::vector<double> ratios;
         ratios.reserve(probes.size());
         for (const auto pt : probes) {
-            // err at lambda=0 => pure weighted MSE part
-            double err0 = estimate_error(t, pt, f32_sample, sample_rows_per_slice, values, activations, quantized_buffer, dequantized_buffer, 0.0f);
-            // err at lambda=1 => weighted MSE + projection penalty
-            const double err1 = estimate_error(t, pt, f32_sample, sample_rows_per_slice, values, activations, quantized_buffer, dequantized_buffer, 1.0f);
-
-            const double p = std::max(0.0, err1 - err0);  // projection term contribution
-            const double m = std::max(0.0, err0); // MSE term contribution
+            double m = 0.0;
+            double p = 0.0;
+            (void)estimate_error(t, pt, f32_sample, sample_rows_per_slice, values, activations, quantized_buffer, dequantized_buffer, 0.0f, &m, &p);
             if (p > epsilon && std::isfinite(m) && std::isfinite(p)) {
                 ratios.push_back(m / p);
             }

From 7d85993f268d9fa35bea9178f6acf2d72833dffa Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 13 Sep 2025 08:44:41 +0100
Subject: [PATCH 062/155] Minor refactoring

---
 src/llama-quant.cpp | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 1677b242d9..15ea36721e 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -617,7 +617,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     struct tensor_info {
         const llama_model_loader::llama_tensor_weight * w = nullptr;
-        std::vector<candidate_types> candidate = {};
+        std::vector<candidate_types> candidate;
         int choice = -1;
         float min_bpw = 0.0;
         float max_bpw = 0.0;
@@ -972,8 +972,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             }
         };
 
-        push_if(GGML_TYPE_Q4_K);
         push_if(GGML_TYPE_Q3_K);
+        push_if(GGML_TYPE_Q4_K);
         push_if(GGML_TYPE_Q5_K);
         if (probes.empty() && !compatible_candidates.empty()) {
             probes.push_back(compatible_candidates[compatible_candidates.size() / 2]);
@@ -1011,7 +1011,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return (float)lambda;
     };
 
-    // Faster to compute but lower precision. Best option for the vast majority of models
+    // Faster to compute but may yield lower precision. Best option for the vast majority of cases
     auto fast_lambda = [&](const float * values, const float * activations, const int64_t n_per_row) {
         if (!activations) { return 0.0f; }
 
@@ -1057,12 +1057,10 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
 
         // Larger sample_rows_per_expert values may result in more accurate error estimates, but it will take much longer to compute
-        constexpr int sample_rows_per_expert = 256;
+        const int sample_rows_per_expert = activations_data ? 512 : 256;
         std::vector<float> f32_sample;
         f32_sample.reserve((size_t)ne2 * (size_t)std::min<int64_t>(nrows_total, sample_rows_per_expert) * (size_t)n_per_row);
 
-        // deterministic sampling seed based on tensor name + fixed constant
-        std::mt19937 rng(std::hash<std::string>{}(name) ^0xeabada55cafed00d);
         std::vector<int64_t> sample_rows_per_slice(ne2, 0);
         const int64_t sample_rows_max = std::max<int64_t>(1, std::min<int64_t>(nrows_total, sample_rows_per_expert));
         const int64_t stride = std::max<int64_t>(1, nrows_total / sample_rows_max);
@@ -1072,6 +1070,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         const bool src_is_quant = ggml_is_quantized(src_type);
         const size_t src_row_sz = ggml_row_size(src_type, n_per_row);
         for (int64_t slice = 0; slice < ne2; ++slice) {
+            std::mt19937 rng(std::hash<std::string>{}(name) ^ 0xeabada55cafed00d ^ slice);
             int64_t current_sampled_rows = 0;
             int64_t offset = 0;
             if (stride > 1) {
@@ -1084,11 +1083,11 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     const float * src_row = (const float *)t->data + slice * (n_per_row * nrows_total) + r * n_per_row;
                     f32_sample.insert(f32_sample.end(), src_row, src_row + n_per_row);
                 } else if (src_type == GGML_TYPE_F16) {
-                    const ggml_fp16_t * src_row = (const ggml_fp16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz);
+                    const auto * src_row = (const ggml_fp16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz);
                     ggml_fp16_to_fp32_row(src_row, row_buffer.data(), (int)n_per_row);
                     f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end());
                 } else if (src_type == GGML_TYPE_BF16) {
-                    const ggml_bf16_t * src_row = (const ggml_bf16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz);
+                    const auto * src_row = (const ggml_bf16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz);
                     ggml_bf16_to_fp32_row(src_row, row_buffer.data(), (int)n_per_row);
                     f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end());
                 } else if (src_is_quant) {
@@ -1211,7 +1210,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     const ggml_type tt = compatible_candidates[i];
                     const auto bpw = (float)tensor_bpw(t, tt);
                     const size_t bytes = tensor_bytes(t, tt);
-                    const auto err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, values, activations, tl_quantized_buffer, tl_dequantised_buffer, bias_lambda);
+                    const auto err = estimate_error(t, tt, f32_sample, sample_rows_per_slice, values, activations, tl_quantized_buffer, tl_dequantised_buffer, bias_lambda);
                     eval_candidates[i] = candidate_types{ tt, bpw, bytes, err };
                 }
             });
@@ -1240,7 +1239,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 return a.error < b.error;
             });
 
-            double best_err = std::numeric_limits<double>::infinity();
+            double best_err = infinity;
             size_t last_bytes = std::numeric_limits<size_t>::max();
             for (const auto & c : info.candidate) {
                 // Only keep the best error seen so far at strictly larger byte sizes

From 12e816b51199b38a6571141d5f1e5f1039ebe706 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 13 Sep 2025 09:24:23 +0100
Subject: [PATCH 063/155] Replace greedy allocator with lagrangian relaxation

---
 src/llama-quant.cpp | 278 ++++++++++++++++++++++++++------------------
 1 file changed, 162 insertions(+), 116 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 15ea36721e..a369d50ffe 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1266,152 +1266,198 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     if (all.empty()) { return {}; }
 
-    // Greedy allocation from minimum bpw upward to reach target_bpw
-    auto current_total_bytes = [&]() -> size_t {
-        size_t b = 0;
+    // Lagrangian relaxation to minimise error subject to a bpw target constraint
+    auto total_bytes = [&]() -> size_t {
+        size_t tb = 0;
         for (const auto & ti : all) {
-            b += ti.candidate[ti.choice].bytes;
+            tb += ti.candidate[ti.choice].bytes;
         }
 
-        return b;
+        return tb;
     };
 
-    auto total_weights = [&]() -> size_t {
-        size_t w = 0;
-        for (const auto & ti : all) {
-            w += ti.n_elements;
-        }
+    size_t total_elems = 0;
+    size_t min_bytes = 0;
+    size_t max_bytes = 0;
+    for (const auto & ti : all) {
+        total_elems += (size_t)ti.n_elements;
+        min_bytes += ti.candidate.front().bytes;  // smallest candidate per tensor
+        max_bytes += ti.candidate.back().bytes;   // largest candidate per tensor
+    }
 
-        return w;
-    };
+    if (total_elems == 0) { return {}; }
 
-    const size_t tw = total_weights();
-    auto current_bpw = [&]() -> double {
-        return (double)current_total_bytes() * 8.0f / (double)tw;
-    };
+    const double target_bpw = params->target_bpw;
+    size_t budget_bytes = std::llround(target_bpw * (double)total_elems / 8.0);
 
-    // Precompute current bpw
-    double bpw_now = current_bpw();
-
-    float target_bpw = params->target_bpw;
-    // If minimal bpw is already above the target, we're constrained by the tensor's shape; return closest (min bpw)
-    if (bpw_now >= target_bpw) {
+    auto emit_overrides = [&]() -> std::unordered_map<std::string, ggml_type> {
         std::unordered_map<std::string, ggml_type> overrides;
+        LLAMA_LOG_INFO("%s: - estimated tensor quantization mix:\n", func);
         for (const auto & ti : all) {
+            LLAMA_LOG_INFO("\t%s: %45s - \t%8s, \t%1.4f bpw,\terror: %.4f\n",
+                func, ggml_get_name(ti.w->tensor), ggml_type_name(ti.candidate[ti.choice].type), ti.candidate[ti.choice].bpw, ti.candidate[ti.choice].error);
             overrides[ggml_get_name(ti.w->tensor)] = ti.candidate[ti.choice].type;
         }
 
         return overrides;
+    };
+
+    if (budget_bytes <= min_bytes) {
+        for (auto & ti : all) { ti.choice = 0; }
+
+        return emit_overrides();
+    }
+    if (budget_bytes >= max_bytes) {
+        for (auto & ti : all) { ti.choice = (int) ti.candidate.size() - 1; }
+
+        return emit_overrides();
     }
 
-    struct upgrade {
-        int idx;
-        int next;
-        double err;
-        size_t delta_bytes;
-        double ratio;
-    };
-
-    // Find next strictly-larger candidate index for a tensor
-    auto next_distinct_idx = [&](const tensor_info & ti) -> int {
-        const auto & cand = ti.candidate;
-        const auto & cur  = cand[ti.choice];
-        int j = ti.choice + 1;
-        while (j < (int)cand.size() && cand[j].bytes == cur.bytes) {
-            ++j;
-        }
-
-        return j < (int)cand.size() ? j : -1;
-    };
-
-    auto recompute_best_upgrade = [&]() -> upgrade {
-        upgrade best{ -1, -1, 0.0, 0, -1.0 };
-        for (int i = 0; i < (int) all.size(); ++i) {
-            const auto & ti = all[i];
-            if (ti.choice >= (int)ti.candidate.size() - 1) { continue; }
-
-            const int j = next_distinct_idx(ti);
-            if (j < 0) { continue; }
-
-            const auto & cur = ti.candidate[ti.choice];
-            const auto & nxt = ti.candidate[j];
-            const size_t delta_bytes = nxt.bytes - cur.bytes;
-            if (delta_bytes == 0) { continue; }
-
-            double err = cur.error - nxt.error;
-            err = std::max(err, 0.0);
-            double ratio = err / (double)(delta_bytes * 8ull);
-            if (ratio > best.ratio + epsilon || (std::abs(ratio - best.ratio) <= epsilon && delta_bytes < best.delta_bytes)) {
-                best = upgrade{ i, j, err, delta_bytes, ratio };
+    auto lagrange_penalty = [&](const double mu, std::vector<int> & choice, size_t & bytes, double & err) {
+        choice.resize(all.size());
+        bytes = 0;
+        err = 0.0;
+        for (size_t i = 0; i < all.size(); ++i) {
+            const auto & cand = all[i].candidate;
+            int best_j = 0;
+            double best_val = infinity;
+            for (int j = 0; j < (int)cand.size(); ++j) {
+                const double bits = (double)cand[j].bytes * 8.0;
+                const double val = cand[j].error + mu * bits;
+                if (val < best_val - epsilon || (std::abs(val - best_val) <= epsilon && cand[j].bytes < cand[best_j].bytes)) {
+                    best_val = val;
+                    best_j = j;
+                }
             }
-        }
 
-        return best;
+            choice[i] = best_j;
+            bytes += cand[best_j].bytes;
+            err += cand[best_j].error;
+        }
     };
 
-    while (true) {
-        upgrade up = recompute_best_upgrade();
-        if (up.idx < 0) { break; }
+    size_t bytes_lo = 0;
+    size_t bytes_hi = 0;
+    size_t bytes_mid = 0;
+    double mu_lo = 0.0;
+    double mu_hi = 1.0;
+    double err_lo = 0.0;
+    double err_hi = 0.0;
+    double err_mid = 0.0;
+    std::vector<int> choice_lo;
+    std::vector<int> choice_hi;
+    std::vector<int> choice_mid;
+    std::vector<int> best_under_choice;
+    std::vector<int> best_over_choice;
 
-        size_t now_bytes = current_total_bytes();
-        size_t next_bytes = now_bytes + up.delta_bytes;
-        double bpw_next = (double)next_bytes * 8.0 / (double)tw;
-        if (bpw_next <= target_bpw + epsilon) {
-            all[up.idx].choice = up.next;
-            bpw_now = bpw_next;
-        } else {
-            break;
-        }
-    }
+    lagrange_penalty(mu_lo, choice_lo, bytes_lo, err_lo);
 
-    // We might still be below target so we try to find the best upgrade one last time
+    // increase mu until we get under budget or hit a safety cap
     {
-        upgrade best_over{ -1, -1, 0.0, 0, -1.0 };
-        double  best_over_gap = 1e300;
-        double  under_gap = target_bpw - bpw_now;
-        size_t now_bytes = current_total_bytes();
-        for (int i = 0; i < (int) all.size(); ++i) {
-            const auto & ti = all[i];
-            if (ti.choice >= (int)ti.candidate.size() - 1) { continue; }
-
-            int j = next_distinct_idx(ti);
-            if (j < 0) { continue; }
-
-            const auto & cur = ti.candidate[ti.choice];
-            const auto & nxt = ti.candidate[j];
-            size_t delta_bytes = nxt.bytes - cur.bytes;
-            if (delta_bytes == 0) { continue; }
-
-            size_t over_bytes = now_bytes + delta_bytes;
-            double bpw_over = (double)over_bytes * 8.0 / (double)tw;
-            double err = cur.error - nxt.error;
-            if (err < 0.0) { err = 0.0; }
-            double ratio = err / (double)(delta_bytes * 8ull);
-
-            double over_gap = std::abs(bpw_over - (double)target_bpw);
-            if (over_gap < best_over_gap - epsilon || (std::abs(over_gap - best_over_gap) <= epsilon && ratio > best_over.ratio)) {
-                best_over_gap = over_gap;
-                best_over = upgrade{ i, j, err, delta_bytes, ratio };
+        int expand = 0;
+        while (true) {
+            lagrange_penalty(mu_hi, choice_hi, bytes_hi, err_hi);
+            if (bytes_hi <= budget_bytes) {
+                break;
             }
-        }
-
-        if (best_over.idx >= 0) {
-            if (best_over_gap < under_gap) {
-                all[best_over.idx].choice = best_over.next;
+            mu_hi *= 2.0;
+            if (++expand > 60) {
+                break;
             }
         }
     }
 
-    // Build the override map
-    std::unordered_map<std::string, ggml_type> overrides;
-    LLAMA_LOG_INFO("%s: - estimated tensor quantization mix:\n", __func__);
-    for (const auto & ti : all) {
-        LLAMA_LOG_INFO("\t%s: %45s - \t%8s, \t%1.4f bpw,\terror: %.4f\n",
-            __func__, ggml_get_name(ti.w->tensor), ggml_type_name(ti.candidate[ti.choice].type), ti.candidate[ti.choice].bpw, ti.candidate[ti.choice].error);
-        overrides[ggml_get_name(ti.w->tensor)] = ti.candidate[ti.choice].type;
+    double best_under_gap = infinity;
+    double best_over_gap = infinity;
+    double best_under_err = infinity;
+    double best_over_err = infinity;
+    for (int it = 0; it < 40; ++it) {
+        double mu = 0.5 * (mu_lo + mu_hi);
+        lagrange_penalty(mu, choice_mid, bytes_mid, err_mid);
+
+        const double gap = std::abs((double)bytes_mid - (double)budget_bytes);
+
+        if (bytes_mid > budget_bytes) {
+            // Too big, need stronger penalty
+            mu_lo = mu;
+
+            if (gap < best_over_gap - epsilon || (std::abs(gap - best_over_gap) <= epsilon && err_mid < best_over_err)) {
+                best_over_gap = gap;
+                best_over_err = err_mid;
+                best_over_choice = choice_mid;
+            }
+        } else {
+            // Under budget, good candidate
+            mu_hi = mu;
+
+            if (gap < best_under_gap - epsilon || (std::abs(gap - best_under_gap) <= epsilon && err_mid < best_under_err)) {
+                best_under_gap = gap;
+                best_under_err = err_mid;
+                best_under_choice = choice_mid;
+            }
+        }
     }
 
-    return overrides;
+    if (!best_under_choice.empty()) {
+        for (size_t i = 0; i < all.size(); ++i) {
+            all[i].choice = best_under_choice[i];
+        }
+    } else if (!best_over_choice.empty()) {
+        for (size_t i = 0; i < all.size(); ++i) {
+            all[i].choice = best_over_choice[i];
+        }
+    } else {
+        // Pick whichever side we already have, or keep minimal
+        if (bytes_hi <= budget_bytes && !choice_hi.empty()) {
+            for (size_t i = 0; i < all.size(); ++i) {
+                all[i].choice = choice_hi[i];
+            }
+        } else {
+            for (auto & ti : all) {
+                ti.choice = 0;
+            }
+        }
+    }
+
+    // Spend any remaining budget with best upgrades that still fit (one pass)
+    {
+        auto cur_bytes = total_bytes();
+        while (true) {
+            int best_i = -1;
+            int best_j = -1;
+            double best_ratio = -1.0;
+            size_t best_delta = 0;
+
+            for (int i = 0; i < (int)all.size(); ++i) {
+                const auto & ti = all[i];
+                if (ti.choice >= (int)ti.candidate.size() - 1) {
+                    continue;
+                }
+
+                int j = ti.choice + 1;
+                while (j < (int)ti.candidate.size() && ti.candidate[j].bytes == ti.candidate[ti.choice].bytes) { ++j; }
+                if (j >= (int)ti.candidate.size()) { continue; }
+
+                size_t delta = ti.candidate[j].bytes - ti.candidate[ti.choice].bytes;
+                if (cur_bytes + delta > budget_bytes) { continue; }
+
+                double err_gain = std::max(0.0, (double)ti.candidate[ti.choice].error - (double)ti.candidate[j].error);
+                double ratio = err_gain / (double)(delta * 8);
+                if (ratio > best_ratio + epsilon || (std::abs(ratio - best_ratio) <= epsilon && delta < best_delta)) {
+                    best_ratio = ratio;
+                    best_delta = delta;
+                    best_i = i;
+                    best_j = j;
+                }
+            }
+
+            if (best_i < 0) { break; }
+            all[best_i].choice = best_j;
+            cur_bytes += best_delta;
+        }
+    }
+
+    return emit_overrides();
 }
 
 static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {

From 2b516068e2ef0e51373be32b1917eb7295bcfc54 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 13 Sep 2025 09:41:52 +0100
Subject: [PATCH 064/155] "Convexify" candidate list

---
 src/llama-quant.cpp | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index a369d50ffe..955e6c12fe 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1257,6 +1257,32 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             info.candidate.swap(pruned);
         }
 
+        // Enforce convexity in (bytes, error) curve
+        {
+            const auto & c = info.candidate;
+            if (c.size() >= 3) {
+                std::vector<candidate_types> convex;
+                convex.reserve(c.size());
+                auto slope = [](const candidate_types & a, const candidate_types & b) -> double {
+                    const double dx = (double)b.bytes - (double)a.bytes;
+                    if (dx <= 0.0) { return infinity; }
+
+                    return ((double)b.error - (double)a.error) / dx;
+                };
+
+                for (const auto & p : c) {
+                    while (convex.size() >= 2) {
+                        double s1 = slope(convex[convex.size() - 2], convex[convex.size() - 1]);
+                        double s2 = slope(convex[convex.size() - 1], p);
+                        if (s2 + epsilon < s1) { convex.pop_back(); }
+                        else { break; }
+                    }
+                    convex.push_back(p);
+                }
+                info.candidate.swap(convex);
+            }
+        }
+
         // Initialize choice at the smallest bpw candidate
         info.choice = 0;
         info.min_bpw = info.candidate.front().bpw;

From 8503d59ee44bc30b0d030cceb5e17590b334730d Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 13 Sep 2025 11:49:18 +0100
Subject: [PATCH 065/155] Increase IQ options

---
 src/llama-quant.cpp | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 955e6c12fe..41fd819f86 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -641,12 +641,21 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     constexpr ggml_type iq_quants[] = {
         GGML_TYPE_IQ1_S,
+        GGML_TYPE_IQ2_XXS,
+        GGML_TYPE_IQ2_XS,
         GGML_TYPE_IQ2_S,
         GGML_TYPE_IQ3_S,
         GGML_TYPE_IQ4_XS,
+        GGML_TYPE_IQ4_NL,
         GGML_TYPE_Q5_K,
         GGML_TYPE_Q6_K,
-        GGML_TYPE_Q8_0
+        GGML_TYPE_Q8_0,
+        // TODO: find better way to handle F16/BF16
+#ifdef GGML_USE_METAL
+        GGML_TYPE_F16
+#else
+        GGML_TYPE_BF16
+#endif
     };
 
     constexpr double epsilon = 1e-12;

From c709e1a3353cbefbe58320c2eae1a1edafc0f618 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 14 Sep 2025 22:38:27 +0100
Subject: [PATCH 066/155] Fix MoE tensor estimation

---
 src/llama-quant.cpp | 45 ++++++++++++++++++++++++++++-----------------
 1 file changed, 28 insertions(+), 17 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 41fd819f86..1efb1c5eee 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1021,27 +1021,38 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     };
 
     // Faster to compute but may yield lower precision. Best option for the vast majority of cases
-    auto fast_lambda = [&](const float * values, const float * activations, const int64_t n_per_row) {
+    auto fast_lambda = [&](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) {
         if (!activations) { return 0.0f; }
 
-        double s = 0.0;
-        double s2 = 0.0;
-        for (int64_t j = 0; j < n_per_row; ++j) {
-            const double w = values ? std::max(0.0f, values[j]) : 1.0;
-            const double aw = std::sqrt(w) * activations[j];
-            const double aw2 = aw * aw;
-            s += aw2;
-            s2 += aw2 * aw2;
+        double accum = 0.0;
+        int ns = 0;
+
+        for (int64_t s = 0; s < std::max<int64_t>(1, ne2); ++s) {
+            const float * v = values ? values + s * n_per_row : nullptr;
+            const float * a = activations + s * n_per_row;
+
+            double s1 = 0.0;
+            double s2 = 0.0;
+            for (int64_t j = 0; j < n_per_row; ++j) {
+                const double w  = v ? std::max(0.0f, v[j]) : 1.0;
+                const double aw = std::sqrt(w) * a[j];
+                const double aw2 = aw * aw;
+                s1 += aw2;
+                s2 += aw2 * aw2;
+            }
+
+            if (s1 > 0.0) {
+                const double n = (double)n_per_row;
+                double c = std::max(0.0, s2 / (s1 * s1 + epsilon) - 1.0 / n);
+                double lambda = 8.0 * (c / (c + 1.0));
+                accum += std::clamp(lambda, 0.0, 8.0);
+                ++ns;
+            }
         }
 
-        if (s2 <= 0.0) { return 0.0f; }
-        const auto d = (double)n_per_row;
-        double base = 1.0 - s * s / (d * s2 + epsilon);
-        base = std::clamp(base, 0.0, 1.0);
+        if (ns == 0) { return 0.0f; }
 
-        const double lambda = std::clamp(base, 0.0, 1.0) * 8.0;
-
-        return (float)lambda;
+        return (float)(accum / ns);
     };
 
     std::vector<tensor_info> all;
@@ -1190,7 +1201,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             const float * values = values_sample.empty() ? nullptr : values_sample.data();
             const float * activations = activations_sample.empty() ? nullptr : activations_sample.data();
             if (params->bpw_bias == 1) {
-                bias_lambda = fast_lambda(values, activations, n_per_row);
+                bias_lambda = fast_lambda(values, activations, n_per_row, ne2);
             } else if (params->bpw_bias == 2) {
                 bias_lambda = precise_lambda(t, f32_sample, sample_rows_per_slice, values, activations, compatible_candidates);
             }

From 14fae69a7bb932fadbc5dd62072a254866512650 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 20 Sep 2025 21:31:31 +0100
Subject: [PATCH 067/155] General refactoring

---
 src/llama-quant.cpp | 75 +++++++++++++++++++++++----------------------
 1 file changed, 39 insertions(+), 36 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index c6051a480c..6e5562379c 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -729,19 +729,19 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     auto estimate_error = [&](const ggml_tensor * t,
         const ggml_type quant_type,
         const std::vector<float> & f32_sample,
-        const std::vector<int64_t> & sample_rows_per_slice,
+        const std::vector<int64_t> & rows_sample,
         const float * values_sample,
         const float * activations_sample,
         std::vector<uint8_t> & quantized_buffer,
         std::vector<float> & dequantized_buffer,
-        float bias_lambda,
+        float tensor_bias_lambda,
+        const float * slice_bias_lambda,
         double * out_mse = nullptr,
         double * out_proj = nullptr) -> double
     {
         const int64_t n_per_row = t->ne[0];
         const int64_t nrows = t->ne[1];
         const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
-
         const size_t sample_element_count = f32_sample.size();
         const size_t sample_row_count = n_per_row > 0 ? sample_element_count / (size_t)n_per_row : 0;
         if (sample_row_count == 0) {
@@ -753,8 +753,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         size_t expected_rows = 0;
         for (int64_t s = 0; s < ne2; ++s) {
-            expected_rows += (size_t)sample_rows_per_slice[s];
+            expected_rows += (size_t)rows_sample[s];
         }
+
         if (expected_rows != sample_row_count) {
             if (out_mse) { *out_mse = infinity; }
             if (out_proj) { *out_proj = 0.0; }
@@ -783,17 +784,18 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     const double a = activations[j];
                     denom += w * a * a;
                 }
+
                 bias_denominator_per_slice[s] = denom;
             }
         }
 
-        // Per-row squared norms with weighting
+        // Weighted per-row squared norms
         std::vector<double> row_sq_norm(sample_row_count, 0.0);
         {
             size_t offset = 0;
             size_t row_idx = 0;
             for (int64_t s = 0; s < ne2; ++s) {
-                const int64_t rs = sample_rows_per_slice[s];
+                const int64_t rs = rows_sample[s];
                 if (rs == 0) { continue; }
 
                 const float * values = has_values ? values_sample + s * n_per_row : nullptr;
@@ -823,7 +825,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             size_t q_offset = 0;
             size_t f_offset = 0;
             for (int64_t slice = 0; slice < ne2; ++slice) {
-                const int64_t rs = sample_rows_per_slice[slice];
+                const int64_t rs = rows_sample[slice];
                 if (rs == 0) { continue; }
 
                 const float * value = has_values ? values_sample + slice * n_per_row : nullptr;
@@ -843,21 +845,19 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             } else {
                 for (size_t r = 0; r < sample_row_count; ++r) {
                     uint8_t * src = quantized_buffer.data() + r * row_sz;
-                    float * dst = dequantized_buffer.data() + r * (size_t) n_per_row;
+                    float * dst = dequantized_buffer.data() + r * (size_t)n_per_row;
                     if (is_fp16) {
-                        ggml_fp16_to_fp32_row((const ggml_fp16_t *) src, dst, (int) n_per_row);
-                    }
-                    else if (is_bf16) {
-                        ggml_bf16_to_fp32_row((const ggml_bf16_t *) src, dst, (int) n_per_row);
-                    }
-                    else {
+                        ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row);
+                    } else if (is_bf16) {
+                        ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row);
+                    } else {
                         if (!traits || !traits->to_float) {
                             if (out_mse) { *out_mse = infinity; }
                             if (out_proj) { *out_proj = 0.0; }
 
                             return infinity;
                         }
-                        traits->to_float(src, dst, (int) n_per_row);
+                        traits->to_float(src, dst, (int)n_per_row);
                     }
                 }
             }
@@ -1098,20 +1098,20 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 offset = dist(rng);
             }
 
-            for (int64_t r = offset; r < nrows_total && current_sampled_rows < sample_rows_max; r += stride) {
+            for (int64_t r = offset; r < nrows_total && current_sampled_rows < rows_sample_max; r += stride) {
                 if (src_type == GGML_TYPE_F32) {
-                    const float * src_row = (const float *)t->data + slice * (n_per_row * nrows_total) + r * n_per_row;
+                    const float * src_row = (const float *)tensor->data + slice * (n_per_row * nrows_total) + r * n_per_row;
                     f32_sample.insert(f32_sample.end(), src_row, src_row + n_per_row);
                 } else if (src_type == GGML_TYPE_F16) {
-                    const auto * src_row = (const ggml_fp16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz);
+                    const auto * src_row = (const ggml_fp16_t *)((const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz);
                     ggml_fp16_to_fp32_row(src_row, row_buffer.data(), (int)n_per_row);
                     f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end());
                 } else if (src_type == GGML_TYPE_BF16) {
-                    const auto * src_row = (const ggml_bf16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz);
+                    const auto * src_row = (const ggml_bf16_t *)((const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz);
                     ggml_bf16_to_fp32_row(src_row, row_buffer.data(), (int)n_per_row);
                     f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end());
                 } else if (src_is_quant) {
-                    const uint8_t * qrow = (const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz;
+                    const uint8_t * qrow = (const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz;
                     if (!src_traits || !src_traits->to_float) {
                         throw std::runtime_error(format("cannot dequantize type %s for sampling", ggml_type_name(src_type)));
                     }
@@ -1120,9 +1120,11 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 } else {
                     throw std::runtime_error(format("unsupported src type %s for sampling", ggml_type_name(src_type)));
                 }
+
                 ++current_sampled_rows;
             }
-            sample_rows_per_slice[slice] = current_sampled_rows;
+
+            rows_sample[slice] = current_sampled_rows;
         }
 
         auto side_data = [&](const std::unordered_map<std::string, std::vector<float>> * m, const std::string & tensor_name) -> std::pair<const float*, size_t> {
@@ -1160,7 +1162,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         if (values_all) { copy_or_broadcast(values_all, values_sz, values_sample); }
         if (activations_all) { copy_or_broadcast(activations_all, activations_sz, activations_sample); }
 
-        const int64_t nelem = ggml_nelements(t);
+        const int64_t nelem = ggml_nelements(tensor);
         tensor_info info;
         info.w = tw;
         info.n_elements = nelem;
@@ -1185,8 +1187,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     __func__, ggml_type_name(ts_type), name.c_str());
                 continue;
             }
-            ggml_type tt = make_compatible(t, ts_type);
-            if (!is_compatible(t, tt)) { continue; }
+
+            ggml_type tt = make_compatible(tensor, ts_type);
+            if (!is_compatible(tensor, tt)) { continue; }
             compatible_candidates.push_back(tt);
             max_row_sz = std::max(max_row_sz, ggml_row_size(tt, n_per_row));
         }
@@ -1222,16 +1225,16 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 // thread-local scratch
                 std::vector<uint8_t> tl_quantized_buffer(quantized_buffer.size());
                 std::vector<float>   tl_dequantised_buffer(dequantised_buffer.size());
-
                 for (;;) {
                     const size_t i = cidx.fetch_add(1, std::memory_order_relaxed);
                     if (i >= compatible_candidates.size()) { break; }
 
-                    const ggml_type tt = compatible_candidates[i];
-                    const auto bpw = (float)tensor_bpw(t, tt);
-                    const size_t bytes = tensor_bytes(t, tt);
-                    const auto err = estimate_error(t, tt, f32_sample, sample_rows_per_slice, values, activations, tl_quantized_buffer, tl_dequantised_buffer, bias_lambda);
-                    eval_candidates[i] = candidate_types{ tt, bpw, bytes, err };
+                    const ggml_type tensor_types = compatible_candidates[i];
+                    const auto bpw = (float)tensor_bpw(tensor, tensor_types);
+                    const size_t bytes = tensor_bytes(tensor, tensor_types);
+                    const auto err = estimate_error(tensor, tensor_types, f32_sample, rows_sample, values, activations,
+                        tl_quantized_buffer, tl_dequantised_buffer, tensor_lambda, slice_lambda);
+                    eval_candidates[i] = candidate_types{ tensor_types, bpw, bytes, err };
                 }
             });
         }
@@ -1244,8 +1247,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         if (info.candidate.empty()) {
             // As a last resort, keep original type
-            float bpw = ggml_nbytes(t) * 8.0f / nelem;
-            info.candidate.push_back(candidate_types{ t->type, bpw, ggml_nbytes(t), 0.0 });
+            float bpw = ggml_nbytes(tensor) * 8.0f / nelem;
+            info.candidate.push_back(candidate_types{ tensor->type, bpw, ggml_nbytes(tensor), 0.0 });
         }
 
         // Keep only the pareto‑optimal candidates: if A has >= bytes and >= error than B, drop A.
@@ -1274,6 +1277,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     // same bytes: we already sorted by error; skip
                 }
             }
+
             info.candidate.swap(pruned);
         }
 
@@ -1299,6 +1303,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     }
                     convex.push_back(p);
                 }
+
                 info.candidate.swap(convex);
             }
         }
@@ -1312,7 +1317,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     if (all.empty()) { return {}; }
 
-    // Lagrangian relaxation to minimise error subject to a bpw target constraint
     auto total_bytes = [&]() -> size_t {
         size_t tb = 0;
         for (const auto & ti : all) {
@@ -1359,6 +1363,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return emit_overrides();
     }
 
+    // Lagrangian relaxation to minimise error subject to a bpw target constraint
     auto lagrange_penalty = [&](const double mu, std::vector<int> & choice, size_t & bytes, double & err) {
         choice.resize(all.size());
         bytes = 0;
@@ -1406,6 +1411,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             if (bytes_hi <= budget_bytes) {
                 break;
             }
+
             mu_hi *= 2.0;
             if (++expand > 60) {
                 break;
@@ -1422,11 +1428,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         lagrange_penalty(mu, choice_mid, bytes_mid, err_mid);
 
         const double gap = std::abs((double)bytes_mid - (double)budget_bytes);
-
         if (bytes_mid > budget_bytes) {
             // Too big, need stronger penalty
             mu_lo = mu;
-
             if (gap < best_over_gap - epsilon || (std::abs(gap - best_over_gap) <= epsilon && err_mid < best_over_err)) {
                 best_over_gap = gap;
                 best_over_err = err_mid;
@@ -1435,7 +1439,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         } else {
             // Under budget, good candidate
             mu_hi = mu;
-
             if (gap < best_under_gap - epsilon || (std::abs(gap - best_under_gap) <= epsilon && err_mid < best_under_err)) {
                 best_under_gap = gap;
                 best_under_err = err_mid;

From a36946997e2c365e9317062f14e298af6e9928a9 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 20 Sep 2025 21:36:54 +0100
Subject: [PATCH 068/155] Replace fast_bias() for per slice version and remove
 precise_bias()

---
 src/llama-quant.cpp | 167 +++++++++++++++-----------------------------
 1 file changed, 58 insertions(+), 109 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 6e5562379c..fe10365772 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -868,8 +868,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         size_t row_idx = 0;
         double total_mse = 0.0;
         double total_proj = 0.0;
+        double total_bias = 0.0;
         for (int64_t slice = 0; slice < ne2; ++slice) {
-            const int64_t rs = sample_rows_per_slice[slice];
+            const int64_t rs = rows_sample[slice];
             if (rs == 0) { continue; }
 
             const float * values = has_values ? values_sample + slice * n_per_row : nullptr;
@@ -918,21 +919,24 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     }
                     row_proj_norm.push_back(p_norm);
                 }
+
                 offset += (size_t)n_per_row;
             }
 
             // Trimmed sum to avoid outlier rows dominating the results
             auto trimmed_sum = [&](std::vector<double> & v) -> double {
                 if (v.empty()) { return 0.0; }
+
                 const int64_t n = (int64_t)v.size();
                 if (n < 50) {
                     double s = 0.0;
                     for (const double z : v) { s += z; }
+
                     return s;
                 }
 
-                int64_t k = (int64_t) std::floor(0.02 * (double)n); // trim 2% on each side
-                k = std::max<int64_t>(0, std::min<int64_t>(k, n / 32)); // but not more than 3.125%
+                int64_t k = (int64_t)std::floor(0.02 * (double)n); // trim 2% each side
+                k = std::max<int64_t>(0, std::min<int64_t>(k, n / 32)); // cap at ~3.125%
                 std::nth_element(v.begin(), v.begin() + k, v.end());
                 std::nth_element(v.begin() + k, v.begin() + (n - k), v.end());
                 double s = 0.0;
@@ -944,11 +948,17 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             };
 
             const double scale_rows = (double)nrows / std::max(1.0, (double)rs);
+            const double slice_mse = trimmed_sum(row_mse_norm) * scale_rows;
+            const double slice_proj = activations ? trimmed_sum(row_proj_norm) * scale_rows : 0.0;
 
-            total_mse += trimmed_sum(row_mse_norm) * scale_rows;
-            if (activations) { total_proj += trimmed_sum(row_proj_norm) * scale_rows; }
+            total_mse += slice_mse;
+            total_proj += slice_proj;
 
-            if (!std::isfinite(total_mse) || !std::isfinite(total_proj)) {
+            // per-slice lambda if provided, otherwise use scalar
+            const double bl = slice_bias_lambda ? (double)std::max(0.0f, slice_bias_lambda[slice]) : (double)tensor_bias_lambda;
+            total_bias += bl * slice_proj;
+
+            if (!std::isfinite(total_mse) || !std::isfinite(total_proj) || !std::isfinite(total_bias)) {
                 if (out_mse) { *out_mse = infinity; }
                 if (out_proj) { *out_proj = 0.0; }
 
@@ -959,100 +969,42 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         if (out_mse) { *out_mse = total_mse; }
         if (out_proj) { *out_proj = total_proj; }
 
-        const double total_err = total_mse + bias_lambda * total_proj;
+        const double total_err = slice_bias_lambda ? total_mse + total_bias : total_mse + tensor_bias_lambda * total_proj;
+
         return std::isfinite(total_err) ? total_err : infinity;
     };
 
-    // Higher precision but longer to compute
-    auto precise_lambda = [&](const ggml_tensor * t,
-        const std::vector<float> & f32_sample,
-        const std::vector<int64_t> & sample_rows_per_slice,
-        const float * values,
-        const float * activations,
-        const std::vector<ggml_type> & compatible_candidates) -> float
+    // Returns lambda per slice or 0.0 if no activations
+    auto estimate_lambda = [&](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) -> std::vector<float>
     {
-        if (!activations) { return 0.0f; }
-
-        std::vector<ggml_type> probes;
-        probes.reserve(3);
-        auto push_if = [&](const ggml_type tiny) {
-            if (std::find(compatible_candidates.begin(), compatible_candidates.end(), tiny) != compatible_candidates.end()) {
-                probes.push_back(tiny);
-            }
-        };
-
-        push_if(GGML_TYPE_Q3_K);
-        push_if(GGML_TYPE_Q4_K);
-        push_if(GGML_TYPE_Q5_K);
-        if (probes.empty() && !compatible_candidates.empty()) {
-            probes.push_back(compatible_candidates[compatible_candidates.size() / 2]);
-        }
-        if (probes.size() == 1 && compatible_candidates.size() >= 2) {
-            probes.push_back(compatible_candidates.front());
-        }
-        if (probes.empty()) { return 0.0f; }
-
-        // Scratch buffers
-        const int64_t n_per_row = t->ne[0];
-        const size_t total_sampled_rows = f32_sample.size() / n_per_row;
-        size_t max_row_sz = 0;
-        for (auto pt : probes) max_row_sz = std::max(max_row_sz, ggml_row_size(pt, n_per_row));
-
-        std::vector<uint8_t> quantized_buffer(max_row_sz * total_sampled_rows);
-        std::vector<float>   dequantized_buffer(f32_sample.size());
-
-        std::vector<double> ratios;
-        ratios.reserve(probes.size());
-        for (const auto pt : probes) {
-            double m = 0.0;
-            double p = 0.0;
-            (void)estimate_error(t, pt, f32_sample, sample_rows_per_slice, values, activations, quantized_buffer, dequantized_buffer, 0.0f, &m, &p);
-            if (p > epsilon && std::isfinite(m) && std::isfinite(p)) {
-                ratios.push_back(m / p);
-            }
-        }
-
-        if (ratios.empty()) { return 0.0f; }
-
-        std::nth_element(ratios.begin(), ratios.begin() + ratios.size() / 2, ratios.end());
-        const double lambda = std::clamp(ratios[ratios.size() / 2], 0.0, 8.0);
-
-        return (float)lambda;
-    };
-
-    // Faster to compute but may yield lower precision. Best option for the vast majority of cases
-    auto fast_lambda = [&](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) {
-        if (!activations) { return 0.0f; }
-
-        double accum = 0.0;
-        int ns = 0;
+        std::vector<float> lambdas(std::max<int64_t>(1, ne2), 0.0f);
+        if (!activations) { return lambdas; }
 
         for (int64_t s = 0; s < std::max<int64_t>(1, ne2); ++s) {
             const float * v = values ? values + s * n_per_row : nullptr;
             const float * a = activations + s * n_per_row;
-
             double s1 = 0.0;
             double s2 = 0.0;
             for (int64_t j = 0; j < n_per_row; ++j) {
-                const double w  = v ? std::max(0.0f, v[j]) : 1.0;
+                const double w = v ? std::max(0.0f, v[j]) : 1.0;
                 const double aw = std::sqrt(w) * a[j];
                 const double aw2 = aw * aw;
                 s1 += aw2;
                 s2 += aw2 * aw2;
             }
 
+            float l = 0.0f;
             if (s1 > 0.0) {
-                const double n = (double)n_per_row;
-                double c = std::max(0.0, s2 / (s1 * s1 + epsilon) - 1.0 / n);
+                const auto n = (double)n_per_row;
+                const double c = std::max(0.0, s2 / (s1 * s1 + epsilon) - 1.0 / n);
                 double lambda = 8.0 * (c / (c + 1.0));
-                accum += std::clamp(lambda, 0.0, 8.0);
-                ++ns;
+                l = (float)std::clamp(lambda, 0.0, 12.0);
             }
+
+            lambdas[(size_t)s] = l;
         }
 
-        if (ns == 0) { return 0.0f; }
-
-        return (float)(accum / ns);
+        return lambdas;
     };
 
     std::vector<tensor_info> all;
@@ -1060,32 +1012,33 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     for (const auto * tw : tensors) {
         std::vector<std::thread> workers;
         workers.reserve(std::max(1, nthread));
-        ggml_tensor * t = tw->tensor;
-        const std::string name = ggml_get_name(t);
-        if (!can_quantize(t)) { continue; }
+        ggml_tensor * tensor = tw->tensor;
+        const std::string name = ggml_get_name(tensor);
+        if (!can_quantize(tensor)) { continue; }
 
-        LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12d elements)\n", __func__, name.c_str(), (int)ggml_nelements(t));
+        LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12d elements)\n", __func__, name.c_str(), (int)ggml_nelements(tensor));
         if (!ml.use_mmap) {
-            if (buffer.size() < ggml_nbytes(t)) { buffer.resize(ggml_nbytes(t)); }
-            t->data = buffer.data();
+            if (buffer.size() < ggml_nbytes(tensor)) { buffer.resize(ggml_nbytes(tensor)); }
+            tensor->data = buffer.data();
         }
-        ml.load_data_for(t);
+
+        ml.load_data_for(tensor);
 
         // Dequantize sampled rows into f32_sample
-        const int64_t n_per_row = t->ne[0];
-        const int64_t nrows_total = t->ne[1];
-        const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
+        const int64_t n_per_row = tensor->ne[0];
+        const int64_t nrows_total = tensor->ne[1];
+        const int64_t ne2 = tensor->ne[2] > 0 ? tensor->ne[2] : 1;
 
-        // Larger sample_rows_per_expert values may result in more accurate error estimates, but it will take much longer to compute
-        const int sample_rows_per_expert = activations_data ? 512 : 256;
+        // Larger rows_sample_per_expert values may result in more accurate error estimates, but it will take much longer to compute
+        const int rows_sample_per_expert = activations_data ? 512 : 256;
         std::vector<float> f32_sample;
-        f32_sample.reserve((size_t)ne2 * (size_t)std::min<int64_t>(nrows_total, sample_rows_per_expert) * (size_t)n_per_row);
+        f32_sample.reserve((size_t)ne2 * (size_t)std::min<int64_t>(nrows_total, rows_sample_per_expert) * (size_t)n_per_row);
 
-        std::vector<int64_t> sample_rows_per_slice(ne2, 0);
-        const int64_t sample_rows_max = std::max<int64_t>(1, std::min<int64_t>(nrows_total, sample_rows_per_expert));
-        const int64_t stride = std::max<int64_t>(1, nrows_total / sample_rows_max);
+        std::vector<int64_t> rows_sample(ne2, 0);
+        const int64_t rows_sample_max = std::max<int64_t>(1, std::min<int64_t>(nrows_total, rows_sample_per_expert));
+        const int64_t stride = std::max<int64_t>(1, nrows_total / rows_sample_max);
         std::vector<float> row_buffer(n_per_row);
-        const ggml_type src_type = t->type;
+        const ggml_type src_type = tensor->type;
         const ggml_type_traits *src_traits = ggml_get_type_traits(src_type);
         const bool src_is_quant = ggml_is_quantized(src_type);
         const size_t src_row_sz = ggml_row_size(src_type, n_per_row);
@@ -1199,23 +1152,20 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         // Adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE.
         // Larger values favours quantisation types that produce smaller bias even if the MSE is slightly bigger
-        float bias_lambda = 0.0f;
-        {
-            const float * values = values_sample.empty() ? nullptr : values_sample.data();
-            const float * activations = activations_sample.empty() ? nullptr : activations_sample.data();
-            if (params->bpw_bias == 1) {
-                bias_lambda = fast_lambda(values, activations, n_per_row, ne2);
-            } else if (params->bpw_bias == 2) {
-                bias_lambda = precise_lambda(t, f32_sample, sample_rows_per_slice, values, activations, compatible_candidates);
-            }
-        }
-
-        // Now evaluate candidates
-        std::vector<candidate_types> eval_candidates(compatible_candidates.size());
+        float tensor_lambda = 0.0f;
         const float * values = values_sample.empty() ? nullptr : values_sample.data();
         const float * activations = activations_sample.empty() ? nullptr : activations_sample.data();
+        auto lambdas = estimate_lambda(values, activations, n_per_row, ne2);
+        double acc = 0.0;
+        int ns = 0;
+        for (float l : lambdas) { acc += l; ++ns; }
+        tensor_lambda = ns ? (float)(acc / ns) : 0.0f;
+
+        // Evaluate candidates
+        std::vector<candidate_types> eval_candidates(compatible_candidates.size());
         std::vector<uint8_t> quantized_buffer(max_row_sz * total_sampled_rows);
         std::vector<float> dequantised_buffer(f32_sample.size());
+        const float * slice_lambda = lambdas.empty() ? nullptr : lambdas.data();
         int n_eval_threads = std::max(1, std::min<int>(nthread, (int)compatible_candidates.size()));
         std::atomic<size_t> cidx{0};
         std::vector<std::thread> eval_workers;
@@ -1476,7 +1426,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             int best_j = -1;
             double best_ratio = -1.0;
             size_t best_delta = 0;
-
             for (int i = 0; i < (int)all.size(); ++i) {
                 const auto & ti = all[i];
                 if (ti.choice >= (int)ti.candidate.size() - 1) {

From 9e74f8341120d5f26939267e96fbaba04451d516 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 20 Sep 2025 23:06:37 +0100
Subject: [PATCH 069/155] Replace --bpw-bias flag with --no-bias

---
 include/llama.h             |  2 +-
 src/llama-quant.cpp         | 18 +++++++++-------
 tools/quantize/quantize.cpp | 42 ++++++++-----------------------------
 3 files changed, 20 insertions(+), 42 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index ba6c185346..502bedbb80 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -365,7 +365,7 @@ extern "C" {
         void * tensor_types;                  // pointer to vector containing tensor types
         void * prune_layers;                  // pointer to vector containing layer indices to prune
         float target_bpw;                     // target bits per weight (bpw)
-        int32_t bpw_bias;                     // type of error bias to use: 0 = no bias (MSE only), 1 = fast (default), 2 = precise (slow)
+        bool no_bias;                         // use mean square error estimation only (no aligment bias)
     } llama_model_quantize_params;
 
     typedef struct llama_logit_bias {
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 9d7a9f9742..9e7d9d295c 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1153,13 +1153,16 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         // Adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE.
         // Larger values favours quantisation types that produce smaller bias even if the MSE is slightly bigger
         float tensor_lambda = 0.0f;
+        std::vector<float> lambdas;
         const float * values = values_sample.empty() ? nullptr : values_sample.data();
         const float * activations = activations_sample.empty() ? nullptr : activations_sample.data();
-        auto lambdas = estimate_lambda(values, activations, n_per_row, ne2);
-        double acc = 0.0;
-        int ns = 0;
-        for (float l : lambdas) { acc += l; ++ns; }
-        tensor_lambda = ns ? (float)(acc / ns) : 0.0f;
+        if (!params->no_bias) {
+            double acc = 0.0;
+            int ns = 0;
+            lambdas = estimate_lambda(values, activations, n_per_row, ne2);
+            for (float l : lambdas) { acc += l; ++ns; }
+            tensor_lambda = ns ? (float)(acc / ns) : 0.0f;
+        }
 
         // Evaluate candidates
         std::vector<candidate_types> eval_candidates(compatible_candidates.size());
@@ -1726,8 +1729,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             } else {
                 LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate - ", __func__);
             }
-            const char* msg[] = {"no bias (MSE only)", "fast (default)", "precise (slow)"};
-            LLAMA_LOG_INFO("using %s error estimation\n", msg[params->bpw_bias]);
+            LLAMA_LOG_INFO("using %s error estimation\n", params->no_bias ? "MSE only (no aligment bias)" : "aligment bias (default)");
             LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw);
             bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread);
         } else {
@@ -2038,7 +2040,7 @@ llama_model_quantize_params llama_model_quantize_default_params() {
         /*.tensor_type                 =*/ nullptr,
         /*.prune_layers                =*/ nullptr,
         /*.target_bpw                  =*/ -1.0f,
-        /*.bpw_bias                    =*/ 1
+        /*.no_bias                     =*/ false
     };
 
     return result;
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 0fe65daea0..03018cc301 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -117,12 +117,12 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 
 [[noreturn]]
 static void usage(const char * executable) {
-    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights]\n", executable);
-    printf("       [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n");
+    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights]\n", executable);
+    printf("       [--target-bpw n] [--no-bias] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n");
     printf("       model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
-    printf("  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
-    printf("  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
-    printf("  --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
+    printf("  --allow-requantize: allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
+    printf("  --leave-output-tensor: will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
+    printf("  --pure: disable k-quant mixtures and quantize all tensors to the same type\n");
     printf("  --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n");
     printf("  --include-weights tensor_name: use importance matrix for this/these tensor(s)\n");
     printf("  --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
@@ -134,7 +134,8 @@ static void usage(const char * executable) {
     printf("      Advanced option to remove all tensors from the given layers\n");
     printf("  --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 16.0\n");
     printf("      Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n");
-    printf("  --bpw_bias: type of error bias to use: 0 = no bias (MSE only), 1 = fast (default), 2 = precise (slow)\n");
+    printf("  --no-bias: use mean square error estimation only (no aligment bias)\n");
+    printf("      Advanced option use MSE only and disable aligment bias error estimation\n");
     printf("  --keep-split: will generate quantized model in the same shards as input\n");
     printf("  --override-kv KEY=TYPE:VALUE\n");
     printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
@@ -496,27 +497,6 @@ static bool parse_target_bpw(const char * data, float & target_bpw) {
     return true;
 }
 
-static bool parse_bpw_bias(const char * data, int & bpw_bias) {
-    if (!data) {
-        printf("\n%s: error bias type not provided\n\n", __func__);
-        return false;
-    }
-
-    try {
-        bpw_bias = std::stoi(data);
-        if (bpw_bias < 0 || bpw_bias > 2) {
-            printf("\n%s: error bias type must be one of 0 (no bias, MSE only), 1 (fast), or 2 (precise, but slow)\n\n", __func__);
-            return false;
-        }
-    }
-    catch (const std::exception & e) {
-        printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__, data);
-        return false;
-    }
-
-    return true;
-}
-
 int main(int argc, char ** argv) {
     if (argc < 3) {
         usage(argv[0]);
@@ -531,7 +511,6 @@ int main(int argc, char ** argv) {
     std::vector<tensor_quantization> tensor_types;
     std::vector<int> prune_layers;
     float target_bpw = -1.0f;
-    int bpw_bias = 1;
 
     for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
         if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
@@ -562,11 +541,8 @@ int main(int argc, char ** argv) {
             if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) {
                 usage(argv[0]);
             }
-        } else if (strcmp(argv[arg_idx], "--bpw-bias") == 0) {
-            if (arg_idx == argc-1 || !parse_bpw_bias(argv[++arg_idx], bpw_bias)) {
-                usage(argv[0]);
-            }
-            params.bpw_bias = bpw_bias;
+        } else if (strcmp(argv[arg_idx], "--no-bias") == 0) {
+            params.no_bias = true;
         } else if (strcmp(argv[arg_idx], "--prune-layers") == 0) {
             if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) {
                 usage(argv[0]);

From e8e2aed17a4ade7b14021e05f2a55f9b8f26510f Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 21 Sep 2025 13:41:44 +0100
Subject: [PATCH 070/155] Refactor row sampling

---
 src/llama-quant.cpp | 49 +++++++++++++++++++++++++--------------------
 1 file changed, 27 insertions(+), 22 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 9e7d9d295c..4a8c08e68f 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1029,7 +1029,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         const int64_t nrows_total = tensor->ne[1];
         const int64_t ne2 = tensor->ne[2] > 0 ? tensor->ne[2] : 1;
 
-        // Larger rows_sample_per_expert values may result in more accurate error estimates, but it will take much longer to compute
         const int rows_sample_per_expert = activations_data ? 512 : 256;
         std::vector<float> f32_sample;
         f32_sample.reserve((size_t)ne2 * (size_t)std::min<int64_t>(nrows_total, rows_sample_per_expert) * (size_t)n_per_row);
@@ -1037,11 +1036,30 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         std::vector<int64_t> rows_sample(ne2, 0);
         const int64_t rows_sample_max = std::max<int64_t>(1, std::min<int64_t>(nrows_total, rows_sample_per_expert));
         const int64_t stride = std::max<int64_t>(1, nrows_total / rows_sample_max);
-        std::vector<float> row_buffer(n_per_row);
         const ggml_type src_type = tensor->type;
-        const ggml_type_traits *src_traits = ggml_get_type_traits(src_type);
+        const ggml_type_traits * src_traits = ggml_get_type_traits(src_type);
         const bool src_is_quant = ggml_is_quantized(src_type);
         const size_t src_row_sz = ggml_row_size(src_type, n_per_row);
+
+        std::vector<float> row_buffer(n_per_row);
+        auto row_to_fp32 = [&](const uint8_t * src, float * dst) {
+            if (src_type == GGML_TYPE_F32) {
+                std::memcpy(dst, src, sizeof(float) * (size_t)n_per_row);
+            } else if (src_type == GGML_TYPE_F16) {
+                ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row);
+            } else if (src_type == GGML_TYPE_BF16) {
+                ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row);
+            } else if (src_is_quant) {
+                if (!src_traits || !src_traits->to_float) {
+                    throw std::runtime_error(format("cannot dequantize type %s for sampling", ggml_type_name(src_type)));
+                }
+
+                src_traits->to_float(src, dst, (int)n_per_row);
+            } else {
+                throw std::runtime_error(format("unsupported src type %s for sampling", ggml_type_name(src_type)));
+            }
+        };
+
         for (int64_t slice = 0; slice < ne2; ++slice) {
             std::mt19937 rng(std::hash<std::string>{}(name) ^ 0xeabada55cafed00d ^ slice);
             int64_t current_sampled_rows = 0;
@@ -1052,31 +1070,18 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             }
 
             for (int64_t r = offset; r < nrows_total && current_sampled_rows < rows_sample_max; r += stride) {
+                const uint8_t * src_row = (const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz;
                 if (src_type == GGML_TYPE_F32) {
-                    const float * src_row = (const float *)tensor->data + slice * (n_per_row * nrows_total) + r * n_per_row;
-                    f32_sample.insert(f32_sample.end(), src_row, src_row + n_per_row);
-                } else if (src_type == GGML_TYPE_F16) {
-                    const auto * src_row = (const ggml_fp16_t *)((const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz);
-                    ggml_fp16_to_fp32_row(src_row, row_buffer.data(), (int)n_per_row);
-                    f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end());
-                } else if (src_type == GGML_TYPE_BF16) {
-                    const auto * src_row = (const ggml_bf16_t *)((const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz);
-                    ggml_bf16_to_fp32_row(src_row, row_buffer.data(), (int)n_per_row);
-                    f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end());
-                } else if (src_is_quant) {
-                    const uint8_t * qrow = (const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz;
-                    if (!src_traits || !src_traits->to_float) {
-                        throw std::runtime_error(format("cannot dequantize type %s for sampling", ggml_type_name(src_type)));
-                    }
-                    src_traits->to_float(qrow, row_buffer.data(), (int)n_per_row);
-                    f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end());
+                    auto src_f32 = (const float *)src_row;
+                    f32_sample.insert(f32_sample.end(), src_f32, src_f32 + n_per_row);
                 } else {
-                    throw std::runtime_error(format("unsupported src type %s for sampling", ggml_type_name(src_type)));
+                    row_to_fp32(src_row, row_buffer.data());
+                    f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end());
                 }
 
                 ++current_sampled_rows;
             }
-
+            
             rows_sample[slice] = current_sampled_rows;
         }
 

From bdefdb673c0d28b59c23d505307536b4f1724858 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 21 Sep 2025 13:42:07 +0100
Subject: [PATCH 071/155] Refactor copy_or_broadcast()

---
 src/llama-quant.cpp | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 4a8c08e68f..b1302df431 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1087,6 +1087,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         auto side_data = [&](const std::unordered_map<std::string, std::vector<float>> * m, const std::string & tensor_name) -> std::pair<const float*, size_t> {
             if (!m) { return {nullptr, 0}; }
+
             const std::string key = remap_imatrix(tensor_name, mapped);
             const auto it = m->find(key);
             if (it == m->end()) { return {nullptr, 0}; }
@@ -1095,22 +1096,27 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         };
 
         // Copy this row's side data (values and activations), or broadcasts to all slices
-        auto copy_or_broadcast = [&](const float *src, size_t src_sz, std::vector<float> &dst) {
-            const size_t want = (size_t)ne2 * (size_t)n_per_row;
+        auto copy_or_broadcast = [&](const float * src, size_t src_sz, std::vector<float> & dst) {
             dst.clear();
             if (!src || src_sz == 0) { return; }
+
+            const size_t want = (size_t)ne2 * (size_t)n_per_row;
             if (src_sz == want) {
                 dst.resize(want);
                 std::memcpy(dst.data(), src, want * sizeof(float));
-            } else if (src_sz == (size_t)n_per_row) {
+
+                return;
+            }
+            if (src_sz == (size_t)n_per_row) {
                 dst.resize(want);
                 for (int64_t s = 0; s < ne2; ++s) {
                     std::memcpy(dst.data() + s * n_per_row, src, n_per_row * sizeof(float));
                 }
-            } else {
-                LLAMA_LOG_WARN("%s: side data size mismatch for %s: got %zu, expected %zu or %zu; ignoring\n",
-                    func, name.c_str(), src_sz, (size_t)n_per_row, want);
+
+                return;
             }
+
+            LLAMA_LOG_WARN("%s: side data size mismatch for %s: got %zu, expected %zu or %zu; ignoring\n", func, name.c_str(), src_sz, (size_t)n_per_row, want);
         };
 
         const auto [values_all, values_sz] = side_data(values_data, name);

From 6b8cedf3bcd2282e9f31b00026178d6bb393fc3e Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 21 Sep 2025 13:42:31 +0100
Subject: [PATCH 072/155] Refactor estimate_lambda()

---
 src/llama-quant.cpp | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index b1302df431..ebacf68806 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -975,30 +975,29 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     };
 
     // Returns lambda per slice or 0.0 if no activations
-    auto estimate_lambda = [&](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) -> std::vector<float>
-    {
-        std::vector<float> lambdas(std::max<int64_t>(1, ne2), 0.0f);
+    auto estimate_lambda = [&](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) -> std::vector<float> {
+        const int64_t ns = std::max<int64_t>(1, ne2);
+        std::vector<float> lambdas(ns, 0.0f);
         if (!activations) { return lambdas; }
 
-        for (int64_t s = 0; s < std::max<int64_t>(1, ne2); ++s) {
+        for (int64_t s = 0; s < ns; ++s) {
             const float * v = values ? values + s * n_per_row : nullptr;
             const float * a = activations + s * n_per_row;
             double s1 = 0.0;
             double s2 = 0.0;
             for (int64_t j = 0; j < n_per_row; ++j) {
                 const double w = v ? std::max(0.0f, v[j]) : 1.0;
-                const double aw = std::sqrt(w) * a[j];
-                const double aw2 = aw * aw;
-                s1 += aw2;
-                s2 += aw2 * aw2;
+                const double aw2 = std::sqrt(w) * a[j];
+                const double z = aw2 * aw2;
+                s1 += z;
+                s2 += z * z;
             }
 
             float l = 0.0f;
             if (s1 > 0.0) {
                 const auto n = (double)n_per_row;
                 const double c = std::max(0.0, s2 / (s1 * s1 + epsilon) - 1.0 / n);
-                double lambda = 8.0 * (c / (c + 1.0));
-                l = (float)std::clamp(lambda, 0.0, 12.0);
+                l = (float) std::clamp(8.0 * (c / (c + 1.0)), 0.0, 12.0);
             }
 
             lambdas[(size_t)s] = l;

From c466c53808e566f5eb81a654c9f131064246cdaf Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 21 Sep 2025 13:42:54 +0100
Subject: [PATCH 073/155] Refactor pareto pruning and convexification

---
 src/llama-quant.cpp | 93 +++++++++++++++++++++------------------------
 1 file changed, 43 insertions(+), 50 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index ebacf68806..ab6601a8bf 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1146,8 +1146,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         for (size_t i = 0; i < base_sz; ++i) {
             ggml_type ts_type = base_arr[i];
             if (is_iq(ts_type) && !has_valid_imatrix) {
-                LLAMA_LOG_WARN("%s: skipping %s quantization for %s, no or mismatched imatrix provided\n",
-                    __func__, ggml_type_name(ts_type), name.c_str());
+                LLAMA_LOG_WARN("%s: skipping %s quantization for %s, no or mismatched imatrix provided\n", __func__, ggml_type_name(ts_type), name.c_str());
                 continue;
             }
 
@@ -1214,60 +1213,54 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             info.candidate.push_back(candidate_types{ tensor->type, bpw, ggml_nbytes(tensor), 0.0 });
         }
 
-        // Keep only the pareto‑optimal candidates: if A has >= bytes and >= error than B, drop A.
+        // Keep only the pareto‑optimal candidates and enforce convexity in (bytes, error) curve
         {
-            std::vector<candidate_types> pruned;
-            pruned.reserve(info.candidate.size());
+            auto & candidates = info.candidate;
+            if (!candidates.empty()) {
+                std::sort(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) {
+                    if (a.bytes != b.bytes) { return a.bytes < b.bytes; }
 
-            // Sort by bytes ascending, error ascending
-            std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types & a, const candidate_types & b) {
-                if (a.bytes != b.bytes) { return a.bytes < b.bytes; }
-                return a.error < b.error;
-            });
+                    return a.error < b.error;
+                });
 
-            double best_err = infinity;
-            size_t last_bytes = std::numeric_limits<size_t>::max();
-            for (const auto & c : info.candidate) {
-                // Only keep the best error seen so far at strictly larger byte sizes
-                if (c.bytes != last_bytes) {
-                    // first time we see this byte size
-                    last_bytes = c.bytes;
-                    if (c.error < best_err) {
-                        pruned.push_back(c);
-                        best_err = c.error;
+                std::vector<candidate_types> pareto;
+                pareto.reserve(candidates.size());
+                double best_err = infinity;
+                size_t last_bytes = std::numeric_limits<size_t>::max();
+                for (const auto & c : candidates) {
+                    if (c.bytes != last_bytes) {
+                        last_bytes = c.bytes;
+                        if (c.error < best_err) {
+                            best_err = c.error;
+                            pareto.push_back(c);
+                        }
                     }
-                } else {
-                    // same bytes: we already sorted by error; skip
-                }
-            }
-
-            info.candidate.swap(pruned);
-        }
-
-        // Enforce convexity in (bytes, error) curve
-        {
-            const auto & c = info.candidate;
-            if (c.size() >= 3) {
-                std::vector<candidate_types> convex;
-                convex.reserve(c.size());
-                auto slope = [](const candidate_types & a, const candidate_types & b) -> double {
-                    const double dx = (double)b.bytes - (double)a.bytes;
-                    if (dx <= 0.0) { return infinity; }
-
-                    return ((double)b.error - (double)a.error) / dx;
-                };
-
-                for (const auto & p : c) {
-                    while (convex.size() >= 2) {
-                        double s1 = slope(convex[convex.size() - 2], convex[convex.size() - 1]);
-                        double s2 = slope(convex[convex.size() - 1], p);
-                        if (s2 + epsilon < s1) { convex.pop_back(); }
-                        else { break; }
-                    }
-                    convex.push_back(p);
                 }
 
-                info.candidate.swap(convex);
+                candidates.swap(pareto);
+
+                if (candidates.size() >= 3) {
+                    std::vector<candidate_types> hull;
+                    hull.reserve(candidates.size());
+                    auto slope = [](const candidate_types & a, const candidate_types & b) {
+                        const double dx = b.bytes - a.bytes;
+
+                        return dx <= 0.0 ? infinity : (b.error - a.error) / dx;
+                    };
+
+                    for (const auto & p : candidates) {
+                        while (hull.size() >= 2) {
+                            double s1 = slope(hull[hull.size() - 2], hull[hull.size() - 1]);
+                            double s2 = slope(hull[hull.size() - 1], p);
+                            if (s2 + epsilon < s1) { hull.pop_back(); }
+                            else { break; }
+                        }
+
+                        hull.push_back(p);
+                    }
+
+                    candidates.swap(hull);
+                }
             }
         }
 

From b433fd95472c39c4974892aa9100e3cdc7b9c63d Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 21 Sep 2025 13:43:09 +0100
Subject: [PATCH 074/155] Refactor last budget pass

---
 src/llama-quant.cpp | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index ab6601a8bf..e062b2dc6a 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1433,19 +1433,16 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             double best_ratio = -1.0;
             size_t best_delta = 0;
             for (int i = 0; i < (int)all.size(); ++i) {
-                const auto & ti = all[i];
-                if (ti.choice >= (int)ti.candidate.size() - 1) {
-                    continue;
-                }
-
+                const auto &ti = all[i];
                 int j = ti.choice + 1;
+                // skip same-bytes entries
                 while (j < (int)ti.candidate.size() && ti.candidate[j].bytes == ti.candidate[ti.choice].bytes) { ++j; }
                 if (j >= (int)ti.candidate.size()) { continue; }
 
                 size_t delta = ti.candidate[j].bytes - ti.candidate[ti.choice].bytes;
                 if (cur_bytes + delta > budget_bytes) { continue; }
 
-                double err_gain = std::max(0.0, (double)ti.candidate[ti.choice].error - (double)ti.candidate[j].error);
+                double err_gain = std::max(0.0, ti.candidate[ti.choice].error - ti.candidate[j].error);
                 double ratio = err_gain / (double)(delta * 8);
                 if (ratio > best_ratio + epsilon || (std::abs(ratio - best_ratio) <= epsilon && delta < best_delta)) {
                     best_ratio = ratio;
@@ -1454,7 +1451,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     best_j = j;
                 }
             }
-
             if (best_i < 0) { break; }
             all[best_i].choice = best_j;
             cur_bytes += best_delta;

From b6c008fd8a12a9b1970c4810585cbd540bf0737e Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 21 Sep 2025 16:04:13 +0100
Subject: [PATCH 075/155] Refactor helper lambdas

---
 src/llama-quant.cpp | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index e062b2dc6a..d31552ea23 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -665,28 +665,23 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t {
         const int64_t n_per_row = t->ne[0];
         const size_t row_sz = ggml_row_size(typ, n_per_row);
-        const int64_t nrows = ggml_nrows(t);
-        return (size_t)nrows * row_sz;
+        return (size_t)ggml_nrows(t) * row_sz;
     };
 
     auto tensor_bpw = [&](const ggml_tensor * t, const ggml_type typ) -> double {
-        const int64_t nelem = ggml_nelements(t);
         const size_t bytes = tensor_bytes(t, typ);
-        return (double)bytes * 8.0 / (double)nelem;
+        return (double)bytes * 8.0 / (double)ggml_nelements(t);
     };
 
     auto is_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> bool {
-        const int64_t n_per_row = t->ne[0];
         const int64_t blck = ggml_blck_size(typ);
-        if (blck <= 1) { return true; }
-        return n_per_row % blck == 0;
+        return blck <= 1 || (t->ne[0] % blck) == 0;
     };
 
     auto make_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> ggml_type {
-        if (is_compatible(t, typ)) { return typ; }
+        if (is_compatible(t, typ)) return typ;
         ggml_type fb = fallback_type(typ);
-        if (is_compatible(t, fb)) { return fb; }
-        return GGML_TYPE_F16;
+        return is_compatible(t, fb) ? fb : GGML_TYPE_F16;
     };
 
     auto name_tn = LLM_TN(model.arch);
@@ -1080,7 +1075,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
                 ++current_sampled_rows;
             }
-            
+
             rows_sample[slice] = current_sampled_rows;
         }
 

From 7386d4eadd64006ac7f0fbc992d7d4bcb195bd6c Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 21 Sep 2025 16:18:26 +0100
Subject: [PATCH 076/155] Refactor row sampling

---
 src/llama-quant.cpp | 87 +++++++++++++++++++++++++--------------------
 1 file changed, 48 insertions(+), 39 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index d31552ea23..f2dab6a898 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1019,64 +1019,73 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         ml.load_data_for(tensor);
 
         // Dequantize sampled rows into f32_sample
+        const int rows_sample_per_expert = activations_data ? 512 : 256;
         const int64_t n_per_row = tensor->ne[0];
         const int64_t nrows_total = tensor->ne[1];
         const int64_t ne2 = tensor->ne[2] > 0 ? tensor->ne[2] : 1;
-
-        const int rows_sample_per_expert = activations_data ? 512 : 256;
         std::vector<float> f32_sample;
         f32_sample.reserve((size_t)ne2 * (size_t)std::min<int64_t>(nrows_total, rows_sample_per_expert) * (size_t)n_per_row);
-
         std::vector<int64_t> rows_sample(ne2, 0);
-        const int64_t rows_sample_max = std::max<int64_t>(1, std::min<int64_t>(nrows_total, rows_sample_per_expert));
-        const int64_t stride = std::max<int64_t>(1, nrows_total / rows_sample_max);
         const ggml_type src_type = tensor->type;
         const ggml_type_traits * src_traits = ggml_get_type_traits(src_type);
         const bool src_is_quant = ggml_is_quantized(src_type);
         const size_t src_row_sz = ggml_row_size(src_type, n_per_row);
 
-        std::vector<float> row_buffer(n_per_row);
+        // Convert a single row to fp32
         auto row_to_fp32 = [&](const uint8_t * src, float * dst) {
-            if (src_type == GGML_TYPE_F32) {
+            const ggml_type t = src_type;
+            if (t == GGML_TYPE_F32) {
                 std::memcpy(dst, src, sizeof(float) * (size_t)n_per_row);
-            } else if (src_type == GGML_TYPE_F16) {
-                ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row);
-            } else if (src_type == GGML_TYPE_BF16) {
-                ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row);
-            } else if (src_is_quant) {
-                if (!src_traits || !src_traits->to_float) {
-                    throw std::runtime_error(format("cannot dequantize type %s for sampling", ggml_type_name(src_type)));
-                }
-
-                src_traits->to_float(src, dst, (int)n_per_row);
-            } else {
-                throw std::runtime_error(format("unsupported src type %s for sampling", ggml_type_name(src_type)));
+                return;
             }
+            if (t == GGML_TYPE_F16) {
+                ggml_fp16_to_fp32_row((const ggml_fp16_t *) src, dst, (int)n_per_row);
+                return;
+            }
+            if (t == GGML_TYPE_BF16) {
+                ggml_bf16_to_fp32_row((const ggml_bf16_t *) src, dst, (int)n_per_row);
+                return;
+            }
+
+            if (src_is_quant) {
+                GGML_ASSERT(src_traits && src_traits->to_float);
+                src_traits->to_float(src, dst, (int) n_per_row);
+                return;
+            }
+
+            throw std::runtime_error(format("unsupported src type %s for sampling", ggml_type_name(t)));
         };
 
-        for (int64_t slice = 0; slice < ne2; ++slice) {
-            std::mt19937 rng(std::hash<std::string>{}(name) ^ 0xeabada55cafed00d ^ slice);
-            int64_t current_sampled_rows = 0;
-            int64_t offset = 0;
-            if (stride > 1) {
-                std::uniform_int_distribution<int64_t> dist(0, stride - 1);
-                offset = dist(rng);
-            }
-
-            for (int64_t r = offset; r < nrows_total && current_sampled_rows < rows_sample_max; r += stride) {
-                const uint8_t * src_row = (const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz;
-                if (src_type == GGML_TYPE_F32) {
-                    auto src_f32 = (const float *)src_row;
-                    f32_sample.insert(f32_sample.end(), src_f32, src_f32 + n_per_row);
-                } else {
-                    row_to_fp32(src_row, row_buffer.data());
-                    f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end());
+        // Sample rows randomly per slice
+        {
+            f32_sample.clear();
+            std::vector<float> row_buffer(n_per_row);
+            for (int64_t slice = 0; slice < ne2; ++slice) {
+                std::mt19937 rng(std::hash<std::string>{}(name) ^ 0xeabada55cafed00d ^ slice);
+                const int64_t rows_sample_max = std::max<int64_t>(1, std::min<int64_t>(nrows_total, rows_sample_per_expert));
+                const int64_t stride = std::max<int64_t>(1, nrows_total / rows_sample_max);
+                int64_t offset = 0;
+                if (stride > 1) {
+                    std::uniform_int_distribution<int64_t> dist(0, stride - 1);
+                    offset = dist(rng);
                 }
 
-                ++current_sampled_rows;
-            }
+                int64_t current = 0;
+                for (int64_t r = offset; r < nrows_total && current < rows_sample_max; r += stride) {
+                    const uint8_t * src_row = (const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz;
+                    if (src_type == GGML_TYPE_F32) {
+                        auto src_f32 = (const float *)src_row;
+                        f32_sample.insert(f32_sample.end(), src_f32, src_f32 + n_per_row);
+                    } else {
+                        row_to_fp32(src_row, row_buffer.data());
+                        f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end());
+                    }
 
-            rows_sample[slice] = current_sampled_rows;
+                    ++current;
+                }
+
+                rows_sample[slice] = current;
+            }
         }
 
         auto side_data = [&](const std::unordered_map<std::string, std::vector<float>> * m, const std::string & tensor_name) -> std::pair<const float*, size_t> {

From 08146fd67f5ec6b93e2406340afaaa5aa336596a Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 21 Sep 2025 16:19:03 +0100
Subject: [PATCH 077/155] Refactor side_data() and copy_or_broadcast()

---
 src/llama-quant.cpp | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index f2dab6a898..b8eb12690e 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1088,14 +1088,12 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             }
         }
 
-        auto side_data = [&](const std::unordered_map<std::string, std::vector<float>> * m, const std::string & tensor_name) -> std::pair<const float*, size_t> {
-            if (!m) { return {nullptr, 0}; }
+        auto side_data = [&](const std::unordered_map<std::string, std::vector<float>> * m, const std::string & tensor_name) {
+            if (!m) { return std::pair<const float*, size_t>{nullptr, 0}; }
 
             const std::string key = remap_imatrix(tensor_name, mapped);
             const auto it = m->find(key);
-            if (it == m->end()) { return {nullptr, 0}; }
-
-            return { it->second.data(), it->second.size() };
+            return it == m->end() ? std::pair<const float*, size_t>{nullptr, 0} : std::pair<const float*, size_t>{ it->second.data(), it->second.size() };
         };
 
         // Copy this row's side data (values and activations), or broadcasts to all slices
@@ -1105,9 +1103,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
             const size_t want = (size_t)ne2 * (size_t)n_per_row;
             if (src_sz == want) {
-                dst.resize(want);
-                std::memcpy(dst.data(), src, want * sizeof(float));
-
+                dst.assign(src, src + want);
                 return;
             }
             if (src_sz == (size_t)n_per_row) {
@@ -1115,7 +1111,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 for (int64_t s = 0; s < ne2; ++s) {
                     std::memcpy(dst.data() + s * n_per_row, src, n_per_row * sizeof(float));
                 }
-
                 return;
             }
 

From 17be7615ce070af61cd1a0f80b38947c3fea5709 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 21 Sep 2025 16:19:28 +0100
Subject: [PATCH 078/155] Refactor candidate types build

---
 src/llama-quant.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index b8eb12690e..beac311d50 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1133,19 +1133,17 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         size_t total_sampled_rows = f32_sample.size() / n_per_row;
 
         // Build list of candidate types first (compatible ones)
+        const bool has_valid_imatrix = !values_sample.empty() && values_sample.size() == (size_t)ne2 * (size_t)n_per_row;
+        size_t max_row_sz = 0;
         const ggml_type * base_arr = is_iq(params->ftype) ? iq_quants : k_quants;
         const size_t base_sz = is_iq(params->ftype) ? std::size(iq_quants) : std::size(k_quants);
-
-        size_t max_row_sz = 0;
-        const bool has_valid_imatrix = !values_sample.empty() && values_sample.size() == (size_t)ne2 * (size_t)n_per_row;
-
         std::vector<ggml_type> compatible_candidates;
         compatible_candidates.reserve(base_sz);
 
         for (size_t i = 0; i < base_sz; ++i) {
             ggml_type ts_type = base_arr[i];
             if (is_iq(ts_type) && !has_valid_imatrix) {
-                LLAMA_LOG_WARN("%s: skipping %s quantization for %s, no or mismatched imatrix provided\n", __func__, ggml_type_name(ts_type), name.c_str());
+                LLAMA_LOG_WARN("%s: skipping %s for %s, no or mismatched imatrix\n", __func__, ggml_type_name(ts_type), name.c_str());
                 continue;
             }
 

From b09662f86aefb5750842c9d68dac42db9054e90c Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 21 Sep 2025 16:19:49 +0100
Subject: [PATCH 079/155] Refactor estimate_lambda()

---
 src/llama-quant.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index beac311d50..63779ded48 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -982,8 +982,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             double s2 = 0.0;
             for (int64_t j = 0; j < n_per_row; ++j) {
                 const double w = v ? std::max(0.0f, v[j]) : 1.0;
-                const double aw2 = std::sqrt(w) * a[j];
-                const double z = aw2 * aw2;
+                const double aw = std::sqrt(w) * a[j];
+                const double z  = aw * aw;
                 s1 += z;
                 s2 += z * z;
             }
@@ -992,7 +992,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             if (s1 > 0.0) {
                 const auto n = (double)n_per_row;
                 const double c = std::max(0.0, s2 / (s1 * s1 + epsilon) - 1.0 / n);
-                l = (float) std::clamp(8.0 * (c / (c + 1.0)), 0.0, 12.0);
+                l = (float)std::clamp(8.0 * (c / (c + 1.0)), 0.0, 12.0);
             }
 
             lambdas[(size_t)s] = l;

From a7ee915e19d9acd7a1187ba7d8d772d3a52a8f0d Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 21 Sep 2025 16:20:06 +0100
Subject: [PATCH 080/155] Refactor trimmed_sum()

---
 src/llama-quant.cpp | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 63779ded48..67de29df87 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -920,26 +920,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
             // Trimmed sum to avoid outlier rows dominating the results
             auto trimmed_sum = [&](std::vector<double> & v) -> double {
-                if (v.empty()) { return 0.0; }
-
                 const int64_t n = (int64_t)v.size();
-                if (n < 50) {
-                    double s = 0.0;
-                    for (const double z : v) { s += z; }
-
-                    return s;
-                }
+                if (n == 0) { return 0.0; }
+                if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); }
 
                 int64_t k = (int64_t)std::floor(0.02 * (double)n); // trim 2% each side
-                k = std::max<int64_t>(0, std::min<int64_t>(k, n / 32)); // cap at ~3.125%
+                k = std::clamp<int64_t>(k, 0, n / 32); // cap at ~3.125%
                 std::nth_element(v.begin(), v.begin() + k, v.end());
                 std::nth_element(v.begin() + k, v.begin() + (n - k), v.end());
-                double s = 0.0;
-                for (int64_t i = k; i < n - k; ++i) {
-                    s += v[i];
-                }
-
-                return s;
+                return std::accumulate(v.begin() + k, v.begin() + (n - k), 0.0);
             };
 
             const double scale_rows = (double)nrows / std::max(1.0, (double)rs);

From 1a3e9ea4c88c40b7fea3a94ff45522531f31f005 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 21 Sep 2025 16:21:00 +0100
Subject: [PATCH 081/155] Refactor estimate_error()

---
 src/llama-quant.cpp | 191 ++++++++++++++++++++------------------------
 1 file changed, 85 insertions(+), 106 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 67de29df87..b3e4b3cbf7 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -737,12 +737,12 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         const int64_t n_per_row = t->ne[0];
         const int64_t nrows = t->ne[1];
         const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1;
-        const size_t sample_element_count = f32_sample.size();
-        const size_t sample_row_count = n_per_row > 0 ? sample_element_count / (size_t)n_per_row : 0;
-        if (sample_row_count == 0) {
+        const size_t sample_elems = f32_sample.size();
+        const size_t sample_rows  = n_per_row > 0 ? sample_elems / (size_t)n_per_row : 0;
+
+        if (sample_rows == 0) {
             if (out_mse) { *out_mse = 0.0; }
             if (out_proj) { *out_proj = 0.0; }
-
             return 0.0;
         }
 
@@ -751,105 +751,102 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             expected_rows += (size_t)rows_sample[s];
         }
 
-        if (expected_rows != sample_row_count) {
+        if (expected_rows != sample_rows) {
             if (out_mse) { *out_mse = infinity; }
             if (out_proj) { *out_proj = 0.0; }
-
             return infinity;
         }
 
         const size_t row_sz = ggml_row_size(quant_type, n_per_row);
-        const size_t buffer_sz = row_sz * sample_row_count;
+        const size_t buf_sz = row_sz * sample_rows;
 
-        if (quantized_buffer.size() < buffer_sz) { quantized_buffer.resize(buffer_sz); }
-        if (dequantized_buffer.size() < sample_element_count) { dequantized_buffer.resize(sample_element_count); }
+        if (quantized_buffer.size() < buf_sz) { quantized_buffer.resize(buf_sz); }
+        if (dequantized_buffer.size() < sample_elems) { dequantized_buffer.resize(sample_elems); }
 
         const bool has_values = values_sample != nullptr;
         const bool has_activations = activations_sample != nullptr;
 
         // Bias denominators per slice
-        std::vector<double> bias_denominator_per_slice(ne2, 0.0);
+        std::vector<double> bias_denom(ne2, 0.0);
         if (has_activations) {
             for (int64_t s = 0; s < ne2; ++s) {
-                const float * values = has_values ? values_sample + s * n_per_row : nullptr;
-                const float * activations = activations_sample + s * n_per_row;
+                const float * v = has_values ? values_sample + s * n_per_row : nullptr;
+                const float * a = activations_sample + s * n_per_row;
                 double denom = 0.0;
                 for (int64_t j = 0; j < n_per_row; ++j) {
-                    const double w = values ? std::max(0.0f, values[j]) : 1.0;
-                    const double a = activations[j];
-                    denom += w * a * a;
+                    const double w  = v ? std::max(0.0f, v[j]) : 1.0;
+                    const double aj = a[j];
+                    denom += w * aj * aj;
                 }
 
-                bias_denominator_per_slice[s] = denom;
+                bias_denom[s] = denom;
             }
         }
 
-        // Weighted per-row squared norms
-        std::vector<double> row_sq_norm(sample_row_count, 0.0);
+        // Row squared norms (weighted if values present)
+        std::vector<double> row_sq_norm(sample_rows, 0.0);
         {
-            size_t offset = 0;
-            size_t row_idx = 0;
+            size_t off = 0;
+            size_t ridx = 0;
             for (int64_t s = 0; s < ne2; ++s) {
                 const int64_t rs = rows_sample[s];
                 if (rs == 0) { continue; }
 
-                const float * values = has_values ? values_sample + s * n_per_row : nullptr;
-                for (int64_t r = 0; r < rs; ++r, ++row_idx) {
-                    const float * x = f32_sample.data() + offset;
-                    double rsn = 0.0;
-                    if (values) {
+                const float * v = has_values ? values_sample + s * n_per_row : nullptr;
+                for (int64_t r = 0; r < rs; ++r, ++ridx) {
+                    const float * x = f32_sample.data() + off;
+                    double sum = 0.0;
+                    if (v) {
                         for (int64_t j = 0; j < n_per_row; ++j) {
-                            const double w = std::max(0.0f, values[j]);
+                            const double w = std::max(0.0f, v[j]);
                             const double xx = x[j];
-                            rsn += w * xx * xx;
+                            sum += w * xx * xx;
                         }
                     } else {
                         for (int64_t j = 0; j < n_per_row; ++j) {
                             const double xx = x[j];
-                            rsn += xx * xx;
+                            sum += xx * xx;
                         }
                     }
-                    row_sq_norm[row_idx] = rsn;
-                    offset += (size_t)n_per_row;
+
+                    row_sq_norm[ridx] = sum;
+                    off += (size_t)n_per_row;
                 }
             }
         }
 
-        // Quantize sampled rows per slice -> quantized_buffer
+        // Quantize per slice into quantized_buffer
         {
-            size_t q_offset = 0;
-            size_t f_offset = 0;
-            for (int64_t slice = 0; slice < ne2; ++slice) {
-                const int64_t rs = rows_sample[slice];
+            size_t qoff = 0;
+            size_t foff = 0;
+            for (int64_t s = 0; s < ne2; ++s) {
+                const int64_t rs = rows_sample[s];
                 if (rs == 0) { continue; }
 
-                const float * value = has_values ? values_sample + slice * n_per_row : nullptr;
-                (void)ggml_quantize_chunk(quant_type, f32_sample.data() + f_offset, quantized_buffer.data() + q_offset, 0, rs, n_per_row, value);
-                q_offset += row_sz * (size_t)rs;
-                f_offset += (size_t)rs * (size_t)n_per_row;
+                const float * v = has_values ? values_sample + s * n_per_row : nullptr;
+                (void)ggml_quantize_chunk(quant_type, f32_sample.data() + foff, quantized_buffer.data() + qoff, 0, rs, n_per_row, v);
+                qoff += row_sz * (size_t)rs;
+                foff += (size_t)rs * (size_t)n_per_row;
             }
         }
 
-        // quantized_buffer -> dequantized_buffer
+        // Dequantize into dequantized_buffer
         {
             const ggml_type_traits * traits = ggml_get_type_traits(quant_type);
-            const bool is_fp16 = quant_type == GGML_TYPE_F16;
-            const bool is_bf16 = quant_type == GGML_TYPE_BF16;
-            if (!is_fp16 && !is_bf16 && traits && traits->to_float) {
-                traits->to_float(quantized_buffer.data(), dequantized_buffer.data(), (int)(sample_row_count * (size_t)n_per_row));
+            if (traits && traits->to_float && quant_type != GGML_TYPE_F16 && quant_type != GGML_TYPE_BF16) {
+                traits->to_float(quantized_buffer.data(), dequantized_buffer.data(), (int)(sample_rows * (size_t)n_per_row));
             } else {
-                for (size_t r = 0; r < sample_row_count; ++r) {
-                    uint8_t * src = quantized_buffer.data() + r * row_sz;
+                for (size_t r = 0; r < sample_rows; ++r) {
+                    const uint8_t * src = quantized_buffer.data() + r * row_sz;
                     float * dst = dequantized_buffer.data() + r * (size_t)n_per_row;
-                    if (is_fp16) {
+                    if (quant_type == GGML_TYPE_F16) {
                         ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row);
-                    } else if (is_bf16) {
+                    } else if (quant_type == GGML_TYPE_BF16) {
                         ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row);
                     } else {
                         if (!traits || !traits->to_float) {
                             if (out_mse) { *out_mse = infinity; }
                             if (out_proj) { *out_proj = 0.0; }
-
                             return infinity;
                         }
                         traits->to_float(src, dst, (int)n_per_row);
@@ -858,94 +855,77 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             }
         }
 
-        // Compute error
-        size_t offset = 0;
-        size_t row_idx = 0;
+        // Compute error per slice with trimmed aggregation
+        auto trimmed_sum = [&](std::vector<double> & v) -> double {
+            const int64_t n = (int64_t)v.size();
+            if (n == 0) { return 0.0; }
+            if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); }
+            int64_t k = (int64_t) std::floor(0.02 * (double) n); // trim 2% on each side
+            k = std::clamp<int64_t>(k, 0, n / 32); // but no more than ~3%
+            std::nth_element(v.begin(), v.begin() + k, v.end());
+            std::nth_element(v.begin() + k, v.begin() + (n - k), v.end());
+            return std::accumulate(v.begin() + k, v.begin() + (n - k), 0.0);
+        };
+
+        size_t off = 0;
+        size_t ridx = 0;
         double total_mse = 0.0;
         double total_proj = 0.0;
         double total_bias = 0.0;
-        for (int64_t slice = 0; slice < ne2; ++slice) {
-            const int64_t rs = rows_sample[slice];
+        for (int64_t s = 0; s < ne2; ++s) {
+            const int64_t rs = rows_sample[s];
             if (rs == 0) { continue; }
 
-            const float * values = has_values ? values_sample + slice * n_per_row : nullptr;
-            const float * activations = has_activations ? activations_sample + slice * n_per_row : nullptr;
-            const double bias_denom = has_activations ? bias_denominator_per_slice[slice] : 0.0;
+            const float * v = has_values ? values_sample + s * n_per_row : nullptr;
+            const float * a = has_activations ? activations_sample + s * n_per_row : nullptr;
+            const double denom_bias = has_activations ? bias_denom[s] : 0.0;
             std::vector<double> row_mse_norm;
-            std::vector<double> row_proj_norm;
             row_mse_norm.reserve(rs);
-            if (activations) { row_proj_norm.reserve(rs); }
+            std::vector<double> row_proj_norm;
+            if (a) { row_proj_norm.reserve(rs); }
 
-            for (int64_t r = 0; r < rs; ++r, ++row_idx) {
-                const float * x = f32_sample.data() + offset;
-                const float * y = dequantized_buffer.data() + offset;
-                double weighted_mse = 0.0;
+            for (int64_t r = 0; r < rs; ++r, ++ridx) {
+                const float * x = f32_sample.data() + off;
+                const float * y = dequantized_buffer.data() + off;
+                double w_mse = 0.0;
                 double bias_num = 0.0;
-                if (values && activations) {
-                    for (int64_t j = 0; j < n_per_row; ++j) {
-                        const double w = std::max(0.0f, values[j]);
-                        const double e = y[j] - x[j];
-                        const double a = activations[j];
-                        weighted_mse += w * e * e;
-                        bias_num += w * e * a;
-                    }
-                } else if (values) {
-                    for (int64_t j = 0; j < n_per_row; ++j) {
-                        const double w = std::max(0.0f, values[j]);
-                        const double e = y[j] - x[j];
-                        weighted_mse += w * e * e;
-                    }
-                } else {
-                    for (int64_t j = 0; j < n_per_row; ++j) {
-                        const double e = y[j] - x[j];
-                        weighted_mse += e * e;
-                    }
+                for (int64_t j = 0; j < n_per_row; ++j) {
+                    const double wj = v ? std::max(0.0f, v[j]) : 1.0;
+                    const double e = y[j] - x[j];
+                    w_mse += wj * e * e;
+                    if (a) { bias_num += wj * e * a[j]; }
                 }
 
-                const double denom_x = row_sq_norm[row_idx];
-                double m_norm = weighted_mse / (denom_x + epsilon);
+                const double denom_x = row_sq_norm[ridx];
+                const double m_norm  = w_mse / (denom_x + epsilon);
                 row_mse_norm.push_back(std::isfinite(m_norm) ? m_norm : infinity);
 
-                if (activations) {
+                if (a) {
                     double p_norm = 0.0;
-                    if (bias_denom > 0.0) {
-                        const double proj = bias_num * bias_num / (bias_denom + epsilon);
+                    if (denom_bias > 0.0) {
+                        const double proj = bias_num * bias_num / (denom_bias + epsilon);
                         p_norm = std::isfinite(proj) ? proj : 0.0;
                     }
+
                     row_proj_norm.push_back(p_norm);
                 }
 
-                offset += (size_t)n_per_row;
+                off += (size_t)n_per_row;
             }
 
-            // Trimmed sum to avoid outlier rows dominating the results
-            auto trimmed_sum = [&](std::vector<double> & v) -> double {
-                const int64_t n = (int64_t)v.size();
-                if (n == 0) { return 0.0; }
-                if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); }
-
-                int64_t k = (int64_t)std::floor(0.02 * (double)n); // trim 2% each side
-                k = std::clamp<int64_t>(k, 0, n / 32); // cap at ~3.125%
-                std::nth_element(v.begin(), v.begin() + k, v.end());
-                std::nth_element(v.begin() + k, v.begin() + (n - k), v.end());
-                return std::accumulate(v.begin() + k, v.begin() + (n - k), 0.0);
-            };
-
             const double scale_rows = (double)nrows / std::max(1.0, (double)rs);
             const double slice_mse = trimmed_sum(row_mse_norm) * scale_rows;
-            const double slice_proj = activations ? trimmed_sum(row_proj_norm) * scale_rows : 0.0;
+            const double slice_proj = a ? trimmed_sum(row_proj_norm) * scale_rows : 0.0;
 
             total_mse += slice_mse;
             total_proj += slice_proj;
 
-            // per-slice lambda if provided, otherwise use scalar
-            const double bl = slice_bias_lambda ? (double)std::max(0.0f, slice_bias_lambda[slice]) : (double)tensor_bias_lambda;
+            const double bl = slice_bias_lambda ? (double)std::max(0.0f, slice_bias_lambda[s]) : (double)tensor_bias_lambda;
             total_bias += bl * slice_proj;
 
             if (!std::isfinite(total_mse) || !std::isfinite(total_proj) || !std::isfinite(total_bias)) {
                 if (out_mse) { *out_mse = infinity; }
                 if (out_proj) { *out_proj = 0.0; }
-
                 return infinity;
             }
         }
@@ -954,7 +934,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         if (out_proj) { *out_proj = total_proj; }
 
         const double total_err = slice_bias_lambda ? total_mse + total_bias : total_mse + tensor_bias_lambda * total_proj;
-
         return std::isfinite(total_err) ? total_err : infinity;
     };
 

From 9a1656eb975fa9f1024a8de029e22a762e49719b Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 21 Sep 2025 16:21:35 +0100
Subject: [PATCH 082/155] Refactor pareto optimise and convexify

---
 src/llama-quant.cpp | 86 ++++++++++++++++++++++-----------------------
 1 file changed, 42 insertions(+), 44 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index b3e4b3cbf7..751a26c63a 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1179,55 +1179,53 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         }
 
         // Keep only the pareto‑optimal candidates and enforce convexity in (bytes, error) curve
-        {
-            auto & candidates = info.candidate;
-            if (!candidates.empty()) {
-                std::sort(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) {
-                    if (a.bytes != b.bytes) { return a.bytes < b.bytes; }
+        auto pareto_convex = [](std::vector<candidate_types> & candidates) {
+            if (candidates.empty()) return;
 
-                    return a.error < b.error;
-                });
+            std::sort(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) {
+                if (a.bytes != b.bytes) { return a.bytes < b.bytes; }
+                return a.error < b.error;
+            });
 
-                std::vector<candidate_types> pareto;
-                pareto.reserve(candidates.size());
-                double best_err = infinity;
-                size_t last_bytes = std::numeric_limits<size_t>::max();
-                for (const auto & c : candidates) {
-                    if (c.bytes != last_bytes) {
-                        last_bytes = c.bytes;
-                        if (c.error < best_err) {
-                            best_err = c.error;
-                            pareto.push_back(c);
-                        }
+            // Pareto by bytes -> error
+            std::vector<candidate_types> pareto;
+            pareto.reserve(candidates.size());
+            double best_err = std::numeric_limits<double>::infinity();
+            size_t last_b = std::numeric_limits<size_t>::max();
+            for (const auto & c : candidates) {
+                if (c.bytes != last_b) {
+                    last_b = c.bytes;
+                    if (c.error < best_err) {
+                        best_err = c.error;
+                        pareto.push_back(c);
                     }
                 }
-
-                candidates.swap(pareto);
-
-                if (candidates.size() >= 3) {
-                    std::vector<candidate_types> hull;
-                    hull.reserve(candidates.size());
-                    auto slope = [](const candidate_types & a, const candidate_types & b) {
-                        const double dx = b.bytes - a.bytes;
-
-                        return dx <= 0.0 ? infinity : (b.error - a.error) / dx;
-                    };
-
-                    for (const auto & p : candidates) {
-                        while (hull.size() >= 2) {
-                            double s1 = slope(hull[hull.size() - 2], hull[hull.size() - 1]);
-                            double s2 = slope(hull[hull.size() - 1], p);
-                            if (s2 + epsilon < s1) { hull.pop_back(); }
-                            else { break; }
-                        }
-
-                        hull.push_back(p);
-                    }
-
-                    candidates.swap(hull);
-                }
             }
-        }
+
+            candidates.swap(pareto);
+            if (candidates.size() < 3) { return; } // need at least 3 points to do convex hull
+
+            // Convex hull (lower envelope)
+            auto slope = [](const candidate_types & a, const candidate_types & b) {
+                const double dx = b.bytes - a.bytes;
+                return dx <= 0.0 ? infinity : (b.error - a.error) / dx;
+            };
+
+            std::vector<candidate_types> hull; hull.reserve(candidates.size());
+            for (const auto & p : candidates) {
+                while (hull.size() >= 2) {
+                    const double s1 = slope(hull[hull.size() - 2], hull[hull.size() - 1]);
+                    const double s2 = slope(hull[hull.size() - 1], p);
+                    if (s2 + epsilon < s1) hull.pop_back();
+                    else { break; }
+                }
+
+                hull.push_back(p);
+            }
+            candidates.swap(hull);
+        };
+
+        pareto_convex(info.candidate);
 
         // Initialize choice at the smallest bpw candidate
         info.choice = 0;

From 0d5f18303e25e6b4e4dc21f963ca6672b9b12d0f Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 21 Sep 2025 16:22:00 +0100
Subject: [PATCH 083/155] Refactor lagrange_penalty()

---
 src/llama-quant.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 751a26c63a..204fbfecad 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1288,21 +1288,21 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         bytes = 0;
         err = 0.0;
         for (size_t i = 0; i < all.size(); ++i) {
-            const auto & cand = all[i].candidate;
+            const auto & candidate = all[i].candidate;
             int best_j = 0;
             double best_val = infinity;
-            for (int j = 0; j < (int)cand.size(); ++j) {
-                const double bits = (double)cand[j].bytes * 8.0;
-                const double val = cand[j].error + mu * bits;
-                if (val < best_val - epsilon || (std::abs(val - best_val) <= epsilon && cand[j].bytes < cand[best_j].bytes)) {
+            for (int j = 0; j < (int)candidate.size(); ++j) {
+                const double bits = (double)candidate[j].bytes * 8.0;
+                const double val = candidate[j].error + mu * bits;
+                if (val < best_val - epsilon || (std::abs(val - best_val) <= epsilon && candidate[j].bytes < candidate[best_j].bytes)) {
                     best_val = val;
                     best_j = j;
                 }
             }
 
             choice[i] = best_j;
-            bytes += cand[best_j].bytes;
-            err += cand[best_j].error;
+            bytes += candidate[best_j].bytes;
+            err += candidate[best_j].error;
         }
     };
 

From 814f6b66be4b5ebbe286201eafe8361a37d39a98 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 21 Sep 2025 16:45:09 +0100
Subject: [PATCH 084/155] Minor general refactoring

---
 src/llama-quant.cpp | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 204fbfecad..93b5fb0eba 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -860,7 +860,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             const int64_t n = (int64_t)v.size();
             if (n == 0) { return 0.0; }
             if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); }
-            int64_t k = (int64_t) std::floor(0.02 * (double) n); // trim 2% on each side
+
+            int64_t k = (int64_t) std::floor(0.02 * (double)n); // trim 2% on each side
             k = std::clamp<int64_t>(k, 0, n / 32); // but no more than ~3%
             std::nth_element(v.begin(), v.begin() + k, v.end());
             std::nth_element(v.begin() + k, v.begin() + (n - k), v.end());
@@ -1190,7 +1191,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             // Pareto by bytes -> error
             std::vector<candidate_types> pareto;
             pareto.reserve(candidates.size());
-            double best_err = std::numeric_limits<double>::infinity();
+            double best_err = infinity;
             size_t last_b = std::numeric_limits<size_t>::max();
             for (const auto & c : candidates) {
                 if (c.bytes != last_b) {
@@ -1273,12 +1274,10 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     if (budget_bytes <= min_bytes) {
         for (auto & ti : all) { ti.choice = 0; }
-
         return emit_overrides();
     }
     if (budget_bytes >= max_bytes) {
         for (auto & ti : all) { ti.choice = (int) ti.candidate.size() - 1; }
-
         return emit_overrides();
     }
 
@@ -1327,14 +1326,10 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         int expand = 0;
         while (true) {
             lagrange_penalty(mu_hi, choice_hi, bytes_hi, err_hi);
-            if (bytes_hi <= budget_bytes) {
-                break;
-            }
+            if (bytes_hi <= budget_bytes) { break; }
 
             mu_hi *= 2.0;
-            if (++expand > 60) {
-                break;
-            }
+            if (++expand > 60) { break; } // safety cap
         }
     }
 

From e92db008bc848b109f2931162a69c7010f675b70 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 21 Sep 2025 17:20:48 +0100
Subject: [PATCH 085/155] Refactor quantisation checks into its own function

---
 src/llama-quant.cpp | 140 ++++++++++++++++++--------------------------
 1 file changed, 57 insertions(+), 83 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 93b5fb0eba..3544653a56 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -21,6 +21,60 @@ struct tensor_quantization {
     ggml_type quant = GGML_TYPE_COUNT;
 };
 
+static bool is_quantizable(const std::string & name, const llm_arch arch, const llama_model_quantize_params * params) {
+    if (params->only_copy) { return false; }
+
+    const auto tn = LLM_TN(arch);
+
+    // This used to be a regex, but <regex> has an extreme cost to compile times.
+    bool q = name.size() >= 6 && name.rfind("weight") == name.size() - 6; // ends with 'weight'?
+
+    // Do not quantize norm tensors
+    q &= name.find("_norm.weight") == std::string::npos;
+
+    // Do not quantize expert gating tensors
+    // NOTE: can't use LLM_TN here because the layer number is not known
+    q &= name.find("ffn_gate_inp.weight") == std::string::npos;
+
+    // These are very small (e.g. 4x4)
+    q &= name.find("altup") == std::string::npos;
+    q &= name.find("laurel") == std::string::npos;
+
+    // These are not too big so keep them as it is
+    q &= name.find("per_layer_model_proj") == std::string::npos;
+
+    // Do not quantize positional embeddings and token types (BERT)
+    q &= name != tn(LLM_TENSOR_POS_EMBD, "weight");
+    q &= name != tn(LLM_TENSOR_TOKEN_TYPES, "weight");
+
+    // Do not quantize Jamba, Mamba, LFM2's small yet 2D weights
+    // NOTE: can't use LLM_TN here because the layer number is not known
+    q &= name.find("ssm_conv1d.weight") == std::string::npos;
+    q &= name.find("shortconv.conv.weight") == std::string::npos;
+
+    // Do not quantize ARWKV, RWKV's small yet 2D weights
+    q &= name.find("time_mix_first.weight") == std::string::npos;
+    q &= name.find("time_mix_w0.weight") == std::string::npos;
+    q &= name.find("time_mix_w1.weight") == std::string::npos;
+    q &= name.find("time_mix_w2.weight") == std::string::npos;
+    q &= name.find("time_mix_v0.weight") == std::string::npos;
+    q &= name.find("time_mix_v1.weight") == std::string::npos;
+    q &= name.find("time_mix_v2.weight") == std::string::npos;
+    q &= name.find("time_mix_a0.weight") == std::string::npos;
+    q &= name.find("time_mix_a1.weight") == std::string::npos;
+    q &= name.find("time_mix_a2.weight") == std::string::npos;
+    q &= name.find("time_mix_g1.weight") == std::string::npos;
+    q &= name.find("time_mix_g2.weight") == std::string::npos;
+    q &= name.find("time_mix_decay_w1.weight") == std::string::npos;
+    q &= name.find("time_mix_decay_w2.weight") == std::string::npos;
+    q &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
+
+    // Do not quantize relative position bias (T5)
+    q &= name.find("attn_rel_b.weight") == std::string::npos;
+
+    return q;
+}
+
 static bool is_iq(const enum ggml_type t) {
     switch (t) {
         case GGML_TYPE_IQ1_S:
@@ -684,40 +738,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return is_compatible(t, fb) ? fb : GGML_TYPE_F16;
     };
 
-    auto name_tn = LLM_TN(model.arch);
     auto can_quantize = [&](const ggml_tensor * t) -> bool {
-        // This list should be kept in sync with llama_tensor_quantize_impl() to avoid drift
-        const std::string name = ggml_get_name(t);
-        bool q = name.rfind("weight") == name.size() - 6;
-        q &= ggml_n_dims(t) >= 2;
-        q &= name.find("_norm.weight") == std::string::npos;
-        q &= name.find("ffn_gate_inp.weight") == std::string::npos;
-        q &= name.find("altup") == std::string::npos;
-        q &= name.find("laurel") == std::string::npos;
-        q &= name.find("per_layer_model_proj") == std::string::npos;
-        q &= name != name_tn(LLM_TENSOR_POS_EMBD, "weight");
-        q &= name != name_tn(LLM_TENSOR_TOKEN_TYPES, "weight");
-        q &= name.find("ssm_conv1d.weight") == std::string::npos;
-        q &= name.find("shortconv.conv.weight") == std::string::npos;
-        q &= name.find("time_mix_first.weight") == std::string::npos;
-        q &= name.find("time_mix_w0.weight") == std::string::npos;
-        q &= name.find("time_mix_w1.weight") == std::string::npos;
-        q &= name.find("time_mix_w2.weight") == std::string::npos;
-        q &= name.find("time_mix_v0.weight") == std::string::npos;
-        q &= name.find("time_mix_v1.weight") == std::string::npos;
-        q &= name.find("time_mix_v2.weight") == std::string::npos;
-        q &= name.find("time_mix_a0.weight") == std::string::npos;
-        q &= name.find("time_mix_a1.weight") == std::string::npos;
-        q &= name.find("time_mix_a2.weight") == std::string::npos;
-        q &= name.find("time_mix_g1.weight") == std::string::npos;
-        q &= name.find("time_mix_g2.weight") == std::string::npos;
-        q &= name.find("time_mix_decay_w1.weight") == std::string::npos;
-        q &= name.find("time_mix_decay_w2.weight") == std::string::npos;
-        q &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
-        q &= name.find("attn_rel_b.weight") == std::string::npos;
-        q &= !params->only_copy;
-
-        return q;
+        if (ggml_n_dims(t) < 2) { return false; }
+        return is_quantizable(ggml_get_name(t), model.arch, params);
     };
 
     // Estimate error for a given type using a sampled subset of rows
@@ -1747,57 +1770,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
             ++idx, ml.n_tensors, ggml_get_name(tensor), llama_format_tensor_shape(tensor).c_str(), ggml_type_name(tensor->type));
 
-        // This used to be a regex, but <regex> has an extreme cost to compile times.
-        bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
-
-        // quantize only 2D and 3D tensors (experts)
-        quantize &= (ggml_n_dims(tensor) >= 2);
-
-        // do not quantize norm tensors
-        quantize &= name.find("_norm.weight") == std::string::npos;
-
+        bool quantize = ggml_n_dims(tensor) >= 2 && is_quantizable(name, model.arch, params);
         quantize &= params->quantize_output_tensor || name != "output.weight";
-        quantize &= !params->only_copy;
-
-        // do not quantize expert gating tensors
-        // NOTE: can't use LLM_TN here because the layer number is not known
-        quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
-
-        // these are very small (e.g. 4x4)
-        quantize &= name.find("altup")  == std::string::npos;
-        quantize &= name.find("laurel") == std::string::npos;
-
-        // these are not too big so keep them as it is
-        quantize &= name.find("per_layer_model_proj") == std::string::npos;
-
-        // do not quantize positional embeddings and token types (BERT)
-        quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD,    "weight");
-        quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
-
-        // do not quantize Mamba's small yet 2D weights
-        // NOTE: can't use LLM_TN here because the layer number is not known
-        quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
-        quantize &= name.find("shortconv.conv.weight") == std::string::npos;
-
-        // do not quantize RWKV's small yet 2D weights
-        quantize &= name.find("time_mix_first.weight") == std::string::npos;
-        quantize &= name.find("time_mix_w0.weight") == std::string::npos;
-        quantize &= name.find("time_mix_w1.weight") == std::string::npos;
-        quantize &= name.find("time_mix_w2.weight") == std::string::npos;
-        quantize &= name.find("time_mix_v0.weight") == std::string::npos;
-        quantize &= name.find("time_mix_v1.weight") == std::string::npos;
-        quantize &= name.find("time_mix_v2.weight") == std::string::npos;
-        quantize &= name.find("time_mix_a0.weight") == std::string::npos;
-        quantize &= name.find("time_mix_a1.weight") == std::string::npos;
-        quantize &= name.find("time_mix_a2.weight") == std::string::npos;
-        quantize &= name.find("time_mix_g1.weight") == std::string::npos;
-        quantize &= name.find("time_mix_g2.weight") == std::string::npos;
-        quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
-        quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
-        quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
-
-        // do not quantize relative position bias (T5)
-        quantize &= name.find("attn_rel_b.weight") == std::string::npos;
 
         ggml_type new_type;
         void * new_data;

From fecc472c6175bc65217d6f29855acf81477a5125 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 21 Sep 2025 17:26:38 +0100
Subject: [PATCH 086/155] Fix typos in variable names

---
 src/llama-quant.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 3544653a56..8a709ddfdd 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1165,7 +1165,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         // Evaluate candidates
         std::vector<candidate_types> eval_candidates(compatible_candidates.size());
         std::vector<uint8_t> quantized_buffer(max_row_sz * total_sampled_rows);
-        std::vector<float> dequantised_buffer(f32_sample.size());
+        std::vector<float> dequantized_buffer(f32_sample.size());
         const float * slice_lambda = lambdas.empty() ? nullptr : lambdas.data();
         int n_eval_threads = std::max(1, std::min<int>(nthread, (int)compatible_candidates.size()));
         std::atomic<size_t> cidx{0};
@@ -1175,7 +1175,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             eval_workers.emplace_back([&] {
                 // thread-local scratch
                 std::vector<uint8_t> tl_quantized_buffer(quantized_buffer.size());
-                std::vector<float>   tl_dequantised_buffer(dequantised_buffer.size());
+                std::vector<float> tl_dequantized_buffer(dequantized_buffer.size());
                 for (;;) {
                     const size_t i = cidx.fetch_add(1, std::memory_order_relaxed);
                     if (i >= compatible_candidates.size()) { break; }
@@ -1184,7 +1184,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     const auto bpw = (float)tensor_bpw(tensor, tensor_types);
                     const size_t bytes = tensor_bytes(tensor, tensor_types);
                     const auto err = estimate_error(tensor, tensor_types, f32_sample, rows_sample, values, activations,
-                        tl_quantized_buffer, tl_dequantised_buffer, tensor_lambda, slice_lambda);
+                        tl_quantized_buffer, tl_dequantized_buffer, tensor_lambda, slice_lambda);
                     eval_candidates[i] = candidate_types{ tensor_types, bpw, bytes, err };
                 }
             });

From 896cdc21217ab4d0b2bcb8b18938d3c0efc94dc1 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 21 Sep 2025 22:03:36 +0100
Subject: [PATCH 087/155] Refactor potential overflow

---
 src/llama-quant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 8a709ddfdd..52d7984e2a 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1002,7 +1002,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         const std::string name = ggml_get_name(tensor);
         if (!can_quantize(tensor)) { continue; }
 
-        LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12d elements)\n", __func__, name.c_str(), (int)ggml_nelements(tensor));
+        LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12" PRId64 " elements)\n", __func__, name.c_str(), ggml_nelements(tensor));
         if (!ml.use_mmap) {
             if (buffer.size() < ggml_nbytes(tensor)) { buffer.resize(ggml_nbytes(tensor)); }
             tensor->data = buffer.data();

From b748a1efa7dd0ab0d4064574530b4b045b27bbfc Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 21 Sep 2025 22:03:54 +0100
Subject: [PATCH 088/155] Fix typo

---
 src/llama-quant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 52d7984e2a..2652f5c86e 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1709,7 +1709,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             } else {
                 LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate - ", __func__);
             }
-            LLAMA_LOG_INFO("using %s error estimation\n", params->no_bias ? "MSE only (no aligment bias)" : "aligment bias (default)");
+            LLAMA_LOG_INFO("using %s error estimation\n", params->no_bias ? "MSE only (no alignment bias)" : "alignment bias (default)");
             LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw);
             bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread);
         } else {

From c855094dff509c97f6cc268e28f123262e67b6f7 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Mon, 22 Sep 2025 20:09:11 +0100
Subject: [PATCH 089/155] Exit loop if no better solution found

---
 src/llama-quant.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 2652f5c86e..8ee052a8e5 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1347,9 +1347,12 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     // increase mu until we get under budget or hit a safety cap
     {
         int expand = 0;
+        size_t prev_bytes_hi = std::numeric_limits<size_t>::max();
         while (true) {
             lagrange_penalty(mu_hi, choice_hi, bytes_hi, err_hi);
             if (bytes_hi <= budget_bytes) { break; }
+            if (bytes_hi >= prev_bytes_hi) { break; }
+            prev_bytes_hi = bytes_hi;
 
             mu_hi *= 2.0;
             if (++expand > 60) { break; } // safety cap

From 1fbc59f867b283d1f66a87a8b1f45d265cf69fca Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Mon, 22 Sep 2025 20:10:10 +0100
Subject: [PATCH 090/155] Replace slope with cross product

---
 src/llama-quant.cpp | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 8ee052a8e5..0b2f15f0a6 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1230,22 +1230,27 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             if (candidates.size() < 3) { return; } // need at least 3 points to do convex hull
 
             // Convex hull (lower envelope)
-            auto slope = [](const candidate_types & a, const candidate_types & b) {
-                const double dx = b.bytes - a.bytes;
-                return dx <= 0.0 ? infinity : (b.error - a.error) / dx;
-            };
-
             std::vector<candidate_types> hull; hull.reserve(candidates.size());
-            for (const auto & p : candidates) {
+            for (const auto & c : candidates) {
+                auto cross_product = [](const candidate_types & h0, const candidate_types & h1, const candidate_types & p) -> double {
+                    const double dx1 = (double)h1.bytes - (double)h0.bytes;
+                    const double dy1 = h1.error - h0.error;
+                    const double dx2 = (double)p.bytes - (double)h0.bytes;
+                    const double dy2 = p.error - h0.error;
+                    return dx1 * dy2 - dx2 * dy1;
+                };
+
                 while (hull.size() >= 2) {
-                    const double s1 = slope(hull[hull.size() - 2], hull[hull.size() - 1]);
-                    const double s2 = slope(hull[hull.size() - 1], p);
-                    if (s2 + epsilon < s1) hull.pop_back();
-                    else { break; }
+                    if (cross_product(hull[hull.size() - 2], hull[hull.size() - 1], c) <= epsilon) {
+                        hull.pop_back();
+                    } else {
+                        break;
+                    }
                 }
 
-                hull.push_back(p);
+                hull.push_back(c);
             }
+
             candidates.swap(hull);
         };
 

From f184450806163bd1af0eecaff5c31639cf3eaf8f Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Mon, 22 Sep 2025 20:10:42 +0100
Subject: [PATCH 091/155] Fix minor logic flaw

---
 src/llama-quant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 0b2f15f0a6..4c0ec3063a 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -957,7 +957,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         if (out_mse) { *out_mse = total_mse; }
         if (out_proj) { *out_proj = total_proj; }
 
-        const double total_err = slice_bias_lambda ? total_mse + total_bias : total_mse + tensor_bias_lambda * total_proj;
+        const double total_err = total_mse + total_bias;
         return std::isfinite(total_err) ? total_err : infinity;
     };
 

From d79ade2e8e45057d9006b0b096888501ae639aab Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Mon, 22 Sep 2025 20:11:26 +0100
Subject: [PATCH 092/155] Adjust for small vector size

---
 src/llama-quant.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 4c0ec3063a..08e1c97185 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -885,9 +885,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); }
 
             int64_t k = (int64_t) std::floor(0.02 * (double)n); // trim 2% on each side
-            k = std::clamp<int64_t>(k, 0, n / 32); // but no more than ~3%
-            std::nth_element(v.begin(), v.begin() + k, v.end());
-            std::nth_element(v.begin() + k, v.begin() + (n - k), v.end());
+            k = std::clamp<int64_t>(k, 0, std::min(n / 32, n / 2 - 1)); // but no more than ~3% or n/2 if small
+            std::sort(v.begin(), v.end());
             return std::accumulate(v.begin() + k, v.begin() + (n - k), 0.0);
         };
 

From 7ba6001ec8fda89e7d513ced2da7b9aa3532cb70 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Mon, 22 Sep 2025 20:11:54 +0100
Subject: [PATCH 093/155] Simplify candidates sorting

---
 src/llama-quant.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 08e1c97185..f4c0ea0fcd 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1209,6 +1209,10 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 if (a.bytes != b.bytes) { return a.bytes < b.bytes; }
                 return a.error < b.error;
             });
+            const auto last = std::unique(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) {
+                return a.bytes == b.bytes;
+            });
+            candidates.erase(last, candidates.end());
 
             // Pareto by bytes -> error
             std::vector<candidate_types> pareto;

From d36ee0a0a86a65e1d730e788d735c1606ebeb49a Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Mon, 22 Sep 2025 20:41:56 +0100
Subject: [PATCH 094/155] Add comments to explain magic numbers

---
 src/llama-quant.cpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index f4c0ea0fcd..93007f281e 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -739,7 +739,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     };
 
     auto can_quantize = [&](const ggml_tensor * t) -> bool {
-        if (ggml_n_dims(t) < 2) { return false; }
+        if (ggml_n_dims(t) < 2) { return false; } // skip 1D tensors
         return is_quantizable(ggml_get_name(t), model.arch, params);
     };
 
@@ -882,10 +882,10 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         auto trimmed_sum = [&](std::vector<double> & v) -> double {
             const int64_t n = (int64_t)v.size();
             if (n == 0) { return 0.0; }
-            if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); }
+            if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); } // use all samples for small datasets
 
-            int64_t k = (int64_t) std::floor(0.02 * (double)n); // trim 2% on each side
-            k = std::clamp<int64_t>(k, 0, std::min(n / 32, n / 2 - 1)); // but no more than ~3% or n/2 if small
+            int64_t k = (int64_t) std::floor(0.02 * (double)n); // trim 2% from each tail of the distribution
+            k = std::clamp<int64_t>(k, 0, std::min(n / 32, n / 2 - 1)); // cap trimming at ~3% (1/32) or half the samples - 1
             std::sort(v.begin(), v.end());
             return std::accumulate(v.begin() + k, v.begin() + (n - k), 0.0);
         };
@@ -1289,7 +1289,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     if (total_elems == 0) { return {}; }
 
     const double target_bpw = params->target_bpw;
-    size_t budget_bytes = std::llround(target_bpw * (double)total_elems / 8.0);
+    size_t budget_bytes = std::llround(target_bpw * (double)total_elems / 8.0); // convert bpw to bytes
 
     auto emit_overrides = [&]() -> std::unordered_map<std::string, ggml_type> {
         std::unordered_map<std::string, ggml_type> overrides;
@@ -1362,8 +1362,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             if (bytes_hi >= prev_bytes_hi) { break; }
             prev_bytes_hi = bytes_hi;
 
-            mu_hi *= 2.0;
-            if (++expand > 60) { break; } // safety cap
+            mu_hi *= 2.0; // double the penalty multiplier to reduce tensor sizes
+            if (++expand > 60) { break; } // safety cap to prevent an infinite loop
         }
     }
 
@@ -1371,8 +1371,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     double best_over_gap = infinity;
     double best_under_err = infinity;
     double best_over_err = infinity;
-    for (int it = 0; it < 40; ++it) {
-        double mu = 0.5 * (mu_lo + mu_hi);
+    for (int it = 0; it < 40; ++it) { // binary search iterations for optimal Lagrange multiplier (40 ≈ 1e-12 precision)
+        double mu = 0.5 * (mu_lo + mu_hi); // midpoint of current bounds
         lagrange_penalty(mu, choice_mid, bytes_mid, err_mid);
 
         const double gap = std::abs((double)bytes_mid - (double)budget_bytes);
@@ -1435,7 +1435,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 if (cur_bytes + delta > budget_bytes) { continue; }
 
                 double err_gain = std::max(0.0, ti.candidate[ti.choice].error - ti.candidate[j].error);
-                double ratio = err_gain / (double)(delta * 8);
+                double ratio = err_gain / (double)(delta * 8); // error reduction per bit
                 if (ratio > best_ratio + epsilon || (std::abs(ratio - best_ratio) <= epsilon && delta < best_delta)) {
                     best_ratio = ratio;
                     best_delta = delta;

From 8eedcf74bc4df64eb7fe5b4935390dc9ad73d104 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Mon, 22 Sep 2025 20:42:37 +0100
Subject: [PATCH 095/155] Increase scale multiplier

---
 src/llama-quant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 93007f281e..0f05c8f956 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -983,7 +983,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             if (s1 > 0.0) {
                 const auto n = (double)n_per_row;
                 const double c = std::max(0.0, s2 / (s1 * s1 + epsilon) - 1.0 / n);
-                l = (float)std::clamp(8.0 * (c / (c + 1.0)), 0.0, 12.0);
+                l = (float)std::clamp(12.0 * (c / (c + 1.0)), 0.0, 12.0);
             }
 
             lambdas[(size_t)s] = l;

From a74b410f5f6bd11ff42cc1f40fa93242d0f67940 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 25 Sep 2025 19:49:47 +0100
Subject: [PATCH 096/155] Move is_iq() into a lambda and remove unused
 variables

---
 src/llama-quant.cpp | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 0f05c8f956..af564ce03e 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -727,11 +727,28 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return (double)bytes * 8.0 / (double)ggml_nelements(t);
     };
 
-    auto is_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> bool {
+    auto is_compatible = [](const ggml_tensor * t, const ggml_type typ) -> bool {
         const int64_t blck = ggml_blck_size(typ);
         return blck <= 1 || (t->ne[0] % blck) == 0;
     };
 
+    auto is_iq = [](const enum ggml_type t) {
+        switch (t) {
+            case GGML_TYPE_IQ1_S:
+            case GGML_TYPE_IQ1_M:
+            case GGML_TYPE_IQ2_XXS:
+            case GGML_TYPE_IQ2_XS:
+            case GGML_TYPE_IQ2_S:
+            case GGML_TYPE_IQ3_XXS:
+            case GGML_TYPE_IQ3_S:
+            case GGML_TYPE_IQ4_NL:
+            case GGML_TYPE_IQ4_XS:
+                return true;
+            default:
+                return false;
+        }
+    };
+
     auto make_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> ggml_type {
         if (is_compatible(t, typ)) return typ;
         ggml_type fb = fallback_type(typ);
@@ -995,8 +1012,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     std::vector<tensor_info> all;
     all.reserve(tensors.size());
     for (const auto * tw : tensors) {
-        std::vector<std::thread> workers;
-        workers.reserve(std::max(1, nthread));
         ggml_tensor * tensor = tw->tensor;
         const std::string name = ggml_get_name(tensor);
         if (!can_quantize(tensor)) { continue; }

From dbdd179a92426c2031e4bee1ba0ccace45ea29fe Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 25 Sep 2025 19:50:20 +0100
Subject: [PATCH 097/155] Combine quant types

---
 src/llama-quant.cpp | 75 ++++++++-------------------------------------
 1 file changed, 13 insertions(+), 62 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index af564ce03e..f36b9202d5 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -75,43 +75,6 @@ static bool is_quantizable(const std::string & name, const llm_arch arch, const
     return q;
 }
 
-static bool is_iq(const enum ggml_type t) {
-    switch (t) {
-        case GGML_TYPE_IQ1_S:
-        case GGML_TYPE_IQ1_M:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
-        case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_IQ3_XXS:
-        case GGML_TYPE_IQ3_S:
-        case GGML_TYPE_IQ4_NL:
-        case GGML_TYPE_IQ4_XS:
-            return true;
-        default:
-            return false;
-    }
-}
-
-static bool is_iq(const enum llama_ftype t) {
-    switch (t) {
-        case LLAMA_FTYPE_MOSTLY_IQ1_S:
-        case LLAMA_FTYPE_MOSTLY_IQ1_M:
-        case LLAMA_FTYPE_MOSTLY_IQ2_XXS:
-        case LLAMA_FTYPE_MOSTLY_IQ2_XS:
-        case LLAMA_FTYPE_MOSTLY_IQ2_S:
-        case LLAMA_FTYPE_MOSTLY_IQ2_M:
-        case LLAMA_FTYPE_MOSTLY_IQ3_XXS:
-        case LLAMA_FTYPE_MOSTLY_IQ3_XS:
-        case LLAMA_FTYPE_MOSTLY_IQ3_S:
-        case LLAMA_FTYPE_MOSTLY_IQ3_M:
-        case LLAMA_FTYPE_MOSTLY_IQ4_XS:
-        case LLAMA_FTYPE_MOSTLY_IQ4_NL:
-            return true;
-        default:
-            return false;
-    }
-}
-
 static enum ggml_type fallback_type(const enum ggml_type new_type) {
     switch (new_type) {
         case GGML_TYPE_TQ1_0:
@@ -678,33 +641,21 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         size_t n_elements = 0;
     };
 
-    constexpr ggml_type k_quants[] = {
-        GGML_TYPE_Q2_K,
-        GGML_TYPE_Q3_K,
-        GGML_TYPE_Q4_K,
-        GGML_TYPE_Q5_K,
-        GGML_TYPE_Q6_K,
-        GGML_TYPE_Q8_0,
-// TODO: find better way to handle F16/BF16
-#ifdef GGML_USE_METAL
-        GGML_TYPE_F16
-#else
-        GGML_TYPE_BF16
-#endif
-    };
-
-    constexpr ggml_type iq_quants[] = {
+    // subset of quantization types with the best accuracy/size tradeoff
+    constexpr ggml_type quant_types[] = {
         GGML_TYPE_IQ1_S,
+        GGML_TYPE_IQ1_M,
         GGML_TYPE_IQ2_XXS,
-        GGML_TYPE_IQ2_XS,
-        GGML_TYPE_IQ2_S,
-        GGML_TYPE_IQ3_S,
+        GGML_TYPE_Q2_K,
+        GGML_TYPE_IQ3_XXS,
+        GGML_TYPE_Q3_K,
         GGML_TYPE_IQ4_XS,
-        GGML_TYPE_IQ4_NL,
+        GGML_TYPE_Q4_1,
+        GGML_TYPE_Q4_K,
+        GGML_TYPE_Q5_1,
         GGML_TYPE_Q5_K,
         GGML_TYPE_Q6_K,
         GGML_TYPE_Q8_0,
-        // TODO: find better way to handle F16/BF16
 #ifdef GGML_USE_METAL
         GGML_TYPE_F16
 #else
@@ -896,7 +847,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         }
 
         // Compute error per slice with trimmed aggregation
-        auto trimmed_sum = [&](std::vector<double> & v) -> double {
+        auto trimmed_sum = [](std::vector<double> & v) -> double {
             const int64_t n = (int64_t)v.size();
             if (n == 0) { return 0.0; }
             if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); } // use all samples for small datasets
@@ -978,7 +929,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     };
 
     // Returns lambda per slice or 0.0 if no activations
-    auto estimate_lambda = [&](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) -> std::vector<float> {
+    auto estimate_lambda = [](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) -> std::vector<float> {
         const int64_t ns = std::max<int64_t>(1, ne2);
         std::vector<float> lambdas(ns, 0.0f);
         if (!activations) { return lambdas; }
@@ -1141,8 +1092,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         // Build list of candidate types first (compatible ones)
         const bool has_valid_imatrix = !values_sample.empty() && values_sample.size() == (size_t)ne2 * (size_t)n_per_row;
         size_t max_row_sz = 0;
-        const ggml_type * base_arr = is_iq(params->ftype) ? iq_quants : k_quants;
-        const size_t base_sz = is_iq(params->ftype) ? std::size(iq_quants) : std::size(k_quants);
+        const ggml_type * base_arr = quant_types;
+        const size_t base_sz = std::size(quant_types);
         std::vector<ggml_type> compatible_candidates;
         compatible_candidates.reserve(base_sz);
 

From dd4f4bd0b88c4d59613033ba941d85e7ce1d9547 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 27 Sep 2025 17:23:48 +0100
Subject: [PATCH 098/155] Reduce bpw range

---
 src/llama-quant.cpp         | 7 +------
 tools/quantize/quantize.cpp | 8 ++++----
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index f36b9202d5..0386352014 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -655,12 +655,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         GGML_TYPE_Q5_1,
         GGML_TYPE_Q5_K,
         GGML_TYPE_Q6_K,
-        GGML_TYPE_Q8_0,
-#ifdef GGML_USE_METAL
-        GGML_TYPE_F16
-#else
-        GGML_TYPE_BF16
-#endif
+        GGML_TYPE_Q8_0
     };
 
     constexpr double epsilon = 1e-12;
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 03018cc301..69e03179b3 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -132,7 +132,7 @@ static void usage(const char * executable) {
     printf("      Advanced option to selectively quantize tensors. May be specified multiple times.\n");
     printf("  --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n");
     printf("      Advanced option to remove all tensors from the given layers\n");
-    printf("  --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 16.0\n");
+    printf("  --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0\n");
     printf("      Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n");
     printf("  --no-bias: use mean square error estimation only (no aligment bias)\n");
     printf("      Advanced option use MSE only and disable aligment bias error estimation\n");
@@ -484,13 +484,13 @@ static bool parse_target_bpw(const char * data, float & target_bpw) {
 
     try {
         target_bpw = std::stof(data);
-        if (target_bpw < 0.0f || target_bpw > 16.0f) {
-            printf("\n%s: target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__);
+        if (target_bpw < 0.0f || target_bpw > 8.0f) {
+            printf("\n%s: target bits per weight (bpw) must be a positive number between 0.0 and 8.0\n\n", __func__);
             return false;
         }
     }
     catch (const std::exception & e) {
-        printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__, data);
+        printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 8.0\n\n", __func__, data);
         return false;
     }
 

From d16945730eac146d87d158a97ef053f845921f01 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 27 Sep 2025 17:25:29 +0100
Subject: [PATCH 099/155] Refactor outlier trimming

---
 src/llama-quant.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 0386352014..df36a705c2 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -847,8 +847,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             if (n == 0) { return 0.0; }
             if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); } // use all samples for small datasets
 
-            int64_t k = (int64_t) std::floor(0.02 * (double)n); // trim 2% from each tail of the distribution
-            k = std::clamp<int64_t>(k, 0, std::min(n / 32, n / 2 - 1)); // cap trimming at ~3% (1/32) or half the samples - 1
+            int64_t k = (int64_t) std::floor(0.025 * (double)n); // trim 2.5% from each tail of the distribution
             std::sort(v.begin(), v.end());
             return std::accumulate(v.begin() + k, v.begin() + (n - k), 0.0);
         };

From 87cba659089342ef4e4c2209d9a750555ae140e3 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 27 Sep 2025 17:26:30 +0100
Subject: [PATCH 100/155] Tighten worker allocator

---
 src/llama-quant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index df36a705c2..90931f25e7 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1136,7 +1136,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 std::vector<uint8_t> tl_quantized_buffer(quantized_buffer.size());
                 std::vector<float> tl_dequantized_buffer(dequantized_buffer.size());
                 for (;;) {
-                    const size_t i = cidx.fetch_add(1, std::memory_order_relaxed);
+                    const size_t i = cidx.fetch_add(1, std::memory_order_acq_rel);
                     if (i >= compatible_candidates.size()) { break; }
 
                     const ggml_type tensor_types = compatible_candidates[i];

From 8a2c71f471842a9b2dcc0bc33592cd7adb8b8dfe Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 27 Sep 2025 17:27:29 +0100
Subject: [PATCH 101/155] Check for direction reversal

---
 src/llama-quant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 90931f25e7..601b9ada42 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1204,7 +1204,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 };
 
                 while (hull.size() >= 2) {
-                    if (cross_product(hull[hull.size() - 2], hull[hull.size() - 1], c) <= epsilon) {
+                    if (cross_product(hull[hull.size() - 2], hull[hull.size() - 1], c) <= -1 * epsilon) { // very small negative tolerance
                         hull.pop_back();
                     } else {
                         break;

From 3d75b14c0f2fc605fb39a3cb425c4c2482b8d8f5 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 27 Sep 2025 17:27:58 +0100
Subject: [PATCH 102/155] Simplify dequantisation

---
 src/llama-quant.cpp | 29 ++++++++++-------------------
 1 file changed, 10 insertions(+), 19 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 601b9ada42..316dd35fa8 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -819,25 +819,16 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         // Dequantize into dequantized_buffer
         {
             const ggml_type_traits * traits = ggml_get_type_traits(quant_type);
-            if (traits && traits->to_float && quant_type != GGML_TYPE_F16 && quant_type != GGML_TYPE_BF16) {
-                traits->to_float(quantized_buffer.data(), dequantized_buffer.data(), (int)(sample_rows * (size_t)n_per_row));
-            } else {
-                for (size_t r = 0; r < sample_rows; ++r) {
-                    const uint8_t * src = quantized_buffer.data() + r * row_sz;
-                    float * dst = dequantized_buffer.data() + r * (size_t)n_per_row;
-                    if (quant_type == GGML_TYPE_F16) {
-                        ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row);
-                    } else if (quant_type == GGML_TYPE_BF16) {
-                        ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row);
-                    } else {
-                        if (!traits || !traits->to_float) {
-                            if (out_mse) { *out_mse = infinity; }
-                            if (out_proj) { *out_proj = 0.0; }
-                            return infinity;
-                        }
-                        traits->to_float(src, dst, (int)n_per_row);
-                    }
-                }
+            if (!traits || !traits->to_float) {
+                if (out_mse) { *out_mse = infinity; }
+                if (out_proj) { *out_proj = 0.0; }
+                return infinity;
+            }
+
+            for (size_t r = 0; r < sample_rows; ++r) {
+                const uint8_t * src = quantized_buffer.data() + r * row_sz;
+                float * dst = dequantized_buffer.data() + r * (size_t)n_per_row;
+                traits->to_float(src, dst, (int)n_per_row);
             }
         }
 

From e49e241d37e7fd7f25142ee514c9e129c304083b Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 27 Sep 2025 17:28:39 +0100
Subject: [PATCH 103/155] Calculate bpw over all tensors

---
 src/llama-quant.cpp | 45 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 32 insertions(+), 13 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 316dd35fa8..699264553a 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1219,6 +1219,18 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     if (all.empty()) { return {}; }
 
+    // Compute total elements across all tensors and bytes for non-quantizable tensors
+    size_t nq_elements = 0;
+    size_t nq_bytes = 0;
+    for (const auto & it : ml.weights_map) {
+        const ggml_tensor * tensor = it.second.tensor;
+        const std::string name = it.first;
+        nq_elements += (size_t)ggml_nelements(tensor);
+        if (!is_quantizable(name, model.arch, params)) {
+            nq_bytes += ggml_nbytes(tensor);
+        }
+    }
+
     auto total_bytes = [&]() -> size_t {
         size_t tb = 0;
         for (const auto & ti : all) {
@@ -1228,19 +1240,20 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return tb;
     };
 
-    size_t total_elems = 0;
+    size_t q_elements = 0;
     size_t min_bytes = 0;
     size_t max_bytes = 0;
     for (const auto & ti : all) {
-        total_elems += (size_t)ti.n_elements;
+        q_elements += (size_t)ti.n_elements;
         min_bytes += ti.candidate.front().bytes;  // smallest candidate per tensor
         max_bytes += ti.candidate.back().bytes;   // largest candidate per tensor
     }
 
-    if (total_elems == 0) { return {}; }
+    if (q_elements == 0) { return {}; }
 
     const double target_bpw = params->target_bpw;
-    size_t budget_bytes = std::llround(target_bpw * (double)total_elems / 8.0); // convert bpw to bytes
+    size_t target_total_bytes = std::llround(target_bpw * (double)nq_elements / 8.0);
+    size_t budget_bytes = target_total_bytes >= nq_bytes ? target_total_bytes - nq_bytes : min_bytes;
 
     auto emit_overrides = [&]() -> std::unordered_map<std::string, ggml_type> {
         std::unordered_map<std::string, ggml_type> overrides;
@@ -1374,29 +1387,35 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             int best_i = -1;
             int best_j = -1;
             double best_ratio = -1.0;
-            size_t best_delta = 0;
+            double best_gain = -1.0;
+
             for (int i = 0; i < (int)all.size(); ++i) {
                 const auto &ti = all[i];
                 int j = ti.choice + 1;
-                // skip same-bytes entries
                 while (j < (int)ti.candidate.size() && ti.candidate[j].bytes == ti.candidate[ti.choice].bytes) { ++j; }
                 if (j >= (int)ti.candidate.size()) { continue; }
 
-                size_t delta = ti.candidate[j].bytes - ti.candidate[ti.choice].bytes;
-                if (cur_bytes + delta > budget_bytes) { continue; }
+                size_t delta_bytes = ti.candidate[j].bytes - ti.candidate[ti.choice].bytes;
+                if (cur_bytes + delta_bytes > budget_bytes) { continue; }
 
                 double err_gain = std::max(0.0, ti.candidate[ti.choice].error - ti.candidate[j].error);
-                double ratio = err_gain / (double)(delta * 8); // error reduction per bit
-                if (ratio > best_ratio + epsilon || (std::abs(ratio - best_ratio) <= epsilon && delta < best_delta)) {
+                if (err_gain < epsilon) { continue; } // no real improvement
+
+                double ratio = err_gain / (double)delta_bytes; // error reduction per byte
+                // For tie-breaking, prioritize the largest absolute error improvement.
+                if (ratio > best_ratio + epsilon || (std::abs(ratio - best_ratio) <= epsilon && err_gain > best_gain)) {
                     best_ratio = ratio;
-                    best_delta = delta;
+                    best_gain = err_gain;
                     best_i = i;
                     best_j = j;
                 }
             }
-            if (best_i < 0) { break; }
+
+            if (best_i < 0) { break; } // no more upgrades within budget found
+
+            size_t upgrade_cost = all[best_i].candidate[best_j].bytes - all[best_i].candidate[all[best_i].choice].bytes;
             all[best_i].choice = best_j;
-            cur_bytes += best_delta;
+            cur_bytes += upgrade_cost;
         }
     }
 

From b3b8a111a58a8a1586c763382463ccdf9bba3f6a Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 28 Sep 2025 18:45:25 +0100
Subject: [PATCH 104/155] Compute rows based on tensor shape and slice count

---
 src/llama-quant.cpp | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 699264553a..7bfb8751ae 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -650,9 +650,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         GGML_TYPE_IQ3_XXS,
         GGML_TYPE_Q3_K,
         GGML_TYPE_IQ4_XS,
-        GGML_TYPE_Q4_1,
         GGML_TYPE_Q4_K,
-        GGML_TYPE_Q5_1,
         GGML_TYPE_Q5_K,
         GGML_TYPE_Q6_K,
         GGML_TYPE_Q8_0
@@ -961,10 +959,24 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         ml.load_data_for(tensor);
 
         // Dequantize sampled rows into f32_sample
-        const int rows_sample_per_expert = activations_data ? 512 : 256;
         const int64_t n_per_row = tensor->ne[0];
         const int64_t nrows_total = tensor->ne[1];
         const int64_t ne2 = tensor->ne[2] > 0 ? tensor->ne[2] : 1;
+
+        // Compute rows based on tensor shape and slice count
+        auto sample_rows = [](const int64_t n, const int64_t rows, const int64_t n2, const bool has_acts) -> int64_t {
+            const double tensor_budget = has_acts ? 1 * 1024 * 1024 : 0.5 * 1024 * 1024;
+            const double scale_rows = std::clamp(std::sqrt(std::max(1.0, (double)rows) / 4096.0), 0.5, 2.0); // favour more rows for large nrt
+            const double slice_budget = tensor_budget * scale_rows / std::max<int64_t>(1, n2);
+            const int64_t min_rows = has_acts ? 128 : 64;
+            const int64_t max_rows = 4096;
+            int64_t total_rows = std::llround(slice_budget / std::max<int64_t>(1, n));
+            total_rows = std::max<int64_t>(min_rows, std::min<int64_t>(total_rows, std::min<int64_t>(rows, max_rows)));
+            if (rows <= min_rows * 2) { total_rows = rows; } // use all rows for small tensors
+            return total_rows;
+        };
+
+        const int64_t rows_sample_per_expert = sample_rows(n_per_row, nrows_total, ne2, activations_data != nullptr);
         std::vector<float> f32_sample;
         f32_sample.reserve((size_t)ne2 * (size_t)std::min<int64_t>(nrows_total, rows_sample_per_expert) * (size_t)n_per_row);
         std::vector<int64_t> rows_sample(ne2, 0);

From f5d8811ddde7533c561ad77d358d1d509a57ff9f Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Wed, 1 Oct 2025 19:04:43 +0100
Subject: [PATCH 105/155] Prioritise important tensors

---
 src/llama-quant.cpp | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 7bfb8751ae..a93d982e63 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -656,6 +656,13 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         GGML_TYPE_Q8_0
     };
 
+    const char * important_tensors[] = {
+        ".output.weight",
+        ".attn_output.weight",
+        ".ffn_down.weight",
+        ".ffn_down_shexp.weight"
+    };
+
     constexpr double epsilon = 1e-12;
     constexpr double infinity = std::numeric_limits<double>::infinity();
     const char * func = __func__;
@@ -1288,6 +1295,13 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return emit_overrides();
     }
 
+    auto is_important = [&](const std::string & tensor_name) -> bool {
+        return std::any_of(std::begin(important_tensors), std::end(important_tensors), [&](const char* imp) {
+                return tensor_name.find(imp) != std::string::npos;
+            }
+        );
+    };
+
     // Lagrangian relaxation to minimise error subject to a bpw target constraint
     auto lagrange_penalty = [&](const double mu, std::vector<int> & choice, size_t & bytes, double & err) {
         choice.resize(all.size());
@@ -1295,11 +1309,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         err = 0.0;
         for (size_t i = 0; i < all.size(); ++i) {
             const auto & candidate = all[i].candidate;
+            const std::string tensor_name = ggml_get_name(all[i].w->tensor);
+            double effective_mu = mu;
+            if (is_important(tensor_name)) { effective_mu *= 0.1; } // important tensors get 10x lower penalty
+
             int best_j = 0;
             double best_val = infinity;
             for (int j = 0; j < (int)candidate.size(); ++j) {
                 const double bits = (double)candidate[j].bytes * 8.0;
-                const double val = candidate[j].error + mu * bits;
+                const double val = candidate[j].error + effective_mu * bits;
                 if (val < best_val - epsilon || (std::abs(val - best_val) <= epsilon && candidate[j].bytes < candidate[best_j].bytes)) {
                     best_val = val;
                     best_j = j;
@@ -1402,18 +1420,21 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             double best_gain = -1.0;
 
             for (int i = 0; i < (int)all.size(); ++i) {
-                const auto &ti = all[i];
+                const auto & ti = all[i];
+                const std::string tensor_name  = ggml_get_name(ti.w->tensor);
                 int j = ti.choice + 1;
                 while (j < (int)ti.candidate.size() && ti.candidate[j].bytes == ti.candidate[ti.choice].bytes) { ++j; }
-                if (j >= (int)ti.candidate.size()) { continue; }
+                if (j >= (int)ti.candidate.size()) { continue; } // no upgrade available
 
                 size_t delta_bytes = ti.candidate[j].bytes - ti.candidate[ti.choice].bytes;
-                if (cur_bytes + delta_bytes > budget_bytes) { continue; }
+                if (cur_bytes + delta_bytes > budget_bytes) { continue; } // won't fit in budget
 
                 double err_gain = std::max(0.0, ti.candidate[ti.choice].error - ti.candidate[j].error);
-                if (err_gain < epsilon) { continue; } // no real improvement
+                if (err_gain < epsilon) { continue; } // no error improvement
 
                 double ratio = err_gain / (double)delta_bytes; // error reduction per byte
+                if (is_important(tensor_name)) { ratio *= 2.0; } // important tensors get 2x boost
+
                 // For tie-breaking, prioritize the largest absolute error improvement.
                 if (ratio > best_ratio + epsilon || (std::abs(ratio - best_ratio) <= epsilon && err_gain > best_gain)) {
                     best_ratio = ratio;

From 940db63144d7369f88145a099370cf1bd33a45b7 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Fri, 3 Oct 2025 11:08:02 +0100
Subject: [PATCH 106/155] Select quantization type if target_bpw is set unless
 user specifies type and threads

---
 tools/quantize/quantize.cpp | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 69e03179b3..89cf0fbf80 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -497,6 +497,24 @@ static bool parse_target_bpw(const char * data, float & target_bpw) {
     return true;
 }
 
+static const char * get_ftype(const float bpw) {
+    const std::map<float, const char *> quant_bpw = {
+        {1.5625, "IQ1_S"},
+        {1.7500, "IQ1_M"},
+        {2.0625, "IQ2_XXS"},
+        {2.6250, "Q2_K"},
+        {3.0625, "IQ3_XXS"},
+        {3.4375, "Q3_K"},
+        {4.2500, "IQ4_XS"},
+        {4.5000, "Q4_K"},
+        {5.5000, "Q5_K"},
+        {6.5625, "Q6_K"},
+        {8.5000, "Q8_0"}
+    };
+
+    return quant_bpw.lower_bound(bpw)->second;
+}
+
 int main(int argc, char ** argv) {
     if (argc < 3) {
         usage(argv[0]);
@@ -655,6 +673,7 @@ int main(int argc, char ** argv) {
 
     std::string ftype_str;
     std::string suffix = ".gguf";
+    std::vector<const char *> tmp_argv(argv, argv + argc);
     if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
         std::string fpath;
         const size_t pos = fname_inp.find_last_of("/\\");
@@ -678,7 +697,21 @@ int main(int argc, char ** argv) {
         }
         arg_idx++;
 
-        if (argc <= arg_idx) {
+        // select quantization type if target_bpw is set unless user specifies type and threads
+        if (argc - arg_idx <= 1 && params.target_bpw != -1.0f) {
+            auto * ftype = const_cast<char *>(get_ftype(params.target_bpw));
+            if (argc == arg_idx) {
+                tmp_argv.push_back(ftype);
+                tmp_argv.push_back(nullptr);
+                argv = const_cast<char **>(tmp_argv.data());
+                argc++;
+            } else {
+                tmp_argv.insert(tmp_argv.end() - 1, ftype);
+                tmp_argv.push_back(nullptr);
+                argv = const_cast<char **>(tmp_argv.data());
+                argc++;
+            }
+        } else if (argc <= arg_idx) {
             fprintf(stderr, "%s: missing ftype\n", __func__);
             return 1;
         }

From 66d4aed173aba8b3b4e05c6d7b46ca8911ec7ddf Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 4 Oct 2025 08:21:01 +0100
Subject: [PATCH 107/155] Minor refactoring

---
 tools/quantize/quantize.cpp | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 89cf0fbf80..d355f97274 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -700,17 +700,11 @@ int main(int argc, char ** argv) {
         // select quantization type if target_bpw is set unless user specifies type and threads
         if (argc - arg_idx <= 1 && params.target_bpw != -1.0f) {
             auto * ftype = const_cast<char *>(get_ftype(params.target_bpw));
-            if (argc == arg_idx) {
-                tmp_argv.push_back(ftype);
-                tmp_argv.push_back(nullptr);
-                argv = const_cast<char **>(tmp_argv.data());
-                argc++;
-            } else {
-                tmp_argv.insert(tmp_argv.end() - 1, ftype);
-                tmp_argv.push_back(nullptr);
-                argv = const_cast<char **>(tmp_argv.data());
-                argc++;
-            }
+            if (argc == arg_idx) { tmp_argv.push_back(ftype); }
+            else { tmp_argv.insert(tmp_argv.end() - 1, ftype); }
+            tmp_argv.push_back(nullptr);
+            argv = const_cast<char **>(tmp_argv.data());
+            argc++;
         } else if (argc <= arg_idx) {
             fprintf(stderr, "%s: missing ftype\n", __func__);
             return 1;

From 560e8c9d70964320a0283936b0d8e9fd198356ee Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 5 Oct 2025 14:41:42 +0100
Subject: [PATCH 108/155] Relax lambda clamping

---
 src/llama-quant.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index a93d982e63..422c929f0c 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -701,7 +701,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     };
 
     auto make_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> ggml_type {
-        if (is_compatible(t, typ)) return typ;
+        if (is_compatible(t, typ)) { return typ; }
         ggml_type fb = fallback_type(typ);
         return is_compatible(t, fb) ? fb : GGML_TYPE_F16;
     };
@@ -941,7 +941,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             if (s1 > 0.0) {
                 const auto n = (double)n_per_row;
                 const double c = std::max(0.0, s2 / (s1 * s1 + epsilon) - 1.0 / n);
-                l = (float)std::clamp(12.0 * (c / (c + 1.0)), 0.0, 12.0);
+                l = (float)std::clamp(12.0 * (c / (c + 1.0)), 0.0, 16.0);
             }
 
             lambdas[(size_t)s] = l;
@@ -1035,7 +1035,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 for (int64_t r = offset; r < nrows_total && current < rows_sample_max; r += stride) {
                     const uint8_t * src_row = (const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz;
                     if (src_type == GGML_TYPE_F32) {
-                        auto src_f32 = (const float *)src_row;
+                        const auto *src_f32 = (const float *)src_row;
                         f32_sample.insert(f32_sample.end(), src_f32, src_f32 + n_per_row);
                     } else {
                         row_to_fp32(src_row, row_buffer.data());
@@ -1173,7 +1173,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         // Keep only the pareto‑optimal candidates and enforce convexity in (bytes, error) curve
         auto pareto_convex = [](std::vector<candidate_types> & candidates) {
-            if (candidates.empty()) return;
+            if (candidates.empty()) { return; }
 
             std::sort(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) {
                 if (a.bytes != b.bytes) { return a.bytes < b.bytes; }

From 533cda3076b5ae26d120f04b7aaa813f7b7a5ac7 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 5 Oct 2025 20:16:33 +0100
Subject: [PATCH 109/155] Add signal handler

---
 src/llama-quant.cpp | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 422c929f0c..50c8dbf423 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -8,6 +8,7 @@
 #include <cmath>
 #include <cstring>
 #include <cinttypes>
+#include <csignal>
 #include <fstream>
 #include <mutex>
 #include <random>
@@ -613,6 +614,12 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
     return new_size;
 }
 
+static std::atomic<bool> bpw_stop{ false };
+
+static void signal_handler(int) {
+    bpw_stop.store(true, std::memory_order_relaxed);
+}
+
 // Returns tensor type overrides to meet a global bpw target
 static std::unordered_map<std::string, ggml_type> target_bpw_type(
     llama_model_loader & ml,
@@ -711,6 +718,22 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return is_quantizable(ggml_get_name(t), model.arch, params);
     };
 
+    auto install_signal_handlers = [] {
+        static std::once_flag once;
+        std::call_once(once, [] {
+            std::signal(SIGINT, signal_handler);
+            std::signal(SIGTERM, signal_handler);
+        });
+    };
+
+    auto uninstall_signal_handlers = [] {
+        static std::once_flag once;
+        std::call_once(once, [] {
+            std::signal(SIGINT, SIG_DFL);
+            std::signal(SIGTERM, SIG_DFL);
+        });
+    };
+
     // Estimate error for a given type using a sampled subset of rows
     auto estimate_error = [&](const ggml_tensor * t,
         const ggml_type quant_type,

From e48ca32f19095ba0c47058dc7a703c1bb52977e0 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 5 Oct 2025 20:17:27 +0100
Subject: [PATCH 110/155] Add save_bpw_state()

---
 src/llama-quant.cpp | 50 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 50c8dbf423..3080b0ed71 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -734,6 +734,56 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         });
     };
 
+    // Saved state per tensor
+    struct saved_info {
+        std::vector<candidate_types> candidate;
+        int choice = -1;
+        float min_bpw = 0.0f;
+        float max_bpw = 0.0f;
+        size_t n_elements = 0;
+    };
+
+    auto save_bpw_state = [&](const std::vector<tensor_info> & all_vec) {
+        const std::string tmp = checkpoint_file + ".tmp";
+        std::ofstream ofs(tmp, std::ios::binary | std::ios::trunc);
+        if (!ofs) { return; } // best-effort
+        const float target_bpw = params->target_bpw;
+        const uint8_t bias_mode = params->no_bias ? 1 : 0;
+        ofs.write((const char *)&file_magic, sizeof(file_magic));
+        ofs.write((const char *)&target_bpw, sizeof(target_bpw));
+        ofs.write((const char *)&bias_mode, sizeof(bias_mode));
+        const uint64_t n = all_vec.size();
+        ofs.write((const char *)&n, sizeof(n));
+        for (const auto & ti : all_vec) {
+            const std::string name = ggml_get_name(ti.w->tensor);
+            const uint32_t len = (uint32_t)name.size();
+            ofs.write((const char *)&len, sizeof(len));
+            ofs.write(name.data(), len);
+
+            const uint64_t cn = ti.candidate.size();
+            ofs.write((const char *)&cn, sizeof(cn));
+            ofs.write((const char *)&ti.choice, sizeof(ti.choice));
+            ofs.write((const char *)&ti.min_bpw, sizeof(ti.min_bpw));
+            ofs.write((const char *)&ti.max_bpw, sizeof(ti.max_bpw));
+            const uint64_t ne = ti.n_elements;
+            ofs.write((const char *)&ne, sizeof(ne));
+
+            for (const auto & c : ti.candidate) {
+                const int32_t  t = c.type;
+                const uint64_t b = c.bytes;
+                ofs.write((const char *)&t, sizeof(t));
+                ofs.write((const char *)&c.bpw, sizeof(c.bpw));
+                ofs.write((const char *)&b, sizeof(b));
+                ofs.write((const char *)&c.error, sizeof(c.error));
+            }
+        }
+
+        ofs.close();
+        std::remove(checkpoint_file.c_str()); // TODO: handle errors
+        std::rename(tmp.c_str(), checkpoint_file.c_str());
+        LLAMA_LOG_INFO("%s: saved bpw progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str());
+    };
+
     // Estimate error for a given type using a sampled subset of rows
     auto estimate_error = [&](const ggml_tensor * t,
         const ggml_type quant_type,

From 02c3073b81cc7fa26219419c517331b3e3243379 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 5 Oct 2025 20:18:36 +0100
Subject: [PATCH 111/155] Add load_bpw_state()

---
 src/llama-quant.cpp | 64 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 3080b0ed71..4d0dc6a36e 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -672,7 +672,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     constexpr double epsilon = 1e-12;
     constexpr double infinity = std::numeric_limits<double>::infinity();
+    constexpr uint32_t file_magic = 0x42505731;  // BPW1
     const char * func = __func__;
+    const std::string checkpoint_file = ml.arch_name + ".bpw_state";
 
     auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t {
         const int64_t n_per_row = t->ne[0];
@@ -784,6 +786,68 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         LLAMA_LOG_INFO("%s: saved bpw progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str());
     };
 
+    auto load_bpw_state = [&]() -> std::unordered_map<std::string, saved_info> {
+        std::unordered_map<std::string, saved_info> out;
+        std::ifstream ifs(checkpoint_file, std::ios::binary);
+        if (!ifs) { return out; }
+
+        uint32_t magic = 0;
+        float target_bpw = 0.0f;
+        uint8_t bias_mode = 0;
+        ifs.read((char *)&magic, sizeof(magic));
+        ifs.read((char *)&target_bpw, sizeof(target_bpw));
+        ifs.read((char *)&bias_mode, sizeof(bias_mode));
+        if (magic != file_magic) {
+            LLAMA_LOG_WARN("%s: invalid resume file, ignoring: %s\n", func, checkpoint_file.c_str());
+            return out;
+        }
+        if (target_bpw != params->target_bpw) {
+            LLAMA_LOG_WARN("%s: target bpw of %f does not match %f, ignoring: %s\n", func, params->target_bpw, target_bpw, checkpoint_file.c_str());
+            return out;
+        }
+        if (bias_mode != (params->no_bias ? 1 : 0)) {
+            LLAMA_LOG_WARN("%s: bias mode does not match, ignoring: %s\n", func, checkpoint_file.c_str());
+            return out;
+        }
+
+        uint64_t n = 0;
+        ifs.read((char *)&n, sizeof(n));
+        for (uint64_t i = 0; i < n; ++i) {
+            uint32_t len = 0;
+            ifs.read((char *)&len, sizeof(len));
+            std::string name(len, '\0');
+            ifs.read(name.data(), len);
+
+            uint64_t cn = 0;
+            ifs.read((char *)&cn, sizeof(cn));
+
+            saved_info si;
+            ifs.read((char *)&si.choice, sizeof(si.choice));
+            ifs.read((char *)&si.min_bpw, sizeof(si.min_bpw));
+            ifs.read((char *)&si.max_bpw, sizeof(si.max_bpw));
+            uint64_t ne = 0;
+            ifs.read((char *)&ne, sizeof(ne));
+            si.n_elements = (size_t)ne;
+
+            si.candidate.resize(cn);
+            for (size_t j = 0; j < si.candidate.size(); ++j) {
+                int32_t t = 0;
+                uint64_t b = 0;
+                ifs.read((char *)&t, sizeof(t));
+                si.candidate[j].type = (ggml_type)t;
+                ifs.read((char *)&si.candidate[j].bpw, sizeof(si.candidate[j].bpw));
+                ifs.read((char *)&b, sizeof(b));
+                si.candidate[j].bytes = (size_t)b;
+                ifs.read((char *)&si.candidate[j].error, sizeof(si.candidate[j].error));
+            }
+
+            out.emplace(std::move(name), std::move(si));
+        }
+
+        LLAMA_LOG_INFO("%s: loaded bpw state for %lu tensors from %s\n", func, out.size(), checkpoint_file.c_str());
+        return out;
+    };
+
     // Estimate error for a given type using a sampled subset of rows
     auto estimate_error = [&](const ggml_tensor * t,
         const ggml_type quant_type,

From 74c62ed4e63e4e95f031875b6ead5718f5fb900a Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 5 Oct 2025 20:19:03 +0100
Subject: [PATCH 112/155] Add delete_bpw_state()

---
 src/llama-quant.cpp | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 4d0dc6a36e..9212c88563 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -848,6 +848,19 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return out;
     };
 
+    auto delete_bpw_state = [&] {
+        LLAMA_LOG_INFO("%s: deleting %s\n", func, checkpoint_file.c_str());
+        std::remove(checkpoint_file.c_str());
+    };
+
+    auto check_signal_handler = [&](const std::vector<tensor_info> & all_vec) {
+        if (bpw_stop.load(std::memory_order_relaxed)) {
+            LLAMA_LOG_INFO("\n%s: saving bpw progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str());
+            save_bpw_state(all_vec);
+            throw std::runtime_error("user interrupted the process");
+        }
+    };
+
     // Estimate error for a given type using a sampled subset of rows
     auto estimate_error = [&](const ggml_tensor * t,
         const ggml_type quant_type,

From 46706cec28ad83b8ab10781493b84343b5b0f048 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 5 Oct 2025 20:20:28 +0100
Subject: [PATCH 113/155] Persist progress

---
 src/llama-quant.cpp | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 9212c88563..640672aec7 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1100,12 +1100,28 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return lambdas;
     };
 
+    install_signal_handlers();
+    auto bpw_data = load_bpw_state();
     std::vector<tensor_info> all;
     all.reserve(tensors.size());
     for (const auto * tw : tensors) {
         ggml_tensor * tensor = tw->tensor;
         const std::string name = ggml_get_name(tensor);
         if (!can_quantize(tensor)) { continue; }
+        check_signal_handler(all);
+
+        // If we already have fully evaluatedd this tensor then reuse it
+        if (auto it_saved = bpw_data.find(name); it_saved != bpw_data.end()) {
+            tensor_info info;
+            info.w = tw;
+            info.candidate = it_saved->second.candidate;
+            info.choice = it_saved->second.choice;
+            info.min_bpw = it_saved->second.min_bpw;
+            info.max_bpw = it_saved->second.max_bpw;
+            info.n_elements = it_saved->second.n_elements ? it_saved->second.n_elements : (size_t)ggml_nelements(tensor);
+            all.push_back(std::move(info));
+            continue;
+        }
 
         LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12" PRId64 " elements)\n", __func__, name.c_str(), ggml_nelements(tensor));
         if (!ml.use_mmap) {
@@ -1296,6 +1312,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 std::vector<uint8_t> tl_quantized_buffer(quantized_buffer.size());
                 std::vector<float> tl_dequantized_buffer(dequantized_buffer.size());
                 for (;;) {
+                    if (bpw_stop.load(std::memory_order_relaxed)) { break; } // stop if a signal arrived
                     const size_t i = cidx.fetch_add(1, std::memory_order_acq_rel);
                     if (i >= compatible_candidates.size()) { break; }
 
@@ -1311,6 +1328,11 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         for (auto &th : eval_workers) { th.join(); }
 
+        // If interruption happened mid-evaluation, exit without adding a half-baked tensor entry
+        if (bpw_stop.load(std::memory_order_relaxed) && cidx.load(std::memory_order_relaxed) < compatible_candidates.size()) {
+            check_signal_handler(all);
+        }
+
         for (auto &c : eval_candidates) {
             if (c.bytes > 0) { info.candidate.push_back(c); }
         }
@@ -1384,6 +1406,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         info.min_bpw = info.candidate.front().bpw;
         info.max_bpw = info.candidate.back().bpw;
         all.push_back(std::move(info));
+        check_signal_handler(all); // save after each tensor
     }
 
     if (all.empty()) { return {}; }
@@ -1441,7 +1464,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return emit_overrides();
     }
     if (budget_bytes >= max_bytes) {
-        for (auto & ti : all) { ti.choice = (int) ti.candidate.size() - 1; }
+        for (auto & ti : all) { ti.choice = (int)ti.candidate.size() - 1; }
         return emit_overrides();
     }
 

From 84ada44894dec721124613820bf640b97ac3e784 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 5 Oct 2025 20:20:56 +0100
Subject: [PATCH 114/155] Uninstall signal handler and cleanup

---
 src/llama-quant.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 640672aec7..eb5c9124b5 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1625,6 +1625,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         }
     }
 
+    delete_bpw_state(); // we're done, clear any checkpoint
+    uninstall_signal_handlers();
+
     return emit_overrides();
 }
 

From 044fa783c7e5e87bddf667fbe7396628e827b455 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Mon, 6 Oct 2025 21:40:37 +0100
Subject: [PATCH 115/155] Fix trimming logic

---
 src/llama-quant.cpp | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index eb5c9124b5..aeb1542607 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -849,8 +849,12 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     };
 
     auto delete_bpw_state = [&] {
-        LLAMA_LOG_INFO("%s: deleting %s\n", func, checkpoint_file.c_str());
-        std::remove(checkpoint_file.c_str());
+        std::ifstream ifs(checkpoint_file);
+        if (ifs.good()) {
+            LLAMA_LOG_INFO("%s: deleting %s\n", func, checkpoint_file.c_str());
+            std::remove(checkpoint_file.c_str());
+        }
+
     };
 
     auto check_signal_handler = [&](const std::vector<tensor_info> & all_vec) {
@@ -988,14 +992,16 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         }
 
         // Compute error per slice with trimmed aggregation
-        auto trimmed_sum = [](std::vector<double> & v) -> double {
+        auto trimmed_mean = [](std::vector<double> & v) -> double {
             const int64_t n = (int64_t)v.size();
             if (n == 0) { return 0.0; }
-            if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); } // use all samples for small datasets
-
-            int64_t k = (int64_t) std::floor(0.025 * (double)n); // trim 2.5% from each tail of the distribution
+            double sum = std::accumulate(v.begin(), v.end(), 0.0);
+            if (n < 50) { return sum / (double)n; } // too few elements to trim
+            int64_t k = (int64_t) std::floor(0.025 * (double)n); // trim 5% (2.5% each side)
             std::sort(v.begin(), v.end());
-            return std::accumulate(v.begin() + k, v.begin() + (n - k), 0.0);
+            const auto num = (double)(n - 2 * k);
+            sum = std::accumulate(v.begin() + k, v.begin() + (n - k), 0.0);
+            return sum / std::max(1.0, num);
         };
 
         size_t off = 0;
@@ -1028,7 +1034,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 }
 
                 const double denom_x = row_sq_norm[ridx];
-                const double m_norm  = w_mse / (denom_x + epsilon);
+                const double m_norm = w_mse / (denom_x + epsilon);
                 row_mse_norm.push_back(std::isfinite(m_norm) ? m_norm : infinity);
 
                 if (a) {
@@ -1044,9 +1050,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 off += (size_t)n_per_row;
             }
 
-            const double scale_rows = (double)nrows / std::max(1.0, (double)rs);
-            const double slice_mse = trimmed_sum(row_mse_norm) * scale_rows;
-            const double slice_proj = a ? trimmed_sum(row_proj_norm) * scale_rows : 0.0;
+            const double slice_mse = trimmed_mean(row_mse_norm) * (double)nrows;
+            const double slice_proj = a ? trimmed_mean(row_proj_norm) * (double)nrows : 0.0;
 
             total_mse += slice_mse;
             total_proj += slice_proj;

From c11184a3c11917aba2c3d360a9cbb3bf3ebaf38a Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 9 Oct 2025 11:58:01 +0100
Subject: [PATCH 116/155] Generate model ID hash

---
 src/llama-quant.cpp | 51 +++++++++++++++++++++++++++++++++------------
 1 file changed, 38 insertions(+), 13 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index aeb1542607..5388d5a072 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -674,7 +674,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     constexpr double infinity = std::numeric_limits<double>::infinity();
     constexpr uint32_t file_magic = 0x42505731;  // BPW1
     const char * func = __func__;
-    const std::string checkpoint_file = ml.arch_name + ".bpw_state";
 
     auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t {
         const int64_t n_per_row = t->ne[0];
@@ -745,6 +744,26 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         size_t n_elements = 0;
     };
 
+    auto djb2_hash = [](const uint8_t * data, size_t n) -> uint64_t {
+        uint64_t h = 5381;
+        for (size_t i = 0; i < n; ++i) {
+            h = (h << 5) + h + data[i];
+        }
+        return h ? h : 0xeabada55cafed00d;
+    };
+
+    auto metadata_id = [&](const gguf_context * ctx) -> uint64_t {
+        const size_t sz = gguf_get_meta_size(ctx);
+        std::vector<uint8_t> buf(sz);
+        gguf_get_meta_data(ctx, buf.data());
+        return djb2_hash(buf.data(), buf.size());
+    };
+
+    char hex[17];
+    const uint64_t model_id = metadata_id(ml.meta.get());
+    std::snprintf(hex, sizeof(hex), "%016" PRIx64, (uint64_t)model_id);
+    const std::string checkpoint_file = ml.arch_name + "-" + std::string(hex) + ".bpw_state";
+
     auto save_bpw_state = [&](const std::vector<tensor_info> & all_vec) {
         const std::string tmp = checkpoint_file + ".tmp";
         std::ofstream ofs(tmp, std::ios::binary | std::ios::trunc);
@@ -752,6 +771,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         const float target_bpw = params->target_bpw;
         const uint8_t bias_mode = params->no_bias ? 1 : 0;
         ofs.write((const char *)&file_magic, sizeof(file_magic));
+        ofs.write((const char *)&model_id, sizeof(model_id));
         ofs.write((const char *)&target_bpw, sizeof(target_bpw));
         ofs.write((const char *)&bias_mode, sizeof(bias_mode));
         const uint64_t n = all_vec.size();
@@ -781,9 +801,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         }
 
         ofs.close();
-        std::remove(checkpoint_file.c_str()); // TODO: handle errors
+        std::remove(checkpoint_file.c_str());
         std::rename(tmp.c_str(), checkpoint_file.c_str());
-        LLAMA_LOG_INFO("%s: saved bpw progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str());
+        LLAMA_LOG_INFO("%s: saved progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str());
     };
 
     auto load_bpw_state = [&]() -> std::unordered_map<std::string, saved_info> {
@@ -792,22 +812,27 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         if (!ifs) { return out; }
 
         uint32_t magic = 0;
-        float target_bpw = 0.0f;
-        uint8_t bias_mode = 0;
+        uint64_t id = 0;
+        float bpw = 0.0f;
+        uint8_t bias = 0;
         ifs.read((char *)&magic, sizeof(magic));
-        ifs.read((char *)&target_bpw, sizeof(target_bpw));
-        ifs.read((char *)&bias_mode, sizeof(bias_mode));
+        ifs.read((char *)&id, sizeof(id));
+        ifs.read((char *)&bpw, sizeof(bpw));
+        ifs.read((char *)&bias, sizeof(bias));
         if (magic != file_magic) {
             LLAMA_LOG_WARN("%s: invalid resume file, ignoring: %s\n", func, checkpoint_file.c_str());
             return out;
-        }
-        if (target_bpw != params->target_bpw) {
-            LLAMA_LOG_WARN("%s: target bpw of %f does not match %f, ignoring: %s\n", func, params->target_bpw, target_bpw, checkpoint_file.c_str());
+        } else if (id != model_id) {
+            LLAMA_LOG_WARN("%s: model ID mismatch, ignoring: %s\n", func, checkpoint_file.c_str());
             return out;
-        }
-        if (bias_mode != (params->no_bias ? 1 : 0)) {
+        } else if (bpw != params->target_bpw) {
+            LLAMA_LOG_WARN("%s: target bpw of %f does not match %f, ignoring: %s\n", func, params->target_bpw, bpw, checkpoint_file.c_str());
+            return out;
+        } else if (bias != (params->no_bias ? 1 : 0)) {
             LLAMA_LOG_WARN("%s: bias mode does not match, ignoring: %s\n", func, checkpoint_file.c_str());
             return out;
+        } else {
+            LLAMA_LOG_INFO("%s: resuming tensor quantization\n", func);
         }
 
         uint64_t n = 0;
@@ -859,7 +884,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     auto check_signal_handler = [&](const std::vector<tensor_info> & all_vec) {
         if (bpw_stop.load(std::memory_order_relaxed)) {
-            LLAMA_LOG_INFO("\n%s: saving bpw progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str());
+            LLAMA_LOG_INFO("\n%s: saving progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str());
             save_bpw_state(all_vec);
             throw std::runtime_error("user interrupted the process");
         }

From 3a3d807fc3aacc01715047bcc893f925f5343c6b Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Fri, 10 Oct 2025 13:10:42 +0100
Subject: [PATCH 117/155] Remove bias mode computation

---
 src/llama-quant.cpp | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 5388d5a072..7b3e956193 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -769,11 +769,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         std::ofstream ofs(tmp, std::ios::binary | std::ios::trunc);
         if (!ofs) { return; } // best-effort
         const float target_bpw = params->target_bpw;
-        const uint8_t bias_mode = params->no_bias ? 1 : 0;
         ofs.write((const char *)&file_magic, sizeof(file_magic));
         ofs.write((const char *)&model_id, sizeof(model_id));
         ofs.write((const char *)&target_bpw, sizeof(target_bpw));
-        ofs.write((const char *)&bias_mode, sizeof(bias_mode));
         const uint64_t n = all_vec.size();
         ofs.write((const char *)&n, sizeof(n));
         for (const auto & ti : all_vec) {
@@ -814,11 +812,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         uint32_t magic = 0;
         uint64_t id = 0;
         float bpw = 0.0f;
-        uint8_t bias = 0;
         ifs.read((char *)&magic, sizeof(magic));
         ifs.read((char *)&id, sizeof(id));
         ifs.read((char *)&bpw, sizeof(bpw));
-        ifs.read((char *)&bias, sizeof(bias));
         if (magic != file_magic) {
             LLAMA_LOG_WARN("%s: invalid resume file, ignoring: %s\n", func, checkpoint_file.c_str());
             return out;
@@ -828,9 +824,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         } else if (bpw != params->target_bpw) {
             LLAMA_LOG_WARN("%s: target bpw of %f does not match %f, ignoring: %s\n", func, params->target_bpw, bpw, checkpoint_file.c_str());
             return out;
-        } else if (bias != (params->no_bias ? 1 : 0)) {
-            LLAMA_LOG_WARN("%s: bias mode does not match, ignoring: %s\n", func, checkpoint_file.c_str());
-            return out;
         } else {
             LLAMA_LOG_INFO("%s: resuming tensor quantization\n", func);
         }
@@ -1319,13 +1312,11 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         std::vector<float> lambdas;
         const float * values = values_sample.empty() ? nullptr : values_sample.data();
         const float * activations = activations_sample.empty() ? nullptr : activations_sample.data();
-        if (!params->no_bias) {
-            double acc = 0.0;
-            int ns = 0;
-            lambdas = estimate_lambda(values, activations, n_per_row, ne2);
-            for (float l : lambdas) { acc += l; ++ns; }
-            tensor_lambda = ns ? (float)(acc / ns) : 0.0f;
-        }
+        double acc = 0.0;
+        int ns = 0;
+        lambdas = estimate_lambda(values, activations, n_per_row, ne2);
+        for (float l : lambdas) { acc += l; ++ns; }
+        tensor_lambda = ns ? (float)(acc / ns) : 0.0f;
 
         // Evaluate candidates
         std::vector<candidate_types> eval_candidates(compatible_candidates.size());
@@ -1925,11 +1916,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     if (params->target_bpw != -1.0f && !params->only_copy) {
         if (params->imatrix) {
             if (params->activations) {
-                LLAMA_LOG_INFO("%s: imatrix with activations provided, target bpw quantization will be more accurate - ",__func__);
+                LLAMA_LOG_INFO("%s: imatrix with activations provided, target bpw quantization will be more accurate\n",__func__);
             } else {
-                LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate - ", __func__);
+                LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate\n", __func__);
             }
-            LLAMA_LOG_INFO("using %s error estimation\n", params->no_bias ? "MSE only (no alignment bias)" : "alignment bias (default)");
             LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw);
             bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread);
         } else {

From c93131cef6dbb4e415fd2b3625f644c6714e7465 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Fri, 10 Oct 2025 13:26:51 +0100
Subject: [PATCH 118/155] Remove --no-bias option

---
 include/llama.h             | 1 -
 src/llama-quant.cpp         | 3 +--
 tools/quantize/quantize.cpp | 6 +-----
 3 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index 16f6124727..1df8f96920 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -365,7 +365,6 @@ extern "C" {
         void * tensor_types;                  // pointer to vector containing tensor types
         void * prune_layers;                  // pointer to vector containing layer indices to prune
         float target_bpw;                     // target bits per weight (bpw)
-        bool no_bias;                         // use mean square error estimation only (no aligment bias)
     } llama_model_quantize_params;
 
     typedef struct llama_logit_bias {
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 7b3e956193..4ad5124d1a 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -2180,8 +2180,7 @@ llama_model_quantize_params llama_model_quantize_default_params() {
         /*.kv_overrides                =*/ nullptr,
         /*.tensor_type                 =*/ nullptr,
         /*.prune_layers                =*/ nullptr,
-        /*.target_bpw                  =*/ -1.0f,
-        /*.no_bias                     =*/ false
+        /*.target_bpw                  =*/ -1.0f
     };
 
     return result;
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index d355f97274..c254c3f6b2 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -118,7 +118,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 [[noreturn]]
 static void usage(const char * executable) {
     printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights]\n", executable);
-    printf("       [--target-bpw n] [--no-bias] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n");
+    printf("       [--target-bpw n] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n");
     printf("       model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
     printf("  --allow-requantize: allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
     printf("  --leave-output-tensor: will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
@@ -134,8 +134,6 @@ static void usage(const char * executable) {
     printf("      Advanced option to remove all tensors from the given layers\n");
     printf("  --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0\n");
     printf("      Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n");
-    printf("  --no-bias: use mean square error estimation only (no aligment bias)\n");
-    printf("      Advanced option use MSE only and disable aligment bias error estimation\n");
     printf("  --keep-split: will generate quantized model in the same shards as input\n");
     printf("  --override-kv KEY=TYPE:VALUE\n");
     printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
@@ -559,8 +557,6 @@ int main(int argc, char ** argv) {
             if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) {
                 usage(argv[0]);
             }
-        } else if (strcmp(argv[arg_idx], "--no-bias") == 0) {
-            params.no_bias = true;
         } else if (strcmp(argv[arg_idx], "--prune-layers") == 0) {
             if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) {
                 usage(argv[0]);

From 5b0d3f6d5ad46596e0f30c967c00e2dc2b93d8da Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 11 Oct 2025 10:04:48 +0100
Subject: [PATCH 119/155] Automatically determine if bias error is significant

---
 src/llama-quant.cpp | 52 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 48 insertions(+), 4 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 4ad5124d1a..07a88f0fd6 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -637,6 +637,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         float bpw;
         size_t bytes;
         double error;
+        double mse = 0.0;
+        double proj = 0.0;
     };
 
     struct tensor_info {
@@ -1340,9 +1342,11 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     const ggml_type tensor_types = compatible_candidates[i];
                     const auto bpw = (float)tensor_bpw(tensor, tensor_types);
                     const size_t bytes = tensor_bytes(tensor, tensor_types);
+                    double mse = 0.0;
+                    double proj = 0.0;
                     const auto err = estimate_error(tensor, tensor_types, f32_sample, rows_sample, values, activations,
-                        tl_quantized_buffer, tl_dequantized_buffer, tensor_lambda, slice_lambda);
-                    eval_candidates[i] = candidate_types{ tensor_types, bpw, bytes, err };
+                        tl_quantized_buffer, tl_dequantized_buffer, tensor_lambda, slice_lambda, &mse, &proj);
+                    eval_candidates[i] = candidate_types{ tensor_types, bpw, bytes, err, mse, proj };
                 }
             });
         }
@@ -1354,8 +1358,48 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             check_signal_handler(all);
         }
 
-        for (auto &c : eval_candidates) {
-            if (c.bytes > 0) { info.candidate.push_back(c); }
+        // Check if biasing is needed
+        bool bias_needed = false;
+        if (!lambdas.empty()) {
+            int min_mse  = -1;
+            int min_bias = -1;
+            {
+                double best_mse = std::numeric_limits<double>::infinity();
+                double best_err = std::numeric_limits<double>::infinity();
+                for (int i = 0; i < (int)eval_candidates.size(); ++i) {
+                    const auto & c = eval_candidates[i];
+                    if (c.bytes == 0) { continue; }
+                    if (c.mse  < best_mse) {
+                        best_mse = c.mse;
+                        min_mse  = i;
+                    }
+                    if (c.error < best_err) {
+                        best_err = c.error;
+                        min_bias = i;
+                    }
+                }
+            }
+
+            if (min_mse != min_bias) {
+                bias_needed = true;
+            } else {
+                double max_rel_bias = 0.0;
+                for (const auto & c : eval_candidates) {
+                    if (c.bytes == 0) { continue; }
+                    const double mse = std::max(c.mse, epsilon);
+                    const double bias_term = std::max(0.0, c.error - c.mse);
+                    const double rel = bias_term / mse;
+                    max_rel_bias = std::max(rel, max_rel_bias);
+                }
+
+                bias_needed = max_rel_bias >= 0.5; // >= 50% of MSE?
+            }
+        }
+
+        for (auto & c : eval_candidates) {
+            if (c.bytes == 0) { continue; }
+            const double final_err = bias_needed ? c.error : c.mse;
+            info.candidate.push_back(candidate_types{ c.type, c.bpw, c.bytes, final_err, c.mse, c.proj });
         }
 
         if (info.candidate.empty()) {

From 12e0524f3a24d4d5c8a81546fff83fee81e0d3e1 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 12 Oct 2025 15:12:15 +0100
Subject: [PATCH 120/155] Reduce compute time by parallelising tensor
 processing - courtesy of https://github.com/ddh0

---
 src/llama-quant.cpp | 187 +++++++++++++++++++++++---------------------
 1 file changed, 100 insertions(+), 87 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 07a88f0fd6..c607651b05 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -15,6 +15,7 @@
 #include <regex>
 #include <thread>
 #include <unordered_map>
+#include <optional>
 
 // Quantization types. Changes to this struct must be replicated in quantize.cpp
 struct tensor_quantization {
@@ -623,7 +624,6 @@ static void signal_handler(int) {
 // Returns tensor type overrides to meet a global bpw target
 static std::unordered_map<std::string, ggml_type> target_bpw_type(
     llama_model_loader & ml,
-    std::vector<no_init<uint8_t>> & buffer,
     const llama_model & model,
     const std::vector<const llama_model_loader::llama_tensor_weight *> & tensors,
     const std::map<int, std::string> & mapped,
@@ -659,6 +659,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         GGML_TYPE_IQ3_XXS,
         GGML_TYPE_Q3_K,
         GGML_TYPE_IQ4_XS,
+        GGML_TYPE_IQ4_NL,
         GGML_TYPE_Q4_K,
         GGML_TYPE_Q5_K,
         GGML_TYPE_Q6_K,
@@ -1127,16 +1128,22 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     install_signal_handlers();
     auto bpw_data = load_bpw_state();
-    std::vector<tensor_info> all;
-    all.reserve(tensors.size());
-    for (const auto * tw : tensors) {
+
+    // Significantly reduce compute time by parallelising tensor processing - courtesy of https://github.com/ddh0
+    auto process_tensor = [&](const llama_model_loader::llama_tensor_weight * tw,
+        std::vector<no_init<uint8_t>> & thread_local_buffer,
+        std::mutex & loader_mutex,
+        std::mutex & log_mutex) -> std::optional<tensor_info>
+    {
         ggml_tensor * tensor = tw->tensor;
         const std::string name = ggml_get_name(tensor);
-        if (!can_quantize(tensor)) { continue; }
-        check_signal_handler(all);
+        if (bpw_stop.load(std::memory_order_relaxed)) {
+            return std::nullopt;
+        }
 
-        // If we already have fully evaluatedd this tensor then reuse it
-        if (auto it_saved = bpw_data.find(name); it_saved != bpw_data.end()) {
+        // check for pre-computed results from a checkpoint file.
+        auto it_saved = bpw_data.find(name);
+        if (it_saved != bpw_data.end()) {
             tensor_info info;
             info.w = tw;
             info.candidate = it_saved->second.candidate;
@@ -1144,17 +1151,21 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             info.min_bpw = it_saved->second.min_bpw;
             info.max_bpw = it_saved->second.max_bpw;
             info.n_elements = it_saved->second.n_elements ? it_saved->second.n_elements : (size_t)ggml_nelements(tensor);
-            all.push_back(std::move(info));
-            continue;
+            return info;
+        }
+        {
+            std::lock_guard<std::mutex> lock(log_mutex);
+            LLAMA_LOG_INFO("\ttarget_bpw_type: - processing tensor %45s \t(%12" PRId64 " elements)\n", name.c_str(), ggml_nelements(tensor));
         }
 
-        LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12" PRId64 " elements)\n", __func__, name.c_str(), ggml_nelements(tensor));
         if (!ml.use_mmap) {
-            if (buffer.size() < ggml_nbytes(tensor)) { buffer.resize(ggml_nbytes(tensor)); }
-            tensor->data = buffer.data();
+            if (thread_local_buffer.size() < ggml_nbytes(tensor)) { thread_local_buffer.resize(ggml_nbytes(tensor)); }
+            tensor->data = thread_local_buffer.data();
+        }
+        {
+            std::lock_guard<std::mutex> lock(loader_mutex);
+            ml.load_data_for(tensor);
         }
-
-        ml.load_data_for(tensor);
 
         // Dequantize sampled rows into f32_sample
         const int64_t n_per_row = tensor->ne[0];
@@ -1170,7 +1181,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             const int64_t max_rows = 4096;
             int64_t total_rows = std::llround(slice_budget / std::max<int64_t>(1, n));
             total_rows = std::max<int64_t>(min_rows, std::min<int64_t>(total_rows, std::min<int64_t>(rows, max_rows)));
-            if (rows <= min_rows * 2) { total_rows = rows; } // use all rows for small tensors
+            if (rows <= min_rows * 2) { total_rows = rows; }
             return total_rows;
         };
 
@@ -1191,17 +1202,16 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 return;
             }
             if (t == GGML_TYPE_F16) {
-                ggml_fp16_to_fp32_row((const ggml_fp16_t *) src, dst, (int)n_per_row);
+                ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row);
                 return;
             }
             if (t == GGML_TYPE_BF16) {
-                ggml_bf16_to_fp32_row((const ggml_bf16_t *) src, dst, (int)n_per_row);
+                ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row);
                 return;
             }
-
             if (src_is_quant) {
                 GGML_ASSERT(src_traits && src_traits->to_float);
-                src_traits->to_float(src, dst, (int) n_per_row);
+                src_traits->to_float(src, dst, (int)n_per_row);
                 return;
             }
 
@@ -1266,6 +1276,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 return;
             }
 
+            std::lock_guard<std::mutex> lock(log_mutex);
             LLAMA_LOG_WARN("%s: side data size mismatch for %s: got %zu, expected %zu or %zu; ignoring\n", func, name.c_str(), src_sz, (size_t)n_per_row, want);
         };
 
@@ -1276,12 +1287,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         if (values_all) { copy_or_broadcast(values_all, values_sz, values_sample); }
         if (activations_all) { copy_or_broadcast(activations_all, activations_sz, activations_sample); }
 
-        const int64_t nelem = ggml_nelements(tensor);
         tensor_info info;
         info.w = tw;
-        info.n_elements = nelem;
-
-        // Prepare scratch buffers sized for the largest candidate row size
+        info.n_elements = ggml_nelements(tensor);
         size_t total_sampled_rows = f32_sample.size() / n_per_row;
 
         // Build list of candidate types first (compatible ones)
@@ -1295,7 +1303,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         for (size_t i = 0; i < base_sz; ++i) {
             ggml_type ts_type = base_arr[i];
             if (is_iq(ts_type) && !has_valid_imatrix) {
-                LLAMA_LOG_WARN("%s: skipping %s for %s, no or mismatched imatrix\n", __func__, ggml_type_name(ts_type), name.c_str());
+                std::lock_guard<std::mutex> lock(log_mutex);
+                LLAMA_LOG_WARN("\t%s: skipping %s for %s, no or mismatched imatrix\n", func, ggml_type_name(ts_type), name.c_str());
                 continue;
             }
 
@@ -1325,58 +1334,38 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         std::vector<uint8_t> quantized_buffer(max_row_sz * total_sampled_rows);
         std::vector<float> dequantized_buffer(f32_sample.size());
         const float * slice_lambda = lambdas.empty() ? nullptr : lambdas.data();
-        int n_eval_threads = std::max(1, std::min<int>(nthread, (int)compatible_candidates.size()));
-        std::atomic<size_t> cidx{0};
-        std::vector<std::thread> eval_workers;
-        eval_workers.reserve(n_eval_threads);
-        for (int ti = 0; ti < n_eval_threads; ++ti) {
-            eval_workers.emplace_back([&] {
-                // thread-local scratch
-                std::vector<uint8_t> tl_quantized_buffer(quantized_buffer.size());
-                std::vector<float> tl_dequantized_buffer(dequantized_buffer.size());
-                for (;;) {
-                    if (bpw_stop.load(std::memory_order_relaxed)) { break; } // stop if a signal arrived
-                    const size_t i = cidx.fetch_add(1, std::memory_order_acq_rel);
-                    if (i >= compatible_candidates.size()) { break; }
+        for (size_t i = 0; i < compatible_candidates.size(); ++i) {
+            if (bpw_stop.load(std::memory_order_relaxed)) { break; }
 
-                    const ggml_type tensor_types = compatible_candidates[i];
-                    const auto bpw = (float)tensor_bpw(tensor, tensor_types);
-                    const size_t bytes = tensor_bytes(tensor, tensor_types);
-                    double mse = 0.0;
-                    double proj = 0.0;
-                    const auto err = estimate_error(tensor, tensor_types, f32_sample, rows_sample, values, activations,
-                        tl_quantized_buffer, tl_dequantized_buffer, tensor_lambda, slice_lambda, &mse, &proj);
-                    eval_candidates[i] = candidate_types{ tensor_types, bpw, bytes, err, mse, proj };
-                }
-            });
+            const ggml_type tensor_types = compatible_candidates[i];
+            const auto bpw = (float)tensor_bpw(tensor, tensor_types);
+            const size_t bytes = tensor_bytes(tensor, tensor_types);
+            double mse = 0.0;
+            double proj = 0.0;
+            const auto err = estimate_error(tensor, tensor_types, f32_sample, rows_sample, values, activations,
+                quantized_buffer, dequantized_buffer, tensor_lambda, slice_lambda, &mse, &proj);
+            eval_candidates[i] = candidate_types{ tensor_types, bpw, bytes, err, mse, proj };
         }
 
-        for (auto &th : eval_workers) { th.join(); }
-
-        // If interruption happened mid-evaluation, exit without adding a half-baked tensor entry
-        if (bpw_stop.load(std::memory_order_relaxed) && cidx.load(std::memory_order_relaxed) < compatible_candidates.size()) {
-            check_signal_handler(all);
-        }
+        if (bpw_stop.load(std::memory_order_relaxed)) { return std::nullopt; }
 
         // Check if biasing is needed
         bool bias_needed = false;
         if (!lambdas.empty()) {
             int min_mse  = -1;
             int min_bias = -1;
-            {
-                double best_mse = std::numeric_limits<double>::infinity();
-                double best_err = std::numeric_limits<double>::infinity();
-                for (int i = 0; i < (int)eval_candidates.size(); ++i) {
-                    const auto & c = eval_candidates[i];
-                    if (c.bytes == 0) { continue; }
-                    if (c.mse  < best_mse) {
-                        best_mse = c.mse;
-                        min_mse  = i;
-                    }
-                    if (c.error < best_err) {
-                        best_err = c.error;
-                        min_bias = i;
-                    }
+            double best_mse = std::numeric_limits<double>::infinity();
+            double best_err = std::numeric_limits<double>::infinity();
+            for (int i = 0; i < (int)eval_candidates.size(); ++i) {
+                const auto & c = eval_candidates[i];
+                if (c.bytes == 0) { continue; }
+                if (c.mse  < best_mse) {
+                    best_mse = c.mse;
+                    min_mse  = i;
+                }
+                if (c.error < best_err) {
+                    best_err = c.error;
+                    min_bias = i;
                 }
             }
 
@@ -1388,8 +1377,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     if (c.bytes == 0) { continue; }
                     const double mse = std::max(c.mse, epsilon);
                     const double bias_term = std::max(0.0, c.error - c.mse);
-                    const double rel = bias_term / mse;
-                    max_rel_bias = std::max(rel, max_rel_bias);
+                    max_rel_bias = std::max(bias_term / mse, max_rel_bias);
                 }
 
                 bias_needed = max_rel_bias >= 0.5; // >= 50% of MSE?
@@ -1404,7 +1392,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         if (info.candidate.empty()) {
             // As a last resort, keep original type
-            float bpw = ggml_nbytes(tensor) * 8.0f / nelem;
+            float bpw = ggml_nbytes(tensor) * 8.0f / info.n_elements;
             info.candidate.push_back(candidate_types{ tensor->type, bpw, ggml_nbytes(tensor), 0.0 });
         }
 
@@ -1416,26 +1404,18 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 if (a.bytes != b.bytes) { return a.bytes < b.bytes; }
                 return a.error < b.error;
             });
-            const auto last = std::unique(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) {
+            candidates.erase(std::unique(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) {
                 return a.bytes == b.bytes;
-            });
-            candidates.erase(last, candidates.end());
-
-            // Pareto by bytes -> error
+            }), candidates.end());
             std::vector<candidate_types> pareto;
             pareto.reserve(candidates.size());
             double best_err = infinity;
-            size_t last_b = std::numeric_limits<size_t>::max();
             for (const auto & c : candidates) {
-                if (c.bytes != last_b) {
-                    last_b = c.bytes;
-                    if (c.error < best_err) {
-                        best_err = c.error;
-                        pareto.push_back(c);
-                    }
+                if (c.error < best_err) {
+                    best_err = c.error;
+                    pareto.push_back(c);
                 }
             }
-
             candidates.swap(pareto);
             if (candidates.size() < 3) { return; } // need at least 3 points to do convex hull
 
@@ -1470,10 +1450,43 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         info.choice = 0;
         info.min_bpw = info.candidate.front().bpw;
         info.max_bpw = info.candidate.back().bpw;
-        all.push_back(std::move(info));
-        check_signal_handler(all); // save after each tensor
+
+        return info;
+    };
+
+    std::vector<tensor_info> all; // this vector will be populated by the parallel workers
+    {
+        std::atomic<size_t> tensor_idx{0}; // shared work queue index for all threads
+        const size_t num_tensors_to_process = tensors.size();
+        std::mutex loader_mutex;
+        std::mutex log_mutex;
+        std::mutex results_mutex;
+        std::vector<std::thread> workers;
+        int num_threads_to_spawn = std::max(1, std::min<int>(nthread, (int)num_tensors_to_process));
+
+        for (int i = 0; i < num_threads_to_spawn; ++i) {
+            workers.emplace_back([&]() {
+                std::vector<no_init<uint8_t>> thread_local_buffer;
+                while (true) {
+                    const size_t current_idx = tensor_idx.fetch_add(1);
+                    if (current_idx >= num_tensors_to_process) { break; }
+                    const auto * tw = tensors[current_idx];
+                    if (!can_quantize(tw->tensor)) { continue; }
+                    // Execute the main processing logic for this tensor
+                    std::optional<tensor_info> result_info = process_tensor(tw, thread_local_buffer, loader_mutex, log_mutex);
+                    if (result_info) {
+                        std::lock_guard<std::mutex> lock(results_mutex);
+                        all.push_back(std::move(*result_info));
+                    }
+                }
+            });
+        }
+
+        for (auto & w : workers) { w.join(); }
     }
 
+    check_signal_handler(all);
+
     if (all.empty()) { return {}; }
 
     // Compute total elements across all tensors and bytes for non-quantizable tensors
@@ -1965,7 +1978,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                 LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate\n", __func__);
             }
             LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw);
-            bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread);
+            bpw_overrides = target_bpw_type(ml, model, tensors, mapped, values_data, activations_data, params, nthread);
         } else {
             LLAMA_LOG_WARN("%s: no imatrix provided, target bpw will not apply\n", __func__);
         }

From b6094a97bfbd831a715ca366200f8b9372a26a0d Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 12 Oct 2025 16:30:35 +0100
Subject: [PATCH 121/155] Add quant types

---
 src/llama-quant.cpp | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index c607651b05..56e63f9bb7 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -655,8 +655,11 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         GGML_TYPE_IQ1_S,
         GGML_TYPE_IQ1_M,
         GGML_TYPE_IQ2_XXS,
+        GGML_TYPE_IQ2_XS,
+        GGML_TYPE_IQ2_S,
         GGML_TYPE_Q2_K,
         GGML_TYPE_IQ3_XXS,
+        GGML_TYPE_IQ3_S,
         GGML_TYPE_Q3_K,
         GGML_TYPE_IQ4_XS,
         GGML_TYPE_IQ4_NL,
@@ -1155,7 +1158,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         }
         {
             std::lock_guard<std::mutex> lock(log_mutex);
-            LLAMA_LOG_INFO("\ttarget_bpw_type: - processing tensor %45s \t(%12" PRId64 " elements)\n", name.c_str(), ggml_nelements(tensor));
+            LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12" PRId64 " elements)\n", func, name.c_str(), ggml_nelements(tensor));
         }
 
         if (!ml.use_mmap) {
@@ -1457,19 +1460,19 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     std::vector<tensor_info> all; // this vector will be populated by the parallel workers
     {
         std::atomic<size_t> tensor_idx{0}; // shared work queue index for all threads
-        const size_t num_tensors_to_process = tensors.size();
+        const size_t tensors_to_process = tensors.size();
         std::mutex loader_mutex;
         std::mutex log_mutex;
         std::mutex results_mutex;
         std::vector<std::thread> workers;
-        int num_threads_to_spawn = std::max(1, std::min<int>(nthread, (int)num_tensors_to_process));
+        int threads_to_spawn = std::max(1, std::min<int>(nthread, (int)tensors_to_process));
 
-        for (int i = 0; i < num_threads_to_spawn; ++i) {
+        for (int i = 0; i < threads_to_spawn; ++i) {
             workers.emplace_back([&]() {
                 std::vector<no_init<uint8_t>> thread_local_buffer;
                 while (true) {
                     const size_t current_idx = tensor_idx.fetch_add(1);
-                    if (current_idx >= num_tensors_to_process) { break; }
+                    if (current_idx >= tensors_to_process) { break; }
                     const auto * tw = tensors[current_idx];
                     if (!can_quantize(tw->tensor)) { continue; }
                     // Execute the main processing logic for this tensor

From ca282302b5cde95945f8337e6df264d92e878501 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 12 Oct 2025 18:23:23 +0100
Subject: [PATCH 122/155] Add --keep-bpw-state option

---
 include/llama.h             |  1 +
 src/llama-quant.cpp         | 16 +++++-----------
 tools/quantize/quantize.cpp |  5 ++++-
 3 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index 14e12d7c51..f745e2110b 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -366,6 +366,7 @@ extern "C" {
         void * tensor_types;                  // pointer to vector containing tensor types
         void * prune_layers;                  // pointer to vector containing layer indices to prune
         float target_bpw;                     // target bits per weight (bpw)
+        bool keep_bpw_state;                  // keep bpw state file
     } llama_model_quantize_params;
 
     typedef struct llama_logit_bias {
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 56e63f9bb7..4b243f1f55 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -659,7 +659,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         GGML_TYPE_IQ2_S,
         GGML_TYPE_Q2_K,
         GGML_TYPE_IQ3_XXS,
-        GGML_TYPE_IQ3_S,
         GGML_TYPE_Q3_K,
         GGML_TYPE_IQ4_XS,
         GGML_TYPE_IQ4_NL,
@@ -773,11 +772,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     auto save_bpw_state = [&](const std::vector<tensor_info> & all_vec) {
         const std::string tmp = checkpoint_file + ".tmp";
         std::ofstream ofs(tmp, std::ios::binary | std::ios::trunc);
-        if (!ofs) { return; } // best-effort
-        const float target_bpw = params->target_bpw;
+        if (!ofs) { return; }
         ofs.write((const char *)&file_magic, sizeof(file_magic));
         ofs.write((const char *)&model_id, sizeof(model_id));
-        ofs.write((const char *)&target_bpw, sizeof(target_bpw));
         const uint64_t n = all_vec.size();
         ofs.write((const char *)&n, sizeof(n));
         for (const auto & ti : all_vec) {
@@ -817,19 +814,14 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         uint32_t magic = 0;
         uint64_t id = 0;
-        float bpw = 0.0f;
         ifs.read((char *)&magic, sizeof(magic));
         ifs.read((char *)&id, sizeof(id));
-        ifs.read((char *)&bpw, sizeof(bpw));
         if (magic != file_magic) {
             LLAMA_LOG_WARN("%s: invalid resume file, ignoring: %s\n", func, checkpoint_file.c_str());
             return out;
         } else if (id != model_id) {
             LLAMA_LOG_WARN("%s: model ID mismatch, ignoring: %s\n", func, checkpoint_file.c_str());
             return out;
-        } else if (bpw != params->target_bpw) {
-            LLAMA_LOG_WARN("%s: target bpw of %f does not match %f, ignoring: %s\n", func, params->target_bpw, bpw, checkpoint_file.c_str());
-            return out;
         } else {
             LLAMA_LOG_INFO("%s: resuming tensor quantization\n", func);
         }
@@ -874,7 +866,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     auto delete_bpw_state = [&] {
         std::ifstream ifs(checkpoint_file);
-        if (ifs.good()) {
+        if (ifs.good() && !params->keep_bpw_state) {
             LLAMA_LOG_INFO("%s: deleting %s\n", func, checkpoint_file.c_str());
             std::remove(checkpoint_file.c_str());
         }
@@ -1489,6 +1481,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     }
 
     check_signal_handler(all);
+    if (params->keep_bpw_state) { save_bpw_state(all); }
 
     if (all.empty()) { return {}; }
 
@@ -2240,7 +2233,8 @@ llama_model_quantize_params llama_model_quantize_default_params() {
         /*.kv_overrides                =*/ nullptr,
         /*.tensor_type                 =*/ nullptr,
         /*.prune_layers                =*/ nullptr,
-        /*.target_bpw                  =*/ -1.0f
+        /*.target_bpw                  =*/ -1.0f,
+        /*.keep_bpw_state              =*/ false
     };
 
     return result;
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index c254c3f6b2..ad2563a48d 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -118,7 +118,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 [[noreturn]]
 static void usage(const char * executable) {
     printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights]\n", executable);
-    printf("       [--target-bpw n] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n");
+    printf("       [--target-bpw n] [--keep-bpw-state] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n");
     printf("       model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
     printf("  --allow-requantize: allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
     printf("  --leave-output-tensor: will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
@@ -134,6 +134,7 @@ static void usage(const char * executable) {
     printf("      Advanced option to remove all tensors from the given layers\n");
     printf("  --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0\n");
     printf("      Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n");
+    printf("  --keep-bpw-state: preserve the bpw computations in a state file\n");
     printf("  --keep-split: will generate quantized model in the same shards as input\n");
     printf("  --override-kv KEY=TYPE:VALUE\n");
     printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
@@ -557,6 +558,8 @@ int main(int argc, char ** argv) {
             if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) {
                 usage(argv[0]);
             }
+        } else if (strcmp(argv[arg_idx], "--keep-bpw-state") == 0) {
+            params.keep_bpw_state = true;
         } else if (strcmp(argv[arg_idx], "--prune-layers") == 0) {
             if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) {
                 usage(argv[0]);

From b1b58e67df30453edd64706abda76d3c42f0bb03 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Mon, 13 Oct 2025 14:54:32 +0100
Subject: [PATCH 123/155] Refactor signal handlers

---
 src/llama-quant.cpp | 34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 4b243f1f55..d1fa429553 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -632,6 +632,22 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     const llama_model_quantize_params * params,
     int nthread
 ) {
+    // RAII guard for signal handlers
+    bpw_stop.store(false, std::memory_order_relaxed);
+    struct signal_scope_guard {
+        using handler_t = void (*)(int);
+        handler_t prev_int = SIG_DFL;
+        handler_t prev_term = SIG_DFL;
+        signal_scope_guard() {
+            prev_int  = std::signal(SIGINT,  signal_handler);
+            prev_term = std::signal(SIGTERM, signal_handler);
+        }
+        ~signal_scope_guard() {
+            std::signal(SIGINT,  prev_int);
+            std::signal(SIGTERM, prev_term);
+        }
+    } _signal_guard;
+
     struct candidate_types {
         ggml_type type;
         float bpw;
@@ -724,22 +740,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return is_quantizable(ggml_get_name(t), model.arch, params);
     };
 
-    auto install_signal_handlers = [] {
-        static std::once_flag once;
-        std::call_once(once, [] {
-            std::signal(SIGINT, signal_handler);
-            std::signal(SIGTERM, signal_handler);
-        });
-    };
-
-    auto uninstall_signal_handlers = [] {
-        static std::once_flag once;
-        std::call_once(once, [] {
-            std::signal(SIGINT, SIG_DFL);
-            std::signal(SIGTERM, SIG_DFL);
-        });
-    };
-
     // Saved state per tensor
     struct saved_info {
         std::vector<candidate_types> candidate;
@@ -1121,7 +1121,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return lambdas;
     };
 
-    install_signal_handlers();
     auto bpw_data = load_bpw_state();
 
     // Significantly reduce compute time by parallelising tensor processing - courtesy of https://github.com/ddh0
@@ -1700,7 +1699,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     }
 
     delete_bpw_state(); // we're done, clear any checkpoint
-    uninstall_signal_handlers();
 
     return emit_overrides();
 }

From cd734b89ce3b2af611fd168975a5921f33b475eb Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Mon, 13 Oct 2025 15:15:23 +0100
Subject: [PATCH 124/155] Update quant types

---
 src/llama-quant.cpp         | 3 ++-
 tools/quantize/quantize.cpp | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index d1fa429553..7543ec6961 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -681,7 +681,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         GGML_TYPE_Q4_K,
         GGML_TYPE_Q5_K,
         GGML_TYPE_Q6_K,
-        GGML_TYPE_Q8_0
+        GGML_TYPE_Q8_0,
+        GGML_TYPE_F16
     };
 
     const char * important_tensors[] = {
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index ad2563a48d..e67649beb9 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -501,6 +501,8 @@ static const char * get_ftype(const float bpw) {
         {1.5625, "IQ1_S"},
         {1.7500, "IQ1_M"},
         {2.0625, "IQ2_XXS"},
+        {2.3125, "IQ2_XS"},
+        {2.5625, "IQ2_S"},
         {2.6250, "Q2_K"},
         {3.0625, "IQ3_XXS"},
         {3.4375, "Q3_K"},

From b7911f14314387e4101957d4eb4df9650660c877 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Mon, 13 Oct 2025 17:46:45 +0100
Subject: [PATCH 125/155] Minor refactoring

---
 src/llama-quant.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 7543ec6961..0f256eface 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1122,9 +1122,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return lambdas;
     };
 
-    auto bpw_data = load_bpw_state();
+    const auto bpw_data = load_bpw_state();
 
-    // Significantly reduce compute time by parallelising tensor processing - courtesy of https://github.com/ddh0
+    // Reduce compute time by parallelising tensor processing - courtesy of https://github.com/ddh0
     auto process_tensor = [&](const llama_model_loader::llama_tensor_weight * tw,
         std::vector<no_init<uint8_t>> & thread_local_buffer,
         std::mutex & loader_mutex,
@@ -1330,7 +1330,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         std::vector<float> dequantized_buffer(f32_sample.size());
         const float * slice_lambda = lambdas.empty() ? nullptr : lambdas.data();
         for (size_t i = 0; i < compatible_candidates.size(); ++i) {
-            if (bpw_stop.load(std::memory_order_relaxed)) { break; }
+            if (bpw_stop.load(std::memory_order_relaxed)) { return std::nullopt; }
 
             const ggml_type tensor_types = compatible_candidates[i];
             const auto bpw = (float)tensor_bpw(tensor, tensor_types);
@@ -1383,6 +1383,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             if (c.bytes == 0) { continue; }
             const double final_err = bias_needed ? c.error : c.mse;
             info.candidate.push_back(candidate_types{ c.type, c.bpw, c.bytes, final_err, c.mse, c.proj });
+            // LLAMA_LOG_INFO("\t%s: %35s \t%10s \t%1.4f bpw \t%10zu bytes \t mse: %1.8e \t err: %1.8e\n",
+            //     func, name.c_str(), ggml_type_name(c.type), c.bpw, c.bytes, c.mse, final_err);
         }
 
         if (info.candidate.empty()) {
@@ -1426,7 +1428,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 };
 
                 while (hull.size() >= 2) {
-                    if (cross_product(hull[hull.size() - 2], hull[hull.size() - 1], c) <= -1 * epsilon) { // very small negative tolerance
+                    if (cross_product(hull[hull.size() - 2], hull[hull.size() - 1], c) <= epsilon) {
                         hull.pop_back();
                     } else {
                         break;
@@ -1670,7 +1672,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 const auto & ti = all[i];
                 const std::string tensor_name  = ggml_get_name(ti.w->tensor);
                 int j = ti.choice + 1;
-                while (j < (int)ti.candidate.size() && ti.candidate[j].bytes == ti.candidate[ti.choice].bytes) { ++j; }
                 if (j >= (int)ti.candidate.size()) { continue; } // no upgrade available
 
                 size_t delta_bytes = ti.candidate[j].bytes - ti.candidate[ti.choice].bytes;

From a6853ea2ae7d828e535874e6f2244786921df594 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 16 Oct 2025 11:20:24 +0100
Subject: [PATCH 126/155] Add tensor type and depth heuristics

---
 src/llama-quant.cpp | 94 +++++++++++++++++++++++++++++++++++++++------
 1 file changed, 83 insertions(+), 11 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 0f256eface..38d20e3d0f 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -16,6 +16,7 @@
 #include <thread>
 #include <unordered_map>
 #include <optional>
+#include <unordered_set>
 
 // Quantization types. Changes to this struct must be replicated in quantize.cpp
 struct tensor_quantization {
@@ -685,13 +686,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         GGML_TYPE_F16
     };
 
-    const char * important_tensors[] = {
-        ".output.weight",
-        ".attn_output.weight",
-        ".ffn_down.weight",
-        ".ffn_down_shexp.weight"
-    };
-
     constexpr double epsilon = 1e-12;
     constexpr double infinity = std::numeric_limits<double>::infinity();
     constexpr uint32_t file_magic = 0x42505731;  // BPW1
@@ -1544,11 +1538,89 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return emit_overrides();
     }
 
-    auto is_important = [&](const std::string & tensor_name) -> bool {
-        return std::any_of(std::begin(important_tensors), std::end(important_tensors), [&](const char* imp) {
-                return tensor_name.find(imp) != std::string::npos;
+    auto tensor_importance = [&](const std::vector<tensor_info> & all_vec) -> std::unordered_map<std::string, float> {
+        std::unordered_map<std::string, float> scores;
+        for (const auto & ti : all_vec) {
+            const std::string name = ggml_get_name(ti.w->tensor);
+            float total_score = 0.0f;
+            float depth_score = 0.0f;
+            float type_score = 0.0f;
+
+            // Depth component: output, embeddings & early/late layers are important
+            if (name.find("output.weight") != std::string::npos ||
+                name.find("token_embd.weight") != std::string::npos) {
+                depth_score = 1.0f;
             }
-        );
+            else if (name.find(".attn_output.weight") != std::string::npos) {
+                depth_score = 0.9f;
+            } else {
+                static const std::regex layer_pattern(R"(blk\.(\d+)\.)");
+                std::smatch match;
+                if (std::regex_search(name, match, layer_pattern)) {
+                    const int layer = std::stoi(match[1]);
+                    const float normalized_layer = (float)layer / (float)std::max(1, (int)model.hparams.n_layer - 1);
+                    const float center_dist = std::abs(normalized_layer - 0.5f) * 2.0f;
+                    depth_score = 0.2f + 0.6f * center_dist;
+                }
+            }
+
+            // Type component: certain tensor types are more important
+            if (name.find("output.weight") != std::string::npos) {
+                type_score = 1.0f;
+            } else if (name.find(".attn_output.weight") != std::string::npos) {
+                type_score = 0.9f;
+            } else if (name.find(".ffn_down.weight") != std::string::npos ||
+                       name.find(".ffn_down_shexp.weight") != std::string::npos ||
+                       name.find(".ffn_down_exps.weight") != std::string::npos) {
+                type_score = 0.8f;
+            } else if (name.find(".attn_q.weight") != std::string::npos ||
+                       name.find(".attn_k.weight") != std::string::npos ||
+                       name.find(".attn_v.weight") != std::string::npos ||
+                       name.find(".attn_qkv.weight") != std::string::npos) {
+                type_score = 0.7f;
+            } else if (name.find(".ffn_up.weight") != std::string::npos ||
+                       name.find(".ffn_gate.weight") != std::string::npos ||
+                       name.find(".ffn_up_shexp.weight") != std::string::npos ||
+                       name.find(".ffn_gate_shexp.weight") != std::string::npos ||
+                       name.find(".ffn_up_exps.weight") != std::string::npos ||
+                       name.find(".ffn_gate_exps.weight") != std::string::npos) {
+                type_score = 0.6f;
+            } else if (name.find("token_embd.weight") != std::string::npos) {
+                type_score = 0.5f;
+            }
+
+            // Weighted combination
+            total_score = 0.80f * type_score + 0.20f * depth_score; // 80% type + 20% depth
+            scores[name] = total_score;
+        }
+
+        return scores;
+    };
+
+    auto select_tensors = [&](const std::vector<tensor_info> & all_vec) -> std::unordered_set<std::string> {
+        const auto scores = tensor_importance(all_vec);
+
+        // Sort by score
+        std::vector<std::pair<std::string, float>> sorted_scores(scores.begin(), scores.end());
+        std::sort(sorted_scores.begin(), sorted_scores.end(), [](const auto & a, const auto & b) { return a.second > b.second; });
+
+        // Select top percentile
+        const size_t n_important = std::max<size_t>(1, std::llround((double)sorted_scores.size() * 0.25f)); // top 25%
+
+        std::unordered_set<std::string> important;
+        for (size_t i = 0; i < std::min(n_important, sorted_scores.size()); ++i) {
+            important.insert(sorted_scores[i].first);
+            //LLAMA_LOG_DEBUG("\t%s: important tensor %s (score %.4f)\n", func, sorted_scores[i].first.c_str(), sorted_scores[i].second);
+        }
+
+        LLAMA_LOG_INFO("%s: prioritizing %zu out off %zu tensors\n", func, important.size(), sorted_scores.size());
+        return important;
+    };
+
+    const auto important_set = select_tensors(all);
+
+    auto is_important = [&](const std::string & tensor_name) -> bool {
+        return important_set.count(tensor_name) > 0;
     };
 
     // Lagrangian relaxation to minimise error subject to a bpw target constraint

From 0b3e930d5204d3c4be96179835f5378811814247 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 16 Oct 2025 11:41:26 +0100
Subject: [PATCH 127/155] Add option to override bpw state file name

---
 include/llama.h             |  1 +
 src/llama-quant.cpp         | 21 +++++++++++++++++++--
 tools/quantize/quantize.cpp | 15 +++++++++++----
 3 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index f745e2110b..ce04011e19 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -367,6 +367,7 @@ extern "C" {
         void * prune_layers;                  // pointer to vector containing layer indices to prune
         float target_bpw;                     // target bits per weight (bpw)
         bool keep_bpw_state;                  // keep bpw state file
+        void * bpw_state;                     // pointer to bpw state file
     } llama_model_quantize_params;
 
     typedef struct llama_logit_bias {
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 38d20e3d0f..1dee52d58d 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -762,7 +762,23 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     char hex[17];
     const uint64_t model_id = metadata_id(ml.meta.get());
     std::snprintf(hex, sizeof(hex), "%016" PRIx64, (uint64_t)model_id);
-    const std::string checkpoint_file = ml.arch_name + "-" + std::string(hex) + ".bpw_state";
+    std::string checkpoint_file = ml.arch_name + "-" + std::string(hex) + ".bpw_state";
+    if (params->keep_bpw_state && params->bpw_state) {
+        const auto * filename = static_cast<const char*>(params->bpw_state);
+        std::ifstream ifs(filename, std::ios::binary);
+        if (ifs.good()) {
+            checkpoint_file = std::string(filename);
+        } else {
+            std::ofstream ofs(filename, std::ios::binary | std::ios::app);
+            if (ofs.is_open()) {
+                checkpoint_file = std::string(filename);
+                ofs.close();
+                std::remove(checkpoint_file.c_str());
+            } else {
+                LLAMA_LOG_WARN("%s: %s is not a valid file name. Using %s instead\n", func, filename, checkpoint_file.c_str());
+            }
+        }
+    }
 
     auto save_bpw_state = [&](const std::vector<tensor_info> & all_vec) {
         const std::string tmp = checkpoint_file + ".tmp";
@@ -2306,7 +2322,8 @@ llama_model_quantize_params llama_model_quantize_default_params() {
         /*.tensor_type                 =*/ nullptr,
         /*.prune_layers                =*/ nullptr,
         /*.target_bpw                  =*/ -1.0f,
-        /*.keep_bpw_state              =*/ false
+        /*.keep_bpw_state              =*/ false,
+        /*.bpw_state                   =*/ nullptr
     };
 
     return result;
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index e67649beb9..945acbe288 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -117,8 +117,8 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 
 [[noreturn]]
 static void usage(const char * executable) {
-    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights]\n", executable);
-    printf("       [--target-bpw n] [--keep-bpw-state] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n");
+    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--target-bpw n]\n", executable);
+    printf("       [--bpw-state filename] [--keep-bpw-state] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n");
     printf("       model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
     printf("  --allow-requantize: allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
     printf("  --leave-output-tensor: will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
@@ -128,13 +128,14 @@ static void usage(const char * executable) {
     printf("  --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
     printf("  --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
     printf("  --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
-    printf("  --tensor-type TENSOR=TYPE: quantize this tensor to this ggml_type. example: --tensor-type attn_q=q8_0\n");
+    printf("  --tensor-type TENSOR=TYPE: quantize this tensor to this ggml_type. Example: --tensor-type attn_q=q8_0\n");
     printf("      Advanced option to selectively quantize tensors. May be specified multiple times.\n");
     printf("  --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n");
     printf("      Advanced option to remove all tensors from the given layers\n");
     printf("  --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0\n");
     printf("      Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n");
-    printf("  --keep-bpw-state: preserve the bpw computations in a state file\n");
+    printf("  --keep-bpw-state: save the bpw computations to <architecture>-<model hash>.bpw_state\n");
+    printf("  --bpw-state: file name to use instead of default\n");
     printf("  --keep-split: will generate quantized model in the same shards as input\n");
     printf("  --override-kv KEY=TYPE:VALUE\n");
     printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
@@ -562,6 +563,12 @@ int main(int argc, char ** argv) {
             }
         } else if (strcmp(argv[arg_idx], "--keep-bpw-state") == 0) {
             params.keep_bpw_state = true;
+        } else if (strcmp(argv[arg_idx], "--bpw-state") == 0) {
+            if (arg_idx < argc-1) {
+                params.bpw_state = argv[++arg_idx];
+            } else {
+                usage(argv[0]);
+            }
         } else if (strcmp(argv[arg_idx], "--prune-layers") == 0) {
             if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) {
                 usage(argv[0]);

From a5103933bb4eec23b71bd8ccaae3b80710a1a82a Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 16 Oct 2025 15:11:48 +0100
Subject: [PATCH 128/155] Minor refactoring

---
 src/llama-quant.cpp | 51 +++++++++++++++++++++++++++++----------------
 1 file changed, 33 insertions(+), 18 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 1dee52d58d..b8391a4f2c 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -647,7 +647,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             std::signal(SIGINT,  prev_int);
             std::signal(SIGTERM, prev_term);
         }
-    } _signal_guard;
+    } signal_guard;
 
     struct candidate_types {
         ggml_type type;
@@ -683,7 +683,11 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         GGML_TYPE_Q5_K,
         GGML_TYPE_Q6_K,
         GGML_TYPE_Q8_0,
+#ifdef GGML_USE_METAL
         GGML_TYPE_F16
+#else
+        GGML_TYPE_BF16
+#endif
     };
 
     constexpr double epsilon = 1e-12;
@@ -1004,17 +1008,30 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
         // Dequantize into dequantized_buffer
         {
-            const ggml_type_traits * traits = ggml_get_type_traits(quant_type);
-            if (!traits || !traits->to_float) {
-                if (out_mse) { *out_mse = infinity; }
-                if (out_proj) { *out_proj = 0.0; }
-                return infinity;
-            }
-
-            for (size_t r = 0; r < sample_rows; ++r) {
-                const uint8_t * src = quantized_buffer.data() + r * row_sz;
-                float * dst = dequantized_buffer.data() + r * (size_t)n_per_row;
-                traits->to_float(src, dst, (int)n_per_row);
+            if (quant_type == GGML_TYPE_F16) {
+                for (size_t r = 0; r < sample_rows; ++r) {
+                    auto src = (const ggml_fp16_t *)(quantized_buffer.data() + r * row_sz);
+                    float * dst = dequantized_buffer.data() + r * (size_t)n_per_row;
+                    ggml_fp16_to_fp32_row(src, dst, (int)n_per_row);
+                }
+            } else if (quant_type == GGML_TYPE_BF16) {
+                for (size_t r = 0; r < sample_rows; ++r) {
+                    auto src = (const ggml_bf16_t *)(quantized_buffer.data() + r * row_sz);
+                    float * dst = dequantized_buffer.data() + r * (size_t)n_per_row;
+                    ggml_bf16_to_fp32_row(src, dst, (int)n_per_row);
+                }
+            } else {
+                const ggml_type_traits * traits = ggml_get_type_traits(quant_type);
+                if (!traits || !traits->to_float) {
+                    if (out_mse) { *out_mse = infinity; }
+                    if (out_proj) { *out_proj = 0.0; }
+                    return infinity;
+                }
+                for (size_t r = 0; r < sample_rows; ++r) {
+                    const uint8_t * src = quantized_buffer.data() + r * row_sz;
+                    float * dst = dequantized_buffer.data() + r * (size_t)n_per_row;
+                    traits->to_float(src, dst, (int)n_per_row);
+                }
             }
         }
 
@@ -1500,13 +1517,11 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     // Compute total elements across all tensors and bytes for non-quantizable tensors
     size_t nq_elements = 0;
     size_t nq_bytes = 0;
-    for (const auto & it : ml.weights_map) {
-        const ggml_tensor * tensor = it.second.tensor;
-        const std::string name = it.first;
+    for (const auto * it : tensors) {
+        const ggml_tensor * tensor = it->tensor;
+        const std::string name = ggml_get_name(tensor);
         nq_elements += (size_t)ggml_nelements(tensor);
-        if (!is_quantizable(name, model.arch, params)) {
-            nq_bytes += ggml_nbytes(tensor);
-        }
+        if (!can_quantize(tensor)) { nq_bytes += ggml_nbytes(tensor); }
     }
 
     auto total_bytes = [&]() -> size_t {

From fa1df81d49a0512cb4dc6b9b2afc10e7af86bcf2 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Mon, 20 Oct 2025 20:52:23 +0100
Subject: [PATCH 129/155] Finetune heuristics

---
 src/llama-quant.cpp | 51 ++++++++++++++++++++++-----------------------
 1 file changed, 25 insertions(+), 26 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 91b127789c..5e3893151c 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1577,13 +1577,9 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             float depth_score = 0.0f;
             float type_score = 0.0f;
 
-            // Depth component: output, embeddings & early/late layers are important
-            if (name.find("output.weight") != std::string::npos ||
-                name.find("token_embd.weight") != std::string::npos) {
+            // Depth component: output & early/late layers are important
+            if (name == "output.weight") {
                 depth_score = 1.0f;
-            }
-            else if (name.find(".attn_output.weight") != std::string::npos) {
-                depth_score = 0.9f;
             } else {
                 static const std::regex layer_pattern(R"(blk\.(\d+)\.)");
                 std::smatch match;
@@ -1591,38 +1587,40 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                     const int layer = std::stoi(match[1]);
                     const float normalized_layer = (float)layer / (float)std::max(1, (int)model.hparams.n_layer - 1);
                     const float center_dist = std::abs(normalized_layer - 0.5f) * 2.0f;
-                    depth_score = 0.2f + 0.6f * center_dist;
+                    depth_score = 0.9f * center_dist;
                 }
             }
 
-            // Type component: certain tensor types are more important
-            if (name.find("output.weight") != std::string::npos) {
+            // Type component: certain tensor types have more impact on model quality
+            if (name == "output.weight") {
                 type_score = 1.0f;
-            } else if (name.find(".attn_output.weight") != std::string::npos) {
-                type_score = 0.9f;
             } else if (name.find(".ffn_down.weight") != std::string::npos ||
-                       name.find(".ffn_down_shexp.weight") != std::string::npos ||
                        name.find(".ffn_down_exps.weight") != std::string::npos) {
+                type_score = 0.9f;
+            } else if (name.find(".attn_output.weight") != std::string::npos ||
+                       name.find(".time_mix_output.weight") != std::string::npos ||
+                       name.find(".attn_o.weight") != std::string::npos) {
                 type_score = 0.8f;
+            } else if (name.find(".ffn_up.weight") != std::string::npos ||
+                       name.find(".ffn_gate.weight") != std::string::npos ||
+                       name.find(".ffn_up_exps.weight") != std::string::npos ||
+                       name.find(".ffn_gate_exps.weight") != std::string::npos) {
+                type_score = 0.3f;
             } else if (name.find(".attn_q.weight") != std::string::npos ||
                        name.find(".attn_k.weight") != std::string::npos ||
                        name.find(".attn_v.weight") != std::string::npos ||
                        name.find(".attn_qkv.weight") != std::string::npos) {
-                type_score = 0.7f;
-            } else if (name.find(".ffn_up.weight") != std::string::npos ||
-                       name.find(".ffn_gate.weight") != std::string::npos ||
-                       name.find(".ffn_up_shexp.weight") != std::string::npos ||
-                       name.find(".ffn_gate_shexp.weight") != std::string::npos ||
-                       name.find(".ffn_up_exps.weight") != std::string::npos ||
-                       name.find(".ffn_gate_exps.weight") != std::string::npos) {
-                type_score = 0.6f;
+                type_score = 0.2f;
             } else if (name.find("token_embd.weight") != std::string::npos) {
-                type_score = 0.5f;
+                type_score = 0.1f;
             }
 
             // Weighted combination
-            total_score = 0.80f * type_score + 0.20f * depth_score; // 80% type + 20% depth
-            scores[name] = total_score;
+            total_score = 0.8f * type_score + 0.2f * depth_score; // 80% type + 20% depth
+            if (total_score != 0.0f) {
+                scores[name] = total_score;
+                LLAMA_LOG_DEBUG("\t%s: \t %45s \t depth score %.4f \t type score %.4f \t total score %.4f\n", func, name.c_str(), depth_score, type_score, total_score);
+            }
         }
 
         return scores;
@@ -1636,15 +1634,16 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         std::sort(sorted_scores.begin(), sorted_scores.end(), [](const auto & a, const auto & b) { return a.second > b.second; });
 
         // Select top percentile
-        const size_t n_important = std::max<size_t>(1, std::llround((double)sorted_scores.size() * 0.25f)); // top 25%
+        const size_t n_important = std::max<size_t>(1, std::llround((double)sorted_scores.size() * 0.25f)); // bump top 25%
 
         std::unordered_set<std::string> important;
         for (size_t i = 0; i < std::min(n_important, sorted_scores.size()); ++i) {
             important.insert(sorted_scores[i].first);
-            //LLAMA_LOG_DEBUG("\t%s: important tensor %s (score %.4f)\n", func, sorted_scores[i].first.c_str(), sorted_scores[i].second);
+            LLAMA_LOG_DEBUG("\t%s: important tensor %s (score %.4f)\n", func, sorted_scores[i].first.c_str(), sorted_scores[i].second);
         }
 
-        LLAMA_LOG_INFO("%s: prioritizing %zu out off %zu tensors\n", func, important.size(), sorted_scores.size());
+        const auto pct = 100.0 * (double)important.size() / (double)sorted_scores.size();
+        LLAMA_LOG_INFO("%s: prioritizing %zu out of %zu tensors (%.2f%%)\n", func, important.size(), sorted_scores.size(), pct);
         return important;
     };
 

From 00ddf039b306882a8a15761624bcdd673f666f71 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Mon, 20 Oct 2025 21:38:49 +0100
Subject: [PATCH 130/155] Update usage

---
 tools/quantize/quantize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 945acbe288..f994999e59 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -118,7 +118,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 [[noreturn]]
 static void usage(const char * executable) {
     printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--target-bpw n]\n", executable);
-    printf("       [--bpw-state filename] [--keep-bpw-state] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n");
+    printf("       [--keep-bpw-state] [--bpw-state filename] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n");
     printf("       model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
     printf("  --allow-requantize: allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
     printf("  --leave-output-tensor: will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");

From 543b5a99db2b74e2b74cb87a222a25586479bd9b Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Mon, 20 Oct 2025 21:57:03 +0100
Subject: [PATCH 131/155] Fix lambda capture

---
 src/llama-quant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 5e3893151c..e6c9bfa7f0 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1421,7 +1421,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         }
 
         // Keep only the pareto‑optimal candidates and enforce convexity in (bytes, error) curve
-        auto pareto_convex = [](std::vector<candidate_types> & candidates) {
+        auto pareto_convex = [epsilon](std::vector<candidate_types> & candidates) {
             if (candidates.empty()) { return; }
 
             std::sort(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) {

From 27bf25e93c9309b96a151c1d8c4eef8fdad0cb21 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Mon, 20 Oct 2025 22:04:35 +0100
Subject: [PATCH 132/155] Fix lambda capture

---
 src/llama-quant.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index e6c9bfa7f0..08f1b30293 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -690,7 +690,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 #endif
     };
 
-    constexpr double epsilon = 1e-12;
+    const double epsilon = 1e-12;
     constexpr double infinity = std::numeric_limits<double>::infinity();
     constexpr uint32_t file_magic = 0x42505731;  // BPW1
     const char * func = __func__;
@@ -1118,7 +1118,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     };
 
     // Returns lambda per slice or 0.0 if no activations
-    auto estimate_lambda = [](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) -> std::vector<float> {
+    auto estimate_lambda = [&](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) -> std::vector<float> {
         const int64_t ns = std::max<int64_t>(1, ne2);
         std::vector<float> lambdas(ns, 0.0f);
         if (!activations) { return lambdas; }
@@ -1421,7 +1421,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         }
 
         // Keep only the pareto‑optimal candidates and enforce convexity in (bytes, error) curve
-        auto pareto_convex = [epsilon](std::vector<candidate_types> & candidates) {
+        auto pareto_convex = [&](std::vector<candidate_types> & candidates) {
             if (candidates.empty()) { return; }
 
             std::sort(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) {

From 04561d5782b930e781627eee5ffcbb6b06e8b558 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 21 Oct 2025 12:53:26 +0100
Subject: [PATCH 133/155] Update epsilon specifier

---
 src/llama-quant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 08f1b30293..5280b9a02a 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -690,7 +690,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 #endif
     };
 
-    const double epsilon = 1e-12;
+    constexpr double epsilon = 1e-12;
     constexpr double infinity = std::numeric_limits<double>::infinity();
     constexpr uint32_t file_magic = 0x42505731;  // BPW1
     const char * func = __func__;

From d6ccd5649ac6db0ad87156cf92f036737cf82be3 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 25 Oct 2025 12:09:20 +0100
Subject: [PATCH 134/155] Finetune heuristics

---
 src/llama-quant.cpp | 83 ++++++++++++++++++++++++---------------------
 1 file changed, 44 insertions(+), 39 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 5280b9a02a..617c7d9473 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -838,7 +838,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             LLAMA_LOG_WARN("%s: model ID mismatch, ignoring: %s\n", func, checkpoint_file.c_str());
             return out;
         } else {
-            LLAMA_LOG_INFO("%s: resuming tensor quantization\n", func);
+            LLAMA_LOG_INFO("%s: state file found, resuming tensor quantization\n", func);
         }
 
         uint64_t n = 0;
@@ -1569,54 +1569,59 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return emit_overrides();
     }
 
-    auto tensor_importance = [&](const std::vector<tensor_info> & all_vec) -> std::unordered_map<std::string, float> {
+    auto tensor_depth = [&](const std::string & name) -> float {
+        static const std::regex layer_pattern(R"(blk\.(\d+)\.)");
+        std::smatch match;
+
+        // Depth component: output, embeddings & early/late layers are important
+        if (name == "output.weight" || name == "token_embd.weight") {
+            return 1.0f;
+        }
+        if (std::regex_search(name, match, layer_pattern)) {
+            const int layer = std::stoi(match[1]);
+            const float normalized_layer = (float)layer / (float)std::max(1, (int)model.hparams.n_layer - 1);
+            const float center_dist = std::abs(normalized_layer - 0.5f) * 2.0f;
+            return 0.01f + 0.9f * center_dist;
+        }
+
+        return 0.0f;
+    };
+
+    auto tensor_importance = [&](const std::vector<tensor_info> & all_tensors) -> std::unordered_map<std::string, float> {
         std::unordered_map<std::string, float> scores;
-        for (const auto & ti : all_vec) {
-            const std::string name = ggml_get_name(ti.w->tensor);
+        for (const auto & t : all_tensors) {
+            const std::string name = ggml_get_name(t.w->tensor);
             float total_score = 0.0f;
             float depth_score = 0.0f;
             float type_score = 0.0f;
 
-            // Depth component: output & early/late layers are important
-            if (name == "output.weight") {
-                depth_score = 1.0f;
-            } else {
-                static const std::regex layer_pattern(R"(blk\.(\d+)\.)");
-                std::smatch match;
-                if (std::regex_search(name, match, layer_pattern)) {
-                    const int layer = std::stoi(match[1]);
-                    const float normalized_layer = (float)layer / (float)std::max(1, (int)model.hparams.n_layer - 1);
-                    const float center_dist = std::abs(normalized_layer - 0.5f) * 2.0f;
-                    depth_score = 0.9f * center_dist;
-                }
-            }
-
             // Type component: certain tensor types have more impact on model quality
+            const std::vector<std::pair<float, std::vector<const char*>>> tensor_scores = {
+                {0.9f, {".ffn_down.weight", ".ffn_down_exps.weight"}},
+                {0.89f, {".attn_output.weight", ".time_mix_output.weight", ".attn_o.weight"}},
+                {0.3f, {".ffn_up.weight", ".ffn_gate.weight", ".ffn_up_exps.weight", ".ffn_gate_exps.weight"}},
+                {0.29f, {".attn_q.weight", ".attn_k.weight", ".attn_v.weight", ".attn_qkv.weight"}},
+                {0.2f, {"token_embd.weight"}}
+            };
             if (name == "output.weight") {
                 type_score = 1.0f;
-            } else if (name.find(".ffn_down.weight") != std::string::npos ||
-                       name.find(".ffn_down_exps.weight") != std::string::npos) {
-                type_score = 0.9f;
-            } else if (name.find(".attn_output.weight") != std::string::npos ||
-                       name.find(".time_mix_output.weight") != std::string::npos ||
-                       name.find(".attn_o.weight") != std::string::npos) {
-                type_score = 0.8f;
-            } else if (name.find(".ffn_up.weight") != std::string::npos ||
-                       name.find(".ffn_gate.weight") != std::string::npos ||
-                       name.find(".ffn_up_exps.weight") != std::string::npos ||
-                       name.find(".ffn_gate_exps.weight") != std::string::npos) {
-                type_score = 0.3f;
-            } else if (name.find(".attn_q.weight") != std::string::npos ||
-                       name.find(".attn_k.weight") != std::string::npos ||
-                       name.find(".attn_v.weight") != std::string::npos ||
-                       name.find(".attn_qkv.weight") != std::string::npos) {
-                type_score = 0.2f;
-            } else if (name.find("token_embd.weight") != std::string::npos) {
-                type_score = 0.1f;
+            } else {
+                for (const auto& ts : tensor_scores) {
+                    const bool found = std::any_of(ts.second.begin(), ts.second.end(), [&](const char* pattern) {
+                        return name.find(pattern) != std::string::npos;
+                    });
+                    if (found) {
+                        type_score = ts.first;
+                        break;
+                    }
+                }
+            }
+            if (type_score > 0.0f) {
+                depth_score = tensor_depth(name);
             }
 
             // Weighted combination
-            total_score = 0.8f * type_score + 0.2f * depth_score; // 80% type + 20% depth
+            total_score = 0.90f * type_score + 0.10f * depth_score; // 90% type + 10% depth
             if (total_score != 0.0f) {
                 scores[name] = total_score;
                 LLAMA_LOG_DEBUG("\t%s: \t %45s \t depth score %.4f \t type score %.4f \t total score %.4f\n", func, name.c_str(), depth_score, type_score, total_score);
@@ -1634,7 +1639,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         std::sort(sorted_scores.begin(), sorted_scores.end(), [](const auto & a, const auto & b) { return a.second > b.second; });
 
         // Select top percentile
-        const size_t n_important = std::max<size_t>(1, std::llround((double)sorted_scores.size() * 0.25f)); // bump top 25%
+        const size_t n_important = std::max<size_t>(1, std::llround((double)sorted_scores.size() * 0.29f)); // 29% seems to be the pareto front
 
         std::unordered_set<std::string> important;
         for (size_t i = 0; i < std::min(n_important, sorted_scores.size()); ++i) {

From 5303212324c90745eb82c3e5f5abb32b184cb7fa Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 26 Oct 2025 17:40:52 +0000
Subject: [PATCH 135/155] Simplify tensor selection

---
 src/llama-quant.cpp | 99 +++++----------------------------------------
 1 file changed, 11 insertions(+), 88 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 617c7d9473..04f4ff341a 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -11,11 +11,12 @@
 #include <csignal>
 #include <fstream>
 #include <mutex>
+#include <numeric>
+#include <optional>
 #include <random>
 #include <regex>
 #include <thread>
 #include <unordered_map>
-#include <optional>
 #include <unordered_set>
 
 // Quantization types. Changes to this struct must be replicated in quantize.cpp
@@ -1151,7 +1152,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     const auto bpw_data = load_bpw_state();
 
-    // Reduce compute time by parallelising tensor processing - courtesy of https://github.com/ddh0
+    // Parallelize tensor processing - courtesy of https://github.com/ddh0
     auto process_tensor = [&](const llama_model_loader::llama_tensor_weight * tw,
         std::vector<no_init<uint8_t>> & thread_local_buffer,
         std::mutex & loader_mutex,
@@ -1569,93 +1570,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return emit_overrides();
     }
 
-    auto tensor_depth = [&](const std::string & name) -> float {
-        static const std::regex layer_pattern(R"(blk\.(\d+)\.)");
-        std::smatch match;
-
-        // Depth component: output, embeddings & early/late layers are important
-        if (name == "output.weight" || name == "token_embd.weight") {
-            return 1.0f;
-        }
-        if (std::regex_search(name, match, layer_pattern)) {
-            const int layer = std::stoi(match[1]);
-            const float normalized_layer = (float)layer / (float)std::max(1, (int)model.hparams.n_layer - 1);
-            const float center_dist = std::abs(normalized_layer - 0.5f) * 2.0f;
-            return 0.01f + 0.9f * center_dist;
-        }
-
-        return 0.0f;
-    };
-
-    auto tensor_importance = [&](const std::vector<tensor_info> & all_tensors) -> std::unordered_map<std::string, float> {
-        std::unordered_map<std::string, float> scores;
-        for (const auto & t : all_tensors) {
-            const std::string name = ggml_get_name(t.w->tensor);
-            float total_score = 0.0f;
-            float depth_score = 0.0f;
-            float type_score = 0.0f;
-
-            // Type component: certain tensor types have more impact on model quality
-            const std::vector<std::pair<float, std::vector<const char*>>> tensor_scores = {
-                {0.9f, {".ffn_down.weight", ".ffn_down_exps.weight"}},
-                {0.89f, {".attn_output.weight", ".time_mix_output.weight", ".attn_o.weight"}},
-                {0.3f, {".ffn_up.weight", ".ffn_gate.weight", ".ffn_up_exps.weight", ".ffn_gate_exps.weight"}},
-                {0.29f, {".attn_q.weight", ".attn_k.weight", ".attn_v.weight", ".attn_qkv.weight"}},
-                {0.2f, {"token_embd.weight"}}
-            };
-            if (name == "output.weight") {
-                type_score = 1.0f;
-            } else {
-                for (const auto& ts : tensor_scores) {
-                    const bool found = std::any_of(ts.second.begin(), ts.second.end(), [&](const char* pattern) {
-                        return name.find(pattern) != std::string::npos;
-                    });
-                    if (found) {
-                        type_score = ts.first;
-                        break;
-                    }
-                }
-            }
-            if (type_score > 0.0f) {
-                depth_score = tensor_depth(name);
-            }
-
-            // Weighted combination
-            total_score = 0.90f * type_score + 0.10f * depth_score; // 90% type + 10% depth
-            if (total_score != 0.0f) {
-                scores[name] = total_score;
-                LLAMA_LOG_DEBUG("\t%s: \t %45s \t depth score %.4f \t type score %.4f \t total score %.4f\n", func, name.c_str(), depth_score, type_score, total_score);
-            }
-        }
-
-        return scores;
-    };
-
-    auto select_tensors = [&](const std::vector<tensor_info> & all_vec) -> std::unordered_set<std::string> {
-        const auto scores = tensor_importance(all_vec);
-
-        // Sort by score
-        std::vector<std::pair<std::string, float>> sorted_scores(scores.begin(), scores.end());
-        std::sort(sorted_scores.begin(), sorted_scores.end(), [](const auto & a, const auto & b) { return a.second > b.second; });
-
-        // Select top percentile
-        const size_t n_important = std::max<size_t>(1, std::llround((double)sorted_scores.size() * 0.29f)); // 29% seems to be the pareto front
-
-        std::unordered_set<std::string> important;
-        for (size_t i = 0; i < std::min(n_important, sorted_scores.size()); ++i) {
-            important.insert(sorted_scores[i].first);
-            LLAMA_LOG_DEBUG("\t%s: important tensor %s (score %.4f)\n", func, sorted_scores[i].first.c_str(), sorted_scores[i].second);
-        }
-
-        const auto pct = 100.0 * (double)important.size() / (double)sorted_scores.size();
-        LLAMA_LOG_INFO("%s: prioritizing %zu out of %zu tensors (%.2f%%)\n", func, important.size(), sorted_scores.size(), pct);
-        return important;
-    };
-
-    const auto important_set = select_tensors(all);
-
+    // Certain tensors have a higher impact on model quality, so we apply a lower penalty to them
     auto is_important = [&](const std::string & tensor_name) -> bool {
-        return important_set.count(tensor_name) > 0;
+        const auto important = tensor_name == "output.weight" ||
+                                    tensor_name.find(".ffn_down.weight") != std::string::npos ||
+                                    tensor_name.find(".ffn_down_exps.weight") != std::string::npos ||
+                                    tensor_name.find(".attn_output.weight") != std::string::npos ||
+                                    tensor_name.find(".time_mix_output.weight") != std::string::npos ||
+                                    tensor_name.find(".attn_o.weight") != std::string::npos;
+        return important;
     };
 
     // Lagrangian relaxation to minimise error subject to a bpw target constraint

From f8863b9a80822bb58e7406fd35d4452a97c4639a Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Tue, 28 Oct 2025 15:22:32 +0000
Subject: [PATCH 136/155] Minor refactoring

---
 src/llama-quant.cpp | 48 ++++++++++++++++++++++-----------------------
 1 file changed, 23 insertions(+), 25 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 04f4ff341a..fdce1f4285 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -694,6 +694,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     constexpr double epsilon = 1e-12;
     constexpr double infinity = std::numeric_limits<double>::infinity();
     constexpr uint32_t file_magic = 0x42505731;  // BPW1
+    constexpr uint64_t arbitrary_magic = 0xeabada55cafed00d;
     const char * func = __func__;
 
     auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t {
@@ -731,7 +732,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     auto make_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> ggml_type {
         if (is_compatible(t, typ)) { return typ; }
-        ggml_type fb = fallback_type(typ);
+        const ggml_type fb = fallback_type(typ);
         return is_compatible(t, fb) ? fb : GGML_TYPE_F16;
     };
 
@@ -754,7 +755,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         for (size_t i = 0; i < n; ++i) {
             h = (h << 5) + h + data[i];
         }
-        return h ? h : 0xeabada55cafed00d;
+        return h ? h : arbitrary_magic;
     };
 
     auto metadata_id = [&](const gguf_context * ctx) -> uint64_t {
@@ -795,7 +796,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         ofs.write((const char *)&n, sizeof(n));
         for (const auto & ti : all_vec) {
             const std::string name = ggml_get_name(ti.w->tensor);
-            const uint32_t len = (uint32_t)name.size();
+            const auto len = (uint32_t)name.size();
             ofs.write((const char *)&len, sizeof(len));
             ofs.write(name.data(), len);
 
@@ -835,13 +836,14 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         if (magic != file_magic) {
             LLAMA_LOG_WARN("%s: invalid resume file, ignoring: %s\n", func, checkpoint_file.c_str());
             return out;
-        } else if (id != model_id) {
+        }
+        if (id != model_id) {
             LLAMA_LOG_WARN("%s: model ID mismatch, ignoring: %s\n", func, checkpoint_file.c_str());
             return out;
-        } else {
-            LLAMA_LOG_INFO("%s: state file found, resuming tensor quantization\n", func);
         }
 
+        LLAMA_LOG_INFO("%s: state file found, resuming tensor quantization\n", func);
+
         uint64_t n = 0;
         ifs.read((char *)&n, sizeof(n));
         for (uint64_t i = 0; i < n; ++i) {
@@ -862,15 +864,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             si.n_elements = (size_t)ne;
 
             si.candidate.resize(cn);
-            for (size_t j = 0; j < si.candidate.size(); ++j) {
+            for (auto & s : si.candidate) {
                 int32_t t = 0;
                 uint64_t b = 0;
                 ifs.read((char *)&t, sizeof(t));
-                si.candidate[j].type = (ggml_type)t;
-                ifs.read((char *)&si.candidate[j].bpw, sizeof(si.candidate[j].bpw));
+                s.type = (ggml_type)t;
+                ifs.read((char *)&s.bpw, sizeof(s.bpw));
                 ifs.read((char *)&b, sizeof(b));
-                si.candidate[j].bytes = (size_t)b;
-                ifs.read((char *)&si.candidate[j].error, sizeof(si.candidate[j].error));
+                s.bytes = (size_t)b;
+                ifs.read((char *)&s.error, sizeof(s.error));
             }
 
             out.emplace(std::move(name), std::move(si));
@@ -886,7 +888,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             LLAMA_LOG_INFO("%s: deleting %s\n", func, checkpoint_file.c_str());
             std::remove(checkpoint_file.c_str());
         }
-
     };
 
     auto check_signal_handler = [&](const std::vector<tensor_info> & all_vec) {
@@ -1198,10 +1199,10 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         // Compute rows based on tensor shape and slice count
         auto sample_rows = [](const int64_t n, const int64_t rows, const int64_t n2, const bool has_acts) -> int64_t {
             const double tensor_budget = has_acts ? 1 * 1024 * 1024 : 0.5 * 1024 * 1024;
-            const double scale_rows = std::clamp(std::sqrt(std::max(1.0, (double)rows) / 4096.0), 0.5, 2.0); // favour more rows for large nrt
+            const double scale_rows = std::clamp(std::sqrt(std::max(1.0, (double)rows) / 4096.0), 0.5, 2.0); // favour more rows for large tensors
             const double slice_budget = tensor_budget * scale_rows / std::max<int64_t>(1, n2);
             const int64_t min_rows = has_acts ? 128 : 64;
-            const int64_t max_rows = 4096;
+            constexpr int64_t max_rows = 4096; // row limit to avoid excessive memory use
             int64_t total_rows = std::llround(slice_budget / std::max<int64_t>(1, n));
             total_rows = std::max<int64_t>(min_rows, std::min<int64_t>(total_rows, std::min<int64_t>(rows, max_rows)));
             if (rows <= min_rows * 2) { total_rows = rows; }
@@ -1246,7 +1247,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             f32_sample.clear();
             std::vector<float> row_buffer(n_per_row);
             for (int64_t slice = 0; slice < ne2; ++slice) {
-                std::mt19937 rng(std::hash<std::string>{}(name) ^ 0xeabada55cafed00d ^ slice);
+                std::mt19937 rng(std::hash<std::string>{}(name) ^ arbitrary_magic ^ slice);
                 const int64_t rows_sample_max = std::max<int64_t>(1, std::min<int64_t>(nrows_total, rows_sample_per_expert));
                 const int64_t stride = std::max<int64_t>(1, nrows_total / rows_sample_max);
                 int64_t offset = 0;
@@ -1411,8 +1412,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             if (c.bytes == 0) { continue; }
             const double final_err = bias_needed ? c.error : c.mse;
             info.candidate.push_back(candidate_types{ c.type, c.bpw, c.bytes, final_err, c.mse, c.proj });
-            // LLAMA_LOG_INFO("\t%s: %35s \t%10s \t%1.4f bpw \t%10zu bytes \t mse: %1.8e \t err: %1.8e\n",
-            //     func, name.c_str(), ggml_type_name(c.type), c.bpw, c.bytes, c.mse, final_err);
         }
 
         if (info.candidate.empty()) {
@@ -1445,16 +1444,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
             if (candidates.size() < 3) { return; } // need at least 3 points to do convex hull
 
             // Convex hull (lower envelope)
+            auto cross_product = [](const candidate_types & h0, const candidate_types & h1, const candidate_types & p) -> double {
+                const double dx1 = (double)h1.bytes - (double)h0.bytes;
+                const double dy1 = h1.error - h0.error;
+                const double dx2 = (double)p.bytes - (double)h0.bytes;
+                const double dy2 = p.error - h0.error;
+                return dx1 * dy2 - dx2 * dy1;
+            };
             std::vector<candidate_types> hull; hull.reserve(candidates.size());
             for (const auto & c : candidates) {
-                auto cross_product = [](const candidate_types & h0, const candidate_types & h1, const candidate_types & p) -> double {
-                    const double dx1 = (double)h1.bytes - (double)h0.bytes;
-                    const double dy1 = h1.error - h0.error;
-                    const double dx2 = (double)p.bytes - (double)h0.bytes;
-                    const double dy2 = p.error - h0.error;
-                    return dx1 * dy2 - dx2 * dy1;
-                };
-
                 while (hull.size() >= 2) {
                     if (cross_product(hull[hull.size() - 2], hull[hull.size() - 1], c) <= epsilon) {
                         hull.pop_back();

From 6e32244a06b1ffe513b1694ee647e92c09904dac Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 30 Oct 2025 21:53:07 +0000
Subject: [PATCH 137/155] Read statistics from imatrix

---
 include/llama.h             |  1 +
 src/llama-quant.cpp         | 28 ++++++++++-----
 tools/quantize/quantize.cpp | 68 +++++++++++++++++++++++++++++--------
 3 files changed, 75 insertions(+), 22 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index ce04011e19..517ef5e0fb 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -368,6 +368,7 @@ extern "C" {
         float target_bpw;                     // target bits per weight (bpw)
         bool keep_bpw_state;                  // keep bpw state file
         void * bpw_state;                     // pointer to bpw state file
+        void * statistics;                    // pointer to statistics data
     } llama_model_quantize_params;
 
     typedef struct llama_logit_bias {
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index fdce1f4285..a8153494f9 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -631,6 +631,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     const std::map<int, std::string> & mapped,
     const std::unordered_map<std::string, std::vector<float>> * values_data,
     const std::unordered_map<std::string, std::vector<float>> * activations_data,
+    const std::unordered_map<std::string, std::vector<float>> * statistics_data,
     const llama_model_quantize_params * params,
     int nthread
 ) {
@@ -1815,6 +1816,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     }
     const std::unordered_map<std::string, std::vector<float>> * values_data = nullptr;
     const std::unordered_map<std::string, std::vector<float>> * activations_data = nullptr;
+    const std::unordered_map<std::string, std::vector<float>> * statistics_data = nullptr;
     if (params->imatrix) {
         values_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
         if (values_data) {
@@ -1845,6 +1847,12 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             }
         }
     }
+    if (params->statistics) {
+        statistics_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->statistics);
+        if (statistics_data) {
+            LLAMA_LOG_INFO(" and %d statistics",int(statistics_data->size()));
+        }
+    }
     LLAMA_LOG_INFO("\n");
 
     gguf_context_ptr ctx_out { gguf_init_empty() };
@@ -1999,15 +2007,18 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     std::unordered_map<std::string, ggml_type> bpw_overrides = {};
     if (params->target_bpw != -1.0f && !params->only_copy) {
         if (params->imatrix) {
-            if (params->activations) {
-                LLAMA_LOG_INFO("%s: imatrix with activations provided, target bpw quantization will be more accurate\n",__func__);
-            } else {
-                LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate\n", __func__);
-            }
+            const char* base_msg = params->activations
+                ? (params->statistics
+                    ? "imatrix with activations and statistics provided, process will be more accurate\n"
+                    : "imatrix with activations provided, process will be accurate\n")
+                : "imatrix without activations provided, process will be less accurate\n";
+            if (params->activations) { LLAMA_LOG_INFO("%s: %s", __func__, base_msg); }
+            else { LLAMA_LOG_WARN("%s: %s", __func__, base_msg); }
+
             LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw);
-            bpw_overrides = target_bpw_type(ml, model, tensors, mapped, values_data, activations_data, params, nthread);
+            bpw_overrides = target_bpw_type(ml, model, tensors, mapped, values_data, activations_data, statistics_data, params, nthread);
         } else {
-            LLAMA_LOG_WARN("%s: no imatrix provided, target bpw will not apply\n", __func__);
+            LLAMA_LOG_WARN("%s: --target-bpw requires an imatrix but none was provided, option will be ignored\n", __func__);
         }
     }
 
@@ -2269,7 +2280,8 @@ llama_model_quantize_params llama_model_quantize_default_params() {
         /*.prune_layers                =*/ nullptr,
         /*.target_bpw                  =*/ -1.0f,
         /*.keep_bpw_state              =*/ false,
-        /*.bpw_state                   =*/ nullptr
+        /*.bpw_state                   =*/ nullptr,
+        /*.statistics                  =*/ nullptr
     };
 
     return result;
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index f994999e59..0b2b05b60a 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -221,7 +221,8 @@ static int load_legacy_imatrix(const std::string & imatrix_file, std::vector<std
 static int load_imatrix(const std::string & imatrix_file,
     std::vector<std::string> & imatrix_datasets,
     std::unordered_map<std::string, std::vector<float>> & values_data,
-    std::unordered_map<std::string, std::vector<float>> & activations_data) {
+    std::unordered_map<std::string, std::vector<float>> & activations_data,
+    std::unordered_map<std::string, std::vector<float>> & statistics_data) {
 
     struct ggml_context * ctx = nullptr;
     struct gguf_init_params meta_gguf_params = {
@@ -256,24 +257,28 @@ static int load_imatrix(const std::string & imatrix_file,
     const std::string sums_suffix{ ".in_sum" };
     const std::string sums2_suffix{ ".in_sum2" };
     const std::string counts_suffix{ ".counts" };
+    const std::string stats_suffix{ ".stats" };
 
     // Using an ordered map to get a deterministic iteration order.
-    std::map<std::string, std::tuple<struct ggml_tensor *, struct ggml_tensor *, struct ggml_tensor *>> sums_counts_for;
+    std::map<std::string, std::tuple<struct ggml_tensor *, struct ggml_tensor *, struct ggml_tensor *, struct ggml_tensor *>> sums_counts_for;
 
     for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
         std::string name = cur->name;
 
         if (name.empty()) { continue; }
 
-        if (string_remove_suffix(name, sums2_suffix)) {
-            // in_sum2
+        if (string_remove_suffix(name, sums_suffix)) {
+            // in_sum
             std::get<0>(sums_counts_for[std::move(name)]) = cur;
+        } else if (string_remove_suffix(name, sums2_suffix)) {
+            // in_sum2
+            std::get<1>(sums_counts_for[std::move(name)]) = cur;
         } else if (string_remove_suffix(name, counts_suffix)) {
             // counts
-            std::get<1>(sums_counts_for[std::move(name)]) = cur;
-        }  else if (string_remove_suffix(name, sums_suffix)) {
-            // in_sum
             std::get<2>(sums_counts_for[std::move(name)]) = cur;
+        }  else if (string_remove_suffix(name, stats_suffix)) {
+            // stats
+            std::get<3>(sums_counts_for[std::move(name)]) = cur;
         }
         else {
             // ignore other tensors
@@ -282,11 +287,12 @@ static int load_imatrix(const std::string & imatrix_file,
 
     for (const auto & sc : sums_counts_for) {
         const        std::string & name   = sc.first;
-        const struct ggml_tensor * sums   = std::get<2>(sc.second);
-        const struct ggml_tensor * sums2  = std::get<0>(sc.second);
-        const struct ggml_tensor * counts = std::get<1>(sc.second);
+        const struct ggml_tensor * sums   = std::get<0>(sc.second);
+        const struct ggml_tensor * sums2  = std::get<1>(sc.second);
+        const struct ggml_tensor * counts = std::get<2>(sc.second);
+        const struct ggml_tensor * stats = std::get<3>(sc.second);
 
-        // check that sums, sums2 and counts have the same shape
+        // check sums2 and counts are present, and that sums and sums2 have the same shape
         if (!sums2 || !counts || (sums != nullptr && ggml_nelements(sums) != ggml_nelements(sums2))) {
             fprintf(stderr, "%s: mismatched sums and counts for %s\n", __func__, name.c_str());
             gguf_free(ctx_gguf);
@@ -302,6 +308,19 @@ static int load_imatrix(const std::string & imatrix_file,
         if (sums) {
             activations.resize(ggml_nelements(sums));
         }
+        if (stats) {
+            auto & statistics = statistics_data[name];
+            statistics.resize(ggml_nelements(stats));
+            if (stats->type == GGML_TYPE_F32) {
+                std::memcpy(statistics.data(), stats->data, ggml_nelements(stats) * sizeof(float));
+            } else {
+                fprintf(stderr, "%s: unsupported .stats type '%s' for '%s' - ignoring entry\n",
+                    __func__, ggml_type_name(stats->type), name.c_str());
+                statistics.clear();
+                statistics_data.erase(name);
+            }
+
+        }
         values.resize(ggml_nelements(sums2));
         float max_count = 0.0f;
         for (int64_t j = 0; j < ne1; ++j) {
@@ -354,10 +373,11 @@ static int prepare_imatrix(const std::string & imatrix_file,
         const std::vector<std::string> & included_weights,
         const std::vector<std::string> & excluded_weights,
         std::unordered_map<std::string, std::vector<float>> & values_data,
-        std::unordered_map<std::string, std::vector<float>> & activations_data) {
+        std::unordered_map<std::string, std::vector<float>> & activations_data,
+        std::unordered_map<std::string, std::vector<float>> & statistics_data) {
     int m_last_call = -1;
     if (!imatrix_file.empty()) {
-        m_last_call = load_imatrix(imatrix_file, imatrix_dataset, values_data, activations_data);
+        m_last_call = load_imatrix(imatrix_file, imatrix_dataset, values_data, activations_data, statistics_data);
     }
     if (values_data.empty()) {
         return m_last_call;
@@ -380,11 +400,20 @@ static int prepare_imatrix(const std::string & imatrix_file,
                     ++at;
                 }
             }
+            for (auto st = statistics_data.begin(); st != statistics_data.end();) {
+                auto pos = st->first.find(name);
+                if (pos != std::string::npos) {
+                    st = activations_data.erase(st);
+                } else {
+                    ++st;
+                }
+            }
         }
     }
     if (!included_weights.empty()) {
         std::unordered_map<std::string, std::vector<float>> tmp_values;
         std::unordered_map<std::string, std::vector<float>> tmp_activations;
+        std::unordered_map<std::string, std::vector<float>> tmp_statistics;
         for (const auto & name : included_weights) {
             for (auto & e : values_data) {
                 auto pos = e.first.find(name);
@@ -398,9 +427,16 @@ static int prepare_imatrix(const std::string & imatrix_file,
                     tmp_activations.emplace(std::move(a));
                 }
             }
+            for (auto & s : statistics_data) {
+                auto pos = s.first.find(name);
+                if (pos != std::string::npos) {
+                    tmp_statistics.emplace(std::move(s));
+                }
+            }
         }
         values_data = std::move(tmp_values);
         activations_data = std::move(tmp_activations);
+        statistics_data = std::move(tmp_statistics);
     }
 
     return m_last_call;
@@ -617,7 +653,8 @@ int main(int argc, char ** argv) {
     std::vector<std::string> imatrix_datasets;
     std::unordered_map<std::string, std::vector<float>> values_data;
     std::unordered_map<std::string, std::vector<float>> activations_data;
-    int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, values_data, activations_data);
+    std::unordered_map<std::string, std::vector<float>> statistics_data;
+    int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, values_data, activations_data, statistics_data);
     if (!values_data.empty()) {
         params.imatrix = &values_data;
         {
@@ -657,6 +694,9 @@ int main(int argc, char ** argv) {
     if (!activations_data.empty()) {
         params.activations = &activations_data;
     }
+    if (!statistics_data.empty()) {
+        params.statistics = &statistics_data;
+    }
     if (!kv_overrides.empty()) {
         kv_overrides.emplace_back();
         kv_overrides.back().key[0] = 0;

From c59bb6d49d025765091d7c83a9b95528395de283 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 30 Oct 2025 22:11:40 +0000
Subject: [PATCH 138/155] Add Euclidean-Cosine score to identify important
 tensors

---
 src/llama-quant.cpp | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index a8153494f9..957dd5f367 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1571,12 +1571,25 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     // Certain tensors have a higher impact on model quality, so we apply a lower penalty to them
     auto is_important = [&](const std::string & tensor_name) -> bool {
-        const auto important = tensor_name == "output.weight" ||
-                                    tensor_name.find(".ffn_down.weight") != std::string::npos ||
-                                    tensor_name.find(".ffn_down_exps.weight") != std::string::npos ||
-                                    tensor_name.find(".attn_output.weight") != std::string::npos ||
-                                    tensor_name.find(".time_mix_output.weight") != std::string::npos ||
-                                    tensor_name.find(".attn_o.weight") != std::string::npos;
+        bool important = false;
+
+        if (statistics_data) {
+            float ecs = 0.0f; // Euclidean-Cosine score
+            const std::string key = remap_imatrix(tensor_name, mapped);
+            const auto tstats = statistics_data->find(key);
+            if (tstats != statistics_data->end() && !tstats->second.empty()) {
+                ecs = tstats->second.front();
+                important = ecs == 100.0f; // mark as important if ecs is 100%
+            }
+        } else {
+            important = tensor_name == "output.weight" ||
+                        tensor_name.find(".ffn_down.weight") != std::string::npos ||
+                        tensor_name.find(".ffn_down_exps.weight") != std::string::npos ||
+                        tensor_name.find(".attn_output.weight") != std::string::npos ||
+                        tensor_name.find(".time_mix_output.weight") != std::string::npos ||
+                        tensor_name.find(".attn_o.weight") != std::string::npos;
+        }
+
         return important;
     };
 

From ac8cfbdd12eb2207098e3bcc4aee9347aa8366bc Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Mon, 17 Nov 2025 18:03:09 +0000
Subject: [PATCH 139/155] Improved is_important() logic

---
 src/llama-quant.cpp | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 739172c70f..1e8a2cda9c 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -635,8 +635,8 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     const llama_model_quantize_params * params,
     int nthread
 ) {
-    // RAII guard for signal handlers
     bpw_stop.store(false, std::memory_order_relaxed);
+    // Signal handlers
     struct signal_scope_guard {
         using handler_t = void (*)(int);
         handler_t prev_int = SIG_DFL;
@@ -1574,12 +1574,23 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         bool important = false;
 
         if (statistics_data) {
-            float ecs = 0.0f; // Euclidean-Cosine score
             const std::string key = remap_imatrix(tensor_name, mapped);
             const auto tstats = statistics_data->find(key);
             if (tstats != statistics_data->end() && !tstats->second.empty()) {
-                ecs = tstats->second.front();
-                important = ecs == 100.0f; // mark as important if ecs is 100%
+                float ecs = 0.0f; // Euclidean-Cosine score
+                float l2 = 0.0f;  // L2 Euclidean Distance
+                float cs = 0.0f;  // Cosine Similarity
+                try {
+                    // ecs = tstats->second.at(0);
+                    l2 = tstats->second.at(1);
+                    cs = tstats->second.at(2);
+                } catch (std::out_of_range &) {
+                    LLAMA_LOG_ERROR("\t%s: insufficient statistics for tensor %s\n", func, tensor_name.c_str());
+                    return false;
+                }
+                ecs = 100.0f - (100.0f / (1.0f + 0.01f * l2 * l2) * std::fabs(cs)); // ecs = 100 - (100 / (1 + (L2 Dist/p)^2) * |Cos Sim|^q)
+                // LLAMA_LOG_INFO("\t%s: tensor %s has ECS score %.4f (L2 Distance %.4f and CosSim %.4f\n", func, tensor_name.c_str(), ecs, l2, cs);
+                important = ecs >= 99.99f; // mark as important if ecs is >= 99.99%
             }
         } else {
             important = tensor_name == "output.weight" ||

From a0ba913613235c1639f92877f09e82c3db6fef47 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Wed, 19 Nov 2025 11:19:44 +0000
Subject: [PATCH 140/155] Fix lambda capture bug in Windows and initialise
 candidate_types struct

---
 src/llama-quant.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 1e8a2cda9c..86ca165b6c 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -652,10 +652,10 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     } signal_guard;
 
     struct candidate_types {
-        ggml_type type;
-        float bpw;
-        size_t bytes;
-        double error;
+        ggml_type type = GGML_TYPE_COUNT;
+        float bpw = 0.0f;
+        size_t bytes = 0;
+        double error = 0.0;
         double mse = 0.0;
         double proj = 0.0;
     };
@@ -751,7 +751,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         size_t n_elements = 0;
     };
 
-    auto djb2_hash = [](const uint8_t * data, size_t n) -> uint64_t {
+    auto djb2_hash = [&](const uint8_t * data, const size_t n) -> uint64_t {
         uint64_t h = 5381;
         for (size_t i = 0; i < n; ++i) {
             h = (h << 5) + h + data[i];

From 9ec3e6e2629d294e7ae95ee58634c360475e67d7 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 23 Nov 2025 17:49:53 +0000
Subject: [PATCH 141/155] Remove processing statistics_data

---
 include/llama.h             |  1 -
 src/llama-quant.cpp         | 19 ++----------
 tools/quantize/quantize.cpp | 61 ++++++-------------------------------
 3 files changed, 12 insertions(+), 69 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index 3515ee1a13..c82a4147f4 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -369,7 +369,6 @@ extern "C" {
         float target_bpw;                     // target bits per weight (bpw)
         bool keep_bpw_state;                  // keep bpw state file
         void * bpw_state;                     // pointer to bpw state file
-        void * statistics;                    // pointer to statistics data
     } llama_model_quantize_params;
 
     typedef struct llama_logit_bias {
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 86ca165b6c..99759a27c8 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -631,7 +631,6 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     const std::map<int, std::string> & mapped,
     const std::unordered_map<std::string, std::vector<float>> * values_data,
     const std::unordered_map<std::string, std::vector<float>> * activations_data,
-    const std::unordered_map<std::string, std::vector<float>> * statistics_data,
     const llama_model_quantize_params * params,
     int nthread
 ) {
@@ -1840,7 +1839,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     }
     const std::unordered_map<std::string, std::vector<float>> * values_data = nullptr;
     const std::unordered_map<std::string, std::vector<float>> * activations_data = nullptr;
-    const std::unordered_map<std::string, std::vector<float>> * statistics_data = nullptr;
     if (params->imatrix) {
         values_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
         if (values_data) {
@@ -1871,12 +1869,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             }
         }
     }
-    if (params->statistics) {
-        statistics_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->statistics);
-        if (statistics_data) {
-            LLAMA_LOG_INFO(" and %d statistics",int(statistics_data->size()));
-        }
-    }
     LLAMA_LOG_INFO("\n");
 
     gguf_context_ptr ctx_out { gguf_init_empty() };
@@ -2031,16 +2023,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     std::unordered_map<std::string, ggml_type> bpw_overrides = {};
     if (params->target_bpw != -1.0f && !params->only_copy) {
         if (params->imatrix) {
-            const char* base_msg = params->activations
-                ? (params->statistics
-                    ? "imatrix with activations and statistics provided, process will be more accurate\n"
-                    : "imatrix with activations provided, process will be accurate\n")
-                : "imatrix without activations provided, process will be less accurate\n";
-            if (params->activations) { LLAMA_LOG_INFO("%s: %s", __func__, base_msg); }
-            else { LLAMA_LOG_WARN("%s: %s", __func__, base_msg); }
 
             LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw);
-            bpw_overrides = target_bpw_type(ml, model, tensors, mapped, values_data, activations_data, statistics_data, params, nthread);
+
+            bpw_overrides = target_bpw_type(ml, model, tensors, mapped, values_data, activations_data, params, nthread);
         } else {
             LLAMA_LOG_WARN("%s: --target-bpw requires an imatrix but none was provided, option will be ignored\n", __func__);
         }
@@ -2305,7 +2291,6 @@ llama_model_quantize_params llama_model_quantize_default_params() {
         /*.target_bpw                  =*/ -1.0f,
         /*.keep_bpw_state              =*/ false,
         /*.bpw_state                   =*/ nullptr,
-        /*.statistics                  =*/ nullptr
     };
 
     return result;
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 0b2b05b60a..aabcd73986 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -221,8 +221,7 @@ static int load_legacy_imatrix(const std::string & imatrix_file, std::vector<std
 static int load_imatrix(const std::string & imatrix_file,
     std::vector<std::string> & imatrix_datasets,
     std::unordered_map<std::string, std::vector<float>> & values_data,
-    std::unordered_map<std::string, std::vector<float>> & activations_data,
-    std::unordered_map<std::string, std::vector<float>> & statistics_data) {
+    std::unordered_map<std::string, std::vector<float>> & activations_data) {
 
     struct ggml_context * ctx = nullptr;
     struct gguf_init_params meta_gguf_params = {
@@ -257,10 +256,9 @@ static int load_imatrix(const std::string & imatrix_file,
     const std::string sums_suffix{ ".in_sum" };
     const std::string sums2_suffix{ ".in_sum2" };
     const std::string counts_suffix{ ".counts" };
-    const std::string stats_suffix{ ".stats" };
 
     // Using an ordered map to get a deterministic iteration order.
-    std::map<std::string, std::tuple<struct ggml_tensor *, struct ggml_tensor *, struct ggml_tensor *, struct ggml_tensor *>> sums_counts_for;
+    std::map<std::string, std::tuple<struct ggml_tensor *, struct ggml_tensor *, struct ggml_tensor *>> sums_counts_for;
 
     for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
         std::string name = cur->name;
@@ -276,11 +274,7 @@ static int load_imatrix(const std::string & imatrix_file,
         } else if (string_remove_suffix(name, counts_suffix)) {
             // counts
             std::get<2>(sums_counts_for[std::move(name)]) = cur;
-        }  else if (string_remove_suffix(name, stats_suffix)) {
-            // stats
-            std::get<3>(sums_counts_for[std::move(name)]) = cur;
-        }
-        else {
+        }  else {
             // ignore other tensors
         }
     }
@@ -290,7 +284,6 @@ static int load_imatrix(const std::string & imatrix_file,
         const struct ggml_tensor * sums   = std::get<0>(sc.second);
         const struct ggml_tensor * sums2  = std::get<1>(sc.second);
         const struct ggml_tensor * counts = std::get<2>(sc.second);
-        const struct ggml_tensor * stats = std::get<3>(sc.second);
 
         // check sums2 and counts are present, and that sums and sums2 have the same shape
         if (!sums2 || !counts || (sums != nullptr && ggml_nelements(sums) != ggml_nelements(sums2))) {
@@ -308,19 +301,6 @@ static int load_imatrix(const std::string & imatrix_file,
         if (sums) {
             activations.resize(ggml_nelements(sums));
         }
-        if (stats) {
-            auto & statistics = statistics_data[name];
-            statistics.resize(ggml_nelements(stats));
-            if (stats->type == GGML_TYPE_F32) {
-                std::memcpy(statistics.data(), stats->data, ggml_nelements(stats) * sizeof(float));
-            } else {
-                fprintf(stderr, "%s: unsupported .stats type '%s' for '%s' - ignoring entry\n",
-                    __func__, ggml_type_name(stats->type), name.c_str());
-                statistics.clear();
-                statistics_data.erase(name);
-            }
-
-        }
         values.resize(ggml_nelements(sums2));
         float max_count = 0.0f;
         for (int64_t j = 0; j < ne1; ++j) {
@@ -373,23 +353,22 @@ static int prepare_imatrix(const std::string & imatrix_file,
         const std::vector<std::string> & included_weights,
         const std::vector<std::string> & excluded_weights,
         std::unordered_map<std::string, std::vector<float>> & values_data,
-        std::unordered_map<std::string, std::vector<float>> & activations_data,
-        std::unordered_map<std::string, std::vector<float>> & statistics_data) {
+        std::unordered_map<std::string, std::vector<float>> & activations_data) {
     int m_last_call = -1;
     if (!imatrix_file.empty()) {
-        m_last_call = load_imatrix(imatrix_file, imatrix_dataset, values_data, activations_data, statistics_data);
+        m_last_call = load_imatrix(imatrix_file, imatrix_dataset, values_data, activations_data);
     }
     if (values_data.empty()) {
         return m_last_call;
     }
     if (!excluded_weights.empty()) {
         for (const auto & name : excluded_weights) {
-            for (auto it = values_data.begin(); it != values_data.end();) {
-                auto pos = it->first.find(name);
+            for (auto vt = values_data.begin(); vt != values_data.end();) {
+                auto pos = vt->first.find(name);
                 if (pos != std::string::npos) {
-                    it = values_data.erase(it);
+                    vt = values_data.erase(vt);
                 } else {
-                    ++it;
+                    ++vt;
                 }
             }
             for (auto at = activations_data.begin(); at != activations_data.end();) {
@@ -400,20 +379,11 @@ static int prepare_imatrix(const std::string & imatrix_file,
                     ++at;
                 }
             }
-            for (auto st = statistics_data.begin(); st != statistics_data.end();) {
-                auto pos = st->first.find(name);
-                if (pos != std::string::npos) {
-                    st = activations_data.erase(st);
-                } else {
-                    ++st;
-                }
-            }
         }
     }
     if (!included_weights.empty()) {
         std::unordered_map<std::string, std::vector<float>> tmp_values;
         std::unordered_map<std::string, std::vector<float>> tmp_activations;
-        std::unordered_map<std::string, std::vector<float>> tmp_statistics;
         for (const auto & name : included_weights) {
             for (auto & e : values_data) {
                 auto pos = e.first.find(name);
@@ -427,16 +397,9 @@ static int prepare_imatrix(const std::string & imatrix_file,
                     tmp_activations.emplace(std::move(a));
                 }
             }
-            for (auto & s : statistics_data) {
-                auto pos = s.first.find(name);
-                if (pos != std::string::npos) {
-                    tmp_statistics.emplace(std::move(s));
-                }
-            }
         }
         values_data = std::move(tmp_values);
         activations_data = std::move(tmp_activations);
-        statistics_data = std::move(tmp_statistics);
     }
 
     return m_last_call;
@@ -653,8 +616,7 @@ int main(int argc, char ** argv) {
     std::vector<std::string> imatrix_datasets;
     std::unordered_map<std::string, std::vector<float>> values_data;
     std::unordered_map<std::string, std::vector<float>> activations_data;
-    std::unordered_map<std::string, std::vector<float>> statistics_data;
-    int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, values_data, activations_data, statistics_data);
+    int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, values_data, activations_data);
     if (!values_data.empty()) {
         params.imatrix = &values_data;
         {
@@ -694,9 +656,6 @@ int main(int argc, char ** argv) {
     if (!activations_data.empty()) {
         params.activations = &activations_data;
     }
-    if (!statistics_data.empty()) {
-        params.statistics = &statistics_data;
-    }
     if (!kv_overrides.empty()) {
         kv_overrides.emplace_back();
         kv_overrides.back().key[0] = 0;

From 1c9993e13198a28db1b5a8e7cd0fcb5d6bcf89eb Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 23 Nov 2025 17:51:04 +0000
Subject: [PATCH 142/155] Add --disable-tensor-importance option

---
 include/llama.h             |  1 +
 src/llama-quant.cpp         | 39 ++++++++++++++-----------------------
 tools/quantize/quantize.cpp |  4 ++++
 3 files changed, 20 insertions(+), 24 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index c82a4147f4..1f5b2e8a2b 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -369,6 +369,7 @@ extern "C" {
         float target_bpw;                     // target bits per weight (bpw)
         bool keep_bpw_state;                  // keep bpw state file
         void * bpw_state;                     // pointer to bpw state file
+        bool disable_tensor_importance;       // treat all tensors equally during quantization
     } llama_model_quantize_params;
 
     typedef struct llama_logit_bias {
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 99759a27c8..2b9aba091b 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1570,29 +1570,10 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     // Certain tensors have a higher impact on model quality, so we apply a lower penalty to them
     auto is_important = [&](const std::string & tensor_name) -> bool {
-        bool important = false;
-
-        if (statistics_data) {
-            const std::string key = remap_imatrix(tensor_name, mapped);
-            const auto tstats = statistics_data->find(key);
-            if (tstats != statistics_data->end() && !tstats->second.empty()) {
-                float ecs = 0.0f; // Euclidean-Cosine score
-                float l2 = 0.0f;  // L2 Euclidean Distance
-                float cs = 0.0f;  // Cosine Similarity
-                try {
-                    // ecs = tstats->second.at(0);
-                    l2 = tstats->second.at(1);
-                    cs = tstats->second.at(2);
-                } catch (std::out_of_range &) {
-                    LLAMA_LOG_ERROR("\t%s: insufficient statistics for tensor %s\n", func, tensor_name.c_str());
-                    return false;
-                }
-                ecs = 100.0f - (100.0f / (1.0f + 0.01f * l2 * l2) * std::fabs(cs)); // ecs = 100 - (100 / (1 + (L2 Dist/p)^2) * |Cos Sim|^q)
-                // LLAMA_LOG_INFO("\t%s: tensor %s has ECS score %.4f (L2 Distance %.4f and CosSim %.4f\n", func, tensor_name.c_str(), ecs, l2, cs);
-                important = ecs >= 99.99f; // mark as important if ecs is >= 99.99%
-            }
-        } else {
-            important = tensor_name == "output.weight" ||
+        bool important = tensor_name == "output.weight";
+        if (!important && !params->disable_tensor_importance) {
+            important = tensor_name.find(".attn_v.weight") != std::string::npos ||
+                        tensor_name.find(".time_mix_value.weight") != std::string::npos ||
                         tensor_name.find(".ffn_down.weight") != std::string::npos ||
                         tensor_name.find(".ffn_down_exps.weight") != std::string::npos ||
                         tensor_name.find(".attn_output.weight") != std::string::npos ||
@@ -2023,7 +2004,16 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     std::unordered_map<std::string, ggml_type> bpw_overrides = {};
     if (params->target_bpw != -1.0f && !params->only_copy) {
         if (params->imatrix) {
-
+            if (params->activations) {
+                LLAMA_LOG_INFO("%s: imatrix has activations, process will be more accurate\n", __func__);
+            } else {
+                LLAMA_LOG_INFO("%s: imatrix does not have activations, process may be less accurate\n", __func__);
+            }
+            if (params->disable_tensor_importance) {
+                LLAMA_LOG_INFO("%s: allocating bpw budget to tensors equally\n", __func__);
+            } else {
+                LLAMA_LOG_INFO("%s: allocating more bpw budget to important tensors\n", __func__);
+            }
             LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw);
 
             bpw_overrides = target_bpw_type(ml, model, tensors, mapped, values_data, activations_data, params, nthread);
@@ -2291,6 +2281,7 @@ llama_model_quantize_params llama_model_quantize_default_params() {
         /*.target_bpw                  =*/ -1.0f,
         /*.keep_bpw_state              =*/ false,
         /*.bpw_state                   =*/ nullptr,
+        /*.disable_tensor_importance   =*/ false
     };
 
     return result;
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index aabcd73986..4fee8c91a1 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -134,6 +134,8 @@ static void usage(const char * executable) {
     printf("      Advanced option to remove all tensors from the given layers\n");
     printf("  --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0\n");
     printf("      Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n");
+    printf("  --disable-tensor-importance: treat all tensors equally during bpw quantization\n");
+    printf("      Advanced option to disable allocating more bpw budget to important tensors. It may increase quality for some models\n");
     printf("  --keep-bpw-state: save the bpw computations to <architecture>-<model hash>.bpw_state\n");
     printf("  --bpw-state: file name to use instead of default\n");
     printf("  --keep-split: will generate quantized model in the same shards as input\n");
@@ -560,6 +562,8 @@ int main(int argc, char ** argv) {
             if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) {
                 usage(argv[0]);
             }
+        } else if (strcmp(argv[arg_idx], "--disable-tensor-importance") == 0) {
+            params.disable_tensor_importance = true;
         } else if (strcmp(argv[arg_idx], "--keep-bpw-state") == 0) {
             params.keep_bpw_state = true;
         } else if (strcmp(argv[arg_idx], "--bpw-state") == 0) {

From 661600842096145db52a4c631bfe0303a5d454ee Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Mon, 24 Nov 2025 18:26:45 +0000
Subject: [PATCH 143/155] Use more descriptive option naming

---
 include/llama.h             |  2 +-
 src/llama-quant.cpp         | 10 +++++-----
 tools/quantize/quantize.cpp | 14 +++++++-------
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index 1f5b2e8a2b..50e61d4976 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -369,7 +369,7 @@ extern "C" {
         float target_bpw;                     // target bits per weight (bpw)
         bool keep_bpw_state;                  // keep bpw state file
         void * bpw_state;                     // pointer to bpw state file
-        bool disable_tensor_importance;       // treat all tensors equally during quantization
+        bool no_importance;                   // allocate target bpw budget equitably across all tensors
     } llama_model_quantize_params;
 
     typedef struct llama_logit_bias {
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 2b9aba091b..c468a3e4fc 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1571,7 +1571,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     // Certain tensors have a higher impact on model quality, so we apply a lower penalty to them
     auto is_important = [&](const std::string & tensor_name) -> bool {
         bool important = tensor_name == "output.weight";
-        if (!important && !params->disable_tensor_importance) {
+        if (!important && !params->no_importance) {
             important = tensor_name.find(".attn_v.weight") != std::string::npos ||
                         tensor_name.find(".time_mix_value.weight") != std::string::npos ||
                         tensor_name.find(".ffn_down.weight") != std::string::npos ||
@@ -2009,10 +2009,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             } else {
                 LLAMA_LOG_INFO("%s: imatrix does not have activations, process may be less accurate\n", __func__);
             }
-            if (params->disable_tensor_importance) {
-                LLAMA_LOG_INFO("%s: allocating bpw budget to tensors equally\n", __func__);
+            if (params->no_importance) {
+                LLAMA_LOG_INFO("%s: distributing bpw budget equitably across all tensors\n", __func__);
             } else {
-                LLAMA_LOG_INFO("%s: allocating more bpw budget to important tensors\n", __func__);
+                LLAMA_LOG_INFO("%s: assigning more bpw budget to important tensors\n", __func__);
             }
             LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw);
 
@@ -2281,7 +2281,7 @@ llama_model_quantize_params llama_model_quantize_default_params() {
         /*.target_bpw                  =*/ -1.0f,
         /*.keep_bpw_state              =*/ false,
         /*.bpw_state                   =*/ nullptr,
-        /*.disable_tensor_importance   =*/ false
+        /*.no_importance               =*/ false
     };
 
     return result;
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 4fee8c91a1..dd4b860e1b 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -117,9 +117,9 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 
 [[noreturn]]
 static void usage(const char * executable) {
-    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--target-bpw n]\n", executable);
-    printf("       [--keep-bpw-state] [--bpw-state filename] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n");
-    printf("       model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
+    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights]\n", executable);
+    printf("        [--target-bpw n] [--no-importance] [--keep-bpw-state] [--bpw-state filename] [--output-tensor-type] [--token-embedding-type] [--tensor-type]\n");
+    printf("        [--prune-layers] [--keep-split] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
     printf("  --allow-requantize: allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
     printf("  --leave-output-tensor: will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
     printf("  --pure: disable k-quant mixtures and quantize all tensors to the same type\n");
@@ -134,8 +134,8 @@ static void usage(const char * executable) {
     printf("      Advanced option to remove all tensors from the given layers\n");
     printf("  --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0\n");
     printf("      Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n");
-    printf("  --disable-tensor-importance: treat all tensors equally during bpw quantization\n");
-    printf("      Advanced option to disable allocating more bpw budget to important tensors. It may increase quality for some models\n");
+    printf("  --no-importance: distribute bpw budget equitably across all tensors\n");
+    printf("      Advanced option to disable assigning more bpw budget to important tensors. It may increase quality for some models\n");
     printf("  --keep-bpw-state: save the bpw computations to <architecture>-<model hash>.bpw_state\n");
     printf("  --bpw-state: file name to use instead of default\n");
     printf("  --keep-split: will generate quantized model in the same shards as input\n");
@@ -562,8 +562,8 @@ int main(int argc, char ** argv) {
             if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) {
                 usage(argv[0]);
             }
-        } else if (strcmp(argv[arg_idx], "--disable-tensor-importance") == 0) {
-            params.disable_tensor_importance = true;
+        } else if (strcmp(argv[arg_idx], "--no-importance") == 0) {
+            params.no_importance = true;
         } else if (strcmp(argv[arg_idx], "--keep-bpw-state") == 0) {
             params.keep_bpw_state = true;
         } else if (strcmp(argv[arg_idx], "--bpw-state") == 0) {

From 69a32b6f508a4d0d38f52cf91cc8cd5b42a4bf62 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 29 Nov 2025 10:28:43 +0000
Subject: [PATCH 144/155] Relax target bpw range

---
 tools/quantize/quantize.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index dd4b860e1b..ebeea65336 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -132,7 +132,7 @@ static void usage(const char * executable) {
     printf("      Advanced option to selectively quantize tensors. May be specified multiple times.\n");
     printf("  --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n");
     printf("      Advanced option to remove all tensors from the given layers\n");
-    printf("  --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0\n");
+    printf("  --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 16.0\n");
     printf("      Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n");
     printf("  --no-importance: distribute bpw budget equitably across all tensors\n");
     printf("      Advanced option to disable assigning more bpw budget to important tensors. It may increase quality for some models\n");
@@ -485,13 +485,13 @@ static bool parse_target_bpw(const char * data, float & target_bpw) {
 
     try {
         target_bpw = std::stof(data);
-        if (target_bpw < 0.0f || target_bpw > 8.0f) {
-            printf("\n%s: target bits per weight (bpw) must be a positive number between 0.0 and 8.0\n\n", __func__);
+        if (target_bpw < 0.0f || target_bpw > 16.0f) {
+            printf("\n%s: target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__);
             return false;
         }
     }
     catch (const std::exception & e) {
-        printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 8.0\n\n", __func__, data);
+        printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__, data);
         return false;
     }
 

From 5b557ca958d3b0cb4293e12aafe21135c0c12142 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 29 Nov 2025 10:30:20 +0000
Subject: [PATCH 145/155] Minor refactoring

---
 src/llama-quant.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index c468a3e4fc..2cb58d46bd 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -635,7 +635,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     int nthread
 ) {
     bpw_stop.store(false, std::memory_order_relaxed);
-    // Signal handlers
+    // SIGINT/SIGTERM signal handlers
     struct signal_scope_guard {
         using handler_t = void (*)(int);
         handler_t prev_int = SIG_DFL;
@@ -1361,14 +1361,14 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         for (size_t i = 0; i < compatible_candidates.size(); ++i) {
             if (bpw_stop.load(std::memory_order_relaxed)) { return std::nullopt; }
 
-            const ggml_type tensor_types = compatible_candidates[i];
-            const auto bpw = (float)tensor_bpw(tensor, tensor_types);
-            const size_t bytes = tensor_bytes(tensor, tensor_types);
+            const ggml_type tensor_type = compatible_candidates[i];
+            const auto bpw = (float)tensor_bpw(tensor, tensor_type);
+            const size_t bytes = tensor_bytes(tensor, tensor_type);
             double mse = 0.0;
             double proj = 0.0;
-            const auto err = estimate_error(tensor, tensor_types, f32_sample, rows_sample, values, activations,
+            const auto err = estimate_error(tensor, tensor_type, f32_sample, rows_sample, values, activations,
                 quantized_buffer, dequantized_buffer, tensor_lambda, slice_lambda, &mse, &proj);
-            eval_candidates[i] = candidate_types{ tensor_types, bpw, bytes, err, mse, proj };
+            eval_candidates[i] = candidate_types{ tensor_type, bpw, bytes, err, mse, proj };
         }
 
         if (bpw_stop.load(std::memory_order_relaxed)) { return std::nullopt; }

From 229109f329c498078f84da39b2c1ebb807e60646 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 29 Nov 2025 10:31:39 +0000
Subject: [PATCH 146/155] Increase importance boost for final pass

---
 src/llama-quant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 2cb58d46bd..44f84ec949 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1714,7 +1714,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
                 if (err_gain < epsilon) { continue; } // no error improvement
 
                 double ratio = err_gain / (double)delta_bytes; // error reduction per byte
-                if (is_important(tensor_name)) { ratio *= 2.0; } // important tensors get 2x boost
+                if (is_important(tensor_name)) { ratio *= 5.0; } // important tensors get 5x boost
 
                 // For tie-breaking, prioritize the largest absolute error improvement.
                 if (ratio > best_ratio + epsilon || (std::abs(ratio - best_ratio) <= epsilon && err_gain > best_gain)) {

From b97cda628960d66a9fcc301062a1dc3925feae9f Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sat, 29 Nov 2025 23:52:51 +0000
Subject: [PATCH 147/155] Add B/F16 to get_ftype()

---
 tools/quantize/quantize.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index ebeea65336..a1426ea4a3 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -512,7 +512,12 @@ static const char * get_ftype(const float bpw) {
         {4.5000, "Q4_K"},
         {5.5000, "Q5_K"},
         {6.5625, "Q6_K"},
-        {8.5000, "Q8_0"}
+        {8.5000, "Q8_0"},
+#ifdef GGML_USE_METAL
+        {16.0000, "F16"}
+#else
+        {16.0000, "BF16"}
+#endif
     };
 
     return quant_bpw.lower_bound(bpw)->second;

From 37cf51ebd032e63c7901835cdd85a0e7e9109e25 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Sun, 30 Nov 2025 00:29:35 +0000
Subject: [PATCH 148/155] Process bpw targets up to B/F16

---
 src/llama-quant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 44f84ec949..6c6926dee8 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -2089,7 +2089,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             new_type = default_type;
 
             // get more optimal quantization type based on the tensor shape, layer, etc.
-            if (!params->pure && ggml_is_quantized(default_type)) {
+            if (!params->pure && (ggml_is_quantized(default_type) || params->target_bpw != -1.0f)) {
                 int fallback = qs.n_fallback;
                 new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
 

From 5f7bba78288c2ef33d45adcd82141d70157eb402 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 25 Dec 2025 15:47:18 +0000
Subject: [PATCH 149/155] Improve state checkpoint filename

---
 src/llama-quant.cpp | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 33b7f7e584..3d4785c1a3 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -765,10 +765,18 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return djb2_hash(buf.data(), buf.size());
     };
 
+    std::string gen_name;
+    std::string checkpoint_file;
     char hex[17];
     const uint64_t model_id = metadata_id(ml.meta.get());
+
     std::snprintf(hex, sizeof(hex), "%016" PRIx64, (uint64_t)model_id);
-    std::string checkpoint_file = ml.arch_name + "-" + std::string(hex) + ".bpw_state";
+    ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
+    std::replace(gen_name.begin(), gen_name.end(), ' ', '_');
+
+    gen_name.empty() ? checkpoint_file = ml.arch_name : checkpoint_file = gen_name;
+    checkpoint_file += "-" + std::string(hex) + ".bpw_state";
+
     if (params->keep_bpw_state && params->bpw_state) {
         const auto * filename = static_cast<const char*>(params->bpw_state);
         std::ifstream ifs(filename, std::ios::binary);

From b6d718a4a6b789bf0f944ff5a9a4ff82e985fe38 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 25 Dec 2025 15:47:44 +0000
Subject: [PATCH 150/155] Add code comments

---
 src/llama-quant.cpp | 37 ++++++++++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 3d4785c1a3..cab4ecaeec 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -623,7 +623,7 @@ static void signal_handler(int) {
     bpw_stop.store(true, std::memory_order_relaxed);
 }
 
-// Returns tensor type overrides to meet a global bpw target
+// Returns tensor type overrides that meet a global bpw target
 static std::unordered_map<std::string, ggml_type> target_bpw_type(
     llama_model_loader & ml,
     const llama_model & model,
@@ -650,6 +650,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         }
     } signal_guard;
 
+    // Error and bias projection per GGML_TYPE per tensor
     struct candidate_types {
         ggml_type type = GGML_TYPE_COUNT;
         float bpw = 0.0f;
@@ -659,6 +660,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         double proj = 0.0;
     };
 
+    // Per‑tensor quantization mix that satisfies a global bpw target
     struct tensor_info {
         const llama_model_loader::llama_tensor_weight * w = nullptr;
         std::vector<candidate_types> candidate;
@@ -697,22 +699,33 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     constexpr uint64_t arbitrary_magic = 0xeabada55cafed00d;
     const char * func = __func__;
 
+    // Tensor size in bytes for a given type
     auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t {
         const int64_t n_per_row = t->ne[0];
         const size_t row_sz = ggml_row_size(typ, n_per_row);
         return (size_t)ggml_nrows(t) * row_sz;
     };
 
+    // Tensor bpw for a given type
     auto tensor_bpw = [&](const ggml_tensor * t, const ggml_type typ) -> double {
         const size_t bytes = tensor_bytes(t, typ);
         return (double)bytes * 8.0 / (double)ggml_nelements(t);
     };
 
+    // Check if tensor is compatible with quantization type
     auto is_compatible = [](const ggml_tensor * t, const ggml_type typ) -> bool {
         const int64_t blck = ggml_blck_size(typ);
         return blck <= 1 || (t->ne[0] % blck) == 0;
     };
 
+    // Get suitable fallback for type
+    auto make_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> ggml_type {
+        if (is_compatible(t, typ)) { return typ; }
+        const ggml_type fb = fallback_type(typ);
+        return is_compatible(t, fb) ? fb : GGML_TYPE_F16;
+    };
+
+    // Check if tensor is an IQ type
     auto is_iq = [](const enum ggml_type t) {
         switch (t) {
             case GGML_TYPE_IQ1_S:
@@ -730,12 +743,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         }
     };
 
-    auto make_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> ggml_type {
-        if (is_compatible(t, typ)) { return typ; }
-        const ggml_type fb = fallback_type(typ);
-        return is_compatible(t, fb) ? fb : GGML_TYPE_F16;
-    };
-
+    // Check if tensor can be quantized
     auto can_quantize = [&](const ggml_tensor * t) -> bool {
         if (ggml_n_dims(t) < 2) { return false; } // skip 1D tensors
         return is_quantizable(ggml_get_name(t), model.arch, params);
@@ -750,6 +758,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         size_t n_elements = 0;
     };
 
+    // DJB2 hashing algorithm
     auto djb2_hash = [&](const uint8_t * data, const size_t n) -> uint64_t {
         uint64_t h = 5381;
         for (size_t i = 0; i < n; ++i) {
@@ -758,6 +767,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return h ? h : arbitrary_magic;
     };
 
+    // Get model ID from metadata hash
     auto metadata_id = [&](const gguf_context * ctx) -> uint64_t {
         const size_t sz = gguf_get_meta_size(ctx);
         std::vector<uint8_t> buf(sz);
@@ -794,6 +804,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         }
     }
 
+    // Serializes vector<tensor_info> to disk
     auto save_bpw_state = [&](const std::vector<tensor_info> & all_vec) {
         const std::string tmp = checkpoint_file + ".tmp";
         std::ofstream ofs(tmp, std::ios::binary | std::ios::trunc);
@@ -832,6 +843,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         LLAMA_LOG_INFO("%s: saved progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str());
     };
 
+    // Deserializes vector<tensor_info> from disk
     auto load_bpw_state = [&]() -> std::unordered_map<std::string, saved_info> {
         std::unordered_map<std::string, saved_info> out;
         std::ifstream ifs(checkpoint_file, std::ios::binary);
@@ -890,6 +902,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return out;
     };
 
+    // Deletes checkpoint file unless --keep-bpw-state is set
     auto delete_bpw_state = [&] {
         std::ifstream ifs(checkpoint_file);
         if (ifs.good() && !params->keep_bpw_state) {
@@ -898,6 +911,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         }
     };
 
+    // Check for user interrupt and save progress
     auto check_signal_handler = [&](const std::vector<tensor_info> & all_vec) {
         if (bpw_stop.load(std::memory_order_relaxed)) {
             LLAMA_LOG_INFO("\n%s: saving progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str());
@@ -1161,7 +1175,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     const auto bpw_data = load_bpw_state();
 
-    // Parallelize tensor processing - courtesy of https://github.com/ddh0
+    // Parallelize tensor processing (courtesy of https://github.com/ddh0)
     auto process_tensor = [&](const llama_model_loader::llama_tensor_weight * tw,
         std::vector<no_init<uint8_t>> & thread_local_buffer,
         std::mutex & loader_mutex,
@@ -1555,6 +1569,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     size_t target_total_bytes = std::llround(target_bpw * (double)nq_elements / 8.0);
     size_t budget_bytes = target_total_bytes >= nq_bytes ? target_total_bytes - nq_bytes : min_bytes;
 
+    // Get the types' override
     auto emit_overrides = [&]() -> std::unordered_map<std::string, ggml_type> {
         std::unordered_map<std::string, ggml_type> overrides;
         LLAMA_LOG_INFO("%s: - estimated tensor quantization mix:\n", func);
@@ -1592,7 +1607,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         return important;
     };
 
-    // Lagrangian relaxation to minimise error subject to a bpw target constraint
+    // Lagrangian relaxation to minimize error subject to a bpw target constraint
     auto lagrange_penalty = [&](const double mu, std::vector<int> & choice, size_t & bytes, double & err) {
         choice.resize(all.size());
         bytes = 0;
@@ -1636,7 +1651,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     lagrange_penalty(mu_lo, choice_lo, bytes_lo, err_lo);
 
-    // increase mu until we get under budget or hit a safety cap
+    // Increase mu until we get under budget or hit a safety cap
     {
         int expand = 0;
         size_t prev_bytes_hi = std::numeric_limits<size_t>::max();
@@ -1741,7 +1756,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
         }
     }
 
-    delete_bpw_state(); // we're done, clear any checkpoint
+    delete_bpw_state();
 
     return emit_overrides();
 }

From 3be3b1ef87f353840de25fd7bffde00330fac7b4 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 25 Dec 2025 17:44:43 +0000
Subject: [PATCH 151/155] Update usage()

---
 tools/quantize/quantize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index a1426ea4a3..cbb8655c63 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -136,7 +136,7 @@ static void usage(const char * executable) {
     printf("      Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n");
     printf("  --no-importance: distribute bpw budget equitably across all tensors\n");
     printf("      Advanced option to disable assigning more bpw budget to important tensors. It may increase quality for some models\n");
-    printf("  --keep-bpw-state: save the bpw computations to <architecture>-<model hash>.bpw_state\n");
+    printf("  --keep-bpw-state: save the bpw computations to <model name>-<model hash>.bpw_state\n");
     printf("  --bpw-state: file name to use instead of default\n");
     printf("  --keep-split: will generate quantized model in the same shards as input\n");
     printf("  --override-kv KEY=TYPE:VALUE\n");

From 311c2c9f0ebd6e08f5b20e8827c654cebb7a41d6 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 25 Dec 2025 17:45:05 +0000
Subject: [PATCH 152/155] Update README.md

---
 tools/quantize/README.md | 71 +++++++++++++++++++---------------------
 1 file changed, 34 insertions(+), 37 deletions(-)

diff --git a/tools/quantize/README.md b/tools/quantize/README.md
index 22f0710286..9b93edafec 100644
--- a/tools/quantize/README.md
+++ b/tools/quantize/README.md
@@ -58,6 +58,8 @@ Options:
 Advanced options:
 * `--tensor-type` quantize specific tensor(s) to specific quant types. Supports regex syntax. May be specified multiple times.
 * `--prune-layers` prune (remove) the layers in the list
+* `--target-bpw` automatically choose quant types so that the overall model size matches a given bits per weight (bpw) average.
+* `--no-importance` during bpw computation, treat each tensor equally instead of prioritizing some. It may yield better quality for some models.
 * `--override-kv` option to override model metadata by key in the quantized model. May be specified multiple times
 
 Examples:
@@ -97,59 +99,54 @@ Examples:
 ./llama-quantize --imatrix imatrix.gguf --override-kv qwen3moe.expert_used_count=int:16 --prune-layers 20,21,22 input-model-f32.gguf pruned-model-f32.gguf copy 8
 ```
 
+```bash
+# quantize model targeting a specific bpw average and save the bpw computations to the default file. Model type is optional and can be omitted
+./llama-quantize --target-bpw 4.567 --keep-bpw-state --imatrix imatrix.gguf input-model-f32.gguf 8
+```
+
 ## Memory/Disk Requirements
 
 When running the larger models, make sure you have enough disk space to store all the intermediate files.
 As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same. For exmaple (Llama 3.1):
 
 | Model | Original size | Quantized size (Q4_K_M) |
-| ----: | ------------: | ----------------------: |
+|------:|--------------:|------------------------:|
 |    8B |       32.1 GB |                  4.9 GB |
 |   70B |      280.9 GB |                 43.1 GB |
 |  405B |    1,625.1 GB |                249.1 GB |
 
-
 ## Quantization
 
 Several quantization methods are supported. They differ in the resulting model disk size and inference speed. For example,
 
 ### [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B)
 
-| Measure                     | IQ1_S        | IQ1_M        | IQ2_XXS      | IQ2_XS        | IQ2_S         | IQ2_M        |
-| --------------------------- | ------------ | ------------ | ------------ | ------------- | ------------- | ------------ |
-| bits/weight                 |       2.0042 |       2.1460 |       2.3824 |        2.5882 |        2.7403 |       2.9294 |
-| size (GiB)                  |       1.87   |       2.01   |       2.23   |        2.42   |        2.56   |       2.74   |
-| prompt processing t/s @ 512 | 858.88 ±1.22 | 847.99 ±0.47 | 852.39 ±0.85 | 826.99 ±12.51 | 783.55 ±13.73 | 787.68 ±7.00 |
-| text generation t/s @ 128   |  79.73 ±0.79 |  72.92 ±0.14 |  79.86 ±0.22 |  78.04 ±0.46  |  77.30 ±2.47  |  74.44 ±0.15 |
-
-| Measure                     | IQ3_XXS      | IQ3_XS       | IQ3_S        | IQ3_M         | IQ4_XS        | IQ4_NL       |
-| --------------------------- | ------------ | ------------ | ------------ | ------------- | ------------- | ------------ |
-| bits/weight                 |       3.2548 |       3.4977 |       3.6606 |        3.7628 |        4.4597 |       4.6818 |
-| size (GiB)                  |       3.04   |       3.27   |       3.42   |        3.52   |        4.17   |       4.38   |
-| prompt processing t/s @ 512 | 813.88 ±6.53 | 708.71 ±1.26 | 798.78 ±8.81 | 768.70 ±13.73 | 771.80 ±11.38 | 806.03 ±7.07 |
-| text generation t/s @ 128   |  73.95 ±0.20 |  71.67 ±0.54 |  69.31 ±0.63 |  70.15 ±0.33  |  77.51 ±0.20  |  76.63 ±0.28 |
-
-
-| Measure                     | Q2_K_S       | Q2_K         | Q3_K_S       | Q3_K_M       | Q3_K_L       | Q4_K_S       |
-| --------------------------- | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ |
-| bits/weight                 |       2.9697 |       3.1593 |       3.6429 |       3.9960 |       4.2979 |       4.6672 |
-| size (GiB)                  |       2.78   |       2.95   |       3.41   |       3.74   |       4.02   |       4.36   |
-| prompt processing t/s @ 512 | 798.91 ±6.40 | 784.45 ±7.85 | 752.17 ±7.94 | 783.44 ±9.92 | 761.17 ±7.55 | 818.55 ±9.58 |
-| text generation t/s @ 128   |  90.01 ±0.12 |  79.85 ±0.20 |  69.84 ±0.18 |  71.68 ±0.22 |  69.38 ±0.49 |  76.71 ±0.20 |
-
-| Measure                     | Q4_K_S       | Q4_K_M        | Q5_K_S       | Q5_K_M       | Q6_K          | Q8_0         |
-| --------------------------- | ------------ | ------------- | ------------ | ------------ | ------------- | ------------ |
-| bits/weight                 |       4.6672 |        4.8944 |       5.5704 |       5.7036 |        6.5633 |       8.5008 |
-| size (GiB)                  |       4.36   |        4.58   |       5.21   |       5.33   |        6.14   |       7.95   |
-| prompt processing t/s @ 512 | 818.55 ±9.58 | 821.81 ±21.44 | 752.52 ±0.99 | 758.69 ±7.43 | 812.01 ±10.82 | 865.09 ±8.30 |
-| text generation t/s @ 128   |  76.71 ±0.20 |  71.93 ±1.52  |  69.53 ±0.18 |  67.23 ±1.08 |  58.67 ±3.13  |  50.93 ±0.08 |
-
-| Measure                     | F16          |
-| --------------------------- | ------------ |
-| bits/weight                 |      16.0005 |
-| size (GiB)                  |      14.96   |
-| prompt processing t/s @ 512 | 923.49 ±0.53 |
-| text generation t/s @ 128   |  29.17 ±0.04 |
+| Quant Type | bits/weight | size (GiB) | prompt processing t/s @ 512 | text generation t/s @ 128 |
+|:----------:|------------:|-----------:|----------------------------:|--------------------------:|
+|   IQ1_S    |      2.0042 |       1.87 |                858.88 ±1.22 |               79.73 ±0.79 |
+|   IQ1_M    |      2.1460 |       2.01 |                847.99 ±0.47 |               72.92 ±0.14 |
+|  IQ2_XXS   |      2.3824 |       2.23 |                852.39 ±0.85 |               79.86 ±0.22 |
+|   IQ2_XS   |      2.5882 |       2.42 |               826.99 ±12.51 |               78.04 ±0.46 |
+|   IQ2_S    |      2.7403 |       2.56 |               783.55 ±13.73 |               77.30 ±2.47 |
+|   IQ2_M    |      2.9294 |       2.74 |                787.68 ±7.00 |               74.44 ±0.15 |
+|  IQ3_XXS   |      3.2548 |       3.04 |                813.88 ±6.53 |               73.95 ±0.20 |
+|   IQ3_XS   |      3.4977 |       3.27 |                708.71 ±1.26 |               71.67 ±0.54 |
+|   IQ3_S    |      3.6606 |       3.42 |                798.78 ±8.81 |               69.31 ±0.63 |
+|   IQ3_M    |      3.7628 |       3.52 |               768.70 ±13.73 |               70.15 ±0.33 |
+|   IQ4_XS   |      4.4597 |       4.17 |               771.80 ±11.38 |               77.51 ±0.20 |
+|   IQ4_NL   |      4.6818 |       4.38 |                818.55 ±9.58 |               76.71 ±0.20 |
+|   Q2_K_S   |      2.9697 |       2.78 |                798.91 ±6.40 |               90.01 ±0.12 |
+|    Q2_K    |      3.1593 |       2.95 |                784.45 ±7.85 |               79.85 ±0.20 |
+|   Q3_K_S   |      3.6429 |       3.41 |                752.17 ±7.94 |               71.68 ±0.22 |
+|   Q3_K_L   |      4.2979 |       4.02 |                761.17 ±7.55 |               69.38 ±0.49 |
+|   Q4_K_S   |      4.6672 |       4.36 |                818.55 ±9.58 |               76.71 ±0.20 |
+|   Q4_K_S   |      4.6672 |       4.36 |                818.55 ±9.58 |               76.71 ±0.20 |
+|   Q4_K_M   |      4.8944 |       4.58 |               821.81 ±21.44 |               71.93 ±1.52 |
+|   Q5_K_S   |      5.5704 |       5.21 |                752.52 ±0.99 |               69.53 ±0.18 |
+|   Q5_K_M   |      5.7036 |       5.33 |                758.69 ±7.43 |               67.23 ±1.08 |
+|    Q6_K    |      6.5633 |       6.14 |               812.01 ±10.82 |               58.67 ±3.13 |
+|    Q8_0    |      8.5008 |       7.95 |                865.09 ±8.30 |               50.93 ±0.08 |
+|    F16     |     16.0005 |      14.96 |                923.49 ±0.53 |               29.17 ±0.04 |
 
 ## Background information on llama-quantize
 

From 7f886128617334831b6e99dfcdff994a5cf6bf4e Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 25 Dec 2025 17:47:38 +0000
Subject: [PATCH 153/155] Update README.md

---
 tools/quantize/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/quantize/README.md b/tools/quantize/README.md
index 9b93edafec..986ba95be5 100644
--- a/tools/quantize/README.md
+++ b/tools/quantize/README.md
@@ -56,10 +56,10 @@ Options:
 * `--keep-split` will generate the quantized model in the same shards as the input file otherwise it will produce a single quantized file
 
 Advanced options:
-* `--tensor-type` quantize specific tensor(s) to specific quant types. Supports regex syntax. May be specified multiple times.
+* `--tensor-type` quantize specific tensor(s) to specific quant types. Supports regex syntax. May be specified multiple times
 * `--prune-layers` prune (remove) the layers in the list
-* `--target-bpw` automatically choose quant types so that the overall model size matches a given bits per weight (bpw) average.
-* `--no-importance` during bpw computation, treat each tensor equally instead of prioritizing some. It may yield better quality for some models.
+* `--target-bpw` automatically choose quant types so that the overall model size matches a given bits per weight (bpw) average
+* `--no-importance` during bpw computation, treat each tensor equally instead of prioritizing some. It may yield better quality for some models
 * `--override-kv` option to override model metadata by key in the quantized model. May be specified multiple times
 
 Examples:

From 91846ee79b385f88fea67150f1a82a5a9058e406 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Mon, 29 Dec 2025 13:02:06 +0000
Subject: [PATCH 154/155] Change checkpoint file magic

---
 src/llama-quant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index cab4ecaeec..f518c10781 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -695,7 +695,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     constexpr double epsilon = 1e-12;
     constexpr double infinity = std::numeric_limits<double>::infinity();
-    constexpr uint32_t file_magic = 0x42505731;  // BPW1
+    constexpr uint32_t file_magic = 0x4d534531;  // MSE1
     constexpr uint64_t arbitrary_magic = 0xeabada55cafed00d;
     const char * func = __func__;
 

From 960ef9614178a825578c32417ed5876c367a506d Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 1 Jan 2026 13:44:59 +0000
Subject: [PATCH 155/155] Prepare for future optimization algorithms

---
 src/llama-quant.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index f518c10781..67e5aa9827 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -695,7 +695,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
 
     constexpr double epsilon = 1e-12;
     constexpr double infinity = std::numeric_limits<double>::infinity();
-    constexpr uint32_t file_magic = 0x4d534531;  // MSE1
+    constexpr uint32_t file_magic = 0x4d534531; // MSE1
     constexpr uint64_t arbitrary_magic = 0xeabada55cafed00d;
     const char * func = __func__;
 
@@ -785,7 +785,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
     std::replace(gen_name.begin(), gen_name.end(), ' ', '_');
 
     gen_name.empty() ? checkpoint_file = ml.arch_name : checkpoint_file = gen_name;
-    checkpoint_file += "-" + std::string(hex) + ".bpw_state";
+    checkpoint_file += "-" + std::string(hex) + "-mse.bpw_state";
 
     if (params->keep_bpw_state && params->bpw_state) {
         const auto * filename = static_cast<const char*>(params->bpw_state);