From ba7335efb363515052a5f8aa755e4a5cd1250150 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 19 Aug 2025 09:54:29 +0100 Subject: [PATCH 001/155] Refactor variable name --- include/llama.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/llama.h b/include/llama.h index 545e957e5f..b17e8f3353 100644 --- a/include/llama.h +++ b/include/llama.h @@ -354,6 +354,7 @@ extern "C" { bool pure; // quantize all tensors to the default type bool keep_split; // quantize to the same number of shards void * imatrix; // pointer to importance matrix data + void * activations; // pointer to activations data void * kv_overrides; // pointer to vector containing overrides void * tensor_types; // pointer to vector containing tensor types void * prune_layers; // pointer to vector containing layer indices to prune From 4d9491141b591d31f7fb91940ef4b1cf41bf94f6 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 19 Aug 2025 10:43:21 +0100 Subject: [PATCH 002/155] Add target_bpw parameter --- include/llama.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/llama.h b/include/llama.h index b17e8f3353..f44e2383d0 100644 --- a/include/llama.h +++ b/include/llama.h @@ -358,6 +358,7 @@ extern "C" { void * kv_overrides; // pointer to vector containing overrides void * tensor_types; // pointer to vector containing tensor types void * prune_layers; // pointer to vector containing layer indices to prune + float target_bpw; // target bits per weight (bpw) } llama_model_quantize_params; typedef struct llama_logit_bias { From cfec4048abc478cd2769d1908e3ecc53ad2f28bd Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 19 Aug 2025 10:43:51 +0100 Subject: [PATCH 003/155] Update usage --- tools/quantize/quantize.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 470dc3d916..b2d62f1490 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -132,6 +132,7 @@ static void usage(const char * executable) { printf(" Advanced option to selectively quantize tensors. May be specified multiple times.\n"); printf(" --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n"); printf(" Advanced option to remove all tensors from the given layers\n"); + printf(" --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0 \n"); printf(" --keep-split: will generate quantized model in the same shards as input\n"); printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n"); From 5e85fb3ff34c5253c3dfa51eb5b9b9bfd6aaaaea Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 19 Aug 2025 10:46:36 +0100 Subject: [PATCH 004/155] Add parse_target_bpw() --- tools/quantize/quantize.cpp | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index b2d62f1490..afd2edb156 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -441,6 +441,27 @@ static bool parse_layer_prune(const char * data, std::vector & prune_layers return true; } +static bool parse_target_bpw(const char * data, float & target_bpw) { + if (!data) { + printf("\n%s: no target bits per weight (bpw) provided\n\n", __func__); + return false; + } + + try { + target_bpw = std::stof(data); + if (target_bpw < 0.0f || target_bpw > 8.0f) { + printf("\n%s: target bits per weight (bpw) must be a positive number between 0.0 and 8.0\n\n", __func__); + return false; + } + } + catch (const std::exception & e) { + printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 8.0\n\n", __func__, data); + return false; + } + + return true; +} + int main(int argc, char ** argv) { if (argc < 3) { usage(argv[0]); From e6d55dc47b42054dcef4a72145cfffb3cb26bd0f Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 19 Aug 2025 10:49:01 +0100 Subject: [PATCH 005/155] Load activations --- tools/quantize/quantize.cpp | 46 ++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index afd2edb156..3d07abd2d0 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -247,56 +247,69 @@ static int load_imatrix(const std::string & imatrix_file, std::vector> sums_counts_for; + std::map> sums_counts_for; for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { std::string name = cur->name; if (name.empty()) { continue; } - if (string_remove_suffix(name, sums_suffix)) { + if (string_remove_suffix(name, sums2_suffix)) { // in_sum2 - sums_counts_for[std::move(name)].first = cur; + std::get<0>(sums_counts_for[std::move(name)]) = cur; } else if (string_remove_suffix(name, counts_suffix)) { // counts - sums_counts_for[std::move(name)].second = cur; - } else { + std::get<1>(sums_counts_for[std::move(name)]) = cur; + } else if (string_remove_suffix(name, sums_suffix)) { + // in_sum + std::get<2>(sums_counts_for[std::move(name)]) = cur; + } + else { // ignore other tensors } } for (const auto & sc : sums_counts_for) { const std::string & name = sc.first; - const struct ggml_tensor * sums = sc.second.first; - const struct ggml_tensor * counts = sc.second.second; + const struct ggml_tensor * sums = std::get<2>(sc.second); + const struct ggml_tensor * sums2 = std::get<0>(sc.second); + const struct ggml_tensor * counts = std::get<1>(sc.second); - if (!sums || !counts) { + // check that sums, sums2 and counts have the same shape + if (!sums2 || !counts || (sums != nullptr && ggml_nelements(sums) != ggml_nelements(sums2))) { fprintf(stderr, "%s: mismatched sums and counts for %s\n", __func__, name.c_str()); gguf_free(ctx_gguf); ggml_free(ctx); exit(1); } - const int64_t ne0 = sums->ne[0]; - const int64_t ne1 = sums->ne[1]; + const int64_t ne0 = sums2->ne[0]; + const int64_t ne1 = sums2->ne[1]; - auto & e = imatrix_data[name]; - e.resize(ggml_nelements(sums)); + auto & activations = activations_data[name]; + auto & values = values_data[name]; + if (sums) { + activations.resize(ggml_nelements(sums)); + } + values.resize(ggml_nelements(sums2)); float max_count = 0.0f; for (int64_t j = 0; j < ne1; ++j) { const float count = ((const float *) counts->data)[j]; if (count > 0.0f) { for (int64_t i = 0; i < ne0; ++i) { - e[j*ne0 + i] = ((const float *) sums->data)[j*ne0 + i] / count; + values[j*ne0 + i] = ((const float *) sums2->data)[j*ne0 + i] / count; + if (sums) { activations[j*ne0 + i] = ((const float *) sums->data)[j*ne0 + i] / count; } } } else { // Partial imatrix data, this tensor never got any input during calibration for (int64_t i = 0; i < ne0; ++i) { - e[j*ne0 + i] = 1; + values[j*ne0 + i] = 1; + if (sums) { activations[j*ne0 + i] = 0; } } } if (count > max_count) { @@ -304,7 +317,8 @@ static int load_imatrix(const std::string & imatrix_file, std::vector Date: Tue, 19 Aug 2025 10:50:37 +0100 Subject: [PATCH 006/155] Populate activations_data with imatrix activations if present --- tools/quantize/quantize.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 3d07abd2d0..c2a4767fc9 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -561,10 +561,11 @@ int main(int argc, char ** argv) { } std::vector imatrix_datasets; - std::unordered_map> imatrix_data; - int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, imatrix_data); - if (!imatrix_data.empty()) { - params.imatrix = &imatrix_data; + std::unordered_map> values_data; + std::unordered_map> activations_data; + int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, values_data, activations_data); + if (!values_data.empty()) { + params.imatrix = &values_data; { llama_model_kv_override kvo; std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE); From 0edbf0c176236b795d8707504388052839556b67 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 19 Aug 2025 10:51:58 +0100 Subject: [PATCH 007/155] Process activations --- tools/quantize/quantize.cpp | 51 +++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 14 deletions(-) diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index c2a4767fc9..2c45adab75 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -215,7 +215,10 @@ static int load_legacy_imatrix(const std::string & imatrix_file, std::vector & imatrix_datasets, std::unordered_map> & imatrix_data) { +static int load_imatrix(const std::string & imatrix_file, + std::vector & imatrix_datasets, + std::unordered_map> & values_data, + std::unordered_map> & activations_data) { struct ggml_context * ctx = nullptr; struct gguf_init_params meta_gguf_params = { @@ -225,7 +228,7 @@ static int load_imatrix(const std::string & imatrix_file, std::vector & imatrix_dataset, const std::vector & included_weights, const std::vector & excluded_weights, - std::unordered_map> & imatrix_data) { + std::unordered_map> & values_data, + std::unordered_map> & activations_data) { int m_last_call = -1; if (!imatrix_file.empty()) { - m_last_call = load_imatrix(imatrix_file, imatrix_dataset, imatrix_data); + m_last_call = load_imatrix(imatrix_file, imatrix_dataset, values_data, activations_data); } - if (imatrix_data.empty()) { + if (values_data.empty()) { return m_last_call; } if (!excluded_weights.empty()) { for (const auto & name : excluded_weights) { - for (auto it = imatrix_data.begin(); it != imatrix_data.end();) { + for (auto it = values_data.begin(); it != values_data.end();) { auto pos = it->first.find(name); if (pos != std::string::npos) { - it = imatrix_data.erase(it); + it = values_data.erase(it); } else { ++it; } } + for (auto at = activations_data.begin(); at != activations_data.end();) { + auto pos = at->first.find(name); + if (pos != std::string::npos) { + at = activations_data.erase(at); + } else { + ++at; + } + } } } if (!included_weights.empty()) { - std::unordered_map> tmp; + std::unordered_map> tmp_values; + std::unordered_map> tmp_activations; for (const auto & name : included_weights) { - for (auto & e : imatrix_data) { + for (auto & e : values_data) { auto pos = e.first.find(name); if (pos != std::string::npos) { - tmp.emplace(std::move(e)); + tmp_values.emplace(std::move(e)); + } + } + for (auto & a : activations_data) { + auto pos = a.first.find(name); + if (pos != std::string::npos) { + tmp_activations.emplace(std::move(a)); } } } - imatrix_data = std::move(tmp); + values_data = std::move(tmp_values); + activations_data = std::move(tmp_activations); } - if (!imatrix_data.empty()) { - printf("%s: have %d importance matrix entries\n", __func__, int(imatrix_data.size())); + if (!values_data.empty()) { + printf("%s: have %d importance matrix value entries\n", __func__, int(values_data.size())); + } + if (!activations_data.empty()) { + printf("%s: have %d importance matrix activation entries\n", __func__, int(activations_data.size())); } return m_last_call; } From e8774744584689db682866b71121597fe4d35c84 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 19 Aug 2025 10:54:02 +0100 Subject: [PATCH 008/155] Process target_bpw parameter --- tools/quantize/quantize.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 2c45adab75..5331dec80c 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -512,6 +512,7 @@ int main(int argc, char ** argv) { std::vector kv_overrides; std::vector tensor_types; std::vector prune_layers; + float target_bpw = -1.0f; for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) { if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) { @@ -538,6 +539,10 @@ int main(int argc, char ** argv) { if (arg_idx == argc-1 || !parse_tensor_type(argv[++arg_idx], tensor_types)) { usage(argv[0]); } + } else if (strcmp(argv[arg_idx], "--target-bpw") == 0) { + if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) { + usage(argv[0]); + } } else if (strcmp(argv[arg_idx], "--prune-layers") == 0) { if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) { usage(argv[0]); From 1b3d5b574414ffc03c5d575ef470c74f4e509a80 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 19 Aug 2025 10:56:02 +0100 Subject: [PATCH 009/155] Populate params --- tools/quantize/quantize.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 5331dec80c..86a96cdfcc 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -616,7 +616,7 @@ int main(int argc, char ** argv) { llama_model_kv_override kvo; std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES); kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; - kvo.val_i64 = imatrix_data.size(); + kvo.val_i64 = values_data.size(); kv_overrides.emplace_back(std::move(kvo)); } @@ -628,6 +628,9 @@ int main(int argc, char ** argv) { kv_overrides.emplace_back(std::move(kvo)); } } + if (!activations_data.empty()) { + params.activations = &activations_data; + } if (!kv_overrides.empty()) { kv_overrides.emplace_back(); kv_overrides.back().key[0] = 0; @@ -639,6 +642,9 @@ int main(int argc, char ** argv) { if (!prune_layers.empty()) { params.prune_layers = &prune_layers; } + if (target_bpw != -1.0f) { + params.target_bpw = target_bpw; + } llama_backend_init(); @@ -701,7 +707,7 @@ int main(int argc, char ** argv) { params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || - params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && imatrix_data.empty()) { + params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && values_data.empty()) { fprintf(stderr, "\n==========================================================================================================\n"); fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n"); fprintf(stderr, "==========================================================================================================\n\n\n"); From a22a9deeeeb51e6f647bb185301b9874538d0324 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 19 Aug 2025 10:57:44 +0100 Subject: [PATCH 010/155] Refactor variable and add target_bpw --- src/llama-quant.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 1d0361cc16..2e1ca7216e 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1062,9 +1062,11 @@ llama_model_quantize_params llama_model_quantize_default_params() { /*.pure =*/ false, /*.keep_split =*/ false, /*.imatrix =*/ nullptr, + /*.activations =*/ nullptr, /*.kv_overrides =*/ nullptr, /*.tensor_type =*/ nullptr, - /*.prune_layers =*/ nullptr + /*.prune_layers =*/ nullptr, + /*.target_bpw =*/ -1.0f }; return result; From c96b8eef949b479d505b63788d2c214e4221abcb Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 19 Aug 2025 11:00:05 +0100 Subject: [PATCH 011/155] Add fallback_type enum --- src/llama-quant.cpp | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 2e1ca7216e..b2879bc847 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -19,6 +19,32 @@ struct tensor_quantization { ggml_type quant = GGML_TYPE_COUNT; }; +static enum ggml_type fallback_type(const enum ggml_type new_type) { + switch (new_type) { + case GGML_TYPE_TQ1_0: + case GGML_TYPE_TQ2_0: + return GGML_TYPE_Q4_0; // symmetric-ish fallback + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ2_S: + case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ3_S: + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_IQ4_XS: + return GGML_TYPE_IQ4_NL; + case GGML_TYPE_Q4_K: + return GGML_TYPE_Q5_0; + case GGML_TYPE_Q5_K: + return GGML_TYPE_Q5_1; + case GGML_TYPE_Q6_K: + return GGML_TYPE_Q8_0; + default: + return new_type; + } +} static void zeros(std::ofstream & file, size_t n) { char zero = 0; for (size_t i = 0; i < n; ++i) { From 9adae08789aefeb945b55858afbdf047e818147f Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 19 Aug 2025 11:00:50 +0100 Subject: [PATCH 012/155] Add is_iq() --- src/llama-quant.cpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index b2879bc847..1e837a7d41 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -19,6 +19,22 @@ struct tensor_quantization { ggml_type quant = GGML_TYPE_COUNT; }; +static bool is_iq(const enum ggml_type t) { + switch (t) { + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ2_S: + case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ3_S: + case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ4_XS: + return true; + default: + return false; + } +} static enum ggml_type fallback_type(const enum ggml_type new_type) { switch (new_type) { case GGML_TYPE_TQ1_0: From 017945a3b20726dc000da1245ecdbf539a7ba0cf Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 19 Aug 2025 11:03:52 +0100 Subject: [PATCH 013/155] Validate if imatrix contains activations --- src/llama-quant.cpp | 48 ++++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 1e837a7d41..fdda5d35a1 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -35,6 +35,7 @@ static bool is_iq(const enum ggml_type t) { return false; } } + static enum ggml_type fallback_type(const enum ggml_type new_type) { switch (new_type) { case GGML_TYPE_TQ1_0: @@ -61,6 +62,7 @@ static enum ggml_type fallback_type(const enum ggml_type new_type) { return new_type; } } + static void zeros(std::ofstream & file, size_t n) { char zero = 0; for (size_t i = 0; i < n; ++i) { @@ -131,10 +133,11 @@ struct quantize_state_impl { int i_ffn_gate = 0; int i_ffn_up = 0; - int n_k_quantized = 0; - int n_fallback = 0; + int n_k_quantized = 0; + int n_fallback = 0; - bool has_imatrix = false; + bool has_imatrix = false; + bool has_activations = false; // used to figure out if a model shares tok_embd with the output weight bool has_output = false; @@ -652,14 +655,15 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: if (params->only_copy) { ftype = ml.ftype; } - const std::unordered_map> * imatrix_data = nullptr; + const std::unordered_map> * values_data = nullptr; + const std::unordered_map> * activations_data = nullptr; if (params->imatrix) { - imatrix_data = static_cast>*>(params->imatrix); - if (imatrix_data) { - LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size())); + values_data = static_cast>*>(params->imatrix); + if (values_data) { + LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(values_data->size())); qs.has_imatrix = true; // check imatrix for nans or infs - for (const auto & kv : *imatrix_data) { + for (const auto & kv : *values_data) { for (float f : kv.second) { if (!std::isfinite(f)) { throw std::runtime_error(format("imatrix contains non-finite value %f\n", f)); @@ -668,8 +672,22 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } } } + if (params->activations) { + activations_data = static_cast>*>(params->activations); + if (activations_data) { + LLAMA_LOG_INFO("================================ Have activations data with %d entries\n",int(activations_data->size())); + qs.has_activations = true; + // check activations for nans or infs + for (const auto & kv : *activations_data) { + for (float f : kv.second) { + if (!std::isfinite(f)) { + throw std::runtime_error(format("activations contain non-finite value %f\n", f)); + } + } + } + } + } - const size_t align = GGUF_DEFAULT_ALIGNMENT; gguf_context_ptr ctx_out { gguf_init_empty() }; std::vector prune_list = {}; @@ -846,6 +864,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: const auto tn = LLM_TN(model.arch); new_ofstream(0); for (const auto * it : tensors) { + const size_t align = GGUF_DEFAULT_ALIGNMENT; const auto & weight = *it; ggml_tensor * tensor = weight.tensor; if (weight.idx != cur_split && params->keep_split) { @@ -864,10 +883,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: ml.load_data_for(tensor); LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ", - ++idx, ml.n_tensors, - ggml_get_name(tensor), - llama_format_tensor_shape(tensor).c_str(), - ggml_type_name(tensor->type)); + ++idx, ml.n_tensors, ggml_get_name(tensor), llama_format_tensor_shape(tensor).c_str(), ggml_type_name(tensor->type)); // This used to be a regex, but has an extreme cost to compile times. bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'? @@ -967,9 +983,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: const int64_t nelements = ggml_nelements(tensor); const float * imatrix = nullptr; - if (imatrix_data) { - auto it = imatrix_data->find(remap_imatrix(tensor->name, mapped)); - if (it == imatrix_data->end()) { + if (values_data) { + auto it = values_data->find(remap_imatrix(tensor->name, mapped)); + if (it == values_data->end()) { LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name); } else { if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) { From 92f49ab39949221ff84b4f70d4528e4f5f43db93 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 19 Aug 2025 11:05:01 +0100 Subject: [PATCH 014/155] Add target_bpw_type() logic --- src/llama-quant.cpp | 482 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 482 insertions(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index fdda5d35a1..1e24303c52 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -575,6 +575,488 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * return new_size; } +// Returns per-tensor overrides of quantization types to meet target BPW with best expected quality. +// imatrix_data: map from tensor name -> length (ne[0] * ne[2]) containing per-column E[a^2] by expert +// activations_data: optional map from tensor name -> length (ne[0] * ne[2]) containing per-column E[a] by expert +// bias_lambda: relative weight on bias term (|sum e_j * E[a_j]|) vs MSE term (sum e_j^2 * E[a_j^2]) +static std::unordered_map target_bpw_type( + llama_model_loader & ml, + std::vector> & read_data, + const llama_model & model, + const std::vector & tensors, + const std::map & mapped, + const std::unordered_map> * values_data, + const std::unordered_map> * activations_data, + float target_bpw, + int nthread, + int sample_rows_per_expert = 128, + float bias_lambda = 1.0 +) { + struct candidate_types { + ggml_type type; + float bpw; + size_t bytes; + float error; // lower is better + }; + + struct tensor_info { + const llama_model_loader::llama_tensor_weight * w; + std::vector candidate; // sorted by bpw ascending + int choice = -1; // index into cand + float min_bpw = 0.0; + float max_bpw = 0.0; + size_t n_elements = 0; + }; + + auto name_tn = LLM_TN(model.arch); + + // The candidate types we consider; adjust as needed + const ggml_type base_candidates[] = { + // Model's + GGML_TYPE_IQ1_S, + GGML_TYPE_IQ1_M, + GGML_TYPE_IQ2_XXS, + GGML_TYPE_IQ2_XS, + GGML_TYPE_IQ2_S, + GGML_TYPE_IQ3_XXS, + GGML_TYPE_IQ3_S, + GGML_TYPE_IQ4_XS, + GGML_TYPE_IQ4_NL, + GGML_TYPE_Q2_K, + GGML_TYPE_Q3_K, + GGML_TYPE_Q4_0, + GGML_TYPE_Q4_1, + GGML_TYPE_Q4_K, + GGML_TYPE_Q5_0, + GGML_TYPE_Q5_1, + GGML_TYPE_Q5_K, + GGML_TYPE_Q6_K, + GGML_TYPE_Q8_0 + }; + + auto can_quantize = [&](const ggml_tensor * t) -> bool { + const std::string name = ggml_get_name(t); + bool q = name.rfind("weight") == name.size() - 6; + q &= (ggml_n_dims(t) >= 2); + q &= name.find("_norm.weight") == std::string::npos; + //q &= name != name_tn(LLM_TENSOR_TOKEN_EMBD, "weight"); + //q &= name != name_tn(LLM_TENSOR_OUTPUT, "weight"); + q &= name.find("ffn_gate_inp.weight") == std::string::npos; + q &= name.find("altup") == std::string::npos; + q &= name.find("laurel") == std::string::npos; + q &= name.find("per_layer_model_proj") == std::string::npos; + q &= name != name_tn(LLM_TENSOR_POS_EMBD, "weight"); + q &= name != name_tn(LLM_TENSOR_TOKEN_TYPES, "weight"); + q &= name.find("ssm_conv1d.weight") == std::string::npos; + q &= name.find("shortconv.conv.weight") == std::string::npos; + q &= name.find("time_mix_first.weight") == std::string::npos; + q &= name.find("time_mix_w0.weight") == std::string::npos; + q &= name.find("time_mix_w1.weight") == std::string::npos; + q &= name.find("time_mix_w2.weight") == std::string::npos; + q &= name.find("time_mix_v0.weight") == std::string::npos; + q &= name.find("time_mix_v1.weight") == std::string::npos; + q &= name.find("time_mix_v2.weight") == std::string::npos; + q &= name.find("time_mix_a0.weight") == std::string::npos; + q &= name.find("time_mix_a1.weight") == std::string::npos; + q &= name.find("time_mix_a2.weight") == std::string::npos; + q &= name.find("time_mix_g1.weight") == std::string::npos; + q &= name.find("time_mix_g2.weight") == std::string::npos; + q &= name.find("time_mix_decay_w1.weight") == std::string::npos; + q &= name.find("time_mix_decay_w2.weight") == std::string::npos; + q &= name.find("time_mix_lerp_fused.weight") == std::string::npos; + q &= name.find("attn_rel_b.weight") == std::string::npos; + return q; + }; + + auto get_values = [&](const std::string & tensor_name) -> const float * { + if (!values_data) { return nullptr; } + const auto it = values_data->find(remap_imatrix(tensor_name, mapped)); + if (it == values_data->end()) { return nullptr; } + return it->second.data(); + }; + + auto get_activations = [&](const std::string & tensor_name) -> const float * { + if (!activations_data) { return nullptr; } + const auto it = activations_data->find(remap_imatrix(tensor_name, mapped)); + if (it == activations_data->end()) { return nullptr; } + return it->second.data(); + }; + + auto total_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t { + const int64_t n_per_row = t->ne[0]; + const int64_t nrows = t->ne[1]; + const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; + const size_t row_sz = ggml_row_size(typ, n_per_row); + return (size_t)ne2 * (size_t)nrows * row_sz; + }; + + auto tensor_bpw = [&](const ggml_tensor * t, const ggml_type typ) -> double { + const int64_t nelem = ggml_nelements(t); + const size_t bytes = total_bytes(t, typ); + return bytes * 8.0 / nelem; + }; + + auto is_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> bool { + const int64_t n_per_row = t->ne[0]; + const int64_t blck = ggml_blck_size(typ); + if (blck <= 1) { return true; } // FP16/BF16/Q8_0 etc + return n_per_row % blck == 0; + }; + + auto make_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> ggml_type { + if (is_compatible(t, typ)) { return typ; } + ggml_type fb = fallback_type(typ); + if (is_compatible(t, fb)) { return fb; } + return GGML_TYPE_F16; // final guard + }; + + // Estimate error for a given type using a sampled subset of rows. + // Uses both imatrix (E[a^2]) and activations (E[a]) if available. + auto estimate_error = [&](const ggml_tensor * t, const float * f32_data, const ggml_type typ, const float * values_all, const float * activations_all) -> double { + const int64_t n_per_row = t->ne[0]; + const int64_t nrows = t->ne[1]; + const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; + + const ggml_type_traits * traits = ggml_get_type_traits(typ); + if (!traits || !traits->to_float) { + // cannot dequantize candidate -> assign very high error + return 1e35f; + } + + // Sampling plan: for each expert slice, take up to sample_rows rows spread uniformly + const int64_t rows_per_expert = nrows; + const int64_t sample_rows = std::max(1, std::min(rows_per_expert, sample_rows_per_expert)); + const int64_t stride = std::max(1, rows_per_expert / sample_rows); + + const size_t row_sz = ggml_row_size(typ, n_per_row); + std::vector qbuf(row_sz * sample_rows); + std::vector f32_sample(sample_rows * n_per_row); + std::vector deq(sample_rows * n_per_row); + + float total_err = 0.0; + + for (int64_t i03 = 0; i03 < ne2; ++i03) { + const float * value = values_all ? (values_all + i03 * n_per_row) : nullptr; + const float * activation = activations_all ? (activations_all + i03 * n_per_row) : nullptr; + + // Assemble sampled rows into contiguous f32_sample + int64_t rs = 0; + for (int64_t r = 0; r < rows_per_expert && rs < sample_rows; r += stride) { + const float * src = f32_data + i03 * (n_per_row * rows_per_expert) + r * n_per_row; + std::memcpy(f32_sample.data() + rs * n_per_row, src, sizeof(float) * n_per_row); + ++rs; + } + if (rs == 0) { continue; } + + // Quantize sampled rows in one chunk; pass the imatrix for this expert slice + const size_t got = ggml_quantize_chunk(typ, f32_sample.data(), qbuf.data(), 0, rs, n_per_row, value); + (void)got; // not strictly needed here + + // Dequantize + traits->to_float(qbuf.data(), deq.data(), rs * n_per_row); + + // Compute error proxy per sampled row + for (int64_t s = 0; s < rs; ++s) { + const float * xs = f32_sample.data() + s * n_per_row; + const float * ys = deq.data() + s * n_per_row; + + float mse_w = 0.0; + float bias = 0.0; + float bias_sum = 0.0; + + if (value) { + for (int64_t j = 0; j < n_per_row; ++j) { + const float e = ys[j] - xs[j]; + mse_w += e * e * value[j]; + if (activation) { + bias_sum += e * activation[j]; + } + } + } else { + for (int64_t j = 0; j < n_per_row; ++j) { + const float e = ys[j] - xs[j]; + mse_w += e*e; + if (activation) { + bias_sum += e * activation[j]; + } + } + } + + if (activation) { + bias = std::abs(bias_sum); + } + + // Normalize by n_per_row to get a per-row average scale + float row_err = mse_w / std::max(1, n_per_row); + if (bias_lambda != 0.0) { + row_err += bias_lambda * (bias / std::max(1, n_per_row)); + } + + total_err += row_err; + } + + // Scale for the rows we didn't sample in this expert: multiply by stride-ish factor + const float scale_rows = rows_per_expert / std::max(1, rs); + total_err *= scale_rows; + } + + return total_err; + }; + + // Produce per-tensor candidate lists + std::vector all; + all.reserve(tensors.size()); + + for (const auto * tw : tensors) { + // Temporary workers for dequantization + std::vector workers; + workers.reserve(std::max(1, nthread)); + + ggml_tensor * t = tw->tensor; + const std::string name = ggml_get_name(t); + + if (!can_quantize(t)) { + continue; + } + + LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12d elements)\n", __func__, name.c_str(), (int)ggml_nelements(t)); + if (!ml.use_mmap) { + if (read_data.size() < ggml_nbytes(t)) { + read_data.resize(ggml_nbytes(t)); + } + t->data = read_data.data(); + } + ml.load_data_for(t); + + // Prepare f32 weights for error estimates + const int64_t nelem = ggml_nelements(t); + std::vector> f32_conv_buf; + float * f32_data = nullptr; + + if (t->type == GGML_TYPE_F32) { + f32_data = (float *)t->data; + } else { + llama_tensor_dequantize_impl(t, f32_conv_buf, workers, nelem, nthread); + f32_data = (float *)f32_conv_buf.data(); + } + + const float * values = get_values(name); + const float * activations = get_activations(name); + + tensor_info info; + info.w = tw; + info.n_elements = nelem; + + // Candidate build with compatibility handling and availability checks + for (ggml_type ts_type : base_candidates) { + // Skip IQ* without imatrix + if (is_iq(ts_type) && !values) { continue; } + ggml_type tt = make_compatible(t, ts_type); + // After fallback, if still incompatible, skip + if (!is_compatible(t, tt)) { continue; } + + // Compute bpw and bytes + auto bpw = (float)tensor_bpw(t, tt); + size_t bytes = total_bytes(t, tt); + + // Estimate error + auto err = (float)estimate_error(t, f32_data, tt, values, activations); + + info.candidate.push_back(candidate_types{tt, bpw, bytes, err}); + } + + if (info.candidate.empty()) { + // as a last resort, keep original type + float bpw = ggml_nbytes(t) * 8.0f / nelem; + info.candidate.push_back(candidate_types{t->type, bpw, ggml_nbytes(t), 0.0}); + } + + // Sort by bpw ascending + std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types &a, const candidate_types &b) { + if (a.bpw != b.bpw) { return a.bpw < b.bpw; } + if (a.error != b.error) { return a.error < b.error; } + return a.bytes < b.bytes; + }); + + // collapse candidates with identical storage size (bytes) + { + std::vector uniq; + uniq.reserve(info.candidate.size()); + + for (size_t i = 0; i < info.candidate.size(); ) { + size_t j = i + 1; + candidate_types best = info.candidate[i]; + // group same-byte entries, keep the one with the lowest error + while (j < info.candidate.size() && info.candidate[j].bytes == info.candidate[i].bytes) { + if (info.candidate[j].error < best.error) { best = info.candidate[j]; } + ++j; + } + uniq.push_back(best); + i = j; + } + info.candidate.swap(uniq); + } + + // Initialize choice at the smallest bpw candidate + info.choice = 0; + info.min_bpw = info.candidate.front().bpw; + info.max_bpw = info.candidate.back().bpw; + + all.push_back(std::move(info)); + } + + if (all.empty()) { return {}; } + + // Greedy allocation from minimum bpw upward to reach target_bpw + // Start with minimal bpw assignment + auto current_total_bytes = [&]() -> size_t { + size_t b = 0; + for (const auto & ti : all) { + b += ti.candidate[ti.choice].bytes; + } + return b; + }; + + auto total_weights = [&]() -> size_t { + size_t w = 0; + for (const auto & ti : all) { + w += ti.n_elements; + } + return w; + }; + + const size_t tw = total_weights(); + auto current_bpw = [&]() -> double { + return (double)current_total_bytes() * 8.0f / (double)tw; + }; + + // Precompute current bpw + double bpw_now = current_bpw(); + + // If minimal bpw is already above the target, we're constrained by geometry; return closest (min bpw) + if (bpw_now >= target_bpw) { + std::unordered_map overrides; + for (const auto & ti : all) { + overrides[ggml_get_name(ti.w->tensor)] = ti.candidate[ti.choice].type; + } + return overrides; + } + + struct upgrade { + int idx; // tensor index + int next; // next candidate index (strictly larger bytes) + double err; // error reduction + size_t delta_bytes; // increase in bytes + double ratio; // err per added bit + }; + + // Find next strictly-larger candidate index for a tensor + auto next_distinct_idx = [&](const tensor_info &ti) -> int { + const auto &cand = ti.candidate; + const auto &cur = cand[ti.choice]; + int j = ti.choice + 1; + while (j < (int)cand.size() && cand[j].bytes == cur.bytes) ++j; + return j < (int)cand.size() ? j : -1; + }; + + auto recompute_best_upgrade = [&]() -> upgrade { + const double eps = 1e-12; + upgrade best{-1, -1, 0.0, 0, -1.0}; + for (int i = 0; i < (int)all.size(); ++i) { + const auto &ti = all[i]; + if (ti.choice >= (int)ti.candidate.size() - 1) { continue; } + + int j = next_distinct_idx(ti); + if (j < 0) { continue; } // no larger-size candidate remains + + const auto &cur = ti.candidate[ti.choice]; + const auto &nxt = ti.candidate[j]; + + size_t delta_bytes = nxt.bytes - cur.bytes; + if (delta_bytes == 0) { continue; } // should not happen after dedup, but be safe + + double err = (double)cur.error - (double)nxt.error; + err = std::max(err, 0.0); // do not penalize due to sampling noise + + double ratio = err / (double)(delta_bytes * 8ull); + if (ratio > best.ratio + eps || (std::abs(ratio - best.ratio) <= eps && delta_bytes < best.delta_bytes)) { + best = upgrade{i, j, err, delta_bytes, ratio}; + } + } + return best; + }; + + while (true) { + upgrade up = recompute_best_upgrade(); + if (up.idx < 0) { break; } + + size_t now_bytes = current_total_bytes(); + size_t next_bytes = now_bytes + up.delta_bytes; + double bpw_next = (double)next_bytes * 8.0 / (double)tw; + + if (bpw_next <= (double)target_bpw + 1e-12) { + all[up.idx].choice = up.next; + bpw_now = bpw_next; + } else { + break; + } + } + + // We might still be below target but taking any single upgrade overshoots. + { + double under_gap = (double)target_bpw - bpw_now; + + upgrade best_over{-1, -1, 0.0, 0, -1.0}; + double best_over_gap = 1e300; + + size_t now_bytes = current_total_bytes(); + + for (int i = 0; i < (int)all.size(); ++i) { + const auto &ti = all[i]; + if (ti.choice >= (int)ti.candidate.size() - 1) { continue; } + + int j = next_distinct_idx(ti); + if (j < 0) { continue; } + + const auto &cur = ti.candidate[ti.choice]; + const auto &nxt = ti.candidate[j]; + + size_t delta_bytes = nxt.bytes - cur.bytes; + if (delta_bytes == 0) { continue; } + + size_t over_bytes = now_bytes + delta_bytes; + double bpw_over = (double)over_bytes * 8.0 / (double)tw; + + double over_gap = std::abs(bpw_over - (double)target_bpw); + + double err = (double)cur.error - (double)nxt.error; + if (err < 0.0) { err = 0.0; } + double ratio = err / (double)(delta_bytes * 8ull); + + if (over_gap < best_over_gap - 1e-12 || (std::abs(over_gap - best_over_gap) <= 1e-12 && ratio > best_over.ratio)) { + best_over_gap = over_gap; + best_over = upgrade{i, j, err, delta_bytes, ratio}; + } + } + + if (best_over.idx >= 0) { + if (best_over_gap < under_gap) { + all[best_over.idx].choice = best_over.next; + } + } + } + + // Build the override map + std::unordered_map overrides; + LLAMA_LOG_INFO("%s: - estimated tensor quantization mix to achieve %.4f bpw at lowest ppl\n", __func__, target_bpw); + for (const auto & ti : all) { + LLAMA_LOG_INFO("\t%s: %45s - \t%8s, \t%1.4f bpw,\terror: %.4f\n", + __func__, ggml_get_name(ti.w->tensor), ggml_type_name(ti.candidate[ti.choice].type), ti.candidate[ti.choice].bpw, ti.candidate[ti.choice].error); + overrides[ggml_get_name(ti.w->tensor)] = ti.candidate[ti.choice].type; + } + return overrides; +} + static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { ggml_type default_type; llama_ftype ftype = params->ftype; From 1187f6aa9eb4cf7a3bf3945d0ecd292a49c03efa Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 19 Aug 2025 11:07:03 +0100 Subject: [PATCH 015/155] Implement bpw_overrides call --- src/llama-quant.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 1e24303c52..b0b3be76ca 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1314,6 +1314,12 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } } + std::unordered_map bpw_overrides = {}; + if (params->target_bpw != -1.0f) { + LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.3f bpw at lowest ppl - this opearation may take some time\n", __func__, params->target_bpw); + bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params->target_bpw, nthread); + } + int cur_split = -1; std::ofstream fout; auto close_ofstream = [&]() { @@ -1430,6 +1436,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: if (!params->pure && ggml_is_quantized(default_type)) { int fallback = qs.n_fallback; new_type = llama_tensor_get_type(qs, new_type, tensor, ftype); + // get bpw override + const auto override = bpw_overrides.find(name); + if (override != bpw_overrides.end()) { new_type = override->second; } // unless the user specifies a type, and the tensor geometry will not require fallback quantisation if (params->tensor_types && qs.n_fallback - fallback == 0) { const std::vector & tensor_types = *static_cast *>(params->tensor_types); From 5aceb9e3ae016ed057a0963934c53203b74ad3c5 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 19 Aug 2025 22:29:27 +0100 Subject: [PATCH 016/155] Refactor variable names --- src/llama-quant.cpp | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index b0b3be76ca..5af70c1c9b 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -575,13 +575,13 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * return new_size; } -// Returns per-tensor overrides of quantization types to meet target BPW with best expected quality. -// imatrix_data: map from tensor name -> length (ne[0] * ne[2]) containing per-column E[a^2] by expert -// activations_data: optional map from tensor name -> length (ne[0] * ne[2]) containing per-column E[a] by expert -// bias_lambda: relative weight on bias term (|sum e_j * E[a_j]|) vs MSE term (sum e_j^2 * E[a_j^2]) +// Returns per-tensor overrides of quantization types to meet target BPW with the lowest ppl +// sample_rows_per_expert: Larger values will result in more accurate error estimates, but will take longer to compute +// bias_lambda: Affects the weight of the bias term in the MSE error function. 0.0 means no bias, 1.0 means equal weight +// for bias and error, 2.0 means twice as much weight for bias static std::unordered_map target_bpw_type( llama_model_loader & ml, - std::vector> & read_data, + std::vector> & buffer, const llama_model & model, const std::vector & tensors, const std::map & mapped, @@ -735,24 +735,21 @@ static std::unordered_map target_bpw_type( float total_err = 0.0; - for (int64_t i03 = 0; i03 < ne2; ++i03) { - const float * value = values_all ? (values_all + i03 * n_per_row) : nullptr; - const float * activation = activations_all ? (activations_all + i03 * n_per_row) : nullptr; + for (int64_t slice = 0; slice < ne2; ++slice) { + const float * value = values_all ? (values_all + slice * n_per_row) : nullptr; + const float * activation = activations_all ? (activations_all + slice * n_per_row) : nullptr; - // Assemble sampled rows into contiguous f32_sample int64_t rs = 0; for (int64_t r = 0; r < rows_per_expert && rs < sample_rows; r += stride) { - const float * src = f32_data + i03 * (n_per_row * rows_per_expert) + r * n_per_row; + const float * src = f32_data + slice * (n_per_row * rows_per_expert) + r * n_per_row; std::memcpy(f32_sample.data() + rs * n_per_row, src, sizeof(float) * n_per_row); ++rs; } if (rs == 0) { continue; } - // Quantize sampled rows in one chunk; pass the imatrix for this expert slice const size_t got = ggml_quantize_chunk(typ, f32_sample.data(), qbuf.data(), 0, rs, n_per_row, value); - (void)got; // not strictly needed here + (void)got; - // Dequantize traits->to_float(qbuf.data(), deq.data(), rs * n_per_row); // Compute error proxy per sampled row @@ -821,10 +818,8 @@ static std::unordered_map target_bpw_type( LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12d elements)\n", __func__, name.c_str(), (int)ggml_nelements(t)); if (!ml.use_mmap) { - if (read_data.size() < ggml_nbytes(t)) { - read_data.resize(ggml_nbytes(t)); - } - t->data = read_data.data(); + if (buffer.size() < ggml_nbytes(t)) { buffer.resize(ggml_nbytes(t)); } + t->data = buffer.data(); } ml.load_data_for(t); From ee05d6bc0b250a7c19b9dedf504163509ef736f8 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 19 Aug 2025 22:32:53 +0100 Subject: [PATCH 017/155] Update comments --- src/llama-quant.cpp | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 5af70c1c9b..546f6b438c 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -596,13 +596,13 @@ static std::unordered_map target_bpw_type( ggml_type type; float bpw; size_t bytes; - float error; // lower is better + float error; }; struct tensor_info { const llama_model_loader::llama_tensor_weight * w; - std::vector candidate; // sorted by bpw ascending - int choice = -1; // index into cand + std::vector candidate; + int choice = -1; float min_bpw = 0.0; float max_bpw = 0.0; size_t n_elements = 0; @@ -610,7 +610,6 @@ static std::unordered_map target_bpw_type( auto name_tn = LLM_TN(model.arch); - // The candidate types we consider; adjust as needed const ggml_type base_candidates[] = { // Model's GGML_TYPE_IQ1_S, @@ -639,8 +638,6 @@ static std::unordered_map target_bpw_type( bool q = name.rfind("weight") == name.size() - 6; q &= (ggml_n_dims(t) >= 2); q &= name.find("_norm.weight") == std::string::npos; - //q &= name != name_tn(LLM_TENSOR_TOKEN_EMBD, "weight"); - //q &= name != name_tn(LLM_TENSOR_OUTPUT, "weight"); q &= name.find("ffn_gate_inp.weight") == std::string::npos; q &= name.find("altup") == std::string::npos; q &= name.find("laurel") == std::string::npos; @@ -719,7 +716,7 @@ static std::unordered_map target_bpw_type( const ggml_type_traits * traits = ggml_get_type_traits(typ); if (!traits || !traits->to_float) { - // cannot dequantize candidate -> assign very high error + // Cannot dequantize candidate -> assign very high error return 1e35f; } @@ -842,12 +839,10 @@ static std::unordered_map target_bpw_type( info.w = tw; info.n_elements = nelem; - // Candidate build with compatibility handling and availability checks + // Build per-tensor candidate list for (ggml_type ts_type : base_candidates) { - // Skip IQ* without imatrix if (is_iq(ts_type) && !values) { continue; } ggml_type tt = make_compatible(t, ts_type); - // After fallback, if still incompatible, skip if (!is_compatible(t, tt)) { continue; } // Compute bpw and bytes @@ -861,19 +856,18 @@ static std::unordered_map target_bpw_type( } if (info.candidate.empty()) { - // as a last resort, keep original type + // As a last resort, keep original type float bpw = ggml_nbytes(t) * 8.0f / nelem; info.candidate.push_back(candidate_types{t->type, bpw, ggml_nbytes(t), 0.0}); } - // Sort by bpw ascending std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types &a, const candidate_types &b) { if (a.bpw != b.bpw) { return a.bpw < b.bpw; } if (a.error != b.error) { return a.error < b.error; } return a.bytes < b.bytes; }); - // collapse candidates with identical storage size (bytes) + // Collapse candidates with identical storage size (bytes) { std::vector uniq; uniq.reserve(info.candidate.size()); @@ -903,7 +897,6 @@ static std::unordered_map target_bpw_type( if (all.empty()) { return {}; } // Greedy allocation from minimum bpw upward to reach target_bpw - // Start with minimal bpw assignment auto current_total_bytes = [&]() -> size_t { size_t b = 0; for (const auto & ti : all) { @@ -938,11 +931,11 @@ static std::unordered_map target_bpw_type( } struct upgrade { - int idx; // tensor index - int next; // next candidate index (strictly larger bytes) - double err; // error reduction - size_t delta_bytes; // increase in bytes - double ratio; // err per added bit + int idx; + int next; + double err; + size_t delta_bytes; + double ratio; }; // Find next strictly-larger candidate index for a tensor @@ -998,6 +991,7 @@ static std::unordered_map target_bpw_type( } // We might still be below target but taking any single upgrade overshoots. + // Try to find the best upgrade that overshoots the target_bpw by the least and has the best error-to-size ratio. { double under_gap = (double)target_bpw - bpw_now; From f22b3097eb144a913d02fbb445cbdb9b97e91859 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 19 Aug 2025 22:34:01 +0100 Subject: [PATCH 018/155] Avoid division by zero if truncation occurs --- src/llama-quant.cpp | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 546f6b438c..3911eba43b 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -790,28 +790,24 @@ static std::unordered_map target_bpw_type( } // Scale for the rows we didn't sample in this expert: multiply by stride-ish factor - const float scale_rows = rows_per_expert / std::max(1, rs); + const float scale_rows = (float)rows_per_expert / std::max(1.0f, (float)rs); total_err *= scale_rows; } return total_err; }; - // Produce per-tensor candidate lists std::vector all; all.reserve(tensors.size()); for (const auto * tw : tensors) { - // Temporary workers for dequantization std::vector workers; workers.reserve(std::max(1, nthread)); ggml_tensor * t = tw->tensor; const std::string name = ggml_get_name(t); - if (!can_quantize(t)) { - continue; - } + if (!can_quantize(t)) { continue; } LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12d elements)\n", __func__, name.c_str(), (int)ggml_nelements(t)); if (!ml.use_mmap) { @@ -820,7 +816,6 @@ static std::unordered_map target_bpw_type( } ml.load_data_for(t); - // Prepare f32 weights for error estimates const int64_t nelem = ggml_nelements(t); std::vector> f32_conv_buf; float * f32_data = nullptr; @@ -955,13 +950,13 @@ static std::unordered_map target_bpw_type( if (ti.choice >= (int)ti.candidate.size() - 1) { continue; } int j = next_distinct_idx(ti); - if (j < 0) { continue; } // no larger-size candidate remains + if (j < 0) { continue; } const auto &cur = ti.candidate[ti.choice]; const auto &nxt = ti.candidate[j]; size_t delta_bytes = nxt.bytes - cur.bytes; - if (delta_bytes == 0) { continue; } // should not happen after dedup, but be safe + if (delta_bytes == 0) { continue; } double err = (double)cur.error - (double)nxt.error; err = std::max(err, 0.0); // do not penalize due to sampling noise From 936294f6afb10aea69ac5ae85fcc29313b49cd9e Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 19 Aug 2025 23:31:22 +0100 Subject: [PATCH 019/155] Increase precision for error calculation --- src/llama-quant.cpp | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 3911eba43b..a4a10da062 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -730,7 +730,7 @@ static std::unordered_map target_bpw_type( std::vector f32_sample(sample_rows * n_per_row); std::vector deq(sample_rows * n_per_row); - float total_err = 0.0; + double total_err = 0.0; for (int64_t slice = 0; slice < ne2; ++slice) { const float * value = values_all ? (values_all + slice * n_per_row) : nullptr; @@ -754,9 +754,9 @@ static std::unordered_map target_bpw_type( const float * xs = f32_sample.data() + s * n_per_row; const float * ys = deq.data() + s * n_per_row; - float mse_w = 0.0; - float bias = 0.0; - float bias_sum = 0.0; + double mse_w = 0.0; + double bias = 0.0; + double bias_sum = 0.0; if (value) { for (int64_t j = 0; j < n_per_row; ++j) { @@ -769,19 +769,17 @@ static std::unordered_map target_bpw_type( } else { for (int64_t j = 0; j < n_per_row; ++j) { const float e = ys[j] - xs[j]; - mse_w += e*e; + mse_w += e * e; if (activation) { bias_sum += e * activation[j]; } } } - if (activation) { - bias = std::abs(bias_sum); - } + if (activation) { bias = std::abs(bias_sum); } // Normalize by n_per_row to get a per-row average scale - float row_err = mse_w / std::max(1, n_per_row); + double row_err = mse_w / std::max(1, n_per_row); if (bias_lambda != 0.0) { row_err += bias_lambda * (bias / std::max(1, n_per_row)); } @@ -790,11 +788,11 @@ static std::unordered_map target_bpw_type( } // Scale for the rows we didn't sample in this expert: multiply by stride-ish factor - const float scale_rows = (float)rows_per_expert / std::max(1.0f, (float)rs); + const auto scale_rows = (double)rows_per_expert / std::max(1.0, (double)rs); total_err *= scale_rows; } - return total_err; + return std::isfinite(total_err) ? total_err : 1e35; }; std::vector all; From 5cd69a6809c56922e1b973ce900f3680c28a5117 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Wed, 20 Aug 2025 09:41:39 +0100 Subject: [PATCH 020/155] Add F16/BF16 type --- src/llama-quant.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index a4a10da062..5522fe39d2 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -630,7 +630,13 @@ static std::unordered_map target_bpw_type( GGML_TYPE_Q5_1, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, - GGML_TYPE_Q8_0 + GGML_TYPE_Q8_0, +// TODO: find better way to handle F16/BF16 +#ifdef GGML_USE_METAL + GGML_TYPE_F16 +#else + GGML_TYPE_BF16 +#endif }; auto can_quantize = [&](const ggml_tensor * t) -> bool { From 69586e212e76849fcdff17e68e8023b91025b415 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Wed, 20 Aug 2025 13:23:11 +0100 Subject: [PATCH 021/155] Add F16/BF16 type --- tools/quantize/quantize.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 86a96cdfcc..b907008cb4 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -132,7 +132,7 @@ static void usage(const char * executable) { printf(" Advanced option to selectively quantize tensors. May be specified multiple times.\n"); printf(" --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n"); printf(" Advanced option to remove all tensors from the given layers\n"); - printf(" --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0 \n"); + printf(" --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 16.0 \n"); printf(" --keep-split: will generate quantized model in the same shards as input\n"); printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n"); @@ -486,13 +486,13 @@ static bool parse_target_bpw(const char * data, float & target_bpw) { try { target_bpw = std::stof(data); - if (target_bpw < 0.0f || target_bpw > 8.0f) { - printf("\n%s: target bits per weight (bpw) must be a positive number between 0.0 and 8.0\n\n", __func__); + if (target_bpw < 0.0f || target_bpw > 16.0f) { + printf("\n%s: target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__); return false; } } catch (const std::exception & e) { - printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 8.0\n\n", __func__, data); + printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__, data); return false; } From 29b2dc3ec0ddefde21394007649df6c268ebca3d Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Wed, 20 Aug 2025 13:27:01 +0100 Subject: [PATCH 022/155] Do not mix K and IQ quants --- src/llama-quant.cpp | 62 +++++++++++++++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 16 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 5522fe39d2..9dc903874f 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -36,6 +36,26 @@ static bool is_iq(const enum ggml_type t) { } } +static bool is_iq(const enum llama_ftype t) { + switch (t) { + case LLAMA_FTYPE_MOSTLY_IQ1_S: + case LLAMA_FTYPE_MOSTLY_IQ1_M: + case LLAMA_FTYPE_MOSTLY_IQ2_XXS: + case LLAMA_FTYPE_MOSTLY_IQ2_XS: + case LLAMA_FTYPE_MOSTLY_IQ2_S: + case LLAMA_FTYPE_MOSTLY_IQ2_M: + case LLAMA_FTYPE_MOSTLY_IQ3_XXS: + case LLAMA_FTYPE_MOSTLY_IQ3_XS: + case LLAMA_FTYPE_MOSTLY_IQ3_S: + case LLAMA_FTYPE_MOSTLY_IQ3_M: + case LLAMA_FTYPE_MOSTLY_IQ4_XS: + case LLAMA_FTYPE_MOSTLY_IQ4_NL: + return true; + default: + return false; + } +} + static enum ggml_type fallback_type(const enum ggml_type new_type) { switch (new_type) { case GGML_TYPE_TQ1_0: @@ -587,7 +607,7 @@ static std::unordered_map target_bpw_type( const std::map & mapped, const std::unordered_map> * values_data, const std::unordered_map> * activations_data, - float target_bpw, + const llama_model_quantize_params * params, int nthread, int sample_rows_per_expert = 128, float bias_lambda = 1.0 @@ -608,19 +628,7 @@ static std::unordered_map target_bpw_type( size_t n_elements = 0; }; - auto name_tn = LLM_TN(model.arch); - - const ggml_type base_candidates[] = { - // Model's - GGML_TYPE_IQ1_S, - GGML_TYPE_IQ1_M, - GGML_TYPE_IQ2_XXS, - GGML_TYPE_IQ2_XS, - GGML_TYPE_IQ2_S, - GGML_TYPE_IQ3_XXS, - GGML_TYPE_IQ3_S, - GGML_TYPE_IQ4_XS, - GGML_TYPE_IQ4_NL, + const ggml_type k_candidates[] = { GGML_TYPE_Q2_K, GGML_TYPE_Q3_K, GGML_TYPE_Q4_0, @@ -639,6 +647,21 @@ static std::unordered_map target_bpw_type( #endif }; + const ggml_type iq_candidates[] = { + GGML_TYPE_IQ1_S, + GGML_TYPE_IQ1_M, + GGML_TYPE_IQ2_XXS, + GGML_TYPE_IQ2_XS, + GGML_TYPE_IQ2_S, + GGML_TYPE_IQ3_XXS, + GGML_TYPE_IQ3_S, + GGML_TYPE_IQ4_XS, + GGML_TYPE_IQ4_NL, + }; + + auto name_tn = LLM_TN(model.arch); + float target_bpw = params->target_bpw; + auto can_quantize = [&](const ggml_tensor * t) -> bool { const std::string name = ggml_get_name(t); bool q = name.rfind("weight") == name.size() - 6; @@ -838,8 +861,15 @@ static std::unordered_map target_bpw_type( info.w = tw; info.n_elements = nelem; + std::vector quant_candidates; + if (is_iq(params->ftype)) { + quant_candidates.assign(std::begin(iq_candidates), std::end(iq_candidates)); + } else { + quant_candidates.assign(std::begin(k_candidates), std::end(k_candidates)); + } + // Build per-tensor candidate list - for (ggml_type ts_type : base_candidates) { + for (ggml_type ts_type : quant_candidates) { if (is_iq(ts_type) && !values) { continue; } ggml_type tt = make_compatible(t, ts_type); if (!is_compatible(t, tt)) { continue; } @@ -1305,7 +1335,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: std::unordered_map bpw_overrides = {}; if (params->target_bpw != -1.0f) { LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.3f bpw at lowest ppl - this opearation may take some time\n", __func__, params->target_bpw); - bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params->target_bpw, nthread); + bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread); } int cur_split = -1; From 43caadf783a4bae41011e3b9aca5bbe79185a7a6 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Wed, 20 Aug 2025 17:24:48 +0100 Subject: [PATCH 023/155] Add better fallbacks for IQ mixes --- src/llama-quant.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 9dc903874f..c412191c8f 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -657,6 +657,12 @@ static std::unordered_map target_bpw_type( GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS, GGML_TYPE_IQ4_NL, + // Add higher-precision fallbacks for IQ mixes to improve ppl if bpw budget allows it + GGML_TYPE_Q5_0, + GGML_TYPE_Q5_1, + GGML_TYPE_Q5_K, + GGML_TYPE_Q6_K, + GGML_TYPE_Q8_0 }; auto name_tn = LLM_TN(model.arch); From 52da4a4f8c28d063378d54dd806da03614251e76 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Wed, 20 Aug 2025 17:26:05 +0100 Subject: [PATCH 024/155] Skip if output.weight or type is COPY --- src/llama-quant.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index c412191c8f..786adfe547 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -697,6 +697,9 @@ static std::unordered_map target_bpw_type( q &= name.find("time_mix_decay_w2.weight") == std::string::npos; q &= name.find("time_mix_lerp_fused.weight") == std::string::npos; q &= name.find("attn_rel_b.weight") == std::string::npos; + q &= params->quantize_output_tensor || name != "output.weight"; + q &= !params->only_copy; + return q; }; From 3f0118d6029450955c43cd84109bdfc36a8cecd3 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Wed, 20 Aug 2025 17:26:37 +0100 Subject: [PATCH 025/155] Fix bias lambda bug --- src/llama-quant.cpp | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 786adfe547..44cf9e30e3 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -782,52 +782,47 @@ static std::unordered_map target_bpw_type( } if (rs == 0) { continue; } - const size_t got = ggml_quantize_chunk(typ, f32_sample.data(), qbuf.data(), 0, rs, n_per_row, value); - (void)got; - + // Quantize sample rows and dequantize back + (void)ggml_quantize_chunk(typ, f32_sample.data(), qbuf.data(), 0, rs, n_per_row, value); traits->to_float(qbuf.data(), deq.data(), rs * n_per_row); - // Compute error proxy per sampled row + // Compute error proxy per sampled slice + double slice_err = 0.0; for (int64_t s = 0; s < rs; ++s) { const float * xs = f32_sample.data() + s * n_per_row; const float * ys = deq.data() + s * n_per_row; double mse_w = 0.0; - double bias = 0.0; double bias_sum = 0.0; if (value) { for (int64_t j = 0; j < n_per_row; ++j) { const float e = ys[j] - xs[j]; mse_w += e * e * value[j]; - if (activation) { - bias_sum += e * activation[j]; - } + if (activation) { bias_sum += e * activation[j]; } } } else { for (int64_t j = 0; j < n_per_row; ++j) { const float e = ys[j] - xs[j]; mse_w += e * e; - if (activation) { - bias_sum += e * activation[j]; - } + if (activation) { bias_sum += e * activation[j]; } } } - if (activation) { bias = std::abs(bias_sum); } - // Normalize by n_per_row to get a per-row average scale double row_err = mse_w / std::max(1, n_per_row); - if (bias_lambda != 0.0) { - row_err += bias_lambda * (bias / std::max(1, n_per_row)); + if (activation && bias_lambda != 0.0) { + // bias_sum ~= sum_j ( (w_q - w_fp)[j] * E[a_j] ) + const double bias = std::abs(bias_sum) / std::max(1, n_per_row); + row_err += bias_lambda * bias; } - total_err += row_err; + slice_err += row_err; } - // Scale for the rows we didn't sample in this expert: multiply by stride-ish factor - const auto scale_rows = (double)rows_per_expert / std::max(1.0, (double)rs); - total_err *= scale_rows; + // Scale the slice contribution by the sampling factor + const auto scale_rows = (double)rows_per_expert / std::max(1.0, (double)rs); + total_err += slice_err * scale_rows; } return std::isfinite(total_err) ? total_err : 1e35; @@ -1002,7 +997,7 @@ static std::unordered_map target_bpw_type( if (delta_bytes == 0) { continue; } double err = (double)cur.error - (double)nxt.error; - err = std::max(err, 0.0); // do not penalize due to sampling noise + err = std::max(err, 0.0); double ratio = err / (double)(delta_bytes * 8ull); if (ratio > best.ratio + eps || (std::abs(ratio - best.ratio) <= eps && delta_bytes < best.delta_bytes)) { From b0b33b7ccbc5880e6ac5206ea309ee328e685c08 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Wed, 20 Aug 2025 20:58:26 +0100 Subject: [PATCH 026/155] Optimise tensor sampling --- src/llama-quant.cpp | 197 ++++++++++++++++++++++++++------------------ 1 file changed, 119 insertions(+), 78 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 44cf9e30e3..830bf915cf 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -609,7 +609,7 @@ static std::unordered_map target_bpw_type( const std::unordered_map> * activations_data, const llama_model_quantize_params * params, int nthread, - int sample_rows_per_expert = 128, + int sample_rows_per_expert = 256, float bias_lambda = 1.0 ) { struct candidate_types { @@ -671,7 +671,7 @@ static std::unordered_map target_bpw_type( auto can_quantize = [&](const ggml_tensor * t) -> bool { const std::string name = ggml_get_name(t); bool q = name.rfind("weight") == name.size() - 6; - q &= (ggml_n_dims(t) >= 2); + q &= ggml_n_dims(t) >= 2; q &= name.find("_norm.weight") == std::string::npos; q &= name.find("ffn_gate_inp.weight") == std::string::npos; q &= name.find("altup") == std::string::npos; @@ -719,9 +719,9 @@ static std::unordered_map target_bpw_type( auto total_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t { const int64_t n_per_row = t->ne[0]; - const int64_t nrows = t->ne[1]; - const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; - const size_t row_sz = ggml_row_size(typ, n_per_row); + const int64_t nrows = t->ne[1]; + const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; + const size_t row_sz = ggml_row_size(typ, n_per_row); return (size_t)ne2 * (size_t)nrows * row_sz; }; @@ -734,7 +734,7 @@ static std::unordered_map target_bpw_type( auto is_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> bool { const int64_t n_per_row = t->ne[0]; const int64_t blck = ggml_blck_size(typ); - if (blck <= 1) { return true; } // FP16/BF16/Q8_0 etc + if (blck <= 1) { return true; } return n_per_row % blck == 0; }; @@ -742,15 +742,20 @@ static std::unordered_map target_bpw_type( if (is_compatible(t, typ)) { return typ; } ggml_type fb = fallback_type(typ); if (is_compatible(t, fb)) { return fb; } - return GGML_TYPE_F16; // final guard + return GGML_TYPE_F16; }; - // Estimate error for a given type using a sampled subset of rows. - // Uses both imatrix (E[a^2]) and activations (E[a]) if available. - auto estimate_error = [&](const ggml_tensor * t, const float * f32_data, const ggml_type typ, const float * values_all, const float * activations_all) -> double { + // Estimate error for a given type using a sampled subset of rows + auto estimate_error = [&](const ggml_tensor * t, + const ggml_type typ, + const std::vector & f32_sample, + const std::vector & sample_rows_per_slice, + const std::vector & values_sample, + const std::vector & activations_sample) -> double + { const int64_t n_per_row = t->ne[0]; - const int64_t nrows = t->ne[1]; - const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; + const int64_t nrows = t->ne[1]; + const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; const ggml_type_traits * traits = ggml_get_type_traits(typ); if (!traits || !traits->to_float) { @@ -758,70 +763,73 @@ static std::unordered_map target_bpw_type( return 1e35f; } - // Sampling plan: for each expert slice, take up to sample_rows rows spread uniformly - const int64_t rows_per_expert = nrows; - const int64_t sample_rows = std::max(1, std::min(rows_per_expert, sample_rows_per_expert)); - const int64_t stride = std::max(1, rows_per_expert / sample_rows); + const size_t total_sampled_rows = f32_sample.size() / n_per_row; + if (total_sampled_rows == 0) { return 0.0; } - const size_t row_sz = ggml_row_size(typ, n_per_row); - std::vector qbuf(row_sz * sample_rows); - std::vector f32_sample(sample_rows * n_per_row); - std::vector deq(sample_rows * n_per_row); - - double total_err = 0.0; + const size_t qbuf_size = ggml_row_size(typ, n_per_row) * total_sampled_rows; + std::vector qbuf(qbuf_size); + std::vector deq(f32_sample.size()); + // Quantize all sampled rows at once and dequantize back + size_t qbuf_offset = 0; + size_t f32_offset = 0; for (int64_t slice = 0; slice < ne2; ++slice) { - const float * value = values_all ? (values_all + slice * n_per_row) : nullptr; - const float * activation = activations_all ? (activations_all + slice * n_per_row) : nullptr; - - int64_t rs = 0; - for (int64_t r = 0; r < rows_per_expert && rs < sample_rows; r += stride) { - const float * src = f32_data + slice * (n_per_row * rows_per_expert) + r * n_per_row; - std::memcpy(f32_sample.data() + rs * n_per_row, src, sizeof(float) * n_per_row); - ++rs; - } + const int64_t rs = sample_rows_per_slice[slice]; if (rs == 0) { continue; } - // Quantize sample rows and dequantize back - (void)ggml_quantize_chunk(typ, f32_sample.data(), qbuf.data(), 0, rs, n_per_row, value); - traits->to_float(qbuf.data(), deq.data(), rs * n_per_row); + const float * value = values_sample.empty() ? nullptr : values_sample.data() + slice * n_per_row; + (void)ggml_quantize_chunk(typ, f32_sample.data() + f32_offset, qbuf.data() + qbuf_offset, 0, rs, n_per_row, value); + qbuf_offset += ggml_row_size(typ, n_per_row) * rs; + f32_offset += rs * n_per_row; + } + + traits->to_float(qbuf.data(), deq.data(), f32_sample.size()); + + double total_err = 0.0; + size_t sample_offset = 0; + + for (int64_t slice = 0; slice < ne2; ++slice) { + const float * value_slice = values_sample.empty() ? nullptr : values_sample.data() + slice * n_per_row; + const float * activation_slice = activations_sample.empty() ? nullptr : activations_sample.data() + slice * n_per_row; + const int64_t rs = sample_rows_per_slice[slice]; - // Compute error proxy per sampled slice double slice_err = 0.0; for (int64_t s = 0; s < rs; ++s) { - const float * xs = f32_sample.data() + s * n_per_row; - const float * ys = deq.data() + s * n_per_row; + const float * xs = f32_sample.data() + sample_offset; + const float * ys = deq.data() + sample_offset; - double mse_w = 0.0; + double mse_w = 0.0; double bias_sum = 0.0; - if (value) { + if (value_slice) { for (int64_t j = 0; j < n_per_row; ++j) { const float e = ys[j] - xs[j]; - mse_w += e * e * value[j]; - if (activation) { bias_sum += e * activation[j]; } + mse_w += e * e * value_slice[j]; + if (activation_slice) { bias_sum += e * activation_slice[j]; } } } else { for (int64_t j = 0; j < n_per_row; ++j) { const float e = ys[j] - xs[j]; mse_w += e * e; - if (activation) { bias_sum += e * activation[j]; } + if (activation_slice) { bias_sum += e * activation_slice[j]; } } } // Normalize by n_per_row to get a per-row average scale double row_err = mse_w / std::max(1, n_per_row); - if (activation && bias_lambda != 0.0) { + if (activation_slice && bias_lambda != 0.0) { // bias_sum ~= sum_j ( (w_q - w_fp)[j] * E[a_j] ) const double bias = std::abs(bias_sum) / std::max(1, n_per_row); row_err += bias_lambda * bias; } slice_err += row_err; + sample_offset += n_per_row; } // Scale the slice contribution by the sampling factor - const auto scale_rows = (double)rows_per_expert / std::max(1.0, (double)rs); + const double rows_per_expert = (double) nrows; + const auto scale_rows = rows_per_expert / std::max(1.0, (double) rs); total_err += slice_err * scale_rows; } @@ -858,8 +866,40 @@ static std::unordered_map target_bpw_type( f32_data = (float *)f32_conv_buf.data(); } - const float * values = get_values(name); - const float * activations = get_activations(name); + const float * values_all = get_values(name); + const float * activations_all = get_activations(name); + + // Sample the tensor rows once, before looping through quantization candidates. + const int64_t n_per_row = t->ne[0]; + const int64_t nrows_total = t->ne[1]; + const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; + const int64_t rows_per_expert = nrows_total; + const int64_t sample_rows_max = std::max(1, std::min(rows_per_expert, sample_rows_per_expert)); + const int64_t stride = std::max(1, rows_per_expert / sample_rows_max); + + std::vector f32_sample; + std::vector values_sample; + std::vector activations_sample; + std::vector sample_rows_per_slice(ne2); + + for (int64_t slice = 0; slice < ne2; ++slice) { + int64_t current_sampled_rows = 0; + for (int64_t r = 0; r < rows_per_expert && current_sampled_rows < sample_rows_max; r += stride) { + const float * src_row = f32_data + slice * (n_per_row * rows_per_expert) + r * n_per_row; + f32_sample.insert(f32_sample.end(), src_row, src_row + n_per_row); + current_sampled_rows++; + } + sample_rows_per_slice[slice] = current_sampled_rows; + } + + if (values_all) { + values_sample.resize(ne2 * n_per_row); + std::memcpy(values_sample.data(), values_all, ne2 * n_per_row * sizeof(float)); + } + if (activations_all) { + activations_sample.resize(ne2 * n_per_row); + std::memcpy(activations_sample.data(), activations_all, ne2 * n_per_row * sizeof(float)); + } tensor_info info; info.w = tw; @@ -874,7 +914,7 @@ static std::unordered_map target_bpw_type( // Build per-tensor candidate list for (ggml_type ts_type : quant_candidates) { - if (is_iq(ts_type) && !values) { continue; } + if (is_iq(ts_type) && !values_all) { continue; } ggml_type tt = make_compatible(t, ts_type); if (!is_compatible(t, tt)) { continue; } @@ -882,19 +922,18 @@ static std::unordered_map target_bpw_type( auto bpw = (float)tensor_bpw(t, tt); size_t bytes = total_bytes(t, tt); - // Estimate error - auto err = (float)estimate_error(t, f32_data, tt, values, activations); - - info.candidate.push_back(candidate_types{tt, bpw, bytes, err}); + // Estimate error using the pre-sampled data + auto err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, values_sample, activations_sample); + info.candidate.push_back(candidate_types{ tt, bpw, bytes, err }); } if (info.candidate.empty()) { // As a last resort, keep original type float bpw = ggml_nbytes(t) * 8.0f / nelem; - info.candidate.push_back(candidate_types{t->type, bpw, ggml_nbytes(t), 0.0}); + info.candidate.push_back(candidate_types{ t->type, bpw, ggml_nbytes(t), 0.0 }); } - std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types &a, const candidate_types &b) { + std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types & a, const candidate_types & b) { if (a.bpw != b.bpw) { return a.bpw < b.bpw; } if (a.error != b.error) { return a.error < b.error; } return a.bytes < b.bytes; @@ -905,7 +944,7 @@ static std::unordered_map target_bpw_type( std::vector uniq; uniq.reserve(info.candidate.size()); - for (size_t i = 0; i < info.candidate.size(); ) { + for (size_t i = 0; i < info.candidate.size();) { size_t j = i + 1; candidate_types best = info.candidate[i]; // group same-byte entries, keep the one with the lowest error @@ -972,36 +1011,39 @@ static std::unordered_map target_bpw_type( }; // Find next strictly-larger candidate index for a tensor - auto next_distinct_idx = [&](const tensor_info &ti) -> int { - const auto &cand = ti.candidate; - const auto &cur = cand[ti.choice]; + auto next_distinct_idx = [&](const tensor_info & ti) -> int { + const auto & cand = ti.candidate; + const auto & cur = cand[ti.choice]; int j = ti.choice + 1; - while (j < (int)cand.size() && cand[j].bytes == cur.bytes) ++j; + while (j < (int)cand.size() && cand[j].bytes == cur.bytes) { + ++j; + } + return j < (int)cand.size() ? j : -1; }; auto recompute_best_upgrade = [&]() -> upgrade { const double eps = 1e-12; - upgrade best{-1, -1, 0.0, 0, -1.0}; - for (int i = 0; i < (int)all.size(); ++i) { - const auto &ti = all[i]; + upgrade best{ -1, -1, 0.0, 0, -1.0 }; + for (int i = 0; i < (int) all.size(); ++i) { + const auto & ti = all[i]; if (ti.choice >= (int)ti.candidate.size() - 1) { continue; } - int j = next_distinct_idx(ti); + const int j = next_distinct_idx(ti); if (j < 0) { continue; } - const auto &cur = ti.candidate[ti.choice]; - const auto &nxt = ti.candidate[j]; + const auto & cur = ti.candidate[ti.choice]; + const auto & nxt = ti.candidate[j]; - size_t delta_bytes = nxt.bytes - cur.bytes; + const size_t delta_bytes = nxt.bytes - cur.bytes; if (delta_bytes == 0) { continue; } - double err = (double)cur.error - (double)nxt.error; + double err = cur.error - nxt.error; err = std::max(err, 0.0); double ratio = err / (double)(delta_bytes * 8ull); if (ratio > best.ratio + eps || (std::abs(ratio - best.ratio) <= eps && delta_bytes < best.delta_bytes)) { - best = upgrade{i, j, err, delta_bytes, ratio}; + best = upgrade{ i, j, err, delta_bytes, ratio }; } } return best; @@ -1014,8 +1056,7 @@ static std::unordered_map target_bpw_type( size_t now_bytes = current_total_bytes(); size_t next_bytes = now_bytes + up.delta_bytes; double bpw_next = (double)next_bytes * 8.0 / (double)tw; - - if (bpw_next <= (double)target_bpw + 1e-12) { + if (bpw_next <= target_bpw + 1e-12) { all[up.idx].choice = up.next; bpw_now = bpw_next; } else { @@ -1026,22 +1067,22 @@ static std::unordered_map target_bpw_type( // We might still be below target but taking any single upgrade overshoots. // Try to find the best upgrade that overshoots the target_bpw by the least and has the best error-to-size ratio. { - double under_gap = (double)target_bpw - bpw_now; + double under_gap = target_bpw - bpw_now; - upgrade best_over{-1, -1, 0.0, 0, -1.0}; - double best_over_gap = 1e300; + upgrade best_over{ -1, -1, 0.0, 0, -1.0 }; + double best_over_gap = 1e300; size_t now_bytes = current_total_bytes(); - for (int i = 0; i < (int)all.size(); ++i) { - const auto &ti = all[i]; + for (int i = 0; i < (int) all.size(); ++i) { + const auto & ti = all[i]; if (ti.choice >= (int)ti.candidate.size() - 1) { continue; } int j = next_distinct_idx(ti); if (j < 0) { continue; } - const auto &cur = ti.candidate[ti.choice]; - const auto &nxt = ti.candidate[j]; + const auto & cur = ti.candidate[ti.choice]; + const auto & nxt = ti.candidate[j]; size_t delta_bytes = nxt.bytes - cur.bytes; if (delta_bytes == 0) { continue; } @@ -1051,13 +1092,13 @@ static std::unordered_map target_bpw_type( double over_gap = std::abs(bpw_over - (double)target_bpw); - double err = (double)cur.error - (double)nxt.error; + double err = cur.error - nxt.error; if (err < 0.0) { err = 0.0; } double ratio = err / (double)(delta_bytes * 8ull); if (over_gap < best_over_gap - 1e-12 || (std::abs(over_gap - best_over_gap) <= 1e-12 && ratio > best_over.ratio)) { best_over_gap = over_gap; - best_over = upgrade{i, j, err, delta_bytes, ratio}; + best_over = upgrade{ i, j, err, delta_bytes, ratio }; } } From 35ad0fc4addf92e9dc0700a88004962731f3c9e0 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Wed, 20 Aug 2025 23:27:20 +0100 Subject: [PATCH 027/155] Improve error estimation using weighted MSE --- src/llama-quant.cpp | 60 +++++++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 24 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 830bf915cf..f5fa309c44 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -783,14 +783,26 @@ static std::unordered_map target_bpw_type( f32_offset += rs * n_per_row; } - traits->to_float(qbuf.data(), deq.data(), f32_sample.size()); + if (typ == GGML_TYPE_F16) { + const auto *const src = (const ggml_fp16_t *)qbuf.data(); + for (size_t r = 0; r < total_sampled_rows; ++r) { + ggml_fp16_to_fp32_row(src + r * n_per_row, deq.data() + r * n_per_row, n_per_row); + } + } else if (typ == GGML_TYPE_BF16) { + const auto *const src = (const ggml_bf16_t *)qbuf.data(); + for (size_t r = 0; r < total_sampled_rows; ++r) { + ggml_bf16_to_fp32_row(src + r * n_per_row, deq.data() + r * n_per_row, n_per_row); + } + } else { + traits->to_float(qbuf.data(), deq.data(), f32_sample.size()); + } double total_err = 0.0; size_t sample_offset = 0; for (int64_t slice = 0; slice < ne2; ++slice) { - const float * value_slice = values_sample.empty() ? nullptr : values_sample.data() + slice * n_per_row; - const float * activation_slice = activations_sample.empty() ? nullptr : activations_sample.data() + slice * n_per_row; + const float * wv = values_sample.empty() ? nullptr : values_sample.data() + slice * n_per_row; + const float * act = activations_sample.empty() ? nullptr : activations_sample.data() + slice * n_per_row; const int64_t rs = sample_rows_per_slice[slice]; double slice_err = 0.0; @@ -799,37 +811,37 @@ static std::unordered_map target_bpw_type( const float * ys = deq.data() + sample_offset; double mse_w = 0.0; - double bias_sum = 0.0; + double x2_w = 0.0; + double bias_num = 0.0; + double bias_den = 0.0; - if (value_slice) { - for (int64_t j = 0; j < n_per_row; ++j) { - const float e = ys[j] - xs[j]; - mse_w += e * e * value_slice[j]; - if (activation_slice) { bias_sum += e * activation_slice[j]; } - } - } else { - for (int64_t j = 0; j < n_per_row; ++j) { - const float e = ys[j] - xs[j]; - mse_w += e * e; - if (activation_slice) { bias_sum += e * activation_slice[j]; } + for (int64_t j = 0; j < n_per_row; ++j) { + const double e = ys[j] - xs[j]; + const double w = wv ? wv[j] : 1.0; + mse_w += w * e * e; + x2_w += w * xs[j] * xs[j]; + + if (act) { + const double a = act[j]; + bias_num += e * a; + bias_den += a * a; } } - // Normalize by n_per_row to get a per-row average scale - double row_err = mse_w / std::max(1, n_per_row); - if (activation_slice && bias_lambda != 0.0) { - // bias_sum ~= sum_j ( (w_q - w_fp)[j] * E[a_j] ) - const double bias = std::abs(bias_sum) / std::max(1, n_per_row); - row_err += bias_lambda * bias; + const double eps = 1e-30; + double row_err = mse_w / (x2_w + eps); + + if (act && bias_lambda != 0.0) { + const double bias_norm = bias_num * bias_num / (bias_den + eps); + row_err += bias_lambda * bias_norm; } slice_err += row_err; sample_offset += n_per_row; } - // Scale the slice contribution by the sampling factor - const double rows_per_expert = (double) nrows; - const auto scale_rows = rows_per_expert / std::max(1.0, (double) rs); + const auto rows_per_expert = nrows; + const double scale_rows = (double)rows_per_expert / std::max(1.0, (double)rs); total_err += slice_err * scale_rows; } From 5ef493ea1a01385c02ef4c56d38dfe5e116c47c6 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 21 Aug 2025 09:48:29 +0100 Subject: [PATCH 028/155] Exclude embeddings and output tensor --- src/llama-quant.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index f5fa309c44..32013e47ba 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -697,8 +697,10 @@ static std::unordered_map target_bpw_type( q &= name.find("time_mix_decay_w2.weight") == std::string::npos; q &= name.find("time_mix_lerp_fused.weight") == std::string::npos; q &= name.find("attn_rel_b.weight") == std::string::npos; - q &= params->quantize_output_tensor || name != "output.weight"; q &= !params->only_copy; + // TODO: Exclude embeddings and output tensors? + q &= params->quantize_output_tensor || name != "output.weight"; + q &= name != name_tn(LLM_TENSOR_TOKEN_EMBD, "weight"); return q; }; From 95b2ab2800e26a5bd5b60c61f9593d720a97eb7a Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 21 Aug 2025 10:46:37 +0100 Subject: [PATCH 029/155] Change error estimate to use normalised weighted MSE --- src/llama-quant.cpp | 204 +++++++++++++++++++++++++++++--------------- 1 file changed, 134 insertions(+), 70 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 32013e47ba..629056ee06 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -661,8 +662,7 @@ static std::unordered_map target_bpw_type( GGML_TYPE_Q5_0, GGML_TYPE_Q5_1, GGML_TYPE_Q5_K, - GGML_TYPE_Q6_K, - GGML_TYPE_Q8_0 + GGML_TYPE_Q6_K }; auto name_tn = LLM_TN(model.arch); @@ -752,103 +752,125 @@ static std::unordered_map target_bpw_type( const ggml_type typ, const std::vector & f32_sample, const std::vector & sample_rows_per_slice, - const std::vector & values_sample, - const std::vector & activations_sample) -> double + const float * values_sample, + const float * activations_sample, + std::vector & qbuf, + std::vector & deq) -> double { const int64_t n_per_row = t->ne[0]; - const int64_t nrows = t->ne[1]; - const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; - - const ggml_type_traits * traits = ggml_get_type_traits(typ); - if (!traits || !traits->to_float) { - // Cannot dequantize candidate -> assign very high error - return 1e35f; - } + const int64_t nrows = t->ne[1]; + const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; const size_t total_sampled_rows = f32_sample.size() / n_per_row; if (total_sampled_rows == 0) { return 0.0; } - const size_t qbuf_size = ggml_row_size(typ, n_per_row) * total_sampled_rows; - std::vector qbuf(qbuf_size); - std::vector deq(f32_sample.size()); + const size_t row_sz = ggml_row_size(typ, n_per_row); + const size_t need_q = row_sz * total_sampled_rows; + if (qbuf.size() < need_q) { qbuf.resize(need_q); } + if (deq.size() < f32_sample.size()) { deq.resize(f32_sample.size()); } - // Quantize all sampled rows at once and dequantize back - size_t qbuf_offset = 0; - size_t f32_offset = 0; + // Quantize sampled rows slice-by-slice + size_t qoff = 0; + size_t foff = 0; for (int64_t slice = 0; slice < ne2; ++slice) { const int64_t rs = sample_rows_per_slice[slice]; if (rs == 0) { continue; } - const float * value = values_sample.empty() ? nullptr : values_sample.data() + slice * n_per_row; - (void)ggml_quantize_chunk(typ, f32_sample.data() + f32_offset, qbuf.data() + qbuf_offset, 0, rs, n_per_row, value); - qbuf_offset += ggml_row_size(typ, n_per_row) * rs; - f32_offset += rs * n_per_row; + const float * value = values_sample ? values_sample + slice * n_per_row : nullptr; + + (void)ggml_quantize_chunk(typ, f32_sample.data() + foff, qbuf.data() + qoff, 0, rs, n_per_row, value); + + qoff += row_sz * rs; + foff += (size_t)rs * n_per_row; } + // Dequantize to deq if (typ == GGML_TYPE_F16) { - const auto *const src = (const ggml_fp16_t *)qbuf.data(); - for (size_t r = 0; r < total_sampled_rows; ++r) { - ggml_fp16_to_fp32_row(src + r * n_per_row, deq.data() + r * n_per_row, n_per_row); - } + ggml_fp16_to_fp32_row((const ggml_fp16_t *)qbuf.data(), deq.data(), (int)f32_sample.size()); } else if (typ == GGML_TYPE_BF16) { - const auto *const src = (const ggml_bf16_t *)qbuf.data(); - for (size_t r = 0; r < total_sampled_rows; ++r) { - ggml_bf16_to_fp32_row(src + r * n_per_row, deq.data() + r * n_per_row, n_per_row); - } + ggml_bf16_to_fp32_row((const ggml_bf16_t *)qbuf.data(), deq.data(), (int)f32_sample.size()); } else { - traits->to_float(qbuf.data(), deq.data(), f32_sample.size()); + const ggml_type_traits * traits = ggml_get_type_traits(typ); + if (!traits || !traits->to_float) { + // no dequantizer available + return 1e35; + } + traits->to_float(qbuf.data(), deq.data(), (int) f32_sample.size()); } + // Compute error + size_t off = 0; double total_err = 0.0; - size_t sample_offset = 0; + const double eps = 1e-12; for (int64_t slice = 0; slice < ne2; ++slice) { - const float * wv = values_sample.empty() ? nullptr : values_sample.data() + slice * n_per_row; - const float * act = activations_sample.empty() ? nullptr : activations_sample.data() + slice * n_per_row; const int64_t rs = sample_rows_per_slice[slice]; + if (rs == 0) { continue; } + + const float * wv = values_sample ? values_sample + slice * n_per_row : nullptr; + const float * act = activations_sample ? activations_sample + slice * n_per_row : nullptr; double slice_err = 0.0; - for (int64_t s = 0; s < rs; ++s) { - const float * xs = f32_sample.data() + sample_offset; - const float * ys = deq.data() + sample_offset; + + for (int64_t r = 0; r < rs; ++r) { + const float * x = f32_sample.data() + off; + const float * y = deq.data() + off; double mse_w = 0.0; double x2_w = 0.0; - double bias_num = 0.0; - double bias_den = 0.0; + double bnum = 0.0; + double bden = 0.0; - for (int64_t j = 0; j < n_per_row; ++j) { - const double e = ys[j] - xs[j]; - const double w = wv ? wv[j] : 1.0; - mse_w += w * e * e; - x2_w += w * xs[j] * xs[j]; - - if (act) { + if (wv && act) { + for (int64_t j = 0; j < n_per_row; ++j) { + const double w = wv[j]; + const double e = y[j] - x[j]; const double a = act[j]; - bias_num += e * a; - bias_den += a * a; + mse_w += w * e * e; + x2_w += w * x[j] * x[j]; + bnum += e * a; + bden += a * a; + } + } else if (wv) { + for (int64_t j = 0; j < n_per_row; ++j) { + const double w = wv[j]; + const double e = y[j] - x[j]; + mse_w += w * e * e; + x2_w += w * x[j] * x[j]; + } + } else if (act) { + for (int64_t j = 0; j < n_per_row; ++j) { + const double e = y[j] - x[j]; + const double a = act[j]; + mse_w += e * e; + x2_w += x[j] * x[j]; + bnum += e * a; + bden += a * a; + } + } else { + for (int64_t j = 0; j < n_per_row; ++j) { + const double e = y[j] - x[j]; + mse_w += e * e; + x2_w += x[j] * x[j]; } } - const double eps = 1e-30; double row_err = mse_w / (x2_w + eps); - if (act && bias_lambda != 0.0) { - const double bias_norm = bias_num * bias_num / (bias_den + eps); - row_err += bias_lambda * bias_norm; + row_err += bias_lambda * (bnum * bnum) / (bden + eps); } slice_err += row_err; - sample_offset += n_per_row; + off += (size_t)n_per_row; } - const auto rows_per_expert = nrows; - const double scale_rows = (double)rows_per_expert / std::max(1.0, (double)rs); + // scale back up to the full number of rows in this slice + const double scale_rows = (double)nrows / std::max(1.0, (double)rs); total_err += slice_err * scale_rows; } return std::isfinite(total_err) ? total_err : 1e35; - }; +}; std::vector all; all.reserve(tensors.size()); @@ -887,38 +909,70 @@ static std::unordered_map target_bpw_type( const int64_t n_per_row = t->ne[0]; const int64_t nrows_total = t->ne[1]; const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; - const int64_t rows_per_expert = nrows_total; - const int64_t sample_rows_max = std::max(1, std::min(rows_per_expert, sample_rows_per_expert)); - const int64_t stride = std::max(1, rows_per_expert / sample_rows_max); + const int64_t sample_rows_max = std::max(1, std::min(nrows_total, sample_rows_per_expert)); + const int64_t stride = std::max(1, nrows_total / sample_rows_max); std::vector f32_sample; std::vector values_sample; std::vector activations_sample; std::vector sample_rows_per_slice(ne2); + std::mt19937 rng(std::random_device{}()); for (int64_t slice = 0; slice < ne2; ++slice) { int64_t current_sampled_rows = 0; - for (int64_t r = 0; r < rows_per_expert && current_sampled_rows < sample_rows_max; r += stride) { - const float * src_row = f32_data + slice * (n_per_row * rows_per_expert) + r * n_per_row; + int64_t offset = 0; + if (stride > 1) { + std::uniform_int_distribution dist(0, stride - 1); + offset = dist(rng); + } + for (int64_t r = offset; r < nrows_total && current_sampled_rows < sample_rows_max; r += stride) { + const float * src_row = f32_data + slice * (n_per_row * nrows_total) + r * n_per_row; f32_sample.insert(f32_sample.end(), src_row, src_row + n_per_row); current_sampled_rows++; } sample_rows_per_slice[slice] = current_sampled_rows; } + auto copy_or_broadcast = [&](const float *src, size_t src_sz, std::vector &dst) { + const size_t want = (size_t)ne2 * (size_t)n_per_row; + dst.clear(); + if (!src || src_sz == 0) { return; } + + if (src_sz == want) { + dst.resize(want); + std::memcpy(dst.data(), src, want * sizeof(float)); + } else if (src_sz == (size_t)n_per_row) { + dst.resize(want); + for (int64_t s = 0; s < ne2; ++s) { + std::memcpy(dst.data() + s * n_per_row, src, n_per_row * sizeof(float)); + } + } else { + // Mismatch – safer to skip using it for this tensor + LLAMA_LOG_WARN("%s: side data size mismatch for %s: got %zu, expected %zu or %zu; ignoring\n", + __func__, name.c_str(), src_sz, (size_t)n_per_row, want); + } + }; + if (values_all) { - values_sample.resize(ne2 * n_per_row); - std::memcpy(values_sample.data(), values_all, ne2 * n_per_row * sizeof(float)); + // get size from the map (not just the raw pointer) + auto itv = values_data->find(remap_imatrix(name, mapped)); + const size_t sz = itv == values_data->end() ? 0 : itv->second.size(); + copy_or_broadcast(values_all, sz, values_sample); } if (activations_all) { - activations_sample.resize(ne2 * n_per_row); - std::memcpy(activations_sample.data(), activations_all, ne2 * n_per_row * sizeof(float)); + auto ita = activations_data->find(remap_imatrix(name, mapped)); + const size_t sz = ita == activations_data->end() ? 0 : ita->second.size(); + copy_or_broadcast(activations_all, sz, activations_sample); } tensor_info info; info.w = tw; info.n_elements = nelem; + // Prepare scratch buffers sized for the largest candidate row size + size_t total_sampled_rows = f32_sample.size() / n_per_row; + + // Build list of candidate types first (compatible ones) std::vector quant_candidates; if (is_iq(params->ftype)) { quant_candidates.assign(std::begin(iq_candidates), std::end(iq_candidates)); @@ -926,18 +980,28 @@ static std::unordered_map target_bpw_type( quant_candidates.assign(std::begin(k_candidates), std::end(k_candidates)); } - // Build per-tensor candidate list + // Compute maximum row size among compatible candidates (to size qbuf once) + size_t max_row_sz = 0; + std::vector compatible_candidates; + compatible_candidates.reserve(quant_candidates.size()); for (ggml_type ts_type : quant_candidates) { if (is_iq(ts_type) && !values_all) { continue; } ggml_type tt = make_compatible(t, ts_type); if (!is_compatible(t, tt)) { continue; } + compatible_candidates.push_back(tt); + max_row_sz = std::max(max_row_sz, ggml_row_size(tt, n_per_row)); + } - // Compute bpw and bytes + std::vector qbuf(max_row_sz * total_sampled_rows); + std::vector deq(f32_sample.size()); + + // Now evaluate candidates + for (ggml_type tt : compatible_candidates) { auto bpw = (float)tensor_bpw(t, tt); size_t bytes = total_bytes(t, tt); - - // Estimate error using the pre-sampled data - auto err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, values_sample, activations_sample); + const float *vals_ptr = values_sample.empty() ? nullptr : values_sample.data(); + const float *acts_ptr = activations_sample.empty() ? nullptr : activations_sample.data(); + float err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, vals_ptr, acts_ptr, qbuf, deq); info.candidate.push_back(candidate_types{ tt, bpw, bytes, err }); } From e01dad886bd2314146ce768240fd0c8a2abecabb Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 21 Aug 2025 12:47:13 +0100 Subject: [PATCH 030/155] Parallelise candidate evaluation --- src/llama-quant.cpp | 87 ++++++++++++++++++++++++++++++--------------- 1 file changed, 59 insertions(+), 28 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 629056ee06..3cade0bf6f 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -610,7 +610,7 @@ static std::unordered_map target_bpw_type( const std::unordered_map> * activations_data, const llama_model_quantize_params * params, int nthread, - int sample_rows_per_expert = 256, + int sample_rows_per_expert = 384, float bias_lambda = 1.0 ) { struct candidate_types { @@ -758,16 +758,17 @@ static std::unordered_map target_bpw_type( std::vector & deq) -> double { const int64_t n_per_row = t->ne[0]; - const int64_t nrows = t->ne[1]; - const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; + const int64_t nrows = t->ne[1]; + const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; - const size_t total_sampled_rows = f32_sample.size() / n_per_row; + const size_t nels = f32_sample.size(); + const size_t total_sampled_rows = nels / (size_t)n_per_row; if (total_sampled_rows == 0) { return 0.0; } const size_t row_sz = ggml_row_size(typ, n_per_row); const size_t need_q = row_sz * total_sampled_rows; if (qbuf.size() < need_q) { qbuf.resize(need_q); } - if (deq.size() < f32_sample.size()) { deq.resize(f32_sample.size()); } + if (deq.size() < nels) { deq.resize(nels); } // Quantize sampled rows slice-by-slice size_t qoff = 0; @@ -777,31 +778,31 @@ static std::unordered_map target_bpw_type( if (rs == 0) { continue; } const float * value = values_sample ? values_sample + slice * n_per_row : nullptr; - (void)ggml_quantize_chunk(typ, f32_sample.data() + foff, qbuf.data() + qoff, 0, rs, n_per_row, value); - qoff += row_sz * rs; - foff += (size_t)rs * n_per_row; + qoff += row_sz * (size_t)rs; + foff += (size_t)rs * (size_t)n_per_row; } - // Dequantize to deq + // Dequantize into deq if (typ == GGML_TYPE_F16) { - ggml_fp16_to_fp32_row((const ggml_fp16_t *)qbuf.data(), deq.data(), (int)f32_sample.size()); + ggml_fp16_to_fp32_row((const ggml_fp16_t *)qbuf.data(), deq.data(), (int)nels); } else if (typ == GGML_TYPE_BF16) { - ggml_bf16_to_fp32_row((const ggml_bf16_t *)qbuf.data(), deq.data(), (int)f32_sample.size()); + ggml_bf16_to_fp32_row((const ggml_bf16_t *)qbuf.data(), deq.data(), (int)nels); } else { const ggml_type_traits * traits = ggml_get_type_traits(typ); if (!traits || !traits->to_float) { - // no dequantizer available + LLAMA_LOG_WARN("%s: unsupported quantization type %s\n", __func__, ggml_type_name(typ)); return 1e35; } - traits->to_float(qbuf.data(), deq.data(), (int) f32_sample.size()); + + traits->to_float(qbuf.data(), deq.data(), (int) nels); } // Compute error + const double eps = 1e-12; size_t off = 0; double total_err = 0.0; - const double eps = 1e-12; for (int64_t slice = 0; slice < ne2; ++slice) { const int64_t rs = sample_rows_per_slice[slice]; @@ -817,9 +818,9 @@ static std::unordered_map target_bpw_type( const float * y = deq.data() + off; double mse_w = 0.0; - double x2_w = 0.0; - double bnum = 0.0; - double bden = 0.0; + double x2_w = 0.0; + double bnum = 0.0; + double bden = 0.0; if (wv && act) { for (int64_t j = 0; j < n_per_row; ++j) { @@ -828,8 +829,8 @@ static std::unordered_map target_bpw_type( const double a = act[j]; mse_w += w * e * e; x2_w += w * x[j] * x[j]; - bnum += e * a; - bden += a * a; + bnum += w * e * a; // weighted bias + bden += w * a * a; // weighted norm } } else if (wv) { for (int64_t j = 0; j < n_per_row; ++j) { @@ -856,7 +857,9 @@ static std::unordered_map target_bpw_type( } double row_err = mse_w / (x2_w + eps); + if (act && bias_lambda != 0.0) { + // penalize squared projection of error onto activations row_err += bias_lambda * (bnum * bnum) / (bden + eps); } @@ -864,7 +867,7 @@ static std::unordered_map target_bpw_type( off += (size_t)n_per_row; } - // scale back up to the full number of rows in this slice + // scale to full rows in this slice (nrows) const double scale_rows = (double)nrows / std::max(1.0, (double)rs); total_err += slice_err * scale_rows; } @@ -982,10 +985,14 @@ static std::unordered_map target_bpw_type( // Compute maximum row size among compatible candidates (to size qbuf once) size_t max_row_sz = 0; + const bool has_valid_imatrix = !values_sample.empty() && values_sample.size() == (size_t)ne2 * (size_t)n_per_row; std::vector compatible_candidates; compatible_candidates.reserve(quant_candidates.size()); for (ggml_type ts_type : quant_candidates) { - if (is_iq(ts_type) && !values_all) { continue; } + if (is_iq(ts_type) && !has_valid_imatrix) { + LLAMA_LOG_WARN("%s: skipping IQ quantization for %s, no or mismatched imatrix provided\n", __func__, name.c_str()); + continue; + } ggml_type tt = make_compatible(t, ts_type); if (!is_compatible(t, tt)) { continue; } compatible_candidates.push_back(tt); @@ -996,13 +1003,37 @@ static std::unordered_map target_bpw_type( std::vector deq(f32_sample.size()); // Now evaluate candidates - for (ggml_type tt : compatible_candidates) { - auto bpw = (float)tensor_bpw(t, tt); - size_t bytes = total_bytes(t, tt); - const float *vals_ptr = values_sample.empty() ? nullptr : values_sample.data(); - const float *acts_ptr = activations_sample.empty() ? nullptr : activations_sample.data(); - float err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, vals_ptr, acts_ptr, qbuf, deq); - info.candidate.push_back(candidate_types{ tt, bpw, bytes, err }); + std::vector cand_out(compatible_candidates.size()); + const float *vals_ptr = values_sample.empty() ? nullptr : values_sample.data(); + const float *acts_ptr = activations_sample.empty() ? nullptr : activations_sample.data(); + + int n_eval_threads = std::max(1, nthread); + std::atomic cidx{0}; + std::vector eval_workers; + eval_workers.reserve(n_eval_threads); + + for (int ti = 0; ti < n_eval_threads; ++ti) { + eval_workers.emplace_back([&] { + // thread-local scratch + std::vector tl_qbuf(qbuf.size()); + std::vector tl_deq(deq.size()); + + for (;;) { + const size_t i = cidx.fetch_add(1, std::memory_order_relaxed); + if (i >= compatible_candidates.size()) { break; } + + const ggml_type tt = compatible_candidates[i]; + const auto bpw = (float)tensor_bpw(t, tt); + const size_t bytes = total_bytes(t, tt); + const auto err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, vals_ptr, acts_ptr, tl_qbuf, tl_deq); + cand_out[i] = candidate_types{ tt, bpw, bytes, err }; + } + }); + } + for (auto &th : eval_workers) { th.join(); } + + for (auto &c : cand_out) { + if (c.bytes > 0) { info.candidate.push_back(c); } } if (info.candidate.empty()) { From 887490c5ec3c679e8bc0c274b743b483e7c595e3 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 21 Aug 2025 15:11:49 +0100 Subject: [PATCH 031/155] Dequantise sampled rows only --- src/llama-quant.cpp | 73 ++++++++++++++++++++++++++++++--------------- 1 file changed, 49 insertions(+), 24 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 3cade0bf6f..547281bd7d 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -610,7 +610,7 @@ static std::unordered_map target_bpw_type( const std::unordered_map> * activations_data, const llama_model_quantize_params * params, int nthread, - int sample_rows_per_expert = 384, + int sample_rows_per_expert = 512, float bias_lambda = 1.0 ) { struct candidate_types { @@ -699,7 +699,7 @@ static std::unordered_map target_bpw_type( q &= name.find("attn_rel_b.weight") == std::string::npos; q &= !params->only_copy; // TODO: Exclude embeddings and output tensors? - q &= params->quantize_output_tensor || name != "output.weight"; + // q &= params->quantize_output_tensor || name != "output.weight"; q &= name != name_tn(LLM_TENSOR_TOKEN_EMBD, "weight"); return q; @@ -896,31 +896,35 @@ static std::unordered_map target_bpw_type( const int64_t nelem = ggml_nelements(t); std::vector> f32_conv_buf; - float * f32_data = nullptr; - - if (t->type == GGML_TYPE_F32) { - f32_data = (float *)t->data; - } else { - llama_tensor_dequantize_impl(t, f32_conv_buf, workers, nelem, nthread); - f32_data = (float *)f32_conv_buf.data(); - } - const float * values_all = get_values(name); const float * activations_all = get_activations(name); - // Sample the tensor rows once, before looping through quantization candidates. + // Dequantize only sampled rows into f32_sample const int64_t n_per_row = t->ne[0]; const int64_t nrows_total = t->ne[1]; const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; + + const ggml_type src_type = t->type; + const ggml_type_traits *src_traits = ggml_get_type_traits(src_type); + const bool src_is_quant = ggml_is_quantized(src_type); + const size_t src_row_sz = ggml_row_size(src_type, n_per_row); + + std::vector f32_sample; + f32_sample.reserve((size_t)ne2 * (size_t)std::min(nrows_total, sample_rows_per_expert) * (size_t)n_per_row); + + std::vector values_sample; + std::vector activations_sample; + std::vector sample_rows_per_slice(ne2, 0); + + // deterministic sampling seed based on tensor name + fixed constant + std::mt19937 rng(std::hash{}(name) ^0xeabada55cafed00d); + const int64_t sample_rows_max = std::max(1, std::min(nrows_total, sample_rows_per_expert)); const int64_t stride = std::max(1, nrows_total / sample_rows_max); - std::vector f32_sample; - std::vector values_sample; - std::vector activations_sample; - std::vector sample_rows_per_slice(ne2); + // Temporary buffer for one dequantized row + std::vector rowbuf((size_t)n_per_row); - std::mt19937 rng(std::random_device{}()); for (int64_t slice = 0; slice < ne2; ++slice) { int64_t current_sampled_rows = 0; int64_t offset = 0; @@ -928,10 +932,30 @@ static std::unordered_map target_bpw_type( std::uniform_int_distribution dist(0, stride - 1); offset = dist(rng); } + for (int64_t r = offset; r < nrows_total && current_sampled_rows < sample_rows_max; r += stride) { - const float * src_row = f32_data + slice * (n_per_row * nrows_total) + r * n_per_row; - f32_sample.insert(f32_sample.end(), src_row, src_row + n_per_row); - current_sampled_rows++; + if (src_type == GGML_TYPE_F32) { + const float * src_row = (const float *)t->data + slice * (n_per_row * nrows_total) + r * n_per_row; + f32_sample.insert(f32_sample.end(), src_row, src_row + n_per_row); + } else if (src_type == GGML_TYPE_F16) { + const ggml_fp16_t * src_row = (const ggml_fp16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz); + ggml_fp16_to_fp32_row(src_row, rowbuf.data(), (int)n_per_row); + f32_sample.insert(f32_sample.end(), rowbuf.begin(), rowbuf.end()); + } else if (src_type == GGML_TYPE_BF16) { + const ggml_bf16_t * src_row = (const ggml_bf16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz); + ggml_bf16_to_fp32_row(src_row, rowbuf.data(), (int)n_per_row); + f32_sample.insert(f32_sample.end(), rowbuf.begin(), rowbuf.end()); + } else if (src_is_quant) { + const uint8_t * qrow = (const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz; + if (!src_traits || !src_traits->to_float) { + throw std::runtime_error(format("cannot dequantize type %s for sampling", ggml_type_name(src_type))); + } + src_traits->to_float(qrow, rowbuf.data(), (int)n_per_row); + f32_sample.insert(f32_sample.end(), rowbuf.begin(), rowbuf.end()); + } else { + throw std::runtime_error(format("unsupported src type %s for sampling", ggml_type_name(src_type))); + } + ++current_sampled_rows; } sample_rows_per_slice[slice] = current_sampled_rows; } @@ -999,15 +1023,16 @@ static std::unordered_map target_bpw_type( max_row_sz = std::max(max_row_sz, ggml_row_size(tt, n_per_row)); } - std::vector qbuf(max_row_sz * total_sampled_rows); - std::vector deq(f32_sample.size()); + std::sort(compatible_candidates.begin(), compatible_candidates.end()); + compatible_candidates.erase(std::unique(compatible_candidates.begin(), compatible_candidates.end()), compatible_candidates.end()); // Now evaluate candidates std::vector cand_out(compatible_candidates.size()); const float *vals_ptr = values_sample.empty() ? nullptr : values_sample.data(); const float *acts_ptr = activations_sample.empty() ? nullptr : activations_sample.data(); - - int n_eval_threads = std::max(1, nthread); + std::vector qbuf(max_row_sz * total_sampled_rows); + std::vector deq(f32_sample.size()); + int n_eval_threads = std::max(1, std::min(nthread, (int)compatible_candidates.size())); std::atomic cidx{0}; std::vector eval_workers; eval_workers.reserve(n_eval_threads); From 9e11f82e8f5ad29cb62cba0bab7014db17a0b2c2 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 21 Aug 2025 16:25:31 +0100 Subject: [PATCH 032/155] Precompute error denominator in estimate_erro() --- src/llama-quant.cpp | 154 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 121 insertions(+), 33 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 547281bd7d..03f8a4bd11 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -598,8 +598,8 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * // Returns per-tensor overrides of quantization types to meet target BPW with the lowest ppl // sample_rows_per_expert: Larger values will result in more accurate error estimates, but will take longer to compute -// bias_lambda: Affects the weight of the bias term in the MSE error function. 0.0 means no bias, 1.0 means equal weight -// for bias and error, 2.0 means twice as much weight for bias +// bias_lambda: Affects the weight of the bias term in the weigthed MSE error function. 0.0 means no bias (standard MSE), +// 1.0 means equal weight for bias and error, 2.0 means twice as much weight for bias static std::unordered_map target_bpw_type( llama_model_loader & ml, std::vector> & buffer, @@ -658,7 +658,7 @@ static std::unordered_map target_bpw_type( GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS, GGML_TYPE_IQ4_NL, - // Add higher-precision fallbacks for IQ mixes to improve ppl if bpw budget allows it + // TODO: add higher-precision fallbacks for IQ mixes to improve ppl if bpw budget allows it? GGML_TYPE_Q5_0, GGML_TYPE_Q5_1, GGML_TYPE_Q5_K, @@ -770,7 +770,68 @@ static std::unordered_map target_bpw_type( if (qbuf.size() < need_q) { qbuf.resize(need_q); } if (deq.size() < nels) { deq.resize(nels); } - // Quantize sampled rows slice-by-slice + // Precompute denominators: + // - x2_per_row: sum_j w[j]*x[j]^2 if w present else sum_j x[j]^2 + // - bden_per_slice: sum_j w[j]*a[j]^2 if w & a present; sum_j a[j]^2 if only a present; 0 otherwise + std::vector x2_per_row(total_sampled_rows, 0.0); + std::vector bden_per_slice(ne2, 0.0); + + const bool has_w = (values_sample != nullptr); + const bool has_a = (activations_sample != nullptr); + + // Precompute bden per slice (depends only on w,a) + if (has_a) { + for (int64_t s = 0; s < ne2; ++s) { + const float * wv = has_w ? values_sample + s * n_per_row : nullptr; + const float * act = activations_sample + s * n_per_row; + double bden = 0.0; + if (has_w) { + for (int64_t j = 0; j < n_per_row; ++j) { + const double a = act[j]; + bden += (double) wv[j] * a * a; + } + } else { + for (int64_t j = 0; j < n_per_row; ++j) { + const double a = act[j]; + bden += a * a; + } + } + bden_per_slice[s] = bden; + } + } + + // Precompute x2 per sampled row + { + size_t off = 0; + size_t row_idx = 0; + for (int64_t s = 0; s < ne2; ++s) { + const int64_t rs = sample_rows_per_slice[s]; + if (rs == 0) { continue; } + + const float * wv = has_w ? values_sample + s * n_per_row : nullptr; + + for (int64_t r = 0; r < rs; ++r, ++row_idx) { + const float * x = f32_sample.data() + off; + double x2 = 0.0; + if (has_w) { + for (int64_t j = 0; j < n_per_row; ++j) { + const double w = wv[j]; + const double xx = x[j]; + x2 += w * xx * xx; + } + } else { + for (int64_t j = 0; j < n_per_row; ++j) { + const double xx = x[j]; + x2 += xx * xx; + } + } + x2_per_row[row_idx] = x2; + off += (size_t)n_per_row; + } + } + } + + // Quantize sampled rows slice-by-slice into qbuf size_t qoff = 0; size_t foff = 0; for (int64_t slice = 0; slice < ne2; ++slice) { @@ -784,43 +845,50 @@ static std::unordered_map target_bpw_type( foff += (size_t)rs * (size_t)n_per_row; } - // Dequantize into deq - if (typ == GGML_TYPE_F16) { - ggml_fp16_to_fp32_row((const ggml_fp16_t *)qbuf.data(), deq.data(), (int)nels); - } else if (typ == GGML_TYPE_BF16) { - ggml_bf16_to_fp32_row((const ggml_bf16_t *)qbuf.data(), deq.data(), (int)nels); - } else { + // Dequantize into deq (row-wise if needed to avoid int overflow) + { const ggml_type_traits * traits = ggml_get_type_traits(typ); - if (!traits || !traits->to_float) { - LLAMA_LOG_WARN("%s: unsupported quantization type %s\n", __func__, ggml_type_name(typ)); - return 1e35; - } + if (typ == GGML_TYPE_F16) { + ggml_fp16_to_fp32_row((const ggml_fp16_t *)qbuf.data(), deq.data(), (int)nels); + } else if (typ == GGML_TYPE_BF16) { + ggml_bf16_to_fp32_row((const ggml_bf16_t *)qbuf.data(), deq.data(), (int)nels); + } else { + if (!traits || !traits->to_float) { + LLAMA_LOG_WARN("%s: unsupported quantization type %s\n", __func__, ggml_type_name(typ)); + return 1e35; + } - traits->to_float(qbuf.data(), deq.data(), (int) nels); + size_t done = 0; + while (done < nels) { + const size_t chunk = std::min((size_t)n_per_row, nels - done); + traits->to_float(qbuf.data() + done / n_per_row * row_sz, deq.data() + done, (int)chunk); + done += chunk; + } + } } // Compute error const double eps = 1e-12; size_t off = 0; + size_t row_idx = 0; double total_err = 0.0; for (int64_t slice = 0; slice < ne2; ++slice) { const int64_t rs = sample_rows_per_slice[slice]; if (rs == 0) { continue; } - const float * wv = values_sample ? values_sample + slice * n_per_row : nullptr; - const float * act = activations_sample ? activations_sample + slice * n_per_row : nullptr; + const float * wv = has_w ? values_sample + slice * n_per_row : nullptr; + const float * act = has_a ? activations_sample + slice * n_per_row : nullptr; + const double bden = has_a ? bden_per_slice[slice] : 0.0; double slice_err = 0.0; - for (int64_t r = 0; r < rs; ++r) { + for (int64_t r = 0; r < rs; ++r, ++row_idx) { const float * x = f32_sample.data() + off; const float * y = deq.data() + off; double mse_w = 0.0; - double x2_w = 0.0; double bnum = 0.0; - double bden = 0.0; if (wv && act) { for (int64_t j = 0; j < n_per_row; ++j) { @@ -828,52 +896,49 @@ static std::unordered_map target_bpw_type( const double e = y[j] - x[j]; const double a = act[j]; mse_w += w * e * e; - x2_w += w * x[j] * x[j]; - bnum += w * e * a; // weighted bias - bden += w * a * a; // weighted norm + bnum += w * e * a; } } else if (wv) { for (int64_t j = 0; j < n_per_row; ++j) { const double w = wv[j]; const double e = y[j] - x[j]; mse_w += w * e * e; - x2_w += w * x[j] * x[j]; } } else if (act) { for (int64_t j = 0; j < n_per_row; ++j) { const double e = y[j] - x[j]; const double a = act[j]; mse_w += e * e; - x2_w += x[j] * x[j]; bnum += e * a; - bden += a * a; } } else { for (int64_t j = 0; j < n_per_row; ++j) { const double e = y[j] - x[j]; mse_w += e * e; - x2_w += x[j] * x[j]; } } - double row_err = mse_w / (x2_w + eps); - + // corrected normalization: divide the full numerator by x2 + double numer = mse_w; if (act && bias_lambda != 0.0) { - // penalize squared projection of error onto activations - row_err += bias_lambda * (bnum * bnum) / (bden + eps); + const double proj = bnum * bnum / (bden + eps); + numer += bias_lambda * proj; } + const double denom = x2_per_row[row_idx] + eps; + const double row_err = numer / denom; + slice_err += row_err; off += (size_t)n_per_row; } - // scale to full rows in this slice (nrows) + // scale to full rows (nrows) const double scale_rows = (double)nrows / std::max(1.0, (double)rs); total_err += slice_err * scale_rows; } return std::isfinite(total_err) ? total_err : 1e35; -}; + }; std::vector all; all.reserve(tensors.size()); @@ -1067,6 +1132,29 @@ static std::unordered_map target_bpw_type( info.candidate.push_back(candidate_types{ t->type, bpw, ggml_nbytes(t), 0.0 }); } + // Remove dominated candidates: if A has >= bytes and >= error than B (and > in at least one), drop A. + { + std::vector pruned; + pruned.reserve(info.candidate.size()); + // Sort by bytes asc, error asc + std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types &a, const candidate_types &b) { + if (a.bytes != b.bytes) { return a.bytes < b.bytes; } + return a.error < b.error; + }); + + double best_err = std::numeric_limits::infinity(); + size_t last_bytes = std::numeric_limits::max(); + + for (const auto &c : info.candidate) { + if (c.error < best_err || c.bytes > last_bytes) { + pruned.push_back(c); + best_err = std::min(best_err, (double)c.error); + last_bytes = c.bytes; + } + } + info.candidate.swap(pruned); + } + std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types & a, const candidate_types & b) { if (a.bpw != b.bpw) { return a.bpw < b.bpw; } if (a.error != b.error) { return a.error < b.error; } From 5b6f1e9fde8dc6fd3456358c5b5c758b1f10b11c Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 21 Aug 2025 19:18:54 +0100 Subject: [PATCH 033/155] General code refactor --- src/llama-quant.cpp | 415 +++++++++++++++++++++----------------------- 1 file changed, 196 insertions(+), 219 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 03f8a4bd11..85191a66ae 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -596,10 +596,7 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * return new_size; } -// Returns per-tensor overrides of quantization types to meet target BPW with the lowest ppl -// sample_rows_per_expert: Larger values will result in more accurate error estimates, but will take longer to compute -// bias_lambda: Affects the weight of the bias term in the weigthed MSE error function. 0.0 means no bias (standard MSE), -// 1.0 means equal weight for bias and error, 2.0 means twice as much weight for bias +// Returns per-tensor type overrides to meet target BPW at lowest ppl static std::unordered_map target_bpw_type( llama_model_loader & ml, std::vector> & buffer, @@ -609,9 +606,7 @@ static std::unordered_map target_bpw_type( const std::unordered_map> * values_data, const std::unordered_map> * activations_data, const llama_model_quantize_params * params, - int nthread, - int sample_rows_per_expert = 512, - float bias_lambda = 1.0 + int nthread ) { struct candidate_types { ggml_type type; @@ -621,15 +616,15 @@ static std::unordered_map target_bpw_type( }; struct tensor_info { - const llama_model_loader::llama_tensor_weight * w; - std::vector candidate; + const llama_model_loader::llama_tensor_weight * w = nullptr; + std::vector candidate = {}; int choice = -1; float min_bpw = 0.0; float max_bpw = 0.0; size_t n_elements = 0; }; - const ggml_type k_candidates[] = { + constexpr ggml_type k_quants[] = { GGML_TYPE_Q2_K, GGML_TYPE_Q3_K, GGML_TYPE_Q4_0, @@ -648,7 +643,7 @@ static std::unordered_map target_bpw_type( #endif }; - const ggml_type iq_candidates[] = { + constexpr ggml_type iq_quants[] = { GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M, GGML_TYPE_IQ2_XXS, @@ -665,9 +660,49 @@ static std::unordered_map target_bpw_type( GGML_TYPE_Q6_K }; - auto name_tn = LLM_TN(model.arch); - float target_bpw = params->target_bpw; + auto get_values = [&](const std::string & tensor_name) -> const float * { + if (!values_data) { return nullptr; } + const auto it = values_data->find(remap_imatrix(tensor_name, mapped)); + if (it == values_data->end()) { return nullptr; } + return it->second.data(); + }; + auto get_activations = [&](const std::string & tensor_name) -> const float * { + if (!activations_data) { return nullptr; } + const auto it = activations_data->find(remap_imatrix(tensor_name, mapped)); + if (it == activations_data->end()) { return nullptr; } + return it->second.data(); + }; + + auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t { + const int64_t n_per_row = t->ne[0]; + const int64_t nrows = t->ne[1]; + const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; + const size_t row_sz = ggml_row_size(typ, n_per_row); + return (size_t)ne2 * (size_t)nrows * row_sz; + }; + + auto tensor_bpw = [&](const ggml_tensor * t, const ggml_type typ) -> double { + const int64_t nelem = ggml_nelements(t); + const size_t bytes = tensor_bytes(t, typ); + return (double)bytes * 8.0 / (double)nelem; + }; + + auto is_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> bool { + const int64_t n_per_row = t->ne[0]; + const int64_t blck = ggml_blck_size(typ); + if (blck <= 1) { return true; } + return n_per_row % blck == 0; + }; + + auto make_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> ggml_type { + if (is_compatible(t, typ)) { return typ; } + ggml_type fb = fallback_type(typ); + if (is_compatible(t, fb)) { return fb; } + return GGML_TYPE_F16; + }; + + auto name_tn = LLM_TN(model.arch); auto can_quantize = [&](const ggml_tensor * t) -> bool { const std::string name = ggml_get_name(t); bool q = name.rfind("weight") == name.size() - 6; @@ -705,231 +740,182 @@ static std::unordered_map target_bpw_type( return q; }; - auto get_values = [&](const std::string & tensor_name) -> const float * { - if (!values_data) { return nullptr; } - const auto it = values_data->find(remap_imatrix(tensor_name, mapped)); - if (it == values_data->end()) { return nullptr; } - return it->second.data(); - }; - - auto get_activations = [&](const std::string & tensor_name) -> const float * { - if (!activations_data) { return nullptr; } - const auto it = activations_data->find(remap_imatrix(tensor_name, mapped)); - if (it == activations_data->end()) { return nullptr; } - return it->second.data(); - }; - - auto total_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t { - const int64_t n_per_row = t->ne[0]; - const int64_t nrows = t->ne[1]; - const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; - const size_t row_sz = ggml_row_size(typ, n_per_row); - return (size_t)ne2 * (size_t)nrows * row_sz; - }; - - auto tensor_bpw = [&](const ggml_tensor * t, const ggml_type typ) -> double { - const int64_t nelem = ggml_nelements(t); - const size_t bytes = total_bytes(t, typ); - return bytes * 8.0 / nelem; - }; - - auto is_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> bool { - const int64_t n_per_row = t->ne[0]; - const int64_t blck = ggml_blck_size(typ); - if (blck <= 1) { return true; } - return n_per_row % blck == 0; - }; - - auto make_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> ggml_type { - if (is_compatible(t, typ)) { return typ; } - ggml_type fb = fallback_type(typ); - if (is_compatible(t, fb)) { return fb; } - return GGML_TYPE_F16; - }; - // Estimate error for a given type using a sampled subset of rows auto estimate_error = [&](const ggml_tensor * t, - const ggml_type typ, + const ggml_type quant_type, const std::vector & f32_sample, const std::vector & sample_rows_per_slice, const float * values_sample, const float * activations_sample, - std::vector & qbuf, - std::vector & deq) -> double + std::vector & quantized_buffer, + std::vector & dequantized_buffer) -> double { const int64_t n_per_row = t->ne[0]; const int64_t nrows = t->ne[1]; const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; - const size_t nels = f32_sample.size(); - const size_t total_sampled_rows = nels / (size_t)n_per_row; - if (total_sampled_rows == 0) { return 0.0; } + const size_t sample_element_count = f32_sample.size(); + const size_t sample_row_count = sample_element_count / (size_t)n_per_row; + if (sample_row_count == 0) { return 0.0; } - const size_t row_sz = ggml_row_size(typ, n_per_row); - const size_t need_q = row_sz * total_sampled_rows; - if (qbuf.size() < need_q) { qbuf.resize(need_q); } - if (deq.size() < nels) { deq.resize(nels); } + const size_t row_size = ggml_row_size(quant_type, n_per_row); + const size_t buffer_size = row_size * sample_row_count; + if (quantized_buffer.size() < buffer_size) { quantized_buffer.resize(buffer_size); } + if (dequantized_buffer.size() < sample_element_count) { dequantized_buffer.resize(sample_element_count); } - // Precompute denominators: - // - x2_per_row: sum_j w[j]*x[j]^2 if w present else sum_j x[j]^2 - // - bden_per_slice: sum_j w[j]*a[j]^2 if w & a present; sum_j a[j]^2 if only a present; 0 otherwise - std::vector x2_per_row(total_sampled_rows, 0.0); - std::vector bden_per_slice(ne2, 0.0); + std::vector row_sq_norm(sample_row_count, 0.0); + std::vector bias_denominator_per_slice(ne2, 0.0); - const bool has_w = (values_sample != nullptr); - const bool has_a = (activations_sample != nullptr); - - // Precompute bden per slice (depends only on w,a) - if (has_a) { + // Precompute bias denominator per slice + const bool has_values = (values_sample != nullptr); + const bool has_activations = (activations_sample != nullptr); + if (has_activations) { for (int64_t s = 0; s < ne2; ++s) { - const float * wv = has_w ? values_sample + s * n_per_row : nullptr; - const float * act = activations_sample + s * n_per_row; - double bden = 0.0; - if (has_w) { + const float * values = has_values ? values_sample + s * n_per_row : nullptr; + const float * activations = activations_sample + s * n_per_row; + double bias_denominator = 0.0; + if (has_values) { for (int64_t j = 0; j < n_per_row; ++j) { - const double a = act[j]; - bden += (double) wv[j] * a * a; + const double a = activations[j]; + bias_denominator += values[j] * a * a; } } else { for (int64_t j = 0; j < n_per_row; ++j) { - const double a = act[j]; - bden += a * a; + const double a = activations[j]; + bias_denominator += a * a; } } - bden_per_slice[s] = bden; + bias_denominator_per_slice[s] = bias_denominator; } } - // Precompute x2 per sampled row + // Compute squared norms of sampled rows { - size_t off = 0; + size_t offset = 0; size_t row_idx = 0; for (int64_t s = 0; s < ne2; ++s) { const int64_t rs = sample_rows_per_slice[s]; if (rs == 0) { continue; } - const float * wv = has_w ? values_sample + s * n_per_row : nullptr; + const float * values = has_values ? values_sample + s * n_per_row : nullptr; for (int64_t r = 0; r < rs; ++r, ++row_idx) { - const float * x = f32_sample.data() + off; - double x2 = 0.0; - if (has_w) { + const float * row = f32_sample.data() + offset; + double rsn = 0.0; + if (has_values) { for (int64_t j = 0; j < n_per_row; ++j) { - const double w = wv[j]; - const double xx = x[j]; - x2 += w * xx * xx; + const double v = values[j]; + const double x = row[j]; + rsn += v * x * x; } } else { for (int64_t j = 0; j < n_per_row; ++j) { - const double xx = x[j]; - x2 += xx * xx; + const double x = row[j]; + rsn += x * x; } } - x2_per_row[row_idx] = x2; - off += (size_t)n_per_row; + row_sq_norm[row_idx] = rsn; + offset += (size_t)n_per_row; } } } - // Quantize sampled rows slice-by-slice into qbuf - size_t qoff = 0; - size_t foff = 0; + // Quantize sampled rows slice-by-slice into quantized_buffer + size_t quantised_offset = 0; + size_t floats_offset = 0; for (int64_t slice = 0; slice < ne2; ++slice) { const int64_t rs = sample_rows_per_slice[slice]; if (rs == 0) { continue; } const float * value = values_sample ? values_sample + slice * n_per_row : nullptr; - (void)ggml_quantize_chunk(typ, f32_sample.data() + foff, qbuf.data() + qoff, 0, rs, n_per_row, value); + (void)ggml_quantize_chunk(quant_type, f32_sample.data() + floats_offset, quantized_buffer.data() + quantised_offset, 0, rs, n_per_row, value); - qoff += row_sz * (size_t)rs; - foff += (size_t)rs * (size_t)n_per_row; + quantised_offset += row_size * (size_t)rs; + floats_offset += (size_t)rs * (size_t)n_per_row; } - // Dequantize into deq (row-wise if needed to avoid int overflow) + // Dequantize into dequantized_buffer { - const ggml_type_traits * traits = ggml_get_type_traits(typ); - if (typ == GGML_TYPE_F16) { - ggml_fp16_to_fp32_row((const ggml_fp16_t *)qbuf.data(), deq.data(), (int)nels); - } else if (typ == GGML_TYPE_BF16) { - ggml_bf16_to_fp32_row((const ggml_bf16_t *)qbuf.data(), deq.data(), (int)nels); + const ggml_type_traits * traits = ggml_get_type_traits(quant_type); + if (quant_type == GGML_TYPE_F16) { + ggml_fp16_to_fp32_row((const ggml_fp16_t *)quantized_buffer.data(), dequantized_buffer.data(), (int)sample_element_count); + } else if (quant_type == GGML_TYPE_BF16) { + ggml_bf16_to_fp32_row((const ggml_bf16_t *)quantized_buffer.data(), dequantized_buffer.data(), (int)sample_element_count); } else { if (!traits || !traits->to_float) { - LLAMA_LOG_WARN("%s: unsupported quantization type %s\n", __func__, ggml_type_name(typ)); + LLAMA_LOG_WARN("%s: unsupported quantization type %s\n", __func__, ggml_type_name(quant_type)); return 1e35; } size_t done = 0; - while (done < nels) { - const size_t chunk = std::min((size_t)n_per_row, nels - done); - traits->to_float(qbuf.data() + done / n_per_row * row_sz, deq.data() + done, (int)chunk); + while (done < sample_element_count) { + const size_t chunk = std::min((size_t)n_per_row, sample_element_count - done); + traits->to_float(quantized_buffer.data() + done / n_per_row * row_size, dequantized_buffer.data() + done, (int)chunk); done += chunk; } } } // Compute error - const double eps = 1e-12; - size_t off = 0; + size_t offset = 0; size_t row_idx = 0; double total_err = 0.0; - for (int64_t slice = 0; slice < ne2; ++slice) { const int64_t rs = sample_rows_per_slice[slice]; if (rs == 0) { continue; } - const float * wv = has_w ? values_sample + slice * n_per_row : nullptr; - const float * act = has_a ? activations_sample + slice * n_per_row : nullptr; - const double bden = has_a ? bden_per_slice[slice] : 0.0; - + const float * values = has_values ? values_sample + slice * n_per_row : nullptr; + const float * activations = has_activations ? activations_sample + slice * n_per_row : nullptr; + const double bias_denominator = has_activations ? bias_denominator_per_slice[slice] : 0.0; double slice_err = 0.0; - for (int64_t r = 0; r < rs; ++r, ++row_idx) { - const float * x = f32_sample.data() + off; - const float * y = deq.data() + off; - - double mse_w = 0.0; - double bnum = 0.0; - - if (wv && act) { + const float * x = f32_sample.data() + offset; + const float * y = dequantized_buffer.data() + offset; + double weighted_mse = 0.0; + double bias_numerator = 0.0; + if (values && activations) { for (int64_t j = 0; j < n_per_row; ++j) { - const double w = wv[j]; + const double v = values[j]; const double e = y[j] - x[j]; - const double a = act[j]; - mse_w += w * e * e; - bnum += w * e * a; + const double a = activations[j]; + weighted_mse += v * e * e; + bias_numerator += v * e * a; } - } else if (wv) { + } else if (values) { for (int64_t j = 0; j < n_per_row; ++j) { - const double w = wv[j]; + const double v = values[j]; const double e = y[j] - x[j]; - mse_w += w * e * e; + weighted_mse += v * e * e; } - } else if (act) { + } else if (activations) { for (int64_t j = 0; j < n_per_row; ++j) { const double e = y[j] - x[j]; - const double a = act[j]; - mse_w += e * e; - bnum += e * a; + const double a = activations[j]; + weighted_mse += e * e; + bias_numerator += e * a; } } else { for (int64_t j = 0; j < n_per_row; ++j) { const double e = y[j] - x[j]; - mse_w += e * e; + weighted_mse += e * e; } } - // corrected normalization: divide the full numerator by x2 - double numer = mse_w; - if (act && bias_lambda != 0.0) { - const double proj = bnum * bnum / (bden + eps); - numer += bias_lambda * proj; + double err_numerator = weighted_mse; + constexpr double epsilon = 1e-12; + constexpr float bias_lambda = 1.0; + //bias_lambda defines the weight of the bias term in the weigthed MSE error function + // 0.0 means no bias (standard MSE) 1.0 means equal weight for bias and error, + // 2.0 means twice as much weight for bias, etc + if (activations && bias_lambda != 0.0) { + const double proj = bias_numerator * bias_numerator / (bias_denominator + epsilon); + err_numerator += bias_lambda * proj; } - const double denom = x2_per_row[row_idx] + eps; - const double row_err = numer / denom; - + const double err_denominator = row_sq_norm[row_idx] + epsilon; + const double row_err = err_numerator / err_denominator; slice_err += row_err; - off += (size_t)n_per_row; + offset += (size_t)n_per_row; } // scale to full rows (nrows) @@ -942,14 +928,11 @@ static std::unordered_map target_bpw_type( std::vector all; all.reserve(tensors.size()); - for (const auto * tw : tensors) { std::vector workers; workers.reserve(std::max(1, nthread)); - ggml_tensor * t = tw->tensor; const std::string name = ggml_get_name(t); - if (!can_quantize(t)) { continue; } LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12d elements)\n", __func__, name.c_str(), (int)ggml_nelements(t)); @@ -959,37 +942,26 @@ static std::unordered_map target_bpw_type( } ml.load_data_for(t); - const int64_t nelem = ggml_nelements(t); - std::vector> f32_conv_buf; - const float * values_all = get_values(name); - const float * activations_all = get_activations(name); - // Dequantize only sampled rows into f32_sample const int64_t n_per_row = t->ne[0]; const int64_t nrows_total = t->ne[1]; const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; + // Larger sample_rows_per_expert values may result in more accurate error estimates, but will take longer to compute + int sample_rows_per_expert = 512; + std::vector f32_sample; + f32_sample.reserve((size_t)ne2 * (size_t)std::min(nrows_total, sample_rows_per_expert) * (size_t)n_per_row); + + // deterministic sampling seed based on tensor name + fixed constant + std::mt19937 rng(std::hash{}(name) ^0xeabada55cafed00d); + std::vector sample_rows_per_slice(ne2, 0); + const int64_t sample_rows_max = std::max(1, std::min(nrows_total, sample_rows_per_expert)); + const int64_t stride = std::max(1, nrows_total / sample_rows_max); + std::vector row_buffer(n_per_row); const ggml_type src_type = t->type; const ggml_type_traits *src_traits = ggml_get_type_traits(src_type); const bool src_is_quant = ggml_is_quantized(src_type); const size_t src_row_sz = ggml_row_size(src_type, n_per_row); - - std::vector f32_sample; - f32_sample.reserve((size_t)ne2 * (size_t)std::min(nrows_total, sample_rows_per_expert) * (size_t)n_per_row); - - std::vector values_sample; - std::vector activations_sample; - std::vector sample_rows_per_slice(ne2, 0); - - // deterministic sampling seed based on tensor name + fixed constant - std::mt19937 rng(std::hash{}(name) ^0xeabada55cafed00d); - - const int64_t sample_rows_max = std::max(1, std::min(nrows_total, sample_rows_per_expert)); - const int64_t stride = std::max(1, nrows_total / sample_rows_max); - - // Temporary buffer for one dequantized row - std::vector rowbuf((size_t)n_per_row); - for (int64_t slice = 0; slice < ne2; ++slice) { int64_t current_sampled_rows = 0; int64_t offset = 0; @@ -1004,19 +976,19 @@ static std::unordered_map target_bpw_type( f32_sample.insert(f32_sample.end(), src_row, src_row + n_per_row); } else if (src_type == GGML_TYPE_F16) { const ggml_fp16_t * src_row = (const ggml_fp16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz); - ggml_fp16_to_fp32_row(src_row, rowbuf.data(), (int)n_per_row); - f32_sample.insert(f32_sample.end(), rowbuf.begin(), rowbuf.end()); + ggml_fp16_to_fp32_row(src_row, row_buffer.data(), (int)n_per_row); + f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end()); } else if (src_type == GGML_TYPE_BF16) { const ggml_bf16_t * src_row = (const ggml_bf16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz); - ggml_bf16_to_fp32_row(src_row, rowbuf.data(), (int)n_per_row); - f32_sample.insert(f32_sample.end(), rowbuf.begin(), rowbuf.end()); + ggml_bf16_to_fp32_row(src_row, row_buffer.data(), (int)n_per_row); + f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end()); } else if (src_is_quant) { const uint8_t * qrow = (const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz; if (!src_traits || !src_traits->to_float) { throw std::runtime_error(format("cannot dequantize type %s for sampling", ggml_type_name(src_type))); } - src_traits->to_float(qrow, rowbuf.data(), (int)n_per_row); - f32_sample.insert(f32_sample.end(), rowbuf.begin(), rowbuf.end()); + src_traits->to_float(qrow, row_buffer.data(), (int)n_per_row); + f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end()); } else { throw std::runtime_error(format("unsupported src type %s for sampling", ggml_type_name(src_type))); } @@ -1045,6 +1017,10 @@ static std::unordered_map target_bpw_type( } }; + const float * values_all = get_values(name); + const float * activations_all = get_activations(name); + std::vector values_sample; + std::vector activations_sample; if (values_all) { // get size from the map (not just the raw pointer) auto itv = values_data->find(remap_imatrix(name, mapped)); @@ -1057,6 +1033,7 @@ static std::unordered_map target_bpw_type( copy_or_broadcast(activations_all, sz, activations_sample); } + const int64_t nelem = ggml_nelements(t); tensor_info info; info.w = tw; info.n_elements = nelem; @@ -1067,12 +1044,12 @@ static std::unordered_map target_bpw_type( // Build list of candidate types first (compatible ones) std::vector quant_candidates; if (is_iq(params->ftype)) { - quant_candidates.assign(std::begin(iq_candidates), std::end(iq_candidates)); + quant_candidates.assign(std::begin(iq_quants), std::end(iq_quants)); } else { - quant_candidates.assign(std::begin(k_candidates), std::end(k_candidates)); + quant_candidates.assign(std::begin(k_quants), std::end(k_quants)); } - // Compute maximum row size among compatible candidates (to size qbuf once) + // Compute maximum row size among compatible candidates (to size quantized_buffer once) size_t max_row_sz = 0; const bool has_valid_imatrix = !values_sample.empty() && values_sample.size() == (size_t)ne2 * (size_t)n_per_row; std::vector compatible_candidates; @@ -1092,21 +1069,20 @@ static std::unordered_map target_bpw_type( compatible_candidates.erase(std::unique(compatible_candidates.begin(), compatible_candidates.end()), compatible_candidates.end()); // Now evaluate candidates - std::vector cand_out(compatible_candidates.size()); - const float *vals_ptr = values_sample.empty() ? nullptr : values_sample.data(); - const float *acts_ptr = activations_sample.empty() ? nullptr : activations_sample.data(); - std::vector qbuf(max_row_sz * total_sampled_rows); - std::vector deq(f32_sample.size()); + std::vector eval_candidates(compatible_candidates.size()); + const float *values = values_sample.empty() ? nullptr : values_sample.data(); + const float *activations = activations_sample.empty() ? nullptr : activations_sample.data(); + std::vector quantized_buffer(max_row_sz * total_sampled_rows); + std::vector dequantised_buffer(f32_sample.size()); int n_eval_threads = std::max(1, std::min(nthread, (int)compatible_candidates.size())); std::atomic cidx{0}; std::vector eval_workers; eval_workers.reserve(n_eval_threads); - for (int ti = 0; ti < n_eval_threads; ++ti) { eval_workers.emplace_back([&] { // thread-local scratch - std::vector tl_qbuf(qbuf.size()); - std::vector tl_deq(deq.size()); + std::vector tl_quantized_buffer(quantized_buffer.size()); + std::vector tl_dequantised_buffer(dequantised_buffer.size()); for (;;) { const size_t i = cidx.fetch_add(1, std::memory_order_relaxed); @@ -1114,15 +1090,16 @@ static std::unordered_map target_bpw_type( const ggml_type tt = compatible_candidates[i]; const auto bpw = (float)tensor_bpw(t, tt); - const size_t bytes = total_bytes(t, tt); - const auto err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, vals_ptr, acts_ptr, tl_qbuf, tl_deq); - cand_out[i] = candidate_types{ tt, bpw, bytes, err }; + const size_t bytes = tensor_bytes(t, tt); + const auto err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, values, activations, tl_quantized_buffer, tl_dequantised_buffer); + eval_candidates[i] = candidate_types{ tt, bpw, bytes, err }; } }); } + for (auto &th : eval_workers) { th.join(); } - for (auto &c : cand_out) { + for (auto &c : eval_candidates) { if (c.bytes > 0) { info.candidate.push_back(c); } } @@ -1132,7 +1109,7 @@ static std::unordered_map target_bpw_type( info.candidate.push_back(candidate_types{ t->type, bpw, ggml_nbytes(t), 0.0 }); } - // Remove dominated candidates: if A has >= bytes and >= error than B (and > in at least one), drop A. + // Keep only the Pareto‑optimal candidates: if A has >= bytes and >= error than B, drop A. { std::vector pruned; pruned.reserve(info.candidate.size()); @@ -1155,36 +1132,37 @@ static std::unordered_map target_bpw_type( info.candidate.swap(pruned); } - std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types & a, const candidate_types & b) { - if (a.bpw != b.bpw) { return a.bpw < b.bpw; } - if (a.error != b.error) { return a.error < b.error; } - return a.bytes < b.bytes; - }); - // Collapse candidates with identical storage size (bytes) { - std::vector uniq; - uniq.reserve(info.candidate.size()); + std::vector unique; + unique.reserve(info.candidate.size()); + // Sort by bpw asc, error asc, bytes asc + std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types & a, const candidate_types & b) { + if (a.bpw != b.bpw) { return a.bpw < b.bpw; } + if (a.error != b.error) { return a.error < b.error; } + return a.bytes < b.bytes; + }); for (size_t i = 0; i < info.candidate.size();) { - size_t j = i + 1; + size_t j = i + 1; candidate_types best = info.candidate[i]; // group same-byte entries, keep the one with the lowest error while (j < info.candidate.size() && info.candidate[j].bytes == info.candidate[i].bytes) { - if (info.candidate[j].error < best.error) { best = info.candidate[j]; } + if (info.candidate[j].error < best.error) { + best = info.candidate[j]; + } ++j; } - uniq.push_back(best); + unique.push_back(best); i = j; } - info.candidate.swap(uniq); + info.candidate.swap(unique); } // Initialize choice at the smallest bpw candidate info.choice = 0; info.min_bpw = info.candidate.front().bpw; info.max_bpw = info.candidate.back().bpw; - all.push_back(std::move(info)); } @@ -1196,6 +1174,7 @@ static std::unordered_map target_bpw_type( for (const auto & ti : all) { b += ti.candidate[ti.choice].bytes; } + return b; }; @@ -1204,6 +1183,7 @@ static std::unordered_map target_bpw_type( for (const auto & ti : all) { w += ti.n_elements; } + return w; }; @@ -1215,12 +1195,14 @@ static std::unordered_map target_bpw_type( // Precompute current bpw double bpw_now = current_bpw(); + float target_bpw = params->target_bpw; // If minimal bpw is already above the target, we're constrained by geometry; return closest (min bpw) if (bpw_now >= target_bpw) { std::unordered_map overrides; for (const auto & ti : all) { overrides[ggml_get_name(ti.w->tensor)] = ti.candidate[ti.choice].type; } + return overrides; } @@ -1268,6 +1250,7 @@ static std::unordered_map target_bpw_type( best = upgrade{ i, j, err, delta_bytes, ratio }; } } + return best; }; @@ -1286,16 +1269,12 @@ static std::unordered_map target_bpw_type( } } - // We might still be below target but taking any single upgrade overshoots. - // Try to find the best upgrade that overshoots the target_bpw by the least and has the best error-to-size ratio. + // We might still be below target so we try to find the best upgrade one last time { - double under_gap = target_bpw - bpw_now; - upgrade best_over{ -1, -1, 0.0, 0, -1.0 }; double best_over_gap = 1e300; - + double under_gap = target_bpw - bpw_now; size_t now_bytes = current_total_bytes(); - for (int i = 0; i < (int) all.size(); ++i) { const auto & ti = all[i]; if (ti.choice >= (int)ti.candidate.size() - 1) { continue; } @@ -1305,19 +1284,16 @@ static std::unordered_map target_bpw_type( const auto & cur = ti.candidate[ti.choice]; const auto & nxt = ti.candidate[j]; - size_t delta_bytes = nxt.bytes - cur.bytes; if (delta_bytes == 0) { continue; } size_t over_bytes = now_bytes + delta_bytes; double bpw_over = (double)over_bytes * 8.0 / (double)tw; - - double over_gap = std::abs(bpw_over - (double)target_bpw); - double err = cur.error - nxt.error; if (err < 0.0) { err = 0.0; } double ratio = err / (double)(delta_bytes * 8ull); + double over_gap = std::abs(bpw_over - (double)target_bpw); if (over_gap < best_over_gap - 1e-12 || (std::abs(over_gap - best_over_gap) <= 1e-12 && ratio > best_over.ratio)) { best_over_gap = over_gap; best_over = upgrade{ i, j, err, delta_bytes, ratio }; @@ -1339,6 +1315,7 @@ static std::unordered_map target_bpw_type( __func__, ggml_get_name(ti.w->tensor), ggml_type_name(ti.candidate[ti.choice].type), ti.candidate[ti.choice].bpw, ti.candidate[ti.choice].error); overrides[ggml_get_name(ti.w->tensor)] = ti.candidate[ti.choice].type; } + return overrides; } From ec0afbe79ff001af56846365f91f97240bd2dbf4 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Fri, 22 Aug 2025 01:46:09 +0100 Subject: [PATCH 034/155] Include embeddings and output tensors --- src/llama-quant.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 85191a66ae..b9e3c19a89 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -733,9 +733,6 @@ static std::unordered_map target_bpw_type( q &= name.find("time_mix_lerp_fused.weight") == std::string::npos; q &= name.find("attn_rel_b.weight") == std::string::npos; q &= !params->only_copy; - // TODO: Exclude embeddings and output tensors? - // q &= params->quantize_output_tensor || name != "output.weight"; - q &= name != name_tn(LLM_TENSOR_TOKEN_EMBD, "weight"); return q; }; From 35c1504441eb03b126b15a6ddd4625f094dc7dfe Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Fri, 22 Aug 2025 09:01:57 +0100 Subject: [PATCH 035/155] Fix byte count for 3d or higher tensors --- src/llama-quant.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index b9e3c19a89..8cc5f221ea 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -676,10 +676,9 @@ static std::unordered_map target_bpw_type( auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t { const int64_t n_per_row = t->ne[0]; - const int64_t nrows = t->ne[1]; - const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; - const size_t row_sz = ggml_row_size(typ, n_per_row); - return (size_t)ne2 * (size_t)nrows * row_sz; + const size_t row_sz = ggml_row_size(typ, n_per_row); + const int64_t nrows = ggml_nrows(t); + return (size_t)nrows * row_sz; }; auto tensor_bpw = [&](const ggml_tensor * t, const ggml_type typ) -> double { From bb0d912c1f93de2ef1af4ef9fb467c4862012898 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Fri, 22 Aug 2025 09:02:56 +0100 Subject: [PATCH 036/155] Update comments --- src/llama-quant.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 8cc5f221ea..4b846c7d0c 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -703,6 +703,7 @@ static std::unordered_map target_bpw_type( auto name_tn = LLM_TN(model.arch); auto can_quantize = [&](const ggml_tensor * t) -> bool { + // This list should be kept in sync with llama_tensor_quantize_impl() const std::string name = ggml_get_name(t); bool q = name.rfind("weight") == name.size() - 6; q &= ggml_n_dims(t) >= 2; @@ -902,7 +903,7 @@ static std::unordered_map target_bpw_type( constexpr float bias_lambda = 1.0; //bias_lambda defines the weight of the bias term in the weigthed MSE error function // 0.0 means no bias (standard MSE) 1.0 means equal weight for bias and error, - // 2.0 means twice as much weight for bias, etc + // 2.0 means twice as much weight for bias, etc. Default is 1.0. if (activations && bias_lambda != 0.0) { const double proj = bias_numerator * bias_numerator / (bias_denominator + epsilon); err_numerator += bias_lambda * proj; @@ -1192,7 +1193,7 @@ static std::unordered_map target_bpw_type( double bpw_now = current_bpw(); float target_bpw = params->target_bpw; - // If minimal bpw is already above the target, we're constrained by geometry; return closest (min bpw) + // If minimal bpw is already above the target, we're constrained by the tensor's shape; return closest (min bpw) if (bpw_now >= target_bpw) { std::unordered_map overrides; for (const auto & ti : all) { From 2f13fee795639841de46b8f415a233062aa5d2b8 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Fri, 22 Aug 2025 09:05:55 +0100 Subject: [PATCH 037/155] Parameterise type --- src/llama-quant.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 4b846c7d0c..e5e27da509 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -760,8 +760,8 @@ static std::unordered_map target_bpw_type( if (quantized_buffer.size() < buffer_size) { quantized_buffer.resize(buffer_size); } if (dequantized_buffer.size() < sample_element_count) { dequantized_buffer.resize(sample_element_count); } - std::vector row_sq_norm(sample_row_count, 0.0); - std::vector bias_denominator_per_slice(ne2, 0.0); + std::vector row_sq_norm(sample_row_count, 0.0); + std::vector bias_denominator_per_slice(ne2, 0.0); // Precompute bias denominator per slice const bool has_values = (values_sample != nullptr); From 47cdbe21552324cd79b9243485eeb455cab4673a Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Fri, 22 Aug 2025 09:11:11 +0100 Subject: [PATCH 038/155] Reduce sampling window to speedup process --- src/llama-quant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index e5e27da509..5460669e7c 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -945,7 +945,7 @@ static std::unordered_map target_bpw_type( const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; // Larger sample_rows_per_expert values may result in more accurate error estimates, but will take longer to compute - int sample_rows_per_expert = 512; + constexpr int sample_rows_per_expert = 384; std::vector f32_sample; f32_sample.reserve((size_t)ne2 * (size_t)std::min(nrows_total, sample_rows_per_expert) * (size_t)n_per_row); From 01c927fb94163ddb36365323683274071c034690 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Fri, 22 Aug 2025 09:14:14 +0100 Subject: [PATCH 039/155] Improve pareto efficient candidate selection --- src/llama-quant.cpp | 49 +++++++++++++-------------------------------- 1 file changed, 14 insertions(+), 35 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 5460669e7c..14d9087f53 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1106,56 +1106,35 @@ static std::unordered_map target_bpw_type( info.candidate.push_back(candidate_types{ t->type, bpw, ggml_nbytes(t), 0.0 }); } - // Keep only the Pareto‑optimal candidates: if A has >= bytes and >= error than B, drop A. + // Keep only the pareto‑optimal candidates: if A has >= bytes and >= error than B, drop A. { std::vector pruned; pruned.reserve(info.candidate.size()); - // Sort by bytes asc, error asc - std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types &a, const candidate_types &b) { + + // Sort by bytes ascending, error ascending + std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types & a, const candidate_types & b) { if (a.bytes != b.bytes) { return a.bytes < b.bytes; } return a.error < b.error; }); double best_err = std::numeric_limits::infinity(); size_t last_bytes = std::numeric_limits::max(); - - for (const auto &c : info.candidate) { - if (c.error < best_err || c.bytes > last_bytes) { - pruned.push_back(c); - best_err = std::min(best_err, (double)c.error); + for (const auto & c : info.candidate) { + // Only keep the best error seen so far at strictly larger byte sizes + if (c.bytes != last_bytes) { + // first time we see this byte size last_bytes = c.bytes; + if (c.error < best_err) { + pruned.push_back(c); + best_err = c.error; + } + } else { + // same bytes: we already sorted by error; skip } } info.candidate.swap(pruned); } - // Collapse candidates with identical storage size (bytes) - { - std::vector unique; - unique.reserve(info.candidate.size()); - // Sort by bpw asc, error asc, bytes asc - std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types & a, const candidate_types & b) { - if (a.bpw != b.bpw) { return a.bpw < b.bpw; } - if (a.error != b.error) { return a.error < b.error; } - return a.bytes < b.bytes; - }); - - for (size_t i = 0; i < info.candidate.size();) { - size_t j = i + 1; - candidate_types best = info.candidate[i]; - // group same-byte entries, keep the one with the lowest error - while (j < info.candidate.size() && info.candidate[j].bytes == info.candidate[i].bytes) { - if (info.candidate[j].error < best.error) { - best = info.candidate[j]; - } - ++j; - } - unique.push_back(best); - i = j; - } - info.candidate.swap(unique); - } - // Initialize choice at the smallest bpw candidate info.choice = 0; info.min_bpw = info.candidate.front().bpw; From 897decbe8a062ded079f1f1a866392571ed7f95f Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Fri, 22 Aug 2025 09:15:11 +0100 Subject: [PATCH 040/155] Show skipped IQ tensors --- src/llama-quant.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 14d9087f53..c5c19f3c5f 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1019,7 +1019,6 @@ static std::unordered_map target_bpw_type( std::vector values_sample; std::vector activations_sample; if (values_all) { - // get size from the map (not just the raw pointer) auto itv = values_data->find(remap_imatrix(name, mapped)); const size_t sz = itv == values_data->end() ? 0 : itv->second.size(); copy_or_broadcast(values_all, sz, values_sample); @@ -1053,7 +1052,7 @@ static std::unordered_map target_bpw_type( compatible_candidates.reserve(quant_candidates.size()); for (ggml_type ts_type : quant_candidates) { if (is_iq(ts_type) && !has_valid_imatrix) { - LLAMA_LOG_WARN("%s: skipping IQ quantization for %s, no or mismatched imatrix provided\n", __func__, name.c_str()); + LLAMA_LOG_WARN("%s: skipping %s quantization for %s, no or mismatched imatrix provided\n", __func__, ggml_type_name(ts_type) , name.c_str()); continue; } ggml_type tt = make_compatible(t, ts_type); @@ -1214,13 +1213,11 @@ static std::unordered_map target_bpw_type( const auto & cur = ti.candidate[ti.choice]; const auto & nxt = ti.candidate[j]; - const size_t delta_bytes = nxt.bytes - cur.bytes; if (delta_bytes == 0) { continue; } double err = cur.error - nxt.error; err = std::max(err, 0.0); - double ratio = err / (double)(delta_bytes * 8ull); if (ratio > best.ratio + eps || (std::abs(ratio - best.ratio) <= eps && delta_bytes < best.delta_bytes)) { best = upgrade{ i, j, err, delta_bytes, ratio }; From f05c8483d8b138c58a41ecdf32f95947bb130be5 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Fri, 22 Aug 2025 09:17:58 +0100 Subject: [PATCH 041/155] Improve dequantized_buffer fill --- src/llama-quant.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index c5c19f3c5f..db4a0e1a20 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -843,12 +843,9 @@ static std::unordered_map target_bpw_type( LLAMA_LOG_WARN("%s: unsupported quantization type %s\n", __func__, ggml_type_name(quant_type)); return 1e35; } - - size_t done = 0; - while (done < sample_element_count) { - const size_t chunk = std::min((size_t)n_per_row, sample_element_count - done); - traits->to_float(quantized_buffer.data() + done / n_per_row * row_size, dequantized_buffer.data() + done, (int)chunk); - done += chunk; + const size_t row_size = ggml_row_size(quant_type, n_per_row); + for (size_t r = 0; r < sample_row_count; ++r) { + traits->to_float(quantized_buffer.data() + r * row_size, dequantized_buffer.data() + r * n_per_row, (int)n_per_row); } } } From fea99d051ad3a9f3cce3cdf084074e0655f47e14 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Fri, 22 Aug 2025 16:57:58 +0100 Subject: [PATCH 042/155] Refactor and combine lambdas --- src/llama-quant.cpp | 40 +++++++++++++--------------------------- 1 file changed, 13 insertions(+), 27 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index db4a0e1a20..10993e89c6 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -660,20 +660,6 @@ static std::unordered_map target_bpw_type( GGML_TYPE_Q6_K }; - auto get_values = [&](const std::string & tensor_name) -> const float * { - if (!values_data) { return nullptr; } - const auto it = values_data->find(remap_imatrix(tensor_name, mapped)); - if (it == values_data->end()) { return nullptr; } - return it->second.data(); - }; - - auto get_activations = [&](const std::string & tensor_name) -> const float * { - if (!activations_data) { return nullptr; } - const auto it = activations_data->find(remap_imatrix(tensor_name, mapped)); - if (it == activations_data->end()) { return nullptr; } - return it->second.data(); - }; - auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t { const int64_t n_per_row = t->ne[0]; const size_t row_sz = ggml_row_size(typ, n_per_row); @@ -991,6 +977,15 @@ static std::unordered_map target_bpw_type( sample_rows_per_slice[slice] = current_sampled_rows; } + auto side_data = [&](const std::unordered_map> * m, const std::string & tensor_name) -> std::pair { + if (!m) { return {nullptr, 0}; } + const std::string key = remap_imatrix(tensor_name, mapped); + const auto it = m->find(key); + if (it == m->end()) { return {nullptr, 0}; } + return { it->second.data(), it->second.size() }; + }; + + // Copy this row's side data (values and activations), or broadcasts to all slices auto copy_or_broadcast = [&](const float *src, size_t src_sz, std::vector &dst) { const size_t want = (size_t)ne2 * (size_t)n_per_row; dst.clear(); @@ -1005,26 +1000,17 @@ static std::unordered_map target_bpw_type( std::memcpy(dst.data() + s * n_per_row, src, n_per_row * sizeof(float)); } } else { - // Mismatch – safer to skip using it for this tensor LLAMA_LOG_WARN("%s: side data size mismatch for %s: got %zu, expected %zu or %zu; ignoring\n", __func__, name.c_str(), src_sz, (size_t)n_per_row, want); } }; - const float * values_all = get_values(name); - const float * activations_all = get_activations(name); + const auto [values_all, values_sz] = side_data(values_data, name); + const auto [activations_all, activations_sz] = side_data(activations_data, name); std::vector values_sample; std::vector activations_sample; - if (values_all) { - auto itv = values_data->find(remap_imatrix(name, mapped)); - const size_t sz = itv == values_data->end() ? 0 : itv->second.size(); - copy_or_broadcast(values_all, sz, values_sample); - } - if (activations_all) { - auto ita = activations_data->find(remap_imatrix(name, mapped)); - const size_t sz = ita == activations_data->end() ? 0 : ita->second.size(); - copy_or_broadcast(activations_all, sz, activations_sample); - } + if (values_all) { copy_or_broadcast(values_all, values_sz, values_sample); } + if (activations_all) { copy_or_broadcast(activations_all, activations_sz, activations_sample); } const int64_t nelem = ggml_nelements(t); tensor_info info; From 6d17889addf3aa18000334e1dd958111104cdf3e Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Fri, 22 Aug 2025 16:58:46 +0100 Subject: [PATCH 043/155] Log if override is from tensor-type or from bpw-target --- src/llama-quant.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 10993e89c6..721deaddad 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1049,8 +1049,8 @@ static std::unordered_map target_bpw_type( // Now evaluate candidates std::vector eval_candidates(compatible_candidates.size()); - const float *values = values_sample.empty() ? nullptr : values_sample.data(); - const float *activations = activations_sample.empty() ? nullptr : activations_sample.data(); + const float * values = values_sample.empty() ? nullptr : values_sample.data(); + const float * activations = activations_sample.empty() ? nullptr : activations_sample.data(); std::vector quantized_buffer(max_row_sz * total_sampled_rows); std::vector dequantised_buffer(f32_sample.size()); int n_eval_threads = std::max(1, std::min(nthread, (int)compatible_candidates.size())); @@ -1656,15 +1656,18 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: new_type = llama_tensor_get_type(qs, new_type, tensor, ftype); // get bpw override const auto override = bpw_overrides.find(name); - if (override != bpw_overrides.end()) { new_type = override->second; } - // unless the user specifies a type, and the tensor geometry will not require fallback quantisation + if (override != bpw_overrides.end() && override->second != new_type) { + LLAMA_LOG_DEBUG("(bpw overriding %s) ", ggml_type_name(new_type)); + new_type = override->second; + } + // unless the user specifies a type, and the tensor shape will not require fallback quantisation if (params->tensor_types && qs.n_fallback - fallback == 0) { const std::vector & tensor_types = *static_cast *>(params->tensor_types); const std::string tensor_name(tensor->name); for (const auto & [tname, qtype] : tensor_types) { if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) { if (qtype != new_type) { - LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type)); + LLAMA_LOG_DEBUG("(type overriding %s) ", ggml_type_name(new_type)); new_type = qtype; // if two or more types are specified for the same tensor, the last match wins } } From 9a4b1154974d5ddbfb9d9d3f785f5a29bb202fac Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 23 Aug 2025 01:08:01 +0100 Subject: [PATCH 044/155] Explicitly adding include --- src/llama-quant.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 721deaddad..d17b21d008 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -4,6 +4,7 @@ #include "llama-model-loader.h" #include +#include #include #include #include From f75265f55bb1d4470dea57f4c9e3ad108cc343a1 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 23 Aug 2025 01:08:37 +0100 Subject: [PATCH 045/155] Fix typo --- src/llama-quant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index d17b21d008..6e3aa3f83d 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1535,7 +1535,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: std::unordered_map bpw_overrides = {}; if (params->target_bpw != -1.0f) { - LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.3f bpw at lowest ppl - this opearation may take some time\n", __func__, params->target_bpw); + LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.3f bpw at lowest ppl - this operation may take some time\n", __func__, params->target_bpw); bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread); } From 73124a9921b967fe9e5afbb9f48924a3d48983a6 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 23 Aug 2025 02:17:22 +0100 Subject: [PATCH 046/155] Refactor estimate_error() --- src/llama-quant.cpp | 131 ++++++++++++++++++++++---------------------- 1 file changed, 66 insertions(+), 65 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 6e3aa3f83d..3c358fb67e 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -742,38 +742,33 @@ static std::unordered_map target_bpw_type( const size_t sample_row_count = sample_element_count / (size_t)n_per_row; if (sample_row_count == 0) { return 0.0; } - const size_t row_size = ggml_row_size(quant_type, n_per_row); - const size_t buffer_size = row_size * sample_row_count; - if (quantized_buffer.size() < buffer_size) { quantized_buffer.resize(buffer_size); } + const size_t row_sz = ggml_row_size(quant_type, n_per_row); + const size_t buffer_sz = row_sz * sample_row_count; + + if (quantized_buffer.size() < buffer_sz) { quantized_buffer.resize(buffer_sz); } if (dequantized_buffer.size() < sample_element_count) { dequantized_buffer.resize(sample_element_count); } - std::vector row_sq_norm(sample_row_count, 0.0); - std::vector bias_denominator_per_slice(ne2, 0.0); + const bool has_values = values_sample != nullptr; + const bool has_activations = activations_sample != nullptr; - // Precompute bias denominator per slice - const bool has_values = (values_sample != nullptr); - const bool has_activations = (activations_sample != nullptr); + // Bias denominators per slice (only needed if we have activations) + std::vector bias_denominator_per_slice(ne2, 0.0); if (has_activations) { for (int64_t s = 0; s < ne2; ++s) { const float * values = has_values ? values_sample + s * n_per_row : nullptr; const float * activations = activations_sample + s * n_per_row; - double bias_denominator = 0.0; - if (has_values) { - for (int64_t j = 0; j < n_per_row; ++j) { - const double a = activations[j]; - bias_denominator += values[j] * a * a; - } - } else { - for (int64_t j = 0; j < n_per_row; ++j) { - const double a = activations[j]; - bias_denominator += a * a; - } + double denom = 0.0; + for (int64_t j = 0; j < n_per_row; ++j) { + const double a = activations[j]; + const double w = values ? values[j] : 1.0; + denom += w * a * a; } - bias_denominator_per_slice[s] = bias_denominator; + bias_denominator_per_slice[s] = denom; } } - // Compute squared norms of sampled rows + // Compute per-row squared norms with weighting (if values are provided) + std::vector row_sq_norm(sample_row_count, 0.0); { size_t offset = 0; size_t row_idx = 0; @@ -784,18 +779,18 @@ static std::unordered_map target_bpw_type( const float * values = has_values ? values_sample + s * n_per_row : nullptr; for (int64_t r = 0; r < rs; ++r, ++row_idx) { - const float * row = f32_sample.data() + offset; + const float * x = f32_sample.data() + offset; double rsn = 0.0; - if (has_values) { + if (values) { for (int64_t j = 0; j < n_per_row; ++j) { - const double v = values[j]; - const double x = row[j]; - rsn += v * x * x; + const double v = values[j]; + const double xx = x[j]; + rsn += v * xx * xx; } } else { for (int64_t j = 0; j < n_per_row; ++j) { - const double x = row[j]; - rsn += x * x; + const double xx = x[j]; + rsn += xx * xx; } } row_sq_norm[row_idx] = rsn; @@ -805,35 +800,44 @@ static std::unordered_map target_bpw_type( } // Quantize sampled rows slice-by-slice into quantized_buffer - size_t quantised_offset = 0; - size_t floats_offset = 0; - for (int64_t slice = 0; slice < ne2; ++slice) { - const int64_t rs = sample_rows_per_slice[slice]; - if (rs == 0) { continue; } + { + size_t q_offset = 0; + size_t f_offset = 0; + for (int64_t slice = 0; slice < ne2; ++slice) { + const int64_t rs = sample_rows_per_slice[slice]; + if (rs == 0) { continue; } - const float * value = values_sample ? values_sample + slice * n_per_row : nullptr; - (void)ggml_quantize_chunk(quant_type, f32_sample.data() + floats_offset, quantized_buffer.data() + quantised_offset, 0, rs, n_per_row, value); + const float * value = has_values ? values_sample + slice * n_per_row : nullptr; + (void)ggml_quantize_chunk(quant_type, f32_sample.data() + f_offset, quantized_buffer.data() + q_offset, 0, rs, n_per_row, value); - quantised_offset += row_size * (size_t)rs; - floats_offset += (size_t)rs * (size_t)n_per_row; + q_offset += row_sz * (size_t)rs; + f_offset += (size_t)rs * (size_t)n_per_row; + } } // Dequantize into dequantized_buffer { const ggml_type_traits * traits = ggml_get_type_traits(quant_type); - if (quant_type == GGML_TYPE_F16) { - ggml_fp16_to_fp32_row((const ggml_fp16_t *)quantized_buffer.data(), dequantized_buffer.data(), (int)sample_element_count); - } else if (quant_type == GGML_TYPE_BF16) { - ggml_bf16_to_fp32_row((const ggml_bf16_t *)quantized_buffer.data(), dequantized_buffer.data(), (int)sample_element_count); - } else { - if (!traits || !traits->to_float) { - LLAMA_LOG_WARN("%s: unsupported quantization type %s\n", __func__, ggml_type_name(quant_type)); - return 1e35; - } - const size_t row_size = ggml_row_size(quant_type, n_per_row); - for (size_t r = 0; r < sample_row_count; ++r) { - traits->to_float(quantized_buffer.data() + r * row_size, dequantized_buffer.data() + r * n_per_row, (int)n_per_row); + auto row_to_float = [&](size_t r) { + uint8_t * src = quantized_buffer.data() + r * row_sz; + float * dst = dequantized_buffer.data() + r * (size_t)n_per_row; + if (quant_type == GGML_TYPE_F16) { + ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row); + } else if (quant_type == GGML_TYPE_BF16) { + ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row); + } else { + if (!traits || !traits->to_float) { + LLAMA_LOG_WARN("%s: unsupported quantization type %s\n", __func__, ggml_type_name(quant_type)); + return false; + } + traits->to_float(src, dst, (int)n_per_row); } + + return true; + }; + + for (size_t r = 0; r < sample_row_count; ++r) { + if (!row_to_float(r)) { return 1e35; } } } @@ -847,20 +851,22 @@ static std::unordered_map target_bpw_type( const float * values = has_values ? values_sample + slice * n_per_row : nullptr; const float * activations = has_activations ? activations_sample + slice * n_per_row : nullptr; - const double bias_denominator = has_activations ? bias_denominator_per_slice[slice] : 0.0; + const double bias_denom = has_activations ? bias_denominator_per_slice[slice] : 0.0; + double slice_err = 0.0; + for (int64_t r = 0; r < rs; ++r, ++row_idx) { const float * x = f32_sample.data() + offset; const float * y = dequantized_buffer.data() + offset; double weighted_mse = 0.0; - double bias_numerator = 0.0; + double bias_num = 0.0; if (values && activations) { for (int64_t j = 0; j < n_per_row; ++j) { const double v = values[j]; const double e = y[j] - x[j]; const double a = activations[j]; weighted_mse += v * e * e; - bias_numerator += v * e * a; + bias_num += v * e * a; } } else if (values) { for (int64_t j = 0; j < n_per_row; ++j) { @@ -873,7 +879,7 @@ static std::unordered_map target_bpw_type( const double e = y[j] - x[j]; const double a = activations[j]; weighted_mse += e * e; - bias_numerator += e * a; + bias_num += e * a; } } else { for (int64_t j = 0; j < n_per_row; ++j) { @@ -882,24 +888,19 @@ static std::unordered_map target_bpw_type( } } - double err_numerator = weighted_mse; + constexpr float bias_lambda = 1.75f; constexpr double epsilon = 1e-12; - constexpr float bias_lambda = 1.0; - //bias_lambda defines the weight of the bias term in the weigthed MSE error function - // 0.0 means no bias (standard MSE) 1.0 means equal weight for bias and error, - // 2.0 means twice as much weight for bias, etc. Default is 1.0. - if (activations && bias_lambda != 0.0) { - const double proj = bias_numerator * bias_numerator / (bias_denominator + epsilon); - err_numerator += bias_lambda * proj; + double err_num = weighted_mse; + if (activations && bias_lambda != 0.0f) { + const double proj = bias_num * bias_num / (bias_denom + epsilon); + err_num += (double)bias_lambda * proj; } - const double err_denominator = row_sq_norm[row_idx] + epsilon; - const double row_err = err_numerator / err_denominator; - slice_err += row_err; + const double err_den = row_sq_norm[row_idx] + epsilon; + slice_err += err_num / err_den; offset += (size_t)n_per_row; } - // scale to full rows (nrows) const double scale_rows = (double)nrows / std::max(1.0, (double)rs); total_err += slice_err * scale_rows; } From 68ae5e66cea41457a3ed11018374b64e2f94d3d3 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 23 Aug 2025 02:50:55 +0100 Subject: [PATCH 047/155] Improve list of candidate types --- src/llama-quant.cpp | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 3c358fb67e..392a23b5ca 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1023,21 +1023,20 @@ static std::unordered_map target_bpw_type( size_t total_sampled_rows = f32_sample.size() / n_per_row; // Build list of candidate types first (compatible ones) - std::vector quant_candidates; - if (is_iq(params->ftype)) { - quant_candidates.assign(std::begin(iq_quants), std::end(iq_quants)); - } else { - quant_candidates.assign(std::begin(k_quants), std::end(k_quants)); - } + const ggml_type * base_arr = is_iq(params->ftype) ? iq_quants : k_quants; + const size_t base_sz = is_iq(params->ftype) ? sizeof(iq_quants) / sizeof(iq_quants[0]) : sizeof(k_quants) / sizeof(k_quants[0]); - // Compute maximum row size among compatible candidates (to size quantized_buffer once) size_t max_row_sz = 0; const bool has_valid_imatrix = !values_sample.empty() && values_sample.size() == (size_t)ne2 * (size_t)n_per_row; + std::vector compatible_candidates; - compatible_candidates.reserve(quant_candidates.size()); - for (ggml_type ts_type : quant_candidates) { + compatible_candidates.reserve(base_sz); + + for (size_t i = 0; i < base_sz; ++i) { + ggml_type ts_type = base_arr[i]; if (is_iq(ts_type) && !has_valid_imatrix) { - LLAMA_LOG_WARN("%s: skipping %s quantization for %s, no or mismatched imatrix provided\n", __func__, ggml_type_name(ts_type) , name.c_str()); + LLAMA_LOG_WARN("%s: skipping %s quantization for %s, no or mismatched imatrix provided\n", + __func__, ggml_type_name(ts_type), name.c_str()); continue; } ggml_type tt = make_compatible(t, ts_type); From decafae27060ed923c69ce3b89db505538a9b230 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 23 Aug 2025 11:30:11 +0100 Subject: [PATCH 048/155] Adjust bias_lambda --- src/llama-quant.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 392a23b5ca..4ce651723f 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -888,7 +888,9 @@ static std::unordered_map target_bpw_type( } } - constexpr float bias_lambda = 1.75f; + // abias_lambda djusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE + // larger value favours quantisation types that produce a smaller bias even if the MSE is slightly larger + constexpr float bias_lambda = 1.5f; constexpr double epsilon = 1e-12; double err_num = weighted_mse; if (activations && bias_lambda != 0.0f) { @@ -1024,7 +1026,7 @@ static std::unordered_map target_bpw_type( // Build list of candidate types first (compatible ones) const ggml_type * base_arr = is_iq(params->ftype) ? iq_quants : k_quants; - const size_t base_sz = is_iq(params->ftype) ? sizeof(iq_quants) / sizeof(iq_quants[0]) : sizeof(k_quants) / sizeof(k_quants[0]); + const size_t base_sz = is_iq(params->ftype) ? std::size(iq_quants) : std::size(k_quants); size_t max_row_sz = 0; const bool has_valid_imatrix = !values_sample.empty() && values_sample.size() == (size_t)ne2 * (size_t)n_per_row; From 3856d60328349c5b2a4e381d6fdff20d272415ab Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 23 Aug 2025 14:45:07 +0100 Subject: [PATCH 049/155] Restrict quant types per family --- src/llama-quant.cpp | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 4ce651723f..7615376e31 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -628,11 +628,7 @@ static std::unordered_map target_bpw_type( constexpr ggml_type k_quants[] = { GGML_TYPE_Q2_K, GGML_TYPE_Q3_K, - GGML_TYPE_Q4_0, - GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, - GGML_TYPE_Q5_0, - GGML_TYPE_Q5_1, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, GGML_TYPE_Q8_0, @@ -646,19 +642,12 @@ static std::unordered_map target_bpw_type( constexpr ggml_type iq_quants[] = { GGML_TYPE_IQ1_S, - GGML_TYPE_IQ1_M, - GGML_TYPE_IQ2_XXS, - GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S, - GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS, - GGML_TYPE_IQ4_NL, - // TODO: add higher-precision fallbacks for IQ mixes to improve ppl if bpw budget allows it? - GGML_TYPE_Q5_0, - GGML_TYPE_Q5_1, GGML_TYPE_Q5_K, - GGML_TYPE_Q6_K + GGML_TYPE_Q6_K, + GGML_TYPE_Q8_0 }; auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t { @@ -888,8 +877,8 @@ static std::unordered_map target_bpw_type( } } - // abias_lambda djusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE - // larger value favours quantisation types that produce a smaller bias even if the MSE is slightly larger + // bias_lambda adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE + // larger value favours quantisation types that produce smaller bias even if the MSE is slightly larger constexpr float bias_lambda = 1.5f; constexpr double epsilon = 1e-12; double err_num = weighted_mse; From 61c0e01f500ef2610904045c6a7852956c7ba6ba Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 24 Aug 2025 13:36:03 +0100 Subject: [PATCH 050/155] Execute bpw_overrides() only if an imatrix file is provided --- src/llama-quant.cpp | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 7615376e31..4ed9454068 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1525,9 +1525,18 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } std::unordered_map bpw_overrides = {}; - if (params->target_bpw != -1.0f) { - LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.3f bpw at lowest ppl - this operation may take some time\n", __func__, params->target_bpw); - bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread); + if (params->target_bpw != -1.0f && !params->only_copy) { + if (params->imatrix) { + if (params->activations) { + LLAMA_LOG_INFO("%s: imatrix with activations provided, target bpw quantization will be more accurate\n", __func__); + } else { + LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate\n", __func__); + } + LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw); + bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread); + } else { + LLAMA_LOG_WARN("%s: no imatrix provided, target bpw will not apply\n", __func__); + } } int cur_split = -1; From d4ac2106fb5b9e1a98d6aef8a0931e73e46f324e Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 24 Aug 2025 13:39:10 +0100 Subject: [PATCH 051/155] Improve logging and some minor code refactoring --- src/llama-quant.cpp | 26 +++++++++++++++----------- tools/quantize/quantize.cpp | 7 +------ 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 4ed9454068..407a63d887 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -132,7 +132,6 @@ static std::string remap_imatrix (const std::string & orig_name, const std::map< for (const auto & p : mapped) { if (p.second == blk) { - LLAMA_LOG_DEBUG("(blk.%d imatrix) ", p.first); return new_name.replace(match.position(1), match.length(1), std::to_string(p.first)); } } @@ -1257,7 +1256,7 @@ static std::unordered_map target_bpw_type( // Build the override map std::unordered_map overrides; - LLAMA_LOG_INFO("%s: - estimated tensor quantization mix to achieve %.4f bpw at lowest ppl\n", __func__, target_bpw); + LLAMA_LOG_INFO("%s: - estimated tensor quantization mix:\n", __func__); for (const auto & ti : all) { LLAMA_LOG_INFO("\t%s: %45s - \t%8s, \t%1.4f bpw,\terror: %.4f\n", __func__, ggml_get_name(ti.w->tensor), ggml_type_name(ti.candidate[ti.choice].type), ti.candidate[ti.choice].bpw, ti.candidate[ti.choice].error); @@ -1352,7 +1351,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: if (params->imatrix) { values_data = static_cast>*>(params->imatrix); if (values_data) { - LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(values_data->size())); + LLAMA_LOG_INFO("================================ Have weights data with %d entries",int(values_data->size())); qs.has_imatrix = true; // check imatrix for nans or infs for (const auto & kv : *values_data) { @@ -1367,7 +1366,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: if (params->activations) { activations_data = static_cast>*>(params->activations); if (activations_data) { - LLAMA_LOG_INFO("================================ Have activations data with %d entries\n",int(activations_data->size())); + LLAMA_LOG_INFO(" and %d activations",int(activations_data->size())); qs.has_activations = true; // check activations for nans or infs for (const auto & kv : *activations_data) { @@ -1379,6 +1378,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } } } + LLAMA_LOG_INFO("\n"); gguf_context_ptr ctx_out { gguf_init_empty() }; @@ -1655,12 +1655,16 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: if (!params->pure && ggml_is_quantized(default_type)) { int fallback = qs.n_fallback; new_type = llama_tensor_get_type(qs, new_type, tensor, ftype); - // get bpw override - const auto override = bpw_overrides.find(name); - if (override != bpw_overrides.end() && override->second != new_type) { - LLAMA_LOG_DEBUG("(bpw overriding %s) ", ggml_type_name(new_type)); - new_type = override->second; + + // get quantization type overrides targeting a given bits per weight budget + if (params->target_bpw != -1.0f && !bpw_overrides.empty()) { + const auto override = bpw_overrides.find(name); + if (override != bpw_overrides.end() && override->second != new_type) { + LLAMA_LOG_DEBUG("(bpw override %s) ", ggml_type_name(new_type)); + new_type = override->second; + } } + // unless the user specifies a type, and the tensor shape will not require fallback quantisation if (params->tensor_types && qs.n_fallback - fallback == 0) { const std::vector & tensor_types = *static_cast *>(params->tensor_types); @@ -1668,7 +1672,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: for (const auto & [tname, qtype] : tensor_types) { if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) { if (qtype != new_type) { - LLAMA_LOG_DEBUG("(type overriding %s) ", ggml_type_name(new_type)); + LLAMA_LOG_DEBUG("(type override %s) ", ggml_type_name(new_type)); new_type = qtype; // if two or more types are specified for the same tensor, the last match wins } } @@ -1699,7 +1703,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: if (values_data) { auto it = values_data->find(remap_imatrix(tensor->name, mapped)); if (it == values_data->end()) { - LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name); + LLAMA_LOG_INFO("\n====== %s: did not find weights for %s, ", __func__, tensor->name); } else { if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) { imatrix = it->second.data(); diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index b907008cb4..77fa6b90ce 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -399,12 +399,7 @@ static int prepare_imatrix(const std::string & imatrix_file, values_data = std::move(tmp_values); activations_data = std::move(tmp_activations); } - if (!values_data.empty()) { - printf("%s: have %d importance matrix value entries\n", __func__, int(values_data.size())); - } - if (!activations_data.empty()) { - printf("%s: have %d importance matrix activation entries\n", __func__, int(activations_data.size())); - } + return m_last_call; } From 4286690019f21cae3abb92a7903c6675a3367e5e Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 26 Aug 2025 21:39:40 +0100 Subject: [PATCH 052/155] Minor comment update --- src/llama-quant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 407a63d887..cbbfdedfbd 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -596,7 +596,7 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * return new_size; } -// Returns per-tensor type overrides to meet target BPW at lowest ppl +// Returns per-tensor type overrides to meet target BPW at lowest error static std::unordered_map target_bpw_type( llama_model_loader & ml, std::vector> & buffer, From 04946114c9009cd04f665ed98b55304e376e19d3 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 28 Aug 2025 16:01:03 +0100 Subject: [PATCH 053/155] Refactor epsilon into a function-wide variable --- src/llama-quant.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index cbbfdedfbd..da1267ddbc 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -649,6 +649,8 @@ static std::unordered_map target_bpw_type( GGML_TYPE_Q8_0 }; + constexpr double epsilon = 1e-12; + auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t { const int64_t n_per_row = t->ne[0]; const size_t row_sz = ggml_row_size(typ, n_per_row); @@ -1193,7 +1195,7 @@ static std::unordered_map target_bpw_type( double err = cur.error - nxt.error; err = std::max(err, 0.0); double ratio = err / (double)(delta_bytes * 8ull); - if (ratio > best.ratio + eps || (std::abs(ratio - best.ratio) <= eps && delta_bytes < best.delta_bytes)) { + if (ratio > best.ratio + epsilon || (std::abs(ratio - best.ratio) <= epsilon && delta_bytes < best.delta_bytes)) { best = upgrade{ i, j, err, delta_bytes, ratio }; } } @@ -1208,7 +1210,7 @@ static std::unordered_map target_bpw_type( size_t now_bytes = current_total_bytes(); size_t next_bytes = now_bytes + up.delta_bytes; double bpw_next = (double)next_bytes * 8.0 / (double)tw; - if (bpw_next <= target_bpw + 1e-12) { + if (bpw_next <= target_bpw + epsilon) { all[up.idx].choice = up.next; bpw_now = bpw_next; } else { @@ -1241,7 +1243,7 @@ static std::unordered_map target_bpw_type( double ratio = err / (double)(delta_bytes * 8ull); double over_gap = std::abs(bpw_over - (double)target_bpw); - if (over_gap < best_over_gap - 1e-12 || (std::abs(over_gap - best_over_gap) <= 1e-12 && ratio > best_over.ratio)) { + if (over_gap < best_over_gap - epsilon || (std::abs(over_gap - best_over_gap) <= epsilon && ratio > best_over.ratio)) { best_over_gap = over_gap; best_over = upgrade{ i, j, err, delta_bytes, ratio }; } From 8df1d00ae4042a1eee38c1fc9ac06137d5ce5078 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 28 Aug 2025 16:04:28 +0100 Subject: [PATCH 054/155] Add directional scaling --- src/llama-quant.cpp | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index da1267ddbc..a9621eab8e 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -900,6 +900,27 @@ static std::unordered_map target_bpw_type( return std::isfinite(total_err) ? total_err : 1e35; }; + auto directional_scale = [&](const float * values, const float * activations, int64_t n_per_row) { + if (!activations) { return 1.0f; } + // Compute dominance = ||sqrt(v).*a||_2 / (RMS(a)*sqrt(sum(v))) + // If no values, use v=1 + double sum_v = 0.0; + double sum_aw2 = 0.0; + double sum_a2 = 0.0; + for (int64_t j = 0; j < n_per_row; ++j) { + const double v = values ? std::max(0.0f, values[j]) : 1.0; + const double a = activations[j]; + sum_v += v; + sum_aw2 += v * a * a; + sum_a2 += a * a; + } + const double rms_a = std::sqrt(sum_a2 / std::max(1.0, (double)n_per_row)); + const double denom = std::sqrt(std::max(epsilon, sum_v)) * std::max(epsilon, rms_a); + const double scale = denom > 0.0 ? std::sqrt(sum_aw2) / denom : 1.0; + + // Clamp to a reasonable range + return (float)std::clamp(scale, 0.5, 2.0); + }; std::vector all; all.reserve(tensors.size()); for (const auto * tw : tensors) { From 66aff8fa1ee1d34c7faaa0ff658a730a9554ef36 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 28 Aug 2025 16:06:42 +0100 Subject: [PATCH 055/155] Add precise_lambda() --- src/llama-quant.cpp | 102 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index a9621eab8e..662760fbe9 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -921,6 +921,108 @@ static std::unordered_map target_bpw_type( // Clamp to a reasonable range return (float)std::clamp(scale, 0.5, 2.0); }; + + // Returns an adaptive lambda for this tensor using a small probe set + // bias_lambda adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE + // larger value favours quantisation types that produce smaller bias even if the MSE is slightly larger + auto precise_lambda = [&](const ggml_tensor * t, + const std::vector & f32_sample, + const std::vector & sample_rows_per_slice, + const float * values, + const float * activations, + const std::vector & compatible_candidates) -> float + { + // No activations => no projection term + if (!activations) { return 0.0f; } + + // pick a tiny probe set: try to spread around mid-range types + std::vector probes; + probes.reserve(3); + auto push_if = [&](const ggml_type tiny) { + if (std::find(compatible_candidates.begin(), compatible_candidates.end(), tiny) != compatible_candidates.end()) { + probes.push_back(tiny); + } + }; + + // Prefer family-consistent probes; fall back to whatever exists + push_if(GGML_TYPE_Q4_K); + push_if(GGML_TYPE_Q3_K); + push_if(GGML_TYPE_Q5_K); + if (probes.empty() && !compatible_candidates.empty()) { + probes.push_back(compatible_candidates[compatible_candidates.size() / 2]); + } + if (probes.size() == 1 && compatible_candidates.size() >= 2) { + probes.push_back(compatible_candidates.front()); + } + if (probes.empty()) { return 0.0f; } + + // Scratch buffers (reused) + const int64_t n_per_row = t->ne[0]; + const size_t total_sampled_rows = f32_sample.size() / n_per_row; + size_t max_row_sz = 0; + for (auto pt : probes) { + max_row_sz = std::max(max_row_sz, ggml_row_size(pt, n_per_row)); + } + std::vector quantized_buffer(max_row_sz * total_sampled_rows); + std::vector dequantized_buffer(f32_sample.size()); + + std::vector ratios; + ratios.reserve(probes.size()); + + for (const auto pt : probes) { + // err at lambda=0 => pure weighted MSE part + double err0 = estimate_error(t, pt, f32_sample, sample_rows_per_slice, values, activations, quantized_buffer, dequantized_buffer, 0.0f); + // err at lambda=1 => weighted MSE + projection penalty + const double err1 = estimate_error(t, pt, f32_sample, sample_rows_per_slice, values, activations, quantized_buffer, dequantized_buffer, 1.0f); + + const double p = std::max(0.0, err1 - err0); // projection term contribution + const double m = std::max(0.0, err0); // MSE term contribution + if (p > epsilon && std::isfinite(m) && std::isfinite(p)) { + ratios.push_back(m / p); + } + } + + if (ratios.empty()) { return 0.0f; } + + std::nth_element(ratios.begin(), ratios.begin() + ratios.size() / 2, ratios.end()); + double lambda = ratios[ratios.size() / 2]; + + // activations directional scale + const float scale = directional_scale(values, activations, n_per_row); + lambda *= scale; + + // clamp to safe range + lambda = std::clamp(lambda, 0.0, 8.0); + return (float)lambda; + }; + + auto fast_lambda = [&](const float * values, const float * activations, const int64_t n_per_row) { + if (!activations) { return 0.0f; } + double s = 0.0; + double s2 = 0.0; + for (int64_t j = 0; j < n_per_row; ++j) { + const double w = values ? std::max(0.0f, values[j]) : 1.0; + const double aw = std::sqrt(w) * activations[j]; + const double aw2 = aw * aw; + s += aw2; + s2 += aw2 * aw2; + } + if (s2 <= 0.0) { return 0.0f; } + const auto d = (double)n_per_row; + //const double p = s * s / (d * s2 + epsilon); + //const double lambda = 8.0 * std::clamp(1.0 - p, 0.0, 1.0); + // Map p in (0,1] to lambda in [0,8] decreasing + double base = 1.0 - s * s / (d * s2 + epsilon); + base = std::clamp(base, 0.0, 1.0); + + // activations directional scale + const double scale = directional_scale(values, activations, n_per_row); + // clamp to safe range + const double lambda = std::clamp(base * scale, 0.0, 1.0) * 8.0; + + return (float)lambda; + }; + std::vector all; all.reserve(tensors.size()); for (const auto * tw : tensors) { From 556f6b04fed2092568e31948708af8102c9e5433 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 28 Aug 2025 16:08:08 +0100 Subject: [PATCH 056/155] Add --precise-lambda option --- include/llama.h | 1 + src/llama-quant.cpp | 27 +++++++++++++++++---------- tools/quantize/quantize.cpp | 6 +++++- 3 files changed, 23 insertions(+), 11 deletions(-) diff --git a/include/llama.h b/include/llama.h index 01c5b67c75..3a5bda32ea 100644 --- a/include/llama.h +++ b/include/llama.h @@ -357,6 +357,7 @@ extern "C" { void * tensor_types; // pointer to vector containing tensor types void * prune_layers; // pointer to vector containing layer indices to prune float target_bpw; // target bits per weight (bpw) + bool precise_lambda; // use precise_lambda calculation - slow computation but very accurate } llama_model_quantize_params; typedef struct llama_logit_bias { diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 662760fbe9..98fc11d840 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -722,7 +722,8 @@ static std::unordered_map target_bpw_type( const float * values_sample, const float * activations_sample, std::vector & quantized_buffer, - std::vector & dequantized_buffer) -> double + std::vector & dequantized_buffer, + float bias_lambda) -> double { const int64_t n_per_row = t->ne[0]; const int64_t nrows = t->ne[1]; @@ -878,10 +879,6 @@ static std::unordered_map target_bpw_type( } } - // bias_lambda adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE - // larger value favours quantisation types that produce smaller bias even if the MSE is slightly larger - constexpr float bias_lambda = 1.5f; - constexpr double epsilon = 1e-12; double err_num = weighted_mse; if (activations && bias_lambda != 0.0f) { const double proj = bias_num * bias_num / (bias_denom + epsilon); @@ -1163,6 +1160,15 @@ static std::unordered_map target_bpw_type( std::sort(compatible_candidates.begin(), compatible_candidates.end()); compatible_candidates.erase(std::unique(compatible_candidates.begin(), compatible_candidates.end()), compatible_candidates.end()); + // Compute adaptive bias_lambda for this tensor + float bias_lambda = 0.0f; + { + const float * values = values_sample.empty() ? nullptr : values_sample.data(); + const float * activations = activations_sample.empty() ? nullptr : activations_sample.data(); + bias_lambda = params->precise_lambda ? precise_lambda(t, f32_sample, sample_rows_per_slice, values, activations, compatible_candidates) : + fast_lambda(values, activations, n_per_row); + } + // Now evaluate candidates std::vector eval_candidates(compatible_candidates.size()); const float * values = values_sample.empty() ? nullptr : values_sample.data(); @@ -1186,7 +1192,7 @@ static std::unordered_map target_bpw_type( const ggml_type tt = compatible_candidates[i]; const auto bpw = (float)tensor_bpw(t, tt); const size_t bytes = tensor_bytes(t, tt); - const auto err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, values, activations, tl_quantized_buffer, tl_dequantised_buffer); + const auto err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, values, activations, tl_quantized_buffer, tl_dequantised_buffer, bias_lambda); eval_candidates[i] = candidate_types{ tt, bpw, bytes, err }; } }); @@ -1301,7 +1307,6 @@ static std::unordered_map target_bpw_type( }; auto recompute_best_upgrade = [&]() -> upgrade { - const double eps = 1e-12; upgrade best{ -1, -1, 0.0, 0, -1.0 }; for (int i = 0; i < (int) all.size(); ++i) { const auto & ti = all[i]; @@ -1653,10 +1658,11 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: if (params->target_bpw != -1.0f && !params->only_copy) { if (params->imatrix) { if (params->activations) { - LLAMA_LOG_INFO("%s: imatrix with activations provided, target bpw quantization will be more accurate\n", __func__); + LLAMA_LOG_INFO("%s: imatrix with activations provided, target bpw quantization will be more accurate - ",__func__); } else { - LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate\n", __func__); + LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate - ", __func__); } + LLAMA_LOG_INFO("using %s\n", params->precise_lambda ? "precise lambda (slow)" : "fast lambda"); LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw); bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread); } else { @@ -1966,7 +1972,8 @@ llama_model_quantize_params llama_model_quantize_default_params() { /*.kv_overrides =*/ nullptr, /*.tensor_type =*/ nullptr, /*.prune_layers =*/ nullptr, - /*.target_bpw =*/ -1.0f + /*.target_bpw =*/ -1.0f, + /*.precise_lambda =*/ false }; return result; diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 77fa6b90ce..0c9460513c 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -132,7 +132,9 @@ static void usage(const char * executable) { printf(" Advanced option to selectively quantize tensors. May be specified multiple times.\n"); printf(" --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n"); printf(" Advanced option to remove all tensors from the given layers\n"); - printf(" --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 16.0 \n"); + printf(" --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 16.0\n"); + printf(" Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n"); + printf(" --precise-lambda: given a target bpw, use a high-precision error computation at the expense of longer processing times\n"); printf(" --keep-split: will generate quantized model in the same shards as input\n"); printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n"); @@ -538,6 +540,8 @@ int main(int argc, char ** argv) { if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) { usage(argv[0]); } + } else if (strcmp(argv[arg_idx], "--precise-lambda") == 0) { + params.precise_lambda = true; } else if (strcmp(argv[arg_idx], "--prune-layers") == 0) { if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) { usage(argv[0]); From eab8708244db703c5c7219261b0c875c4b57825f Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 30 Aug 2025 10:14:46 +0100 Subject: [PATCH 057/155] Minor factoring for efficiency and correctness --- src/llama-quant.cpp | 126 +++++++++++++++++++++----------------------- 1 file changed, 60 insertions(+), 66 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 98fc11d840..db688fdf02 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -596,7 +596,7 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * return new_size; } -// Returns per-tensor type overrides to meet target BPW at lowest error +// Returns tensor type overrides to meet a global bpw target static std::unordered_map target_bpw_type( llama_model_loader & ml, std::vector> & buffer, @@ -650,6 +650,7 @@ static std::unordered_map target_bpw_type( }; constexpr double epsilon = 1e-12; + constexpr double infinity = std::numeric_limits::infinity(); auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t { const int64_t n_per_row = t->ne[0]; @@ -680,7 +681,7 @@ static std::unordered_map target_bpw_type( auto name_tn = LLM_TN(model.arch); auto can_quantize = [&](const ggml_tensor * t) -> bool { - // This list should be kept in sync with llama_tensor_quantize_impl() + // This list should be kept in sync with llama_tensor_quantize_impl() to avoid drift const std::string name = ggml_get_name(t); bool q = name.rfind("weight") == name.size() - 6; q &= ggml_n_dims(t) >= 2; @@ -730,9 +731,15 @@ static std::unordered_map target_bpw_type( const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; const size_t sample_element_count = f32_sample.size(); - const size_t sample_row_count = sample_element_count / (size_t)n_per_row; + const size_t sample_row_count = n_per_row > 0 ? sample_element_count / (size_t)n_per_row : 0; if (sample_row_count == 0) { return 0.0; } + size_t expected_rows = 0; + for (int64_t s = 0; s < ne2; ++s) { + expected_rows += (size_t)sample_rows_per_slice[s]; + } + if (expected_rows != sample_row_count) { return infinity; } + const size_t row_sz = ggml_row_size(quant_type, n_per_row); const size_t buffer_sz = row_sz * sample_row_count; @@ -750,15 +757,15 @@ static std::unordered_map target_bpw_type( const float * activations = activations_sample + s * n_per_row; double denom = 0.0; for (int64_t j = 0; j < n_per_row; ++j) { + const double w = values ? std::max(0.0f, values[j]) : 1.0; const double a = activations[j]; - const double w = values ? values[j] : 1.0; denom += w * a * a; } bias_denominator_per_slice[s] = denom; } } - // Compute per-row squared norms with weighting (if values are provided) + // Per-row squared norms with weighting std::vector row_sq_norm(sample_row_count, 0.0); { size_t offset = 0; @@ -768,15 +775,14 @@ static std::unordered_map target_bpw_type( if (rs == 0) { continue; } const float * values = has_values ? values_sample + s * n_per_row : nullptr; - for (int64_t r = 0; r < rs; ++r, ++row_idx) { const float * x = f32_sample.data() + offset; double rsn = 0.0; if (values) { for (int64_t j = 0; j < n_per_row; ++j) { - const double v = values[j]; + const double w = std::max(0.0f, values[j]); const double xx = x[j]; - rsn += v * xx * xx; + rsn += w * xx * xx; } } else { for (int64_t j = 0; j < n_per_row; ++j) { @@ -790,7 +796,7 @@ static std::unordered_map target_bpw_type( } } - // Quantize sampled rows slice-by-slice into quantized_buffer + // Quantize sampled rows per slice -> quantized_buffer { size_t q_offset = 0; size_t f_offset = 0; @@ -800,35 +806,32 @@ static std::unordered_map target_bpw_type( const float * value = has_values ? values_sample + slice * n_per_row : nullptr; (void)ggml_quantize_chunk(quant_type, f32_sample.data() + f_offset, quantized_buffer.data() + q_offset, 0, rs, n_per_row, value); - q_offset += row_sz * (size_t)rs; f_offset += (size_t)rs * (size_t)n_per_row; } } - // Dequantize into dequantized_buffer + // quantized_buffer -> dequantized_buffer { const ggml_type_traits * traits = ggml_get_type_traits(quant_type); - auto row_to_float = [&](size_t r) { - uint8_t * src = quantized_buffer.data() + r * row_sz; - float * dst = dequantized_buffer.data() + r * (size_t)n_per_row; - if (quant_type == GGML_TYPE_F16) { - ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row); - } else if (quant_type == GGML_TYPE_BF16) { - ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row); - } else { - if (!traits || !traits->to_float) { - LLAMA_LOG_WARN("%s: unsupported quantization type %s\n", __func__, ggml_type_name(quant_type)); - return false; + + const bool is_fp16 = quant_type == GGML_TYPE_F16; + const bool is_bf16 = quant_type == GGML_TYPE_BF16; + if (!is_fp16 && !is_bf16 && traits && traits->to_float) { + traits->to_float(quantized_buffer.data(), dequantized_buffer.data(), (int)(sample_row_count * (size_t)n_per_row)); + } else { + for (size_t r = 0; r < sample_row_count; ++r) { + uint8_t * src = quantized_buffer.data() + r * row_sz; + float * dst = dequantized_buffer.data() + r * (size_t) n_per_row; + if (is_fp16) { + ggml_fp16_to_fp32_row((const ggml_fp16_t *) src, dst, (int)n_per_row); + } else if (is_bf16) { + ggml_bf16_to_fp32_row((const ggml_bf16_t *) src, dst, (int)n_per_row); + } else { + if (!traits || !traits->to_float) { return infinity; } + traits->to_float(src, dst, (int)n_per_row); } - traits->to_float(src, dst, (int)n_per_row); } - - return true; - }; - - for (size_t r = 0; r < sample_row_count; ++r) { - if (!row_to_float(r)) { return 1e35; } } } @@ -836,6 +839,7 @@ static std::unordered_map target_bpw_type( size_t offset = 0; size_t row_idx = 0; double total_err = 0.0; + for (int64_t slice = 0; slice < ne2; ++slice) { const int64_t rs = sample_rows_per_slice[slice]; if (rs == 0) { continue; } @@ -843,9 +847,7 @@ static std::unordered_map target_bpw_type( const float * values = has_values ? values_sample + slice * n_per_row : nullptr; const float * activations = has_activations ? activations_sample + slice * n_per_row : nullptr; const double bias_denom = has_activations ? bias_denominator_per_slice[slice] : 0.0; - double slice_err = 0.0; - for (int64_t r = 0; r < rs; ++r, ++row_idx) { const float * x = f32_sample.data() + offset; const float * y = dequantized_buffer.data() + offset; @@ -853,17 +855,17 @@ static std::unordered_map target_bpw_type( double bias_num = 0.0; if (values && activations) { for (int64_t j = 0; j < n_per_row; ++j) { - const double v = values[j]; + const double w = std::max(0.0f, values[j]); const double e = y[j] - x[j]; const double a = activations[j]; - weighted_mse += v * e * e; - bias_num += v * e * a; + weighted_mse += w * e * e; + bias_num += w * e * a; } } else if (values) { for (int64_t j = 0; j < n_per_row; ++j) { - const double v = values[j]; + const double w = std::max(0.0f, values[j]); const double e = y[j] - x[j]; - weighted_mse += v * e * e; + weighted_mse += w * e * e; } } else if (activations) { for (int64_t j = 0; j < n_per_row; ++j) { @@ -881,26 +883,28 @@ static std::unordered_map target_bpw_type( double err_num = weighted_mse; if (activations && bias_lambda != 0.0f) { - const double proj = bias_num * bias_num / (bias_denom + epsilon); - err_num += (double)bias_lambda * proj; + if (bias_denom > 0.0) { + const double proj = bias_num * bias_num / (bias_denom + epsilon); + err_num += bias_lambda * proj; + } } - const double err_den = row_sq_norm[row_idx] + epsilon; - slice_err += err_num / err_den; + const double denom = row_sq_norm[row_idx] + epsilon; + slice_err += err_num / denom; offset += (size_t)n_per_row; } const double scale_rows = (double)nrows / std::max(1.0, (double)rs); total_err += slice_err * scale_rows; + if (!std::isfinite(total_err)) { return infinity; } } - return std::isfinite(total_err) ? total_err : 1e35; + return std::isfinite(total_err) ? total_err : infinity; }; + // Scaling factor to increase lambda when activations are concentrated auto directional_scale = [&](const float * values, const float * activations, int64_t n_per_row) { if (!activations) { return 1.0f; } - // Compute dominance = ||sqrt(v).*a||_2 / (RMS(a)*sqrt(sum(v))) - // If no values, use v=1 double sum_v = 0.0; double sum_aw2 = 0.0; double sum_a2 = 0.0; @@ -915,13 +919,10 @@ static std::unordered_map target_bpw_type( const double denom = std::sqrt(std::max(epsilon, sum_v)) * std::max(epsilon, rms_a); const double scale = denom > 0.0 ? std::sqrt(sum_aw2) / denom : 1.0; - // Clamp to a reasonable range return (float)std::clamp(scale, 0.5, 2.0); }; - // Returns an adaptive lambda for this tensor using a small probe set - // bias_lambda adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE - // larger value favours quantisation types that produce smaller bias even if the MSE is slightly larger + // Higher precision but much longer to compute auto precise_lambda = [&](const ggml_tensor * t, const std::vector & f32_sample, const std::vector & sample_rows_per_slice, @@ -929,10 +930,8 @@ static std::unordered_map target_bpw_type( const float * activations, const std::vector & compatible_candidates) -> float { - // No activations => no projection term if (!activations) { return 0.0f; } - // pick a tiny probe set: try to spread around mid-range types std::vector probes; probes.reserve(3); auto push_if = [&](const ggml_type tiny) { @@ -941,7 +940,6 @@ static std::unordered_map target_bpw_type( } }; - // Prefer family-consistent probes; fall back to whatever exists push_if(GGML_TYPE_Q4_K); push_if(GGML_TYPE_Q3_K); push_if(GGML_TYPE_Q5_K); @@ -953,19 +951,18 @@ static std::unordered_map target_bpw_type( } if (probes.empty()) { return 0.0f; } - // Scratch buffers (reused) + // Scratch buffers const int64_t n_per_row = t->ne[0]; const size_t total_sampled_rows = f32_sample.size() / n_per_row; size_t max_row_sz = 0; for (auto pt : probes) { max_row_sz = std::max(max_row_sz, ggml_row_size(pt, n_per_row)); } + std::vector quantized_buffer(max_row_sz * total_sampled_rows); std::vector dequantized_buffer(f32_sample.size()); - std::vector ratios; ratios.reserve(probes.size()); - for (const auto pt : probes) { // err at lambda=0 => pure weighted MSE part double err0 = estimate_error(t, pt, f32_sample, sample_rows_per_slice, values, activations, quantized_buffer, dequantized_buffer, 0.0f); @@ -984,17 +981,17 @@ static std::unordered_map target_bpw_type( std::nth_element(ratios.begin(), ratios.begin() + ratios.size() / 2, ratios.end()); double lambda = ratios[ratios.size() / 2]; - // activations directional scale const float scale = directional_scale(values, activations, n_per_row); lambda *= scale; - - // clamp to safe range lambda = std::clamp(lambda, 0.0, 8.0); + return (float)lambda; }; + // Faster to compute but lower precision. Best option for the vast majority of models auto fast_lambda = [&](const float * values, const float * activations, const int64_t n_per_row) { if (!activations) { return 0.0f; } + double s = 0.0; double s2 = 0.0; for (int64_t j = 0; j < n_per_row; ++j) { @@ -1004,17 +1001,13 @@ static std::unordered_map target_bpw_type( s += aw2; s2 += aw2 * aw2; } + if (s2 <= 0.0) { return 0.0f; } const auto d = (double)n_per_row; - //const double p = s * s / (d * s2 + epsilon); - //const double lambda = 8.0 * std::clamp(1.0 - p, 0.0, 1.0); - // Map p in (0,1] to lambda in [0,8] decreasing double base = 1.0 - s * s / (d * s2 + epsilon); base = std::clamp(base, 0.0, 1.0); - // activations directional scale const double scale = directional_scale(values, activations, n_per_row); - // clamp to safe range const double lambda = std::clamp(base * scale, 0.0, 1.0) * 8.0; return (float)lambda; @@ -1036,13 +1029,13 @@ static std::unordered_map target_bpw_type( } ml.load_data_for(t); - // Dequantize only sampled rows into f32_sample + // Dequantize sampled rows into f32_sample const int64_t n_per_row = t->ne[0]; const int64_t nrows_total = t->ne[1]; const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; - // Larger sample_rows_per_expert values may result in more accurate error estimates, but will take longer to compute - constexpr int sample_rows_per_expert = 384; + // Larger sample_rows_per_expert values may result in more accurate error estimates, but it will take much longer to compute + constexpr int sample_rows_per_expert = 256; std::vector f32_sample; f32_sample.reserve((size_t)ne2 * (size_t)std::min(nrows_total, sample_rows_per_expert) * (size_t)n_per_row); @@ -1096,6 +1089,7 @@ static std::unordered_map target_bpw_type( const std::string key = remap_imatrix(tensor_name, mapped); const auto it = m->find(key); if (it == m->end()) { return {nullptr, 0}; } + return { it->second.data(), it->second.size() }; }; @@ -1104,7 +1098,6 @@ static std::unordered_map target_bpw_type( const size_t want = (size_t)ne2 * (size_t)n_per_row; dst.clear(); if (!src || src_sz == 0) { return; } - if (src_sz == want) { dst.resize(want); std::memcpy(dst.data(), src, want * sizeof(float)); @@ -1160,7 +1153,8 @@ static std::unordered_map target_bpw_type( std::sort(compatible_candidates.begin(), compatible_candidates.end()); compatible_candidates.erase(std::unique(compatible_candidates.begin(), compatible_candidates.end()), compatible_candidates.end()); - // Compute adaptive bias_lambda for this tensor + // Adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE. + // Larger values favours quantisation types that produce smaller bias even if the MSE is slightly bigger float bias_lambda = 0.0f; { const float * values = values_sample.empty() ? nullptr : values_sample.data(); From 04c07b3272f067ba30d32fb82d693fb0013cc47d Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Wed, 10 Sep 2025 18:00:56 +0100 Subject: [PATCH 058/155] Add better control over MSE and directional bias computation --- include/llama.h | 2 +- src/llama-quant.cpp | 41 +++++++++---------------------------- tools/quantize/quantize.cpp | 31 +++++++++++++++++++++++++--- 3 files changed, 39 insertions(+), 35 deletions(-) diff --git a/include/llama.h b/include/llama.h index d0ca37dc65..ba6c185346 100644 --- a/include/llama.h +++ b/include/llama.h @@ -365,7 +365,7 @@ extern "C" { void * tensor_types; // pointer to vector containing tensor types void * prune_layers; // pointer to vector containing layer indices to prune float target_bpw; // target bits per weight (bpw) - bool precise_lambda; // use precise_lambda calculation - slow computation but very accurate + int32_t bpw_bias; // type of error bias to use: 0 = no bias (MSE only), 1 = fast (default), 2 = precise (slow) } llama_model_quantize_params; typedef struct llama_logit_bias { diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index db688fdf02..74ceb3de9c 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -902,26 +902,6 @@ static std::unordered_map target_bpw_type( return std::isfinite(total_err) ? total_err : infinity; }; - // Scaling factor to increase lambda when activations are concentrated - auto directional_scale = [&](const float * values, const float * activations, int64_t n_per_row) { - if (!activations) { return 1.0f; } - double sum_v = 0.0; - double sum_aw2 = 0.0; - double sum_a2 = 0.0; - for (int64_t j = 0; j < n_per_row; ++j) { - const double v = values ? std::max(0.0f, values[j]) : 1.0; - const double a = activations[j]; - sum_v += v; - sum_aw2 += v * a * a; - sum_a2 += a * a; - } - const double rms_a = std::sqrt(sum_a2 / std::max(1.0, (double)n_per_row)); - const double denom = std::sqrt(std::max(epsilon, sum_v)) * std::max(epsilon, rms_a); - const double scale = denom > 0.0 ? std::sqrt(sum_aw2) / denom : 1.0; - - return (float)std::clamp(scale, 0.5, 2.0); - }; - // Higher precision but much longer to compute auto precise_lambda = [&](const ggml_tensor * t, const std::vector & f32_sample, @@ -979,11 +959,7 @@ static std::unordered_map target_bpw_type( if (ratios.empty()) { return 0.0f; } std::nth_element(ratios.begin(), ratios.begin() + ratios.size() / 2, ratios.end()); - double lambda = ratios[ratios.size() / 2]; - - const float scale = directional_scale(values, activations, n_per_row); - lambda *= scale; - lambda = std::clamp(lambda, 0.0, 8.0); + const double lambda = std::clamp(ratios[ratios.size() / 2], 0.0, 8.0); return (float)lambda; }; @@ -1007,8 +983,7 @@ static std::unordered_map target_bpw_type( double base = 1.0 - s * s / (d * s2 + epsilon); base = std::clamp(base, 0.0, 1.0); - const double scale = directional_scale(values, activations, n_per_row); - const double lambda = std::clamp(base * scale, 0.0, 1.0) * 8.0; + const double lambda = std::clamp(base, 0.0, 1.0) * 8.0; return (float)lambda; }; @@ -1159,8 +1134,11 @@ static std::unordered_map target_bpw_type( { const float * values = values_sample.empty() ? nullptr : values_sample.data(); const float * activations = activations_sample.empty() ? nullptr : activations_sample.data(); - bias_lambda = params->precise_lambda ? precise_lambda(t, f32_sample, sample_rows_per_slice, values, activations, compatible_candidates) : - fast_lambda(values, activations, n_per_row); + if (params->bpw_bias == 1) { + bias_lambda = fast_lambda(values, activations, n_per_row); + } else if (params->bpw_bias == 2) { + bias_lambda = precise_lambda(t, f32_sample, sample_rows_per_slice, values, activations, compatible_candidates); + } } // Now evaluate candidates @@ -1656,7 +1634,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } else { LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate - ", __func__); } - LLAMA_LOG_INFO("using %s\n", params->precise_lambda ? "precise lambda (slow)" : "fast lambda"); + const char* msg[] = {"no bias (MSE only)", "fast (default)", "precise (slow)"}; + LLAMA_LOG_INFO("using %s error estimation\n", msg[params->bpw_bias]); LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw); bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread); } else { @@ -1967,7 +1946,7 @@ llama_model_quantize_params llama_model_quantize_default_params() { /*.tensor_type =*/ nullptr, /*.prune_layers =*/ nullptr, /*.target_bpw =*/ -1.0f, - /*.precise_lambda =*/ false + /*.bpw_bias =*/ 1 }; return result; diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 0c9460513c..0fe65daea0 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -134,7 +134,7 @@ static void usage(const char * executable) { printf(" Advanced option to remove all tensors from the given layers\n"); printf(" --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 16.0\n"); printf(" Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n"); - printf(" --precise-lambda: given a target bpw, use a high-precision error computation at the expense of longer processing times\n"); + printf(" --bpw_bias: type of error bias to use: 0 = no bias (MSE only), 1 = fast (default), 2 = precise (slow)\n"); printf(" --keep-split: will generate quantized model in the same shards as input\n"); printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n"); @@ -496,6 +496,27 @@ static bool parse_target_bpw(const char * data, float & target_bpw) { return true; } +static bool parse_bpw_bias(const char * data, int & bpw_bias) { + if (!data) { + printf("\n%s: error bias type not provided\n\n", __func__); + return false; + } + + try { + bpw_bias = std::stoi(data); + if (bpw_bias < 0 || bpw_bias > 2) { + printf("\n%s: error bias type must be one of 0 (no bias, MSE only), 1 (fast), or 2 (precise, but slow)\n\n", __func__); + return false; + } + } + catch (const std::exception & e) { + printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__, data); + return false; + } + + return true; +} + int main(int argc, char ** argv) { if (argc < 3) { usage(argv[0]); @@ -510,6 +531,7 @@ int main(int argc, char ** argv) { std::vector tensor_types; std::vector prune_layers; float target_bpw = -1.0f; + int bpw_bias = 1; for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) { if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) { @@ -540,8 +562,11 @@ int main(int argc, char ** argv) { if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) { usage(argv[0]); } - } else if (strcmp(argv[arg_idx], "--precise-lambda") == 0) { - params.precise_lambda = true; + } else if (strcmp(argv[arg_idx], "--bpw-bias") == 0) { + if (arg_idx == argc-1 || !parse_bpw_bias(argv[++arg_idx], bpw_bias)) { + usage(argv[0]); + } + params.bpw_bias = bpw_bias; } else if (strcmp(argv[arg_idx], "--prune-layers") == 0) { if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) { usage(argv[0]); From 886536d80ab5c227cd6c3f8813b8b5fbf5bea41d Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 13 Sep 2025 08:27:23 +0100 Subject: [PATCH 059/155] Increase error type precision --- src/llama-quant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 74ceb3de9c..c4c525c68e 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -612,7 +612,7 @@ static std::unordered_map target_bpw_type( ggml_type type; float bpw; size_t bytes; - float error; + double error; }; struct tensor_info { From bc8762f27f185c5db1cbd0d8ec3bcc8e1771856d Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 13 Sep 2025 08:33:22 +0100 Subject: [PATCH 060/155] Capture surrounding function name --- src/llama-quant.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index c4c525c68e..cae908803b 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -651,6 +651,7 @@ static std::unordered_map target_bpw_type( constexpr double epsilon = 1e-12; constexpr double infinity = std::numeric_limits::infinity(); + const char * func = __func__; auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t { const int64_t n_per_row = t->ne[0]; @@ -1083,7 +1084,7 @@ static std::unordered_map target_bpw_type( } } else { LLAMA_LOG_WARN("%s: side data size mismatch for %s: got %zu, expected %zu or %zu; ignoring\n", - __func__, name.c_str(), src_sz, (size_t)n_per_row, want); + func, name.c_str(), src_sz, (size_t)n_per_row, want); } }; From 4dff85fbe54336130155a8e4fa5e7f4db48f4451 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 13 Sep 2025 08:41:37 +0100 Subject: [PATCH 061/155] Improve precise_lambda() efficiency --- src/llama-quant.cpp | 126 ++++++++++++++++++++++++++++++-------------- 1 file changed, 86 insertions(+), 40 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index cae908803b..1677b242d9 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -725,7 +725,9 @@ static std::unordered_map target_bpw_type( const float * activations_sample, std::vector & quantized_buffer, std::vector & dequantized_buffer, - float bias_lambda) -> double + float bias_lambda, + double * out_mse = nullptr, + double * out_proj = nullptr) -> double { const int64_t n_per_row = t->ne[0]; const int64_t nrows = t->ne[1]; @@ -733,13 +735,23 @@ static std::unordered_map target_bpw_type( const size_t sample_element_count = f32_sample.size(); const size_t sample_row_count = n_per_row > 0 ? sample_element_count / (size_t)n_per_row : 0; - if (sample_row_count == 0) { return 0.0; } + if (sample_row_count == 0) { + if (out_mse) { *out_mse = 0.0; } + if (out_proj) { *out_proj = 0.0; } + + return 0.0; + } size_t expected_rows = 0; for (int64_t s = 0; s < ne2; ++s) { expected_rows += (size_t)sample_rows_per_slice[s]; } - if (expected_rows != sample_row_count) { return infinity; } + if (expected_rows != sample_row_count) { + if (out_mse) { *out_mse = infinity; } + if (out_proj) { *out_proj = 0.0; } + + return infinity; + } const size_t row_sz = ggml_row_size(quant_type, n_per_row); const size_t buffer_sz = row_sz * sample_row_count; @@ -750,7 +762,7 @@ static std::unordered_map target_bpw_type( const bool has_values = values_sample != nullptr; const bool has_activations = activations_sample != nullptr; - // Bias denominators per slice (only needed if we have activations) + // Bias denominators per slice std::vector bias_denominator_per_slice(ne2, 0.0); if (has_activations) { for (int64_t s = 0; s < ne2; ++s) { @@ -815,7 +827,6 @@ static std::unordered_map target_bpw_type( // quantized_buffer -> dequantized_buffer { const ggml_type_traits * traits = ggml_get_type_traits(quant_type); - const bool is_fp16 = quant_type == GGML_TYPE_F16; const bool is_bf16 = quant_type == GGML_TYPE_BF16; if (!is_fp16 && !is_bf16 && traits && traits->to_float) { @@ -825,12 +836,19 @@ static std::unordered_map target_bpw_type( uint8_t * src = quantized_buffer.data() + r * row_sz; float * dst = dequantized_buffer.data() + r * (size_t) n_per_row; if (is_fp16) { - ggml_fp16_to_fp32_row((const ggml_fp16_t *) src, dst, (int)n_per_row); - } else if (is_bf16) { - ggml_bf16_to_fp32_row((const ggml_bf16_t *) src, dst, (int)n_per_row); - } else { - if (!traits || !traits->to_float) { return infinity; } - traits->to_float(src, dst, (int)n_per_row); + ggml_fp16_to_fp32_row((const ggml_fp16_t *) src, dst, (int) n_per_row); + } + else if (is_bf16) { + ggml_bf16_to_fp32_row((const ggml_bf16_t *) src, dst, (int) n_per_row); + } + else { + if (!traits || !traits->to_float) { + if (out_mse) { *out_mse = infinity; } + if (out_proj) { *out_proj = 0.0; } + + return infinity; + } + traits->to_float(src, dst, (int) n_per_row); } } } @@ -839,8 +857,8 @@ static std::unordered_map target_bpw_type( // Compute error size_t offset = 0; size_t row_idx = 0; - double total_err = 0.0; - + double total_mse = 0.0; + double total_proj = 0.0; for (int64_t slice = 0; slice < ne2; ++slice) { const int64_t rs = sample_rows_per_slice[slice]; if (rs == 0) { continue; } @@ -848,7 +866,11 @@ static std::unordered_map target_bpw_type( const float * values = has_values ? values_sample + slice * n_per_row : nullptr; const float * activations = has_activations ? activations_sample + slice * n_per_row : nullptr; const double bias_denom = has_activations ? bias_denominator_per_slice[slice] : 0.0; - double slice_err = 0.0; + std::vector row_mse_norm; + std::vector row_proj_norm; + row_mse_norm.reserve(rs); + if (activations) { row_proj_norm.reserve(rs); } + for (int64_t r = 0; r < rs; ++r, ++row_idx) { const float * x = f32_sample.data() + offset; const float * y = dequantized_buffer.data() + offset; @@ -868,13 +890,6 @@ static std::unordered_map target_bpw_type( const double e = y[j] - x[j]; weighted_mse += w * e * e; } - } else if (activations) { - for (int64_t j = 0; j < n_per_row; ++j) { - const double e = y[j] - x[j]; - const double a = activations[j]; - weighted_mse += e * e; - bias_num += e * a; - } } else { for (int64_t j = 0; j < n_per_row; ++j) { const double e = y[j] - x[j]; @@ -882,28 +897,64 @@ static std::unordered_map target_bpw_type( } } - double err_num = weighted_mse; - if (activations && bias_lambda != 0.0f) { + const double denom_x = row_sq_norm[row_idx]; + double m_norm = weighted_mse / (denom_x + epsilon); + row_mse_norm.push_back(std::isfinite(m_norm) ? m_norm : infinity); + + if (activations) { + double p_norm = 0.0; if (bias_denom > 0.0) { const double proj = bias_num * bias_num / (bias_denom + epsilon); - err_num += bias_lambda * proj; + p_norm = std::isfinite(proj) ? proj : 0.0; } + row_proj_norm.push_back(p_norm); } - - const double denom = row_sq_norm[row_idx] + epsilon; - slice_err += err_num / denom; offset += (size_t)n_per_row; } + // Trimmed sum to avoid outlier rows dominating the results + auto trimmed_sum = [&](std::vector & v) -> double { + if (v.empty()) { return 0.0; } + const int64_t n = (int64_t)v.size(); + if (n < 50) { + double s = 0.0; + for (const double z : v) { s += z; } + return s; + } + + int64_t k = (int64_t) std::floor(0.02 * (double)n); // trim 2% on each side + k = std::max(0, std::min(k, n / 32)); // but not more than 3.125% + std::nth_element(v.begin(), v.begin() + k, v.end()); + std::nth_element(v.begin() + k, v.begin() + (n - k), v.end()); + double s = 0.0; + for (int64_t i = k; i < n - k; ++i) { + s += v[i]; + } + + return s; + }; + const double scale_rows = (double)nrows / std::max(1.0, (double)rs); - total_err += slice_err * scale_rows; - if (!std::isfinite(total_err)) { return infinity; } + + total_mse += trimmed_sum(row_mse_norm) * scale_rows; + if (activations) { total_proj += trimmed_sum(row_proj_norm) * scale_rows; } + + if (!std::isfinite(total_mse) || !std::isfinite(total_proj)) { + if (out_mse) { *out_mse = infinity; } + if (out_proj) { *out_proj = 0.0; } + + return infinity; + } } + if (out_mse) { *out_mse = total_mse; } + if (out_proj) { *out_proj = total_proj; } + + const double total_err = total_mse + bias_lambda * total_proj; return std::isfinite(total_err) ? total_err : infinity; }; - // Higher precision but much longer to compute + // Higher precision but longer to compute auto precise_lambda = [&](const ggml_tensor * t, const std::vector & f32_sample, const std::vector & sample_rows_per_slice, @@ -936,22 +987,17 @@ static std::unordered_map target_bpw_type( const int64_t n_per_row = t->ne[0]; const size_t total_sampled_rows = f32_sample.size() / n_per_row; size_t max_row_sz = 0; - for (auto pt : probes) { - max_row_sz = std::max(max_row_sz, ggml_row_size(pt, n_per_row)); - } + for (auto pt : probes) max_row_sz = std::max(max_row_sz, ggml_row_size(pt, n_per_row)); std::vector quantized_buffer(max_row_sz * total_sampled_rows); std::vector dequantized_buffer(f32_sample.size()); + std::vector ratios; ratios.reserve(probes.size()); for (const auto pt : probes) { - // err at lambda=0 => pure weighted MSE part - double err0 = estimate_error(t, pt, f32_sample, sample_rows_per_slice, values, activations, quantized_buffer, dequantized_buffer, 0.0f); - // err at lambda=1 => weighted MSE + projection penalty - const double err1 = estimate_error(t, pt, f32_sample, sample_rows_per_slice, values, activations, quantized_buffer, dequantized_buffer, 1.0f); - - const double p = std::max(0.0, err1 - err0); // projection term contribution - const double m = std::max(0.0, err0); // MSE term contribution + double m = 0.0; + double p = 0.0; + (void)estimate_error(t, pt, f32_sample, sample_rows_per_slice, values, activations, quantized_buffer, dequantized_buffer, 0.0f, &m, &p); if (p > epsilon && std::isfinite(m) && std::isfinite(p)) { ratios.push_back(m / p); } From 7d85993f268d9fa35bea9178f6acf2d72833dffa Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 13 Sep 2025 08:44:41 +0100 Subject: [PATCH 062/155] Minor refactoring --- src/llama-quant.cpp | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 1677b242d9..15ea36721e 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -617,7 +617,7 @@ static std::unordered_map target_bpw_type( struct tensor_info { const llama_model_loader::llama_tensor_weight * w = nullptr; - std::vector candidate = {}; + std::vector candidate; int choice = -1; float min_bpw = 0.0; float max_bpw = 0.0; @@ -972,8 +972,8 @@ static std::unordered_map target_bpw_type( } }; - push_if(GGML_TYPE_Q4_K); push_if(GGML_TYPE_Q3_K); + push_if(GGML_TYPE_Q4_K); push_if(GGML_TYPE_Q5_K); if (probes.empty() && !compatible_candidates.empty()) { probes.push_back(compatible_candidates[compatible_candidates.size() / 2]); @@ -1011,7 +1011,7 @@ static std::unordered_map target_bpw_type( return (float)lambda; }; - // Faster to compute but lower precision. Best option for the vast majority of models + // Faster to compute but may yield lower precision. Best option for the vast majority of cases auto fast_lambda = [&](const float * values, const float * activations, const int64_t n_per_row) { if (!activations) { return 0.0f; } @@ -1057,12 +1057,10 @@ static std::unordered_map target_bpw_type( const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; // Larger sample_rows_per_expert values may result in more accurate error estimates, but it will take much longer to compute - constexpr int sample_rows_per_expert = 256; + const int sample_rows_per_expert = activations_data ? 512 : 256; std::vector f32_sample; f32_sample.reserve((size_t)ne2 * (size_t)std::min(nrows_total, sample_rows_per_expert) * (size_t)n_per_row); - // deterministic sampling seed based on tensor name + fixed constant - std::mt19937 rng(std::hash{}(name) ^0xeabada55cafed00d); std::vector sample_rows_per_slice(ne2, 0); const int64_t sample_rows_max = std::max(1, std::min(nrows_total, sample_rows_per_expert)); const int64_t stride = std::max(1, nrows_total / sample_rows_max); @@ -1072,6 +1070,7 @@ static std::unordered_map target_bpw_type( const bool src_is_quant = ggml_is_quantized(src_type); const size_t src_row_sz = ggml_row_size(src_type, n_per_row); for (int64_t slice = 0; slice < ne2; ++slice) { + std::mt19937 rng(std::hash{}(name) ^ 0xeabada55cafed00d ^ slice); int64_t current_sampled_rows = 0; int64_t offset = 0; if (stride > 1) { @@ -1084,11 +1083,11 @@ static std::unordered_map target_bpw_type( const float * src_row = (const float *)t->data + slice * (n_per_row * nrows_total) + r * n_per_row; f32_sample.insert(f32_sample.end(), src_row, src_row + n_per_row); } else if (src_type == GGML_TYPE_F16) { - const ggml_fp16_t * src_row = (const ggml_fp16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz); + const auto * src_row = (const ggml_fp16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz); ggml_fp16_to_fp32_row(src_row, row_buffer.data(), (int)n_per_row); f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end()); } else if (src_type == GGML_TYPE_BF16) { - const ggml_bf16_t * src_row = (const ggml_bf16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz); + const auto * src_row = (const ggml_bf16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz); ggml_bf16_to_fp32_row(src_row, row_buffer.data(), (int)n_per_row); f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end()); } else if (src_is_quant) { @@ -1211,7 +1210,7 @@ static std::unordered_map target_bpw_type( const ggml_type tt = compatible_candidates[i]; const auto bpw = (float)tensor_bpw(t, tt); const size_t bytes = tensor_bytes(t, tt); - const auto err = (float)estimate_error(t, tt, f32_sample, sample_rows_per_slice, values, activations, tl_quantized_buffer, tl_dequantised_buffer, bias_lambda); + const auto err = estimate_error(t, tt, f32_sample, sample_rows_per_slice, values, activations, tl_quantized_buffer, tl_dequantised_buffer, bias_lambda); eval_candidates[i] = candidate_types{ tt, bpw, bytes, err }; } }); @@ -1240,7 +1239,7 @@ static std::unordered_map target_bpw_type( return a.error < b.error; }); - double best_err = std::numeric_limits::infinity(); + double best_err = infinity; size_t last_bytes = std::numeric_limits::max(); for (const auto & c : info.candidate) { // Only keep the best error seen so far at strictly larger byte sizes From 12e816b51199b38a6571141d5f1e5f1039ebe706 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 13 Sep 2025 09:24:23 +0100 Subject: [PATCH 063/155] Replace greedy allocator with lagrangian relaxation --- src/llama-quant.cpp | 278 ++++++++++++++++++++++++++------------------ 1 file changed, 162 insertions(+), 116 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 15ea36721e..a369d50ffe 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1266,152 +1266,198 @@ static std::unordered_map target_bpw_type( if (all.empty()) { return {}; } - // Greedy allocation from minimum bpw upward to reach target_bpw - auto current_total_bytes = [&]() -> size_t { - size_t b = 0; + // Lagrangian relaxation to minimise error subject to a bpw target constraint + auto total_bytes = [&]() -> size_t { + size_t tb = 0; for (const auto & ti : all) { - b += ti.candidate[ti.choice].bytes; + tb += ti.candidate[ti.choice].bytes; } - return b; + return tb; }; - auto total_weights = [&]() -> size_t { - size_t w = 0; - for (const auto & ti : all) { - w += ti.n_elements; - } + size_t total_elems = 0; + size_t min_bytes = 0; + size_t max_bytes = 0; + for (const auto & ti : all) { + total_elems += (size_t)ti.n_elements; + min_bytes += ti.candidate.front().bytes; // smallest candidate per tensor + max_bytes += ti.candidate.back().bytes; // largest candidate per tensor + } - return w; - }; + if (total_elems == 0) { return {}; } - const size_t tw = total_weights(); - auto current_bpw = [&]() -> double { - return (double)current_total_bytes() * 8.0f / (double)tw; - }; + const double target_bpw = params->target_bpw; + size_t budget_bytes = std::llround(target_bpw * (double)total_elems / 8.0); - // Precompute current bpw - double bpw_now = current_bpw(); - - float target_bpw = params->target_bpw; - // If minimal bpw is already above the target, we're constrained by the tensor's shape; return closest (min bpw) - if (bpw_now >= target_bpw) { + auto emit_overrides = [&]() -> std::unordered_map { std::unordered_map overrides; + LLAMA_LOG_INFO("%s: - estimated tensor quantization mix:\n", func); for (const auto & ti : all) { + LLAMA_LOG_INFO("\t%s: %45s - \t%8s, \t%1.4f bpw,\terror: %.4f\n", + func, ggml_get_name(ti.w->tensor), ggml_type_name(ti.candidate[ti.choice].type), ti.candidate[ti.choice].bpw, ti.candidate[ti.choice].error); overrides[ggml_get_name(ti.w->tensor)] = ti.candidate[ti.choice].type; } return overrides; + }; + + if (budget_bytes <= min_bytes) { + for (auto & ti : all) { ti.choice = 0; } + + return emit_overrides(); + } + if (budget_bytes >= max_bytes) { + for (auto & ti : all) { ti.choice = (int) ti.candidate.size() - 1; } + + return emit_overrides(); } - struct upgrade { - int idx; - int next; - double err; - size_t delta_bytes; - double ratio; - }; - - // Find next strictly-larger candidate index for a tensor - auto next_distinct_idx = [&](const tensor_info & ti) -> int { - const auto & cand = ti.candidate; - const auto & cur = cand[ti.choice]; - int j = ti.choice + 1; - while (j < (int)cand.size() && cand[j].bytes == cur.bytes) { - ++j; - } - - return j < (int)cand.size() ? j : -1; - }; - - auto recompute_best_upgrade = [&]() -> upgrade { - upgrade best{ -1, -1, 0.0, 0, -1.0 }; - for (int i = 0; i < (int) all.size(); ++i) { - const auto & ti = all[i]; - if (ti.choice >= (int)ti.candidate.size() - 1) { continue; } - - const int j = next_distinct_idx(ti); - if (j < 0) { continue; } - - const auto & cur = ti.candidate[ti.choice]; - const auto & nxt = ti.candidate[j]; - const size_t delta_bytes = nxt.bytes - cur.bytes; - if (delta_bytes == 0) { continue; } - - double err = cur.error - nxt.error; - err = std::max(err, 0.0); - double ratio = err / (double)(delta_bytes * 8ull); - if (ratio > best.ratio + epsilon || (std::abs(ratio - best.ratio) <= epsilon && delta_bytes < best.delta_bytes)) { - best = upgrade{ i, j, err, delta_bytes, ratio }; + auto lagrange_penalty = [&](const double mu, std::vector & choice, size_t & bytes, double & err) { + choice.resize(all.size()); + bytes = 0; + err = 0.0; + for (size_t i = 0; i < all.size(); ++i) { + const auto & cand = all[i].candidate; + int best_j = 0; + double best_val = infinity; + for (int j = 0; j < (int)cand.size(); ++j) { + const double bits = (double)cand[j].bytes * 8.0; + const double val = cand[j].error + mu * bits; + if (val < best_val - epsilon || (std::abs(val - best_val) <= epsilon && cand[j].bytes < cand[best_j].bytes)) { + best_val = val; + best_j = j; + } } - } - return best; + choice[i] = best_j; + bytes += cand[best_j].bytes; + err += cand[best_j].error; + } }; - while (true) { - upgrade up = recompute_best_upgrade(); - if (up.idx < 0) { break; } + size_t bytes_lo = 0; + size_t bytes_hi = 0; + size_t bytes_mid = 0; + double mu_lo = 0.0; + double mu_hi = 1.0; + double err_lo = 0.0; + double err_hi = 0.0; + double err_mid = 0.0; + std::vector choice_lo; + std::vector choice_hi; + std::vector choice_mid; + std::vector best_under_choice; + std::vector best_over_choice; - size_t now_bytes = current_total_bytes(); - size_t next_bytes = now_bytes + up.delta_bytes; - double bpw_next = (double)next_bytes * 8.0 / (double)tw; - if (bpw_next <= target_bpw + epsilon) { - all[up.idx].choice = up.next; - bpw_now = bpw_next; - } else { - break; - } - } + lagrange_penalty(mu_lo, choice_lo, bytes_lo, err_lo); - // We might still be below target so we try to find the best upgrade one last time + // increase mu until we get under budget or hit a safety cap { - upgrade best_over{ -1, -1, 0.0, 0, -1.0 }; - double best_over_gap = 1e300; - double under_gap = target_bpw - bpw_now; - size_t now_bytes = current_total_bytes(); - for (int i = 0; i < (int) all.size(); ++i) { - const auto & ti = all[i]; - if (ti.choice >= (int)ti.candidate.size() - 1) { continue; } - - int j = next_distinct_idx(ti); - if (j < 0) { continue; } - - const auto & cur = ti.candidate[ti.choice]; - const auto & nxt = ti.candidate[j]; - size_t delta_bytes = nxt.bytes - cur.bytes; - if (delta_bytes == 0) { continue; } - - size_t over_bytes = now_bytes + delta_bytes; - double bpw_over = (double)over_bytes * 8.0 / (double)tw; - double err = cur.error - nxt.error; - if (err < 0.0) { err = 0.0; } - double ratio = err / (double)(delta_bytes * 8ull); - - double over_gap = std::abs(bpw_over - (double)target_bpw); - if (over_gap < best_over_gap - epsilon || (std::abs(over_gap - best_over_gap) <= epsilon && ratio > best_over.ratio)) { - best_over_gap = over_gap; - best_over = upgrade{ i, j, err, delta_bytes, ratio }; + int expand = 0; + while (true) { + lagrange_penalty(mu_hi, choice_hi, bytes_hi, err_hi); + if (bytes_hi <= budget_bytes) { + break; } - } - - if (best_over.idx >= 0) { - if (best_over_gap < under_gap) { - all[best_over.idx].choice = best_over.next; + mu_hi *= 2.0; + if (++expand > 60) { + break; } } } - // Build the override map - std::unordered_map overrides; - LLAMA_LOG_INFO("%s: - estimated tensor quantization mix:\n", __func__); - for (const auto & ti : all) { - LLAMA_LOG_INFO("\t%s: %45s - \t%8s, \t%1.4f bpw,\terror: %.4f\n", - __func__, ggml_get_name(ti.w->tensor), ggml_type_name(ti.candidate[ti.choice].type), ti.candidate[ti.choice].bpw, ti.candidate[ti.choice].error); - overrides[ggml_get_name(ti.w->tensor)] = ti.candidate[ti.choice].type; + double best_under_gap = infinity; + double best_over_gap = infinity; + double best_under_err = infinity; + double best_over_err = infinity; + for (int it = 0; it < 40; ++it) { + double mu = 0.5 * (mu_lo + mu_hi); + lagrange_penalty(mu, choice_mid, bytes_mid, err_mid); + + const double gap = std::abs((double)bytes_mid - (double)budget_bytes); + + if (bytes_mid > budget_bytes) { + // Too big, need stronger penalty + mu_lo = mu; + + if (gap < best_over_gap - epsilon || (std::abs(gap - best_over_gap) <= epsilon && err_mid < best_over_err)) { + best_over_gap = gap; + best_over_err = err_mid; + best_over_choice = choice_mid; + } + } else { + // Under budget, good candidate + mu_hi = mu; + + if (gap < best_under_gap - epsilon || (std::abs(gap - best_under_gap) <= epsilon && err_mid < best_under_err)) { + best_under_gap = gap; + best_under_err = err_mid; + best_under_choice = choice_mid; + } + } } - return overrides; + if (!best_under_choice.empty()) { + for (size_t i = 0; i < all.size(); ++i) { + all[i].choice = best_under_choice[i]; + } + } else if (!best_over_choice.empty()) { + for (size_t i = 0; i < all.size(); ++i) { + all[i].choice = best_over_choice[i]; + } + } else { + // Pick whichever side we already have, or keep minimal + if (bytes_hi <= budget_bytes && !choice_hi.empty()) { + for (size_t i = 0; i < all.size(); ++i) { + all[i].choice = choice_hi[i]; + } + } else { + for (auto & ti : all) { + ti.choice = 0; + } + } + } + + // Spend any remaining budget with best upgrades that still fit (one pass) + { + auto cur_bytes = total_bytes(); + while (true) { + int best_i = -1; + int best_j = -1; + double best_ratio = -1.0; + size_t best_delta = 0; + + for (int i = 0; i < (int)all.size(); ++i) { + const auto & ti = all[i]; + if (ti.choice >= (int)ti.candidate.size() - 1) { + continue; + } + + int j = ti.choice + 1; + while (j < (int)ti.candidate.size() && ti.candidate[j].bytes == ti.candidate[ti.choice].bytes) { ++j; } + if (j >= (int)ti.candidate.size()) { continue; } + + size_t delta = ti.candidate[j].bytes - ti.candidate[ti.choice].bytes; + if (cur_bytes + delta > budget_bytes) { continue; } + + double err_gain = std::max(0.0, (double)ti.candidate[ti.choice].error - (double)ti.candidate[j].error); + double ratio = err_gain / (double)(delta * 8); + if (ratio > best_ratio + epsilon || (std::abs(ratio - best_ratio) <= epsilon && delta < best_delta)) { + best_ratio = ratio; + best_delta = delta; + best_i = i; + best_j = j; + } + } + + if (best_i < 0) { break; } + all[best_i].choice = best_j; + cur_bytes += best_delta; + } + } + + return emit_overrides(); } static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { From 2b516068e2ef0e51373be32b1917eb7295bcfc54 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 13 Sep 2025 09:41:52 +0100 Subject: [PATCH 064/155] "Convexify" candidate list --- src/llama-quant.cpp | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index a369d50ffe..955e6c12fe 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1257,6 +1257,32 @@ static std::unordered_map target_bpw_type( info.candidate.swap(pruned); } + // Enforce convexity in (bytes, error) curve + { + const auto & c = info.candidate; + if (c.size() >= 3) { + std::vector convex; + convex.reserve(c.size()); + auto slope = [](const candidate_types & a, const candidate_types & b) -> double { + const double dx = (double)b.bytes - (double)a.bytes; + if (dx <= 0.0) { return infinity; } + + return ((double)b.error - (double)a.error) / dx; + }; + + for (const auto & p : c) { + while (convex.size() >= 2) { + double s1 = slope(convex[convex.size() - 2], convex[convex.size() - 1]); + double s2 = slope(convex[convex.size() - 1], p); + if (s2 + epsilon < s1) { convex.pop_back(); } + else { break; } + } + convex.push_back(p); + } + info.candidate.swap(convex); + } + } + // Initialize choice at the smallest bpw candidate info.choice = 0; info.min_bpw = info.candidate.front().bpw; From 8503d59ee44bc30b0d030cceb5e17590b334730d Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 13 Sep 2025 11:49:18 +0100 Subject: [PATCH 065/155] Increase IQ options --- src/llama-quant.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 955e6c12fe..41fd819f86 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -641,12 +641,21 @@ static std::unordered_map target_bpw_type( constexpr ggml_type iq_quants[] = { GGML_TYPE_IQ1_S, + GGML_TYPE_IQ2_XXS, + GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS, + GGML_TYPE_IQ4_NL, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, - GGML_TYPE_Q8_0 + GGML_TYPE_Q8_0, + // TODO: find better way to handle F16/BF16 +#ifdef GGML_USE_METAL + GGML_TYPE_F16 +#else + GGML_TYPE_BF16 +#endif }; constexpr double epsilon = 1e-12; From c709e1a3353cbefbe58320c2eae1a1edafc0f618 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 14 Sep 2025 22:38:27 +0100 Subject: [PATCH 066/155] Fix MoE tensor estimation --- src/llama-quant.cpp | 45 ++++++++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 41fd819f86..1efb1c5eee 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1021,27 +1021,38 @@ static std::unordered_map target_bpw_type( }; // Faster to compute but may yield lower precision. Best option for the vast majority of cases - auto fast_lambda = [&](const float * values, const float * activations, const int64_t n_per_row) { + auto fast_lambda = [&](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) { if (!activations) { return 0.0f; } - double s = 0.0; - double s2 = 0.0; - for (int64_t j = 0; j < n_per_row; ++j) { - const double w = values ? std::max(0.0f, values[j]) : 1.0; - const double aw = std::sqrt(w) * activations[j]; - const double aw2 = aw * aw; - s += aw2; - s2 += aw2 * aw2; + double accum = 0.0; + int ns = 0; + + for (int64_t s = 0; s < std::max(1, ne2); ++s) { + const float * v = values ? values + s * n_per_row : nullptr; + const float * a = activations + s * n_per_row; + + double s1 = 0.0; + double s2 = 0.0; + for (int64_t j = 0; j < n_per_row; ++j) { + const double w = v ? std::max(0.0f, v[j]) : 1.0; + const double aw = std::sqrt(w) * a[j]; + const double aw2 = aw * aw; + s1 += aw2; + s2 += aw2 * aw2; + } + + if (s1 > 0.0) { + const double n = (double)n_per_row; + double c = std::max(0.0, s2 / (s1 * s1 + epsilon) - 1.0 / n); + double lambda = 8.0 * (c / (c + 1.0)); + accum += std::clamp(lambda, 0.0, 8.0); + ++ns; + } } - if (s2 <= 0.0) { return 0.0f; } - const auto d = (double)n_per_row; - double base = 1.0 - s * s / (d * s2 + epsilon); - base = std::clamp(base, 0.0, 1.0); + if (ns == 0) { return 0.0f; } - const double lambda = std::clamp(base, 0.0, 1.0) * 8.0; - - return (float)lambda; + return (float)(accum / ns); }; std::vector all; @@ -1190,7 +1201,7 @@ static std::unordered_map target_bpw_type( const float * values = values_sample.empty() ? nullptr : values_sample.data(); const float * activations = activations_sample.empty() ? nullptr : activations_sample.data(); if (params->bpw_bias == 1) { - bias_lambda = fast_lambda(values, activations, n_per_row); + bias_lambda = fast_lambda(values, activations, n_per_row, ne2); } else if (params->bpw_bias == 2) { bias_lambda = precise_lambda(t, f32_sample, sample_rows_per_slice, values, activations, compatible_candidates); } From 14fae69a7bb932fadbc5dd62072a254866512650 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 20 Sep 2025 21:31:31 +0100 Subject: [PATCH 067/155] General refactoring --- src/llama-quant.cpp | 75 +++++++++++++++++++++++---------------------- 1 file changed, 39 insertions(+), 36 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index c6051a480c..6e5562379c 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -729,19 +729,19 @@ static std::unordered_map target_bpw_type( auto estimate_error = [&](const ggml_tensor * t, const ggml_type quant_type, const std::vector & f32_sample, - const std::vector & sample_rows_per_slice, + const std::vector & rows_sample, const float * values_sample, const float * activations_sample, std::vector & quantized_buffer, std::vector & dequantized_buffer, - float bias_lambda, + float tensor_bias_lambda, + const float * slice_bias_lambda, double * out_mse = nullptr, double * out_proj = nullptr) -> double { const int64_t n_per_row = t->ne[0]; const int64_t nrows = t->ne[1]; const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; - const size_t sample_element_count = f32_sample.size(); const size_t sample_row_count = n_per_row > 0 ? sample_element_count / (size_t)n_per_row : 0; if (sample_row_count == 0) { @@ -753,8 +753,9 @@ static std::unordered_map target_bpw_type( size_t expected_rows = 0; for (int64_t s = 0; s < ne2; ++s) { - expected_rows += (size_t)sample_rows_per_slice[s]; + expected_rows += (size_t)rows_sample[s]; } + if (expected_rows != sample_row_count) { if (out_mse) { *out_mse = infinity; } if (out_proj) { *out_proj = 0.0; } @@ -783,17 +784,18 @@ static std::unordered_map target_bpw_type( const double a = activations[j]; denom += w * a * a; } + bias_denominator_per_slice[s] = denom; } } - // Per-row squared norms with weighting + // Weighted per-row squared norms std::vector row_sq_norm(sample_row_count, 0.0); { size_t offset = 0; size_t row_idx = 0; for (int64_t s = 0; s < ne2; ++s) { - const int64_t rs = sample_rows_per_slice[s]; + const int64_t rs = rows_sample[s]; if (rs == 0) { continue; } const float * values = has_values ? values_sample + s * n_per_row : nullptr; @@ -823,7 +825,7 @@ static std::unordered_map target_bpw_type( size_t q_offset = 0; size_t f_offset = 0; for (int64_t slice = 0; slice < ne2; ++slice) { - const int64_t rs = sample_rows_per_slice[slice]; + const int64_t rs = rows_sample[slice]; if (rs == 0) { continue; } const float * value = has_values ? values_sample + slice * n_per_row : nullptr; @@ -843,21 +845,19 @@ static std::unordered_map target_bpw_type( } else { for (size_t r = 0; r < sample_row_count; ++r) { uint8_t * src = quantized_buffer.data() + r * row_sz; - float * dst = dequantized_buffer.data() + r * (size_t) n_per_row; + float * dst = dequantized_buffer.data() + r * (size_t)n_per_row; if (is_fp16) { - ggml_fp16_to_fp32_row((const ggml_fp16_t *) src, dst, (int) n_per_row); - } - else if (is_bf16) { - ggml_bf16_to_fp32_row((const ggml_bf16_t *) src, dst, (int) n_per_row); - } - else { + ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row); + } else if (is_bf16) { + ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row); + } else { if (!traits || !traits->to_float) { if (out_mse) { *out_mse = infinity; } if (out_proj) { *out_proj = 0.0; } return infinity; } - traits->to_float(src, dst, (int) n_per_row); + traits->to_float(src, dst, (int)n_per_row); } } } @@ -1098,20 +1098,20 @@ static std::unordered_map target_bpw_type( offset = dist(rng); } - for (int64_t r = offset; r < nrows_total && current_sampled_rows < sample_rows_max; r += stride) { + for (int64_t r = offset; r < nrows_total && current_sampled_rows < rows_sample_max; r += stride) { if (src_type == GGML_TYPE_F32) { - const float * src_row = (const float *)t->data + slice * (n_per_row * nrows_total) + r * n_per_row; + const float * src_row = (const float *)tensor->data + slice * (n_per_row * nrows_total) + r * n_per_row; f32_sample.insert(f32_sample.end(), src_row, src_row + n_per_row); } else if (src_type == GGML_TYPE_F16) { - const auto * src_row = (const ggml_fp16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz); + const auto * src_row = (const ggml_fp16_t *)((const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz); ggml_fp16_to_fp32_row(src_row, row_buffer.data(), (int)n_per_row); f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end()); } else if (src_type == GGML_TYPE_BF16) { - const auto * src_row = (const ggml_bf16_t *)((const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz); + const auto * src_row = (const ggml_bf16_t *)((const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz); ggml_bf16_to_fp32_row(src_row, row_buffer.data(), (int)n_per_row); f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end()); } else if (src_is_quant) { - const uint8_t * qrow = (const uint8_t *)t->data + slice * (src_row_sz * nrows_total) + r * src_row_sz; + const uint8_t * qrow = (const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz; if (!src_traits || !src_traits->to_float) { throw std::runtime_error(format("cannot dequantize type %s for sampling", ggml_type_name(src_type))); } @@ -1120,9 +1120,11 @@ static std::unordered_map target_bpw_type( } else { throw std::runtime_error(format("unsupported src type %s for sampling", ggml_type_name(src_type))); } + ++current_sampled_rows; } - sample_rows_per_slice[slice] = current_sampled_rows; + + rows_sample[slice] = current_sampled_rows; } auto side_data = [&](const std::unordered_map> * m, const std::string & tensor_name) -> std::pair { @@ -1160,7 +1162,7 @@ static std::unordered_map target_bpw_type( if (values_all) { copy_or_broadcast(values_all, values_sz, values_sample); } if (activations_all) { copy_or_broadcast(activations_all, activations_sz, activations_sample); } - const int64_t nelem = ggml_nelements(t); + const int64_t nelem = ggml_nelements(tensor); tensor_info info; info.w = tw; info.n_elements = nelem; @@ -1185,8 +1187,9 @@ static std::unordered_map target_bpw_type( __func__, ggml_type_name(ts_type), name.c_str()); continue; } - ggml_type tt = make_compatible(t, ts_type); - if (!is_compatible(t, tt)) { continue; } + + ggml_type tt = make_compatible(tensor, ts_type); + if (!is_compatible(tensor, tt)) { continue; } compatible_candidates.push_back(tt); max_row_sz = std::max(max_row_sz, ggml_row_size(tt, n_per_row)); } @@ -1222,16 +1225,16 @@ static std::unordered_map target_bpw_type( // thread-local scratch std::vector tl_quantized_buffer(quantized_buffer.size()); std::vector tl_dequantised_buffer(dequantised_buffer.size()); - for (;;) { const size_t i = cidx.fetch_add(1, std::memory_order_relaxed); if (i >= compatible_candidates.size()) { break; } - const ggml_type tt = compatible_candidates[i]; - const auto bpw = (float)tensor_bpw(t, tt); - const size_t bytes = tensor_bytes(t, tt); - const auto err = estimate_error(t, tt, f32_sample, sample_rows_per_slice, values, activations, tl_quantized_buffer, tl_dequantised_buffer, bias_lambda); - eval_candidates[i] = candidate_types{ tt, bpw, bytes, err }; + const ggml_type tensor_types = compatible_candidates[i]; + const auto bpw = (float)tensor_bpw(tensor, tensor_types); + const size_t bytes = tensor_bytes(tensor, tensor_types); + const auto err = estimate_error(tensor, tensor_types, f32_sample, rows_sample, values, activations, + tl_quantized_buffer, tl_dequantised_buffer, tensor_lambda, slice_lambda); + eval_candidates[i] = candidate_types{ tensor_types, bpw, bytes, err }; } }); } @@ -1244,8 +1247,8 @@ static std::unordered_map target_bpw_type( if (info.candidate.empty()) { // As a last resort, keep original type - float bpw = ggml_nbytes(t) * 8.0f / nelem; - info.candidate.push_back(candidate_types{ t->type, bpw, ggml_nbytes(t), 0.0 }); + float bpw = ggml_nbytes(tensor) * 8.0f / nelem; + info.candidate.push_back(candidate_types{ tensor->type, bpw, ggml_nbytes(tensor), 0.0 }); } // Keep only the pareto‑optimal candidates: if A has >= bytes and >= error than B, drop A. @@ -1274,6 +1277,7 @@ static std::unordered_map target_bpw_type( // same bytes: we already sorted by error; skip } } + info.candidate.swap(pruned); } @@ -1299,6 +1303,7 @@ static std::unordered_map target_bpw_type( } convex.push_back(p); } + info.candidate.swap(convex); } } @@ -1312,7 +1317,6 @@ static std::unordered_map target_bpw_type( if (all.empty()) { return {}; } - // Lagrangian relaxation to minimise error subject to a bpw target constraint auto total_bytes = [&]() -> size_t { size_t tb = 0; for (const auto & ti : all) { @@ -1359,6 +1363,7 @@ static std::unordered_map target_bpw_type( return emit_overrides(); } + // Lagrangian relaxation to minimise error subject to a bpw target constraint auto lagrange_penalty = [&](const double mu, std::vector & choice, size_t & bytes, double & err) { choice.resize(all.size()); bytes = 0; @@ -1406,6 +1411,7 @@ static std::unordered_map target_bpw_type( if (bytes_hi <= budget_bytes) { break; } + mu_hi *= 2.0; if (++expand > 60) { break; @@ -1422,11 +1428,9 @@ static std::unordered_map target_bpw_type( lagrange_penalty(mu, choice_mid, bytes_mid, err_mid); const double gap = std::abs((double)bytes_mid - (double)budget_bytes); - if (bytes_mid > budget_bytes) { // Too big, need stronger penalty mu_lo = mu; - if (gap < best_over_gap - epsilon || (std::abs(gap - best_over_gap) <= epsilon && err_mid < best_over_err)) { best_over_gap = gap; best_over_err = err_mid; @@ -1435,7 +1439,6 @@ static std::unordered_map target_bpw_type( } else { // Under budget, good candidate mu_hi = mu; - if (gap < best_under_gap - epsilon || (std::abs(gap - best_under_gap) <= epsilon && err_mid < best_under_err)) { best_under_gap = gap; best_under_err = err_mid; From a36946997e2c365e9317062f14e298af6e9928a9 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 20 Sep 2025 21:36:54 +0100 Subject: [PATCH 068/155] Replace fast_bias() for per slice version and remove precise_bias() --- src/llama-quant.cpp | 167 +++++++++++++++----------------------------- 1 file changed, 58 insertions(+), 109 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 6e5562379c..fe10365772 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -868,8 +868,9 @@ static std::unordered_map target_bpw_type( size_t row_idx = 0; double total_mse = 0.0; double total_proj = 0.0; + double total_bias = 0.0; for (int64_t slice = 0; slice < ne2; ++slice) { - const int64_t rs = sample_rows_per_slice[slice]; + const int64_t rs = rows_sample[slice]; if (rs == 0) { continue; } const float * values = has_values ? values_sample + slice * n_per_row : nullptr; @@ -918,21 +919,24 @@ static std::unordered_map target_bpw_type( } row_proj_norm.push_back(p_norm); } + offset += (size_t)n_per_row; } // Trimmed sum to avoid outlier rows dominating the results auto trimmed_sum = [&](std::vector & v) -> double { if (v.empty()) { return 0.0; } + const int64_t n = (int64_t)v.size(); if (n < 50) { double s = 0.0; for (const double z : v) { s += z; } + return s; } - int64_t k = (int64_t) std::floor(0.02 * (double)n); // trim 2% on each side - k = std::max(0, std::min(k, n / 32)); // but not more than 3.125% + int64_t k = (int64_t)std::floor(0.02 * (double)n); // trim 2% each side + k = std::max(0, std::min(k, n / 32)); // cap at ~3.125% std::nth_element(v.begin(), v.begin() + k, v.end()); std::nth_element(v.begin() + k, v.begin() + (n - k), v.end()); double s = 0.0; @@ -944,11 +948,17 @@ static std::unordered_map target_bpw_type( }; const double scale_rows = (double)nrows / std::max(1.0, (double)rs); + const double slice_mse = trimmed_sum(row_mse_norm) * scale_rows; + const double slice_proj = activations ? trimmed_sum(row_proj_norm) * scale_rows : 0.0; - total_mse += trimmed_sum(row_mse_norm) * scale_rows; - if (activations) { total_proj += trimmed_sum(row_proj_norm) * scale_rows; } + total_mse += slice_mse; + total_proj += slice_proj; - if (!std::isfinite(total_mse) || !std::isfinite(total_proj)) { + // per-slice lambda if provided, otherwise use scalar + const double bl = slice_bias_lambda ? (double)std::max(0.0f, slice_bias_lambda[slice]) : (double)tensor_bias_lambda; + total_bias += bl * slice_proj; + + if (!std::isfinite(total_mse) || !std::isfinite(total_proj) || !std::isfinite(total_bias)) { if (out_mse) { *out_mse = infinity; } if (out_proj) { *out_proj = 0.0; } @@ -959,100 +969,42 @@ static std::unordered_map target_bpw_type( if (out_mse) { *out_mse = total_mse; } if (out_proj) { *out_proj = total_proj; } - const double total_err = total_mse + bias_lambda * total_proj; + const double total_err = slice_bias_lambda ? total_mse + total_bias : total_mse + tensor_bias_lambda * total_proj; + return std::isfinite(total_err) ? total_err : infinity; }; - // Higher precision but longer to compute - auto precise_lambda = [&](const ggml_tensor * t, - const std::vector & f32_sample, - const std::vector & sample_rows_per_slice, - const float * values, - const float * activations, - const std::vector & compatible_candidates) -> float + // Returns lambda per slice or 0.0 if no activations + auto estimate_lambda = [&](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) -> std::vector { - if (!activations) { return 0.0f; } - - std::vector probes; - probes.reserve(3); - auto push_if = [&](const ggml_type tiny) { - if (std::find(compatible_candidates.begin(), compatible_candidates.end(), tiny) != compatible_candidates.end()) { - probes.push_back(tiny); - } - }; - - push_if(GGML_TYPE_Q3_K); - push_if(GGML_TYPE_Q4_K); - push_if(GGML_TYPE_Q5_K); - if (probes.empty() && !compatible_candidates.empty()) { - probes.push_back(compatible_candidates[compatible_candidates.size() / 2]); - } - if (probes.size() == 1 && compatible_candidates.size() >= 2) { - probes.push_back(compatible_candidates.front()); - } - if (probes.empty()) { return 0.0f; } - - // Scratch buffers - const int64_t n_per_row = t->ne[0]; - const size_t total_sampled_rows = f32_sample.size() / n_per_row; - size_t max_row_sz = 0; - for (auto pt : probes) max_row_sz = std::max(max_row_sz, ggml_row_size(pt, n_per_row)); - - std::vector quantized_buffer(max_row_sz * total_sampled_rows); - std::vector dequantized_buffer(f32_sample.size()); - - std::vector ratios; - ratios.reserve(probes.size()); - for (const auto pt : probes) { - double m = 0.0; - double p = 0.0; - (void)estimate_error(t, pt, f32_sample, sample_rows_per_slice, values, activations, quantized_buffer, dequantized_buffer, 0.0f, &m, &p); - if (p > epsilon && std::isfinite(m) && std::isfinite(p)) { - ratios.push_back(m / p); - } - } - - if (ratios.empty()) { return 0.0f; } - - std::nth_element(ratios.begin(), ratios.begin() + ratios.size() / 2, ratios.end()); - const double lambda = std::clamp(ratios[ratios.size() / 2], 0.0, 8.0); - - return (float)lambda; - }; - - // Faster to compute but may yield lower precision. Best option for the vast majority of cases - auto fast_lambda = [&](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) { - if (!activations) { return 0.0f; } - - double accum = 0.0; - int ns = 0; + std::vector lambdas(std::max(1, ne2), 0.0f); + if (!activations) { return lambdas; } for (int64_t s = 0; s < std::max(1, ne2); ++s) { const float * v = values ? values + s * n_per_row : nullptr; const float * a = activations + s * n_per_row; - double s1 = 0.0; double s2 = 0.0; for (int64_t j = 0; j < n_per_row; ++j) { - const double w = v ? std::max(0.0f, v[j]) : 1.0; + const double w = v ? std::max(0.0f, v[j]) : 1.0; const double aw = std::sqrt(w) * a[j]; const double aw2 = aw * aw; s1 += aw2; s2 += aw2 * aw2; } + float l = 0.0f; if (s1 > 0.0) { - const double n = (double)n_per_row; - double c = std::max(0.0, s2 / (s1 * s1 + epsilon) - 1.0 / n); + const auto n = (double)n_per_row; + const double c = std::max(0.0, s2 / (s1 * s1 + epsilon) - 1.0 / n); double lambda = 8.0 * (c / (c + 1.0)); - accum += std::clamp(lambda, 0.0, 8.0); - ++ns; + l = (float)std::clamp(lambda, 0.0, 12.0); } + + lambdas[(size_t)s] = l; } - if (ns == 0) { return 0.0f; } - - return (float)(accum / ns); + return lambdas; }; std::vector all; @@ -1060,32 +1012,33 @@ static std::unordered_map target_bpw_type( for (const auto * tw : tensors) { std::vector workers; workers.reserve(std::max(1, nthread)); - ggml_tensor * t = tw->tensor; - const std::string name = ggml_get_name(t); - if (!can_quantize(t)) { continue; } + ggml_tensor * tensor = tw->tensor; + const std::string name = ggml_get_name(tensor); + if (!can_quantize(tensor)) { continue; } - LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12d elements)\n", __func__, name.c_str(), (int)ggml_nelements(t)); + LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12d elements)\n", __func__, name.c_str(), (int)ggml_nelements(tensor)); if (!ml.use_mmap) { - if (buffer.size() < ggml_nbytes(t)) { buffer.resize(ggml_nbytes(t)); } - t->data = buffer.data(); + if (buffer.size() < ggml_nbytes(tensor)) { buffer.resize(ggml_nbytes(tensor)); } + tensor->data = buffer.data(); } - ml.load_data_for(t); + + ml.load_data_for(tensor); // Dequantize sampled rows into f32_sample - const int64_t n_per_row = t->ne[0]; - const int64_t nrows_total = t->ne[1]; - const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; + const int64_t n_per_row = tensor->ne[0]; + const int64_t nrows_total = tensor->ne[1]; + const int64_t ne2 = tensor->ne[2] > 0 ? tensor->ne[2] : 1; - // Larger sample_rows_per_expert values may result in more accurate error estimates, but it will take much longer to compute - const int sample_rows_per_expert = activations_data ? 512 : 256; + // Larger rows_sample_per_expert values may result in more accurate error estimates, but it will take much longer to compute + const int rows_sample_per_expert = activations_data ? 512 : 256; std::vector f32_sample; - f32_sample.reserve((size_t)ne2 * (size_t)std::min(nrows_total, sample_rows_per_expert) * (size_t)n_per_row); + f32_sample.reserve((size_t)ne2 * (size_t)std::min(nrows_total, rows_sample_per_expert) * (size_t)n_per_row); - std::vector sample_rows_per_slice(ne2, 0); - const int64_t sample_rows_max = std::max(1, std::min(nrows_total, sample_rows_per_expert)); - const int64_t stride = std::max(1, nrows_total / sample_rows_max); + std::vector rows_sample(ne2, 0); + const int64_t rows_sample_max = std::max(1, std::min(nrows_total, rows_sample_per_expert)); + const int64_t stride = std::max(1, nrows_total / rows_sample_max); std::vector row_buffer(n_per_row); - const ggml_type src_type = t->type; + const ggml_type src_type = tensor->type; const ggml_type_traits *src_traits = ggml_get_type_traits(src_type); const bool src_is_quant = ggml_is_quantized(src_type); const size_t src_row_sz = ggml_row_size(src_type, n_per_row); @@ -1199,23 +1152,20 @@ static std::unordered_map target_bpw_type( // Adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE. // Larger values favours quantisation types that produce smaller bias even if the MSE is slightly bigger - float bias_lambda = 0.0f; - { - const float * values = values_sample.empty() ? nullptr : values_sample.data(); - const float * activations = activations_sample.empty() ? nullptr : activations_sample.data(); - if (params->bpw_bias == 1) { - bias_lambda = fast_lambda(values, activations, n_per_row, ne2); - } else if (params->bpw_bias == 2) { - bias_lambda = precise_lambda(t, f32_sample, sample_rows_per_slice, values, activations, compatible_candidates); - } - } - - // Now evaluate candidates - std::vector eval_candidates(compatible_candidates.size()); + float tensor_lambda = 0.0f; const float * values = values_sample.empty() ? nullptr : values_sample.data(); const float * activations = activations_sample.empty() ? nullptr : activations_sample.data(); + auto lambdas = estimate_lambda(values, activations, n_per_row, ne2); + double acc = 0.0; + int ns = 0; + for (float l : lambdas) { acc += l; ++ns; } + tensor_lambda = ns ? (float)(acc / ns) : 0.0f; + + // Evaluate candidates + std::vector eval_candidates(compatible_candidates.size()); std::vector quantized_buffer(max_row_sz * total_sampled_rows); std::vector dequantised_buffer(f32_sample.size()); + const float * slice_lambda = lambdas.empty() ? nullptr : lambdas.data(); int n_eval_threads = std::max(1, std::min(nthread, (int)compatible_candidates.size())); std::atomic cidx{0}; std::vector eval_workers; @@ -1476,7 +1426,6 @@ static std::unordered_map target_bpw_type( int best_j = -1; double best_ratio = -1.0; size_t best_delta = 0; - for (int i = 0; i < (int)all.size(); ++i) { const auto & ti = all[i]; if (ti.choice >= (int)ti.candidate.size() - 1) { From 9e74f8341120d5f26939267e96fbaba04451d516 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 20 Sep 2025 23:06:37 +0100 Subject: [PATCH 069/155] Replace --bpw-bias flag with --no-bias --- include/llama.h | 2 +- src/llama-quant.cpp | 18 +++++++++------- tools/quantize/quantize.cpp | 42 ++++++++----------------------------- 3 files changed, 20 insertions(+), 42 deletions(-) diff --git a/include/llama.h b/include/llama.h index ba6c185346..502bedbb80 100644 --- a/include/llama.h +++ b/include/llama.h @@ -365,7 +365,7 @@ extern "C" { void * tensor_types; // pointer to vector containing tensor types void * prune_layers; // pointer to vector containing layer indices to prune float target_bpw; // target bits per weight (bpw) - int32_t bpw_bias; // type of error bias to use: 0 = no bias (MSE only), 1 = fast (default), 2 = precise (slow) + bool no_bias; // use mean square error estimation only (no aligment bias) } llama_model_quantize_params; typedef struct llama_logit_bias { diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 9d7a9f9742..9e7d9d295c 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1153,13 +1153,16 @@ static std::unordered_map target_bpw_type( // Adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE. // Larger values favours quantisation types that produce smaller bias even if the MSE is slightly bigger float tensor_lambda = 0.0f; + std::vector lambdas; const float * values = values_sample.empty() ? nullptr : values_sample.data(); const float * activations = activations_sample.empty() ? nullptr : activations_sample.data(); - auto lambdas = estimate_lambda(values, activations, n_per_row, ne2); - double acc = 0.0; - int ns = 0; - for (float l : lambdas) { acc += l; ++ns; } - tensor_lambda = ns ? (float)(acc / ns) : 0.0f; + if (!params->no_bias) { + double acc = 0.0; + int ns = 0; + lambdas = estimate_lambda(values, activations, n_per_row, ne2); + for (float l : lambdas) { acc += l; ++ns; } + tensor_lambda = ns ? (float)(acc / ns) : 0.0f; + } // Evaluate candidates std::vector eval_candidates(compatible_candidates.size()); @@ -1726,8 +1729,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } else { LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate - ", __func__); } - const char* msg[] = {"no bias (MSE only)", "fast (default)", "precise (slow)"}; - LLAMA_LOG_INFO("using %s error estimation\n", msg[params->bpw_bias]); + LLAMA_LOG_INFO("using %s error estimation\n", params->no_bias ? "MSE only (no aligment bias)" : "aligment bias (default)"); LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw); bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread); } else { @@ -2038,7 +2040,7 @@ llama_model_quantize_params llama_model_quantize_default_params() { /*.tensor_type =*/ nullptr, /*.prune_layers =*/ nullptr, /*.target_bpw =*/ -1.0f, - /*.bpw_bias =*/ 1 + /*.no_bias =*/ false }; return result; diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 0fe65daea0..03018cc301 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -117,12 +117,12 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp [[noreturn]] static void usage(const char * executable) { - printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights]\n", executable); - printf(" [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n"); + printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights]\n", executable); + printf(" [--target-bpw n] [--no-bias] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n"); printf(" model-f32.gguf [model-quant.gguf] type [nthreads]\n\n"); - printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); - printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); - printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n"); + printf(" --allow-requantize: allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); + printf(" --leave-output-tensor: will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); + printf(" --pure: disable k-quant mixtures and quantize all tensors to the same type\n"); printf(" --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n"); printf(" --include-weights tensor_name: use importance matrix for this/these tensor(s)\n"); printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n"); @@ -134,7 +134,8 @@ static void usage(const char * executable) { printf(" Advanced option to remove all tensors from the given layers\n"); printf(" --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 16.0\n"); printf(" Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n"); - printf(" --bpw_bias: type of error bias to use: 0 = no bias (MSE only), 1 = fast (default), 2 = precise (slow)\n"); + printf(" --no-bias: use mean square error estimation only (no aligment bias)\n"); + printf(" Advanced option use MSE only and disable aligment bias error estimation\n"); printf(" --keep-split: will generate quantized model in the same shards as input\n"); printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n"); @@ -496,27 +497,6 @@ static bool parse_target_bpw(const char * data, float & target_bpw) { return true; } -static bool parse_bpw_bias(const char * data, int & bpw_bias) { - if (!data) { - printf("\n%s: error bias type not provided\n\n", __func__); - return false; - } - - try { - bpw_bias = std::stoi(data); - if (bpw_bias < 0 || bpw_bias > 2) { - printf("\n%s: error bias type must be one of 0 (no bias, MSE only), 1 (fast), or 2 (precise, but slow)\n\n", __func__); - return false; - } - } - catch (const std::exception & e) { - printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__, data); - return false; - } - - return true; -} - int main(int argc, char ** argv) { if (argc < 3) { usage(argv[0]); @@ -531,7 +511,6 @@ int main(int argc, char ** argv) { std::vector tensor_types; std::vector prune_layers; float target_bpw = -1.0f; - int bpw_bias = 1; for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) { if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) { @@ -562,11 +541,8 @@ int main(int argc, char ** argv) { if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) { usage(argv[0]); } - } else if (strcmp(argv[arg_idx], "--bpw-bias") == 0) { - if (arg_idx == argc-1 || !parse_bpw_bias(argv[++arg_idx], bpw_bias)) { - usage(argv[0]); - } - params.bpw_bias = bpw_bias; + } else if (strcmp(argv[arg_idx], "--no-bias") == 0) { + params.no_bias = true; } else if (strcmp(argv[arg_idx], "--prune-layers") == 0) { if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) { usage(argv[0]); From e8e2aed17a4ade7b14021e05f2a55f9b8f26510f Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 21 Sep 2025 13:41:44 +0100 Subject: [PATCH 070/155] Refactor row sampling --- src/llama-quant.cpp | 49 +++++++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 9e7d9d295c..4a8c08e68f 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1029,7 +1029,6 @@ static std::unordered_map target_bpw_type( const int64_t nrows_total = tensor->ne[1]; const int64_t ne2 = tensor->ne[2] > 0 ? tensor->ne[2] : 1; - // Larger rows_sample_per_expert values may result in more accurate error estimates, but it will take much longer to compute const int rows_sample_per_expert = activations_data ? 512 : 256; std::vector f32_sample; f32_sample.reserve((size_t)ne2 * (size_t)std::min(nrows_total, rows_sample_per_expert) * (size_t)n_per_row); @@ -1037,11 +1036,30 @@ static std::unordered_map target_bpw_type( std::vector rows_sample(ne2, 0); const int64_t rows_sample_max = std::max(1, std::min(nrows_total, rows_sample_per_expert)); const int64_t stride = std::max(1, nrows_total / rows_sample_max); - std::vector row_buffer(n_per_row); const ggml_type src_type = tensor->type; - const ggml_type_traits *src_traits = ggml_get_type_traits(src_type); + const ggml_type_traits * src_traits = ggml_get_type_traits(src_type); const bool src_is_quant = ggml_is_quantized(src_type); const size_t src_row_sz = ggml_row_size(src_type, n_per_row); + + std::vector row_buffer(n_per_row); + auto row_to_fp32 = [&](const uint8_t * src, float * dst) { + if (src_type == GGML_TYPE_F32) { + std::memcpy(dst, src, sizeof(float) * (size_t)n_per_row); + } else if (src_type == GGML_TYPE_F16) { + ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row); + } else if (src_type == GGML_TYPE_BF16) { + ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row); + } else if (src_is_quant) { + if (!src_traits || !src_traits->to_float) { + throw std::runtime_error(format("cannot dequantize type %s for sampling", ggml_type_name(src_type))); + } + + src_traits->to_float(src, dst, (int)n_per_row); + } else { + throw std::runtime_error(format("unsupported src type %s for sampling", ggml_type_name(src_type))); + } + }; + for (int64_t slice = 0; slice < ne2; ++slice) { std::mt19937 rng(std::hash{}(name) ^ 0xeabada55cafed00d ^ slice); int64_t current_sampled_rows = 0; @@ -1052,31 +1070,18 @@ static std::unordered_map target_bpw_type( } for (int64_t r = offset; r < nrows_total && current_sampled_rows < rows_sample_max; r += stride) { + const uint8_t * src_row = (const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz; if (src_type == GGML_TYPE_F32) { - const float * src_row = (const float *)tensor->data + slice * (n_per_row * nrows_total) + r * n_per_row; - f32_sample.insert(f32_sample.end(), src_row, src_row + n_per_row); - } else if (src_type == GGML_TYPE_F16) { - const auto * src_row = (const ggml_fp16_t *)((const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz); - ggml_fp16_to_fp32_row(src_row, row_buffer.data(), (int)n_per_row); - f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end()); - } else if (src_type == GGML_TYPE_BF16) { - const auto * src_row = (const ggml_bf16_t *)((const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz); - ggml_bf16_to_fp32_row(src_row, row_buffer.data(), (int)n_per_row); - f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end()); - } else if (src_is_quant) { - const uint8_t * qrow = (const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz; - if (!src_traits || !src_traits->to_float) { - throw std::runtime_error(format("cannot dequantize type %s for sampling", ggml_type_name(src_type))); - } - src_traits->to_float(qrow, row_buffer.data(), (int)n_per_row); - f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end()); + auto src_f32 = (const float *)src_row; + f32_sample.insert(f32_sample.end(), src_f32, src_f32 + n_per_row); } else { - throw std::runtime_error(format("unsupported src type %s for sampling", ggml_type_name(src_type))); + row_to_fp32(src_row, row_buffer.data()); + f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end()); } ++current_sampled_rows; } - + rows_sample[slice] = current_sampled_rows; } From bdefdb673c0d28b59c23d505307536b4f1724858 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 21 Sep 2025 13:42:07 +0100 Subject: [PATCH 071/155] Refactor copy_or_broadcast() --- src/llama-quant.cpp | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 4a8c08e68f..b1302df431 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1087,6 +1087,7 @@ static std::unordered_map target_bpw_type( auto side_data = [&](const std::unordered_map> * m, const std::string & tensor_name) -> std::pair { if (!m) { return {nullptr, 0}; } + const std::string key = remap_imatrix(tensor_name, mapped); const auto it = m->find(key); if (it == m->end()) { return {nullptr, 0}; } @@ -1095,22 +1096,27 @@ static std::unordered_map target_bpw_type( }; // Copy this row's side data (values and activations), or broadcasts to all slices - auto copy_or_broadcast = [&](const float *src, size_t src_sz, std::vector &dst) { - const size_t want = (size_t)ne2 * (size_t)n_per_row; + auto copy_or_broadcast = [&](const float * src, size_t src_sz, std::vector & dst) { dst.clear(); if (!src || src_sz == 0) { return; } + + const size_t want = (size_t)ne2 * (size_t)n_per_row; if (src_sz == want) { dst.resize(want); std::memcpy(dst.data(), src, want * sizeof(float)); - } else if (src_sz == (size_t)n_per_row) { + + return; + } + if (src_sz == (size_t)n_per_row) { dst.resize(want); for (int64_t s = 0; s < ne2; ++s) { std::memcpy(dst.data() + s * n_per_row, src, n_per_row * sizeof(float)); } - } else { - LLAMA_LOG_WARN("%s: side data size mismatch for %s: got %zu, expected %zu or %zu; ignoring\n", - func, name.c_str(), src_sz, (size_t)n_per_row, want); + + return; } + + LLAMA_LOG_WARN("%s: side data size mismatch for %s: got %zu, expected %zu or %zu; ignoring\n", func, name.c_str(), src_sz, (size_t)n_per_row, want); }; const auto [values_all, values_sz] = side_data(values_data, name); From 6b8cedf3bcd2282e9f31b00026178d6bb393fc3e Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 21 Sep 2025 13:42:31 +0100 Subject: [PATCH 072/155] Refactor estimate_lambda() --- src/llama-quant.cpp | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index b1302df431..ebacf68806 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -975,30 +975,29 @@ static std::unordered_map target_bpw_type( }; // Returns lambda per slice or 0.0 if no activations - auto estimate_lambda = [&](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) -> std::vector - { - std::vector lambdas(std::max(1, ne2), 0.0f); + auto estimate_lambda = [&](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) -> std::vector { + const int64_t ns = std::max(1, ne2); + std::vector lambdas(ns, 0.0f); if (!activations) { return lambdas; } - for (int64_t s = 0; s < std::max(1, ne2); ++s) { + for (int64_t s = 0; s < ns; ++s) { const float * v = values ? values + s * n_per_row : nullptr; const float * a = activations + s * n_per_row; double s1 = 0.0; double s2 = 0.0; for (int64_t j = 0; j < n_per_row; ++j) { const double w = v ? std::max(0.0f, v[j]) : 1.0; - const double aw = std::sqrt(w) * a[j]; - const double aw2 = aw * aw; - s1 += aw2; - s2 += aw2 * aw2; + const double aw2 = std::sqrt(w) * a[j]; + const double z = aw2 * aw2; + s1 += z; + s2 += z * z; } float l = 0.0f; if (s1 > 0.0) { const auto n = (double)n_per_row; const double c = std::max(0.0, s2 / (s1 * s1 + epsilon) - 1.0 / n); - double lambda = 8.0 * (c / (c + 1.0)); - l = (float)std::clamp(lambda, 0.0, 12.0); + l = (float) std::clamp(8.0 * (c / (c + 1.0)), 0.0, 12.0); } lambdas[(size_t)s] = l; From c466c53808e566f5eb81a654c9f131064246cdaf Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 21 Sep 2025 13:42:54 +0100 Subject: [PATCH 073/155] Refactor pareto pruning and convexification --- src/llama-quant.cpp | 93 +++++++++++++++++++++------------------------ 1 file changed, 43 insertions(+), 50 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index ebacf68806..ab6601a8bf 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1146,8 +1146,7 @@ static std::unordered_map target_bpw_type( for (size_t i = 0; i < base_sz; ++i) { ggml_type ts_type = base_arr[i]; if (is_iq(ts_type) && !has_valid_imatrix) { - LLAMA_LOG_WARN("%s: skipping %s quantization for %s, no or mismatched imatrix provided\n", - __func__, ggml_type_name(ts_type), name.c_str()); + LLAMA_LOG_WARN("%s: skipping %s quantization for %s, no or mismatched imatrix provided\n", __func__, ggml_type_name(ts_type), name.c_str()); continue; } @@ -1214,60 +1213,54 @@ static std::unordered_map target_bpw_type( info.candidate.push_back(candidate_types{ tensor->type, bpw, ggml_nbytes(tensor), 0.0 }); } - // Keep only the pareto‑optimal candidates: if A has >= bytes and >= error than B, drop A. + // Keep only the pareto‑optimal candidates and enforce convexity in (bytes, error) curve { - std::vector pruned; - pruned.reserve(info.candidate.size()); + auto & candidates = info.candidate; + if (!candidates.empty()) { + std::sort(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) { + if (a.bytes != b.bytes) { return a.bytes < b.bytes; } - // Sort by bytes ascending, error ascending - std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types & a, const candidate_types & b) { - if (a.bytes != b.bytes) { return a.bytes < b.bytes; } - return a.error < b.error; - }); + return a.error < b.error; + }); - double best_err = infinity; - size_t last_bytes = std::numeric_limits::max(); - for (const auto & c : info.candidate) { - // Only keep the best error seen so far at strictly larger byte sizes - if (c.bytes != last_bytes) { - // first time we see this byte size - last_bytes = c.bytes; - if (c.error < best_err) { - pruned.push_back(c); - best_err = c.error; + std::vector pareto; + pareto.reserve(candidates.size()); + double best_err = infinity; + size_t last_bytes = std::numeric_limits::max(); + for (const auto & c : candidates) { + if (c.bytes != last_bytes) { + last_bytes = c.bytes; + if (c.error < best_err) { + best_err = c.error; + pareto.push_back(c); + } } - } else { - // same bytes: we already sorted by error; skip - } - } - - info.candidate.swap(pruned); - } - - // Enforce convexity in (bytes, error) curve - { - const auto & c = info.candidate; - if (c.size() >= 3) { - std::vector convex; - convex.reserve(c.size()); - auto slope = [](const candidate_types & a, const candidate_types & b) -> double { - const double dx = (double)b.bytes - (double)a.bytes; - if (dx <= 0.0) { return infinity; } - - return ((double)b.error - (double)a.error) / dx; - }; - - for (const auto & p : c) { - while (convex.size() >= 2) { - double s1 = slope(convex[convex.size() - 2], convex[convex.size() - 1]); - double s2 = slope(convex[convex.size() - 1], p); - if (s2 + epsilon < s1) { convex.pop_back(); } - else { break; } - } - convex.push_back(p); } - info.candidate.swap(convex); + candidates.swap(pareto); + + if (candidates.size() >= 3) { + std::vector hull; + hull.reserve(candidates.size()); + auto slope = [](const candidate_types & a, const candidate_types & b) { + const double dx = b.bytes - a.bytes; + + return dx <= 0.0 ? infinity : (b.error - a.error) / dx; + }; + + for (const auto & p : candidates) { + while (hull.size() >= 2) { + double s1 = slope(hull[hull.size() - 2], hull[hull.size() - 1]); + double s2 = slope(hull[hull.size() - 1], p); + if (s2 + epsilon < s1) { hull.pop_back(); } + else { break; } + } + + hull.push_back(p); + } + + candidates.swap(hull); + } } } From b433fd95472c39c4974892aa9100e3cdc7b9c63d Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 21 Sep 2025 13:43:09 +0100 Subject: [PATCH 074/155] Refactor last budget pass --- src/llama-quant.cpp | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index ab6601a8bf..e062b2dc6a 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1433,19 +1433,16 @@ static std::unordered_map target_bpw_type( double best_ratio = -1.0; size_t best_delta = 0; for (int i = 0; i < (int)all.size(); ++i) { - const auto & ti = all[i]; - if (ti.choice >= (int)ti.candidate.size() - 1) { - continue; - } - + const auto &ti = all[i]; int j = ti.choice + 1; + // skip same-bytes entries while (j < (int)ti.candidate.size() && ti.candidate[j].bytes == ti.candidate[ti.choice].bytes) { ++j; } if (j >= (int)ti.candidate.size()) { continue; } size_t delta = ti.candidate[j].bytes - ti.candidate[ti.choice].bytes; if (cur_bytes + delta > budget_bytes) { continue; } - double err_gain = std::max(0.0, (double)ti.candidate[ti.choice].error - (double)ti.candidate[j].error); + double err_gain = std::max(0.0, ti.candidate[ti.choice].error - ti.candidate[j].error); double ratio = err_gain / (double)(delta * 8); if (ratio > best_ratio + epsilon || (std::abs(ratio - best_ratio) <= epsilon && delta < best_delta)) { best_ratio = ratio; @@ -1454,7 +1451,6 @@ static std::unordered_map target_bpw_type( best_j = j; } } - if (best_i < 0) { break; } all[best_i].choice = best_j; cur_bytes += best_delta; From b6c008fd8a12a9b1970c4810585cbd540bf0737e Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 21 Sep 2025 16:04:13 +0100 Subject: [PATCH 075/155] Refactor helper lambdas --- src/llama-quant.cpp | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index e062b2dc6a..d31552ea23 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -665,28 +665,23 @@ static std::unordered_map target_bpw_type( auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t { const int64_t n_per_row = t->ne[0]; const size_t row_sz = ggml_row_size(typ, n_per_row); - const int64_t nrows = ggml_nrows(t); - return (size_t)nrows * row_sz; + return (size_t)ggml_nrows(t) * row_sz; }; auto tensor_bpw = [&](const ggml_tensor * t, const ggml_type typ) -> double { - const int64_t nelem = ggml_nelements(t); const size_t bytes = tensor_bytes(t, typ); - return (double)bytes * 8.0 / (double)nelem; + return (double)bytes * 8.0 / (double)ggml_nelements(t); }; auto is_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> bool { - const int64_t n_per_row = t->ne[0]; const int64_t blck = ggml_blck_size(typ); - if (blck <= 1) { return true; } - return n_per_row % blck == 0; + return blck <= 1 || (t->ne[0] % blck) == 0; }; auto make_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> ggml_type { - if (is_compatible(t, typ)) { return typ; } + if (is_compatible(t, typ)) return typ; ggml_type fb = fallback_type(typ); - if (is_compatible(t, fb)) { return fb; } - return GGML_TYPE_F16; + return is_compatible(t, fb) ? fb : GGML_TYPE_F16; }; auto name_tn = LLM_TN(model.arch); @@ -1080,7 +1075,7 @@ static std::unordered_map target_bpw_type( ++current_sampled_rows; } - + rows_sample[slice] = current_sampled_rows; } From 7386d4eadd64006ac7f0fbc992d7d4bcb195bd6c Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 21 Sep 2025 16:18:26 +0100 Subject: [PATCH 076/155] Refactor row sampling --- src/llama-quant.cpp | 87 +++++++++++++++++++++++++-------------------- 1 file changed, 48 insertions(+), 39 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index d31552ea23..f2dab6a898 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1019,64 +1019,73 @@ static std::unordered_map target_bpw_type( ml.load_data_for(tensor); // Dequantize sampled rows into f32_sample + const int rows_sample_per_expert = activations_data ? 512 : 256; const int64_t n_per_row = tensor->ne[0]; const int64_t nrows_total = tensor->ne[1]; const int64_t ne2 = tensor->ne[2] > 0 ? tensor->ne[2] : 1; - - const int rows_sample_per_expert = activations_data ? 512 : 256; std::vector f32_sample; f32_sample.reserve((size_t)ne2 * (size_t)std::min(nrows_total, rows_sample_per_expert) * (size_t)n_per_row); - std::vector rows_sample(ne2, 0); - const int64_t rows_sample_max = std::max(1, std::min(nrows_total, rows_sample_per_expert)); - const int64_t stride = std::max(1, nrows_total / rows_sample_max); const ggml_type src_type = tensor->type; const ggml_type_traits * src_traits = ggml_get_type_traits(src_type); const bool src_is_quant = ggml_is_quantized(src_type); const size_t src_row_sz = ggml_row_size(src_type, n_per_row); - std::vector row_buffer(n_per_row); + // Convert a single row to fp32 auto row_to_fp32 = [&](const uint8_t * src, float * dst) { - if (src_type == GGML_TYPE_F32) { + const ggml_type t = src_type; + if (t == GGML_TYPE_F32) { std::memcpy(dst, src, sizeof(float) * (size_t)n_per_row); - } else if (src_type == GGML_TYPE_F16) { - ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row); - } else if (src_type == GGML_TYPE_BF16) { - ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row); - } else if (src_is_quant) { - if (!src_traits || !src_traits->to_float) { - throw std::runtime_error(format("cannot dequantize type %s for sampling", ggml_type_name(src_type))); - } - - src_traits->to_float(src, dst, (int)n_per_row); - } else { - throw std::runtime_error(format("unsupported src type %s for sampling", ggml_type_name(src_type))); + return; } + if (t == GGML_TYPE_F16) { + ggml_fp16_to_fp32_row((const ggml_fp16_t *) src, dst, (int)n_per_row); + return; + } + if (t == GGML_TYPE_BF16) { + ggml_bf16_to_fp32_row((const ggml_bf16_t *) src, dst, (int)n_per_row); + return; + } + + if (src_is_quant) { + GGML_ASSERT(src_traits && src_traits->to_float); + src_traits->to_float(src, dst, (int) n_per_row); + return; + } + + throw std::runtime_error(format("unsupported src type %s for sampling", ggml_type_name(t))); }; - for (int64_t slice = 0; slice < ne2; ++slice) { - std::mt19937 rng(std::hash{}(name) ^ 0xeabada55cafed00d ^ slice); - int64_t current_sampled_rows = 0; - int64_t offset = 0; - if (stride > 1) { - std::uniform_int_distribution dist(0, stride - 1); - offset = dist(rng); - } - - for (int64_t r = offset; r < nrows_total && current_sampled_rows < rows_sample_max; r += stride) { - const uint8_t * src_row = (const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz; - if (src_type == GGML_TYPE_F32) { - auto src_f32 = (const float *)src_row; - f32_sample.insert(f32_sample.end(), src_f32, src_f32 + n_per_row); - } else { - row_to_fp32(src_row, row_buffer.data()); - f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end()); + // Sample rows randomly per slice + { + f32_sample.clear(); + std::vector row_buffer(n_per_row); + for (int64_t slice = 0; slice < ne2; ++slice) { + std::mt19937 rng(std::hash{}(name) ^ 0xeabada55cafed00d ^ slice); + const int64_t rows_sample_max = std::max(1, std::min(nrows_total, rows_sample_per_expert)); + const int64_t stride = std::max(1, nrows_total / rows_sample_max); + int64_t offset = 0; + if (stride > 1) { + std::uniform_int_distribution dist(0, stride - 1); + offset = dist(rng); } - ++current_sampled_rows; - } + int64_t current = 0; + for (int64_t r = offset; r < nrows_total && current < rows_sample_max; r += stride) { + const uint8_t * src_row = (const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz; + if (src_type == GGML_TYPE_F32) { + auto src_f32 = (const float *)src_row; + f32_sample.insert(f32_sample.end(), src_f32, src_f32 + n_per_row); + } else { + row_to_fp32(src_row, row_buffer.data()); + f32_sample.insert(f32_sample.end(), row_buffer.begin(), row_buffer.end()); + } - rows_sample[slice] = current_sampled_rows; + ++current; + } + + rows_sample[slice] = current; + } } auto side_data = [&](const std::unordered_map> * m, const std::string & tensor_name) -> std::pair { From 08146fd67f5ec6b93e2406340afaaa5aa336596a Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 21 Sep 2025 16:19:03 +0100 Subject: [PATCH 077/155] Refactor side_data() and copy_or_broadcast() --- src/llama-quant.cpp | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index f2dab6a898..b8eb12690e 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1088,14 +1088,12 @@ static std::unordered_map target_bpw_type( } } - auto side_data = [&](const std::unordered_map> * m, const std::string & tensor_name) -> std::pair { - if (!m) { return {nullptr, 0}; } + auto side_data = [&](const std::unordered_map> * m, const std::string & tensor_name) { + if (!m) { return std::pair{nullptr, 0}; } const std::string key = remap_imatrix(tensor_name, mapped); const auto it = m->find(key); - if (it == m->end()) { return {nullptr, 0}; } - - return { it->second.data(), it->second.size() }; + return it == m->end() ? std::pair{nullptr, 0} : std::pair{ it->second.data(), it->second.size() }; }; // Copy this row's side data (values and activations), or broadcasts to all slices @@ -1105,9 +1103,7 @@ static std::unordered_map target_bpw_type( const size_t want = (size_t)ne2 * (size_t)n_per_row; if (src_sz == want) { - dst.resize(want); - std::memcpy(dst.data(), src, want * sizeof(float)); - + dst.assign(src, src + want); return; } if (src_sz == (size_t)n_per_row) { @@ -1115,7 +1111,6 @@ static std::unordered_map target_bpw_type( for (int64_t s = 0; s < ne2; ++s) { std::memcpy(dst.data() + s * n_per_row, src, n_per_row * sizeof(float)); } - return; } From 17be7615ce070af61cd1a0f80b38947c3fea5709 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 21 Sep 2025 16:19:28 +0100 Subject: [PATCH 078/155] Refactor candidate types build --- src/llama-quant.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index b8eb12690e..beac311d50 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1133,19 +1133,17 @@ static std::unordered_map target_bpw_type( size_t total_sampled_rows = f32_sample.size() / n_per_row; // Build list of candidate types first (compatible ones) + const bool has_valid_imatrix = !values_sample.empty() && values_sample.size() == (size_t)ne2 * (size_t)n_per_row; + size_t max_row_sz = 0; const ggml_type * base_arr = is_iq(params->ftype) ? iq_quants : k_quants; const size_t base_sz = is_iq(params->ftype) ? std::size(iq_quants) : std::size(k_quants); - - size_t max_row_sz = 0; - const bool has_valid_imatrix = !values_sample.empty() && values_sample.size() == (size_t)ne2 * (size_t)n_per_row; - std::vector compatible_candidates; compatible_candidates.reserve(base_sz); for (size_t i = 0; i < base_sz; ++i) { ggml_type ts_type = base_arr[i]; if (is_iq(ts_type) && !has_valid_imatrix) { - LLAMA_LOG_WARN("%s: skipping %s quantization for %s, no or mismatched imatrix provided\n", __func__, ggml_type_name(ts_type), name.c_str()); + LLAMA_LOG_WARN("%s: skipping %s for %s, no or mismatched imatrix\n", __func__, ggml_type_name(ts_type), name.c_str()); continue; } From b09662f86aefb5750842c9d68dac42db9054e90c Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 21 Sep 2025 16:19:49 +0100 Subject: [PATCH 079/155] Refactor estimate_lambda() --- src/llama-quant.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index beac311d50..63779ded48 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -982,8 +982,8 @@ static std::unordered_map target_bpw_type( double s2 = 0.0; for (int64_t j = 0; j < n_per_row; ++j) { const double w = v ? std::max(0.0f, v[j]) : 1.0; - const double aw2 = std::sqrt(w) * a[j]; - const double z = aw2 * aw2; + const double aw = std::sqrt(w) * a[j]; + const double z = aw * aw; s1 += z; s2 += z * z; } @@ -992,7 +992,7 @@ static std::unordered_map target_bpw_type( if (s1 > 0.0) { const auto n = (double)n_per_row; const double c = std::max(0.0, s2 / (s1 * s1 + epsilon) - 1.0 / n); - l = (float) std::clamp(8.0 * (c / (c + 1.0)), 0.0, 12.0); + l = (float)std::clamp(8.0 * (c / (c + 1.0)), 0.0, 12.0); } lambdas[(size_t)s] = l; From a7ee915e19d9acd7a1187ba7d8d772d3a52a8f0d Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 21 Sep 2025 16:20:06 +0100 Subject: [PATCH 080/155] Refactor trimmed_sum() --- src/llama-quant.cpp | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 63779ded48..67de29df87 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -920,26 +920,15 @@ static std::unordered_map target_bpw_type( // Trimmed sum to avoid outlier rows dominating the results auto trimmed_sum = [&](std::vector & v) -> double { - if (v.empty()) { return 0.0; } - const int64_t n = (int64_t)v.size(); - if (n < 50) { - double s = 0.0; - for (const double z : v) { s += z; } - - return s; - } + if (n == 0) { return 0.0; } + if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); } int64_t k = (int64_t)std::floor(0.02 * (double)n); // trim 2% each side - k = std::max(0, std::min(k, n / 32)); // cap at ~3.125% + k = std::clamp(k, 0, n / 32); // cap at ~3.125% std::nth_element(v.begin(), v.begin() + k, v.end()); std::nth_element(v.begin() + k, v.begin() + (n - k), v.end()); - double s = 0.0; - for (int64_t i = k; i < n - k; ++i) { - s += v[i]; - } - - return s; + return std::accumulate(v.begin() + k, v.begin() + (n - k), 0.0); }; const double scale_rows = (double)nrows / std::max(1.0, (double)rs); From 1a3e9ea4c88c40b7fea3a94ff45522531f31f005 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 21 Sep 2025 16:21:00 +0100 Subject: [PATCH 081/155] Refactor estimate_error() --- src/llama-quant.cpp | 191 ++++++++++++++++++++------------------------ 1 file changed, 85 insertions(+), 106 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 67de29df87..b3e4b3cbf7 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -737,12 +737,12 @@ static std::unordered_map target_bpw_type( const int64_t n_per_row = t->ne[0]; const int64_t nrows = t->ne[1]; const int64_t ne2 = t->ne[2] > 0 ? t->ne[2] : 1; - const size_t sample_element_count = f32_sample.size(); - const size_t sample_row_count = n_per_row > 0 ? sample_element_count / (size_t)n_per_row : 0; - if (sample_row_count == 0) { + const size_t sample_elems = f32_sample.size(); + const size_t sample_rows = n_per_row > 0 ? sample_elems / (size_t)n_per_row : 0; + + if (sample_rows == 0) { if (out_mse) { *out_mse = 0.0; } if (out_proj) { *out_proj = 0.0; } - return 0.0; } @@ -751,105 +751,102 @@ static std::unordered_map target_bpw_type( expected_rows += (size_t)rows_sample[s]; } - if (expected_rows != sample_row_count) { + if (expected_rows != sample_rows) { if (out_mse) { *out_mse = infinity; } if (out_proj) { *out_proj = 0.0; } - return infinity; } const size_t row_sz = ggml_row_size(quant_type, n_per_row); - const size_t buffer_sz = row_sz * sample_row_count; + const size_t buf_sz = row_sz * sample_rows; - if (quantized_buffer.size() < buffer_sz) { quantized_buffer.resize(buffer_sz); } - if (dequantized_buffer.size() < sample_element_count) { dequantized_buffer.resize(sample_element_count); } + if (quantized_buffer.size() < buf_sz) { quantized_buffer.resize(buf_sz); } + if (dequantized_buffer.size() < sample_elems) { dequantized_buffer.resize(sample_elems); } const bool has_values = values_sample != nullptr; const bool has_activations = activations_sample != nullptr; // Bias denominators per slice - std::vector bias_denominator_per_slice(ne2, 0.0); + std::vector bias_denom(ne2, 0.0); if (has_activations) { for (int64_t s = 0; s < ne2; ++s) { - const float * values = has_values ? values_sample + s * n_per_row : nullptr; - const float * activations = activations_sample + s * n_per_row; + const float * v = has_values ? values_sample + s * n_per_row : nullptr; + const float * a = activations_sample + s * n_per_row; double denom = 0.0; for (int64_t j = 0; j < n_per_row; ++j) { - const double w = values ? std::max(0.0f, values[j]) : 1.0; - const double a = activations[j]; - denom += w * a * a; + const double w = v ? std::max(0.0f, v[j]) : 1.0; + const double aj = a[j]; + denom += w * aj * aj; } - bias_denominator_per_slice[s] = denom; + bias_denom[s] = denom; } } - // Weighted per-row squared norms - std::vector row_sq_norm(sample_row_count, 0.0); + // Row squared norms (weighted if values present) + std::vector row_sq_norm(sample_rows, 0.0); { - size_t offset = 0; - size_t row_idx = 0; + size_t off = 0; + size_t ridx = 0; for (int64_t s = 0; s < ne2; ++s) { const int64_t rs = rows_sample[s]; if (rs == 0) { continue; } - const float * values = has_values ? values_sample + s * n_per_row : nullptr; - for (int64_t r = 0; r < rs; ++r, ++row_idx) { - const float * x = f32_sample.data() + offset; - double rsn = 0.0; - if (values) { + const float * v = has_values ? values_sample + s * n_per_row : nullptr; + for (int64_t r = 0; r < rs; ++r, ++ridx) { + const float * x = f32_sample.data() + off; + double sum = 0.0; + if (v) { for (int64_t j = 0; j < n_per_row; ++j) { - const double w = std::max(0.0f, values[j]); + const double w = std::max(0.0f, v[j]); const double xx = x[j]; - rsn += w * xx * xx; + sum += w * xx * xx; } } else { for (int64_t j = 0; j < n_per_row; ++j) { const double xx = x[j]; - rsn += xx * xx; + sum += xx * xx; } } - row_sq_norm[row_idx] = rsn; - offset += (size_t)n_per_row; + + row_sq_norm[ridx] = sum; + off += (size_t)n_per_row; } } } - // Quantize sampled rows per slice -> quantized_buffer + // Quantize per slice into quantized_buffer { - size_t q_offset = 0; - size_t f_offset = 0; - for (int64_t slice = 0; slice < ne2; ++slice) { - const int64_t rs = rows_sample[slice]; + size_t qoff = 0; + size_t foff = 0; + for (int64_t s = 0; s < ne2; ++s) { + const int64_t rs = rows_sample[s]; if (rs == 0) { continue; } - const float * value = has_values ? values_sample + slice * n_per_row : nullptr; - (void)ggml_quantize_chunk(quant_type, f32_sample.data() + f_offset, quantized_buffer.data() + q_offset, 0, rs, n_per_row, value); - q_offset += row_sz * (size_t)rs; - f_offset += (size_t)rs * (size_t)n_per_row; + const float * v = has_values ? values_sample + s * n_per_row : nullptr; + (void)ggml_quantize_chunk(quant_type, f32_sample.data() + foff, quantized_buffer.data() + qoff, 0, rs, n_per_row, v); + qoff += row_sz * (size_t)rs; + foff += (size_t)rs * (size_t)n_per_row; } } - // quantized_buffer -> dequantized_buffer + // Dequantize into dequantized_buffer { const ggml_type_traits * traits = ggml_get_type_traits(quant_type); - const bool is_fp16 = quant_type == GGML_TYPE_F16; - const bool is_bf16 = quant_type == GGML_TYPE_BF16; - if (!is_fp16 && !is_bf16 && traits && traits->to_float) { - traits->to_float(quantized_buffer.data(), dequantized_buffer.data(), (int)(sample_row_count * (size_t)n_per_row)); + if (traits && traits->to_float && quant_type != GGML_TYPE_F16 && quant_type != GGML_TYPE_BF16) { + traits->to_float(quantized_buffer.data(), dequantized_buffer.data(), (int)(sample_rows * (size_t)n_per_row)); } else { - for (size_t r = 0; r < sample_row_count; ++r) { - uint8_t * src = quantized_buffer.data() + r * row_sz; + for (size_t r = 0; r < sample_rows; ++r) { + const uint8_t * src = quantized_buffer.data() + r * row_sz; float * dst = dequantized_buffer.data() + r * (size_t)n_per_row; - if (is_fp16) { + if (quant_type == GGML_TYPE_F16) { ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row); - } else if (is_bf16) { + } else if (quant_type == GGML_TYPE_BF16) { ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row); } else { if (!traits || !traits->to_float) { if (out_mse) { *out_mse = infinity; } if (out_proj) { *out_proj = 0.0; } - return infinity; } traits->to_float(src, dst, (int)n_per_row); @@ -858,94 +855,77 @@ static std::unordered_map target_bpw_type( } } - // Compute error - size_t offset = 0; - size_t row_idx = 0; + // Compute error per slice with trimmed aggregation + auto trimmed_sum = [&](std::vector & v) -> double { + const int64_t n = (int64_t)v.size(); + if (n == 0) { return 0.0; } + if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); } + int64_t k = (int64_t) std::floor(0.02 * (double) n); // trim 2% on each side + k = std::clamp(k, 0, n / 32); // but no more than ~3% + std::nth_element(v.begin(), v.begin() + k, v.end()); + std::nth_element(v.begin() + k, v.begin() + (n - k), v.end()); + return std::accumulate(v.begin() + k, v.begin() + (n - k), 0.0); + }; + + size_t off = 0; + size_t ridx = 0; double total_mse = 0.0; double total_proj = 0.0; double total_bias = 0.0; - for (int64_t slice = 0; slice < ne2; ++slice) { - const int64_t rs = rows_sample[slice]; + for (int64_t s = 0; s < ne2; ++s) { + const int64_t rs = rows_sample[s]; if (rs == 0) { continue; } - const float * values = has_values ? values_sample + slice * n_per_row : nullptr; - const float * activations = has_activations ? activations_sample + slice * n_per_row : nullptr; - const double bias_denom = has_activations ? bias_denominator_per_slice[slice] : 0.0; + const float * v = has_values ? values_sample + s * n_per_row : nullptr; + const float * a = has_activations ? activations_sample + s * n_per_row : nullptr; + const double denom_bias = has_activations ? bias_denom[s] : 0.0; std::vector row_mse_norm; - std::vector row_proj_norm; row_mse_norm.reserve(rs); - if (activations) { row_proj_norm.reserve(rs); } + std::vector row_proj_norm; + if (a) { row_proj_norm.reserve(rs); } - for (int64_t r = 0; r < rs; ++r, ++row_idx) { - const float * x = f32_sample.data() + offset; - const float * y = dequantized_buffer.data() + offset; - double weighted_mse = 0.0; + for (int64_t r = 0; r < rs; ++r, ++ridx) { + const float * x = f32_sample.data() + off; + const float * y = dequantized_buffer.data() + off; + double w_mse = 0.0; double bias_num = 0.0; - if (values && activations) { - for (int64_t j = 0; j < n_per_row; ++j) { - const double w = std::max(0.0f, values[j]); - const double e = y[j] - x[j]; - const double a = activations[j]; - weighted_mse += w * e * e; - bias_num += w * e * a; - } - } else if (values) { - for (int64_t j = 0; j < n_per_row; ++j) { - const double w = std::max(0.0f, values[j]); - const double e = y[j] - x[j]; - weighted_mse += w * e * e; - } - } else { - for (int64_t j = 0; j < n_per_row; ++j) { - const double e = y[j] - x[j]; - weighted_mse += e * e; - } + for (int64_t j = 0; j < n_per_row; ++j) { + const double wj = v ? std::max(0.0f, v[j]) : 1.0; + const double e = y[j] - x[j]; + w_mse += wj * e * e; + if (a) { bias_num += wj * e * a[j]; } } - const double denom_x = row_sq_norm[row_idx]; - double m_norm = weighted_mse / (denom_x + epsilon); + const double denom_x = row_sq_norm[ridx]; + const double m_norm = w_mse / (denom_x + epsilon); row_mse_norm.push_back(std::isfinite(m_norm) ? m_norm : infinity); - if (activations) { + if (a) { double p_norm = 0.0; - if (bias_denom > 0.0) { - const double proj = bias_num * bias_num / (bias_denom + epsilon); + if (denom_bias > 0.0) { + const double proj = bias_num * bias_num / (denom_bias + epsilon); p_norm = std::isfinite(proj) ? proj : 0.0; } + row_proj_norm.push_back(p_norm); } - offset += (size_t)n_per_row; + off += (size_t)n_per_row; } - // Trimmed sum to avoid outlier rows dominating the results - auto trimmed_sum = [&](std::vector & v) -> double { - const int64_t n = (int64_t)v.size(); - if (n == 0) { return 0.0; } - if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); } - - int64_t k = (int64_t)std::floor(0.02 * (double)n); // trim 2% each side - k = std::clamp(k, 0, n / 32); // cap at ~3.125% - std::nth_element(v.begin(), v.begin() + k, v.end()); - std::nth_element(v.begin() + k, v.begin() + (n - k), v.end()); - return std::accumulate(v.begin() + k, v.begin() + (n - k), 0.0); - }; - const double scale_rows = (double)nrows / std::max(1.0, (double)rs); const double slice_mse = trimmed_sum(row_mse_norm) * scale_rows; - const double slice_proj = activations ? trimmed_sum(row_proj_norm) * scale_rows : 0.0; + const double slice_proj = a ? trimmed_sum(row_proj_norm) * scale_rows : 0.0; total_mse += slice_mse; total_proj += slice_proj; - // per-slice lambda if provided, otherwise use scalar - const double bl = slice_bias_lambda ? (double)std::max(0.0f, slice_bias_lambda[slice]) : (double)tensor_bias_lambda; + const double bl = slice_bias_lambda ? (double)std::max(0.0f, slice_bias_lambda[s]) : (double)tensor_bias_lambda; total_bias += bl * slice_proj; if (!std::isfinite(total_mse) || !std::isfinite(total_proj) || !std::isfinite(total_bias)) { if (out_mse) { *out_mse = infinity; } if (out_proj) { *out_proj = 0.0; } - return infinity; } } @@ -954,7 +934,6 @@ static std::unordered_map target_bpw_type( if (out_proj) { *out_proj = total_proj; } const double total_err = slice_bias_lambda ? total_mse + total_bias : total_mse + tensor_bias_lambda * total_proj; - return std::isfinite(total_err) ? total_err : infinity; }; From 9a1656eb975fa9f1024a8de029e22a762e49719b Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 21 Sep 2025 16:21:35 +0100 Subject: [PATCH 082/155] Refactor pareto optimise and convexify --- src/llama-quant.cpp | 86 ++++++++++++++++++++++----------------------- 1 file changed, 42 insertions(+), 44 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index b3e4b3cbf7..751a26c63a 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1179,55 +1179,53 @@ static std::unordered_map target_bpw_type( } // Keep only the pareto‑optimal candidates and enforce convexity in (bytes, error) curve - { - auto & candidates = info.candidate; - if (!candidates.empty()) { - std::sort(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) { - if (a.bytes != b.bytes) { return a.bytes < b.bytes; } + auto pareto_convex = [](std::vector & candidates) { + if (candidates.empty()) return; - return a.error < b.error; - }); + std::sort(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) { + if (a.bytes != b.bytes) { return a.bytes < b.bytes; } + return a.error < b.error; + }); - std::vector pareto; - pareto.reserve(candidates.size()); - double best_err = infinity; - size_t last_bytes = std::numeric_limits::max(); - for (const auto & c : candidates) { - if (c.bytes != last_bytes) { - last_bytes = c.bytes; - if (c.error < best_err) { - best_err = c.error; - pareto.push_back(c); - } + // Pareto by bytes -> error + std::vector pareto; + pareto.reserve(candidates.size()); + double best_err = std::numeric_limits::infinity(); + size_t last_b = std::numeric_limits::max(); + for (const auto & c : candidates) { + if (c.bytes != last_b) { + last_b = c.bytes; + if (c.error < best_err) { + best_err = c.error; + pareto.push_back(c); } } - - candidates.swap(pareto); - - if (candidates.size() >= 3) { - std::vector hull; - hull.reserve(candidates.size()); - auto slope = [](const candidate_types & a, const candidate_types & b) { - const double dx = b.bytes - a.bytes; - - return dx <= 0.0 ? infinity : (b.error - a.error) / dx; - }; - - for (const auto & p : candidates) { - while (hull.size() >= 2) { - double s1 = slope(hull[hull.size() - 2], hull[hull.size() - 1]); - double s2 = slope(hull[hull.size() - 1], p); - if (s2 + epsilon < s1) { hull.pop_back(); } - else { break; } - } - - hull.push_back(p); - } - - candidates.swap(hull); - } } - } + + candidates.swap(pareto); + if (candidates.size() < 3) { return; } // need at least 3 points to do convex hull + + // Convex hull (lower envelope) + auto slope = [](const candidate_types & a, const candidate_types & b) { + const double dx = b.bytes - a.bytes; + return dx <= 0.0 ? infinity : (b.error - a.error) / dx; + }; + + std::vector hull; hull.reserve(candidates.size()); + for (const auto & p : candidates) { + while (hull.size() >= 2) { + const double s1 = slope(hull[hull.size() - 2], hull[hull.size() - 1]); + const double s2 = slope(hull[hull.size() - 1], p); + if (s2 + epsilon < s1) hull.pop_back(); + else { break; } + } + + hull.push_back(p); + } + candidates.swap(hull); + }; + + pareto_convex(info.candidate); // Initialize choice at the smallest bpw candidate info.choice = 0; From 0d5f18303e25e6b4e4dc21f963ca6672b9b12d0f Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 21 Sep 2025 16:22:00 +0100 Subject: [PATCH 083/155] Refactor lagrange_penalty() --- src/llama-quant.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 751a26c63a..204fbfecad 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1288,21 +1288,21 @@ static std::unordered_map target_bpw_type( bytes = 0; err = 0.0; for (size_t i = 0; i < all.size(); ++i) { - const auto & cand = all[i].candidate; + const auto & candidate = all[i].candidate; int best_j = 0; double best_val = infinity; - for (int j = 0; j < (int)cand.size(); ++j) { - const double bits = (double)cand[j].bytes * 8.0; - const double val = cand[j].error + mu * bits; - if (val < best_val - epsilon || (std::abs(val - best_val) <= epsilon && cand[j].bytes < cand[best_j].bytes)) { + for (int j = 0; j < (int)candidate.size(); ++j) { + const double bits = (double)candidate[j].bytes * 8.0; + const double val = candidate[j].error + mu * bits; + if (val < best_val - epsilon || (std::abs(val - best_val) <= epsilon && candidate[j].bytes < candidate[best_j].bytes)) { best_val = val; best_j = j; } } choice[i] = best_j; - bytes += cand[best_j].bytes; - err += cand[best_j].error; + bytes += candidate[best_j].bytes; + err += candidate[best_j].error; } }; From 814f6b66be4b5ebbe286201eafe8361a37d39a98 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 21 Sep 2025 16:45:09 +0100 Subject: [PATCH 084/155] Minor general refactoring --- src/llama-quant.cpp | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 204fbfecad..93b5fb0eba 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -860,7 +860,8 @@ static std::unordered_map target_bpw_type( const int64_t n = (int64_t)v.size(); if (n == 0) { return 0.0; } if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); } - int64_t k = (int64_t) std::floor(0.02 * (double) n); // trim 2% on each side + + int64_t k = (int64_t) std::floor(0.02 * (double)n); // trim 2% on each side k = std::clamp(k, 0, n / 32); // but no more than ~3% std::nth_element(v.begin(), v.begin() + k, v.end()); std::nth_element(v.begin() + k, v.begin() + (n - k), v.end()); @@ -1190,7 +1191,7 @@ static std::unordered_map target_bpw_type( // Pareto by bytes -> error std::vector pareto; pareto.reserve(candidates.size()); - double best_err = std::numeric_limits::infinity(); + double best_err = infinity; size_t last_b = std::numeric_limits::max(); for (const auto & c : candidates) { if (c.bytes != last_b) { @@ -1273,12 +1274,10 @@ static std::unordered_map target_bpw_type( if (budget_bytes <= min_bytes) { for (auto & ti : all) { ti.choice = 0; } - return emit_overrides(); } if (budget_bytes >= max_bytes) { for (auto & ti : all) { ti.choice = (int) ti.candidate.size() - 1; } - return emit_overrides(); } @@ -1327,14 +1326,10 @@ static std::unordered_map target_bpw_type( int expand = 0; while (true) { lagrange_penalty(mu_hi, choice_hi, bytes_hi, err_hi); - if (bytes_hi <= budget_bytes) { - break; - } + if (bytes_hi <= budget_bytes) { break; } mu_hi *= 2.0; - if (++expand > 60) { - break; - } + if (++expand > 60) { break; } // safety cap } } From e92db008bc848b109f2931162a69c7010f675b70 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 21 Sep 2025 17:20:48 +0100 Subject: [PATCH 085/155] Refactor quantisation checks into its own function --- src/llama-quant.cpp | 140 ++++++++++++++++++-------------------------- 1 file changed, 57 insertions(+), 83 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 93b5fb0eba..3544653a56 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -21,6 +21,60 @@ struct tensor_quantization { ggml_type quant = GGML_TYPE_COUNT; }; +static bool is_quantizable(const std::string & name, const llm_arch arch, const llama_model_quantize_params * params) { + if (params->only_copy) { return false; } + + const auto tn = LLM_TN(arch); + + // This used to be a regex, but has an extreme cost to compile times. + bool q = name.size() >= 6 && name.rfind("weight") == name.size() - 6; // ends with 'weight'? + + // Do not quantize norm tensors + q &= name.find("_norm.weight") == std::string::npos; + + // Do not quantize expert gating tensors + // NOTE: can't use LLM_TN here because the layer number is not known + q &= name.find("ffn_gate_inp.weight") == std::string::npos; + + // These are very small (e.g. 4x4) + q &= name.find("altup") == std::string::npos; + q &= name.find("laurel") == std::string::npos; + + // These are not too big so keep them as it is + q &= name.find("per_layer_model_proj") == std::string::npos; + + // Do not quantize positional embeddings and token types (BERT) + q &= name != tn(LLM_TENSOR_POS_EMBD, "weight"); + q &= name != tn(LLM_TENSOR_TOKEN_TYPES, "weight"); + + // Do not quantize Jamba, Mamba, LFM2's small yet 2D weights + // NOTE: can't use LLM_TN here because the layer number is not known + q &= name.find("ssm_conv1d.weight") == std::string::npos; + q &= name.find("shortconv.conv.weight") == std::string::npos; + + // Do not quantize ARWKV, RWKV's small yet 2D weights + q &= name.find("time_mix_first.weight") == std::string::npos; + q &= name.find("time_mix_w0.weight") == std::string::npos; + q &= name.find("time_mix_w1.weight") == std::string::npos; + q &= name.find("time_mix_w2.weight") == std::string::npos; + q &= name.find("time_mix_v0.weight") == std::string::npos; + q &= name.find("time_mix_v1.weight") == std::string::npos; + q &= name.find("time_mix_v2.weight") == std::string::npos; + q &= name.find("time_mix_a0.weight") == std::string::npos; + q &= name.find("time_mix_a1.weight") == std::string::npos; + q &= name.find("time_mix_a2.weight") == std::string::npos; + q &= name.find("time_mix_g1.weight") == std::string::npos; + q &= name.find("time_mix_g2.weight") == std::string::npos; + q &= name.find("time_mix_decay_w1.weight") == std::string::npos; + q &= name.find("time_mix_decay_w2.weight") == std::string::npos; + q &= name.find("time_mix_lerp_fused.weight") == std::string::npos; + + // Do not quantize relative position bias (T5) + q &= name.find("attn_rel_b.weight") == std::string::npos; + + return q; +} + static bool is_iq(const enum ggml_type t) { switch (t) { case GGML_TYPE_IQ1_S: @@ -684,40 +738,9 @@ static std::unordered_map target_bpw_type( return is_compatible(t, fb) ? fb : GGML_TYPE_F16; }; - auto name_tn = LLM_TN(model.arch); auto can_quantize = [&](const ggml_tensor * t) -> bool { - // This list should be kept in sync with llama_tensor_quantize_impl() to avoid drift - const std::string name = ggml_get_name(t); - bool q = name.rfind("weight") == name.size() - 6; - q &= ggml_n_dims(t) >= 2; - q &= name.find("_norm.weight") == std::string::npos; - q &= name.find("ffn_gate_inp.weight") == std::string::npos; - q &= name.find("altup") == std::string::npos; - q &= name.find("laurel") == std::string::npos; - q &= name.find("per_layer_model_proj") == std::string::npos; - q &= name != name_tn(LLM_TENSOR_POS_EMBD, "weight"); - q &= name != name_tn(LLM_TENSOR_TOKEN_TYPES, "weight"); - q &= name.find("ssm_conv1d.weight") == std::string::npos; - q &= name.find("shortconv.conv.weight") == std::string::npos; - q &= name.find("time_mix_first.weight") == std::string::npos; - q &= name.find("time_mix_w0.weight") == std::string::npos; - q &= name.find("time_mix_w1.weight") == std::string::npos; - q &= name.find("time_mix_w2.weight") == std::string::npos; - q &= name.find("time_mix_v0.weight") == std::string::npos; - q &= name.find("time_mix_v1.weight") == std::string::npos; - q &= name.find("time_mix_v2.weight") == std::string::npos; - q &= name.find("time_mix_a0.weight") == std::string::npos; - q &= name.find("time_mix_a1.weight") == std::string::npos; - q &= name.find("time_mix_a2.weight") == std::string::npos; - q &= name.find("time_mix_g1.weight") == std::string::npos; - q &= name.find("time_mix_g2.weight") == std::string::npos; - q &= name.find("time_mix_decay_w1.weight") == std::string::npos; - q &= name.find("time_mix_decay_w2.weight") == std::string::npos; - q &= name.find("time_mix_lerp_fused.weight") == std::string::npos; - q &= name.find("attn_rel_b.weight") == std::string::npos; - q &= !params->only_copy; - - return q; + if (ggml_n_dims(t) < 2) { return false; } + return is_quantizable(ggml_get_name(t), model.arch, params); }; // Estimate error for a given type using a sampled subset of rows @@ -1747,57 +1770,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ", ++idx, ml.n_tensors, ggml_get_name(tensor), llama_format_tensor_shape(tensor).c_str(), ggml_type_name(tensor->type)); - // This used to be a regex, but has an extreme cost to compile times. - bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'? - - // quantize only 2D and 3D tensors (experts) - quantize &= (ggml_n_dims(tensor) >= 2); - - // do not quantize norm tensors - quantize &= name.find("_norm.weight") == std::string::npos; - + bool quantize = ggml_n_dims(tensor) >= 2 && is_quantizable(name, model.arch, params); quantize &= params->quantize_output_tensor || name != "output.weight"; - quantize &= !params->only_copy; - - // do not quantize expert gating tensors - // NOTE: can't use LLM_TN here because the layer number is not known - quantize &= name.find("ffn_gate_inp.weight") == std::string::npos; - - // these are very small (e.g. 4x4) - quantize &= name.find("altup") == std::string::npos; - quantize &= name.find("laurel") == std::string::npos; - - // these are not too big so keep them as it is - quantize &= name.find("per_layer_model_proj") == std::string::npos; - - // do not quantize positional embeddings and token types (BERT) - quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight"); - quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight"); - - // do not quantize Mamba's small yet 2D weights - // NOTE: can't use LLM_TN here because the layer number is not known - quantize &= name.find("ssm_conv1d.weight") == std::string::npos; - quantize &= name.find("shortconv.conv.weight") == std::string::npos; - - // do not quantize RWKV's small yet 2D weights - quantize &= name.find("time_mix_first.weight") == std::string::npos; - quantize &= name.find("time_mix_w0.weight") == std::string::npos; - quantize &= name.find("time_mix_w1.weight") == std::string::npos; - quantize &= name.find("time_mix_w2.weight") == std::string::npos; - quantize &= name.find("time_mix_v0.weight") == std::string::npos; - quantize &= name.find("time_mix_v1.weight") == std::string::npos; - quantize &= name.find("time_mix_v2.weight") == std::string::npos; - quantize &= name.find("time_mix_a0.weight") == std::string::npos; - quantize &= name.find("time_mix_a1.weight") == std::string::npos; - quantize &= name.find("time_mix_a2.weight") == std::string::npos; - quantize &= name.find("time_mix_g1.weight") == std::string::npos; - quantize &= name.find("time_mix_g2.weight") == std::string::npos; - quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos; - quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos; - quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos; - - // do not quantize relative position bias (T5) - quantize &= name.find("attn_rel_b.weight") == std::string::npos; ggml_type new_type; void * new_data; From fecc472c6175bc65217d6f29855acf81477a5125 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 21 Sep 2025 17:26:38 +0100 Subject: [PATCH 086/155] Fix typos in variable names --- src/llama-quant.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 3544653a56..8a709ddfdd 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1165,7 +1165,7 @@ static std::unordered_map target_bpw_type( // Evaluate candidates std::vector eval_candidates(compatible_candidates.size()); std::vector quantized_buffer(max_row_sz * total_sampled_rows); - std::vector dequantised_buffer(f32_sample.size()); + std::vector dequantized_buffer(f32_sample.size()); const float * slice_lambda = lambdas.empty() ? nullptr : lambdas.data(); int n_eval_threads = std::max(1, std::min(nthread, (int)compatible_candidates.size())); std::atomic cidx{0}; @@ -1175,7 +1175,7 @@ static std::unordered_map target_bpw_type( eval_workers.emplace_back([&] { // thread-local scratch std::vector tl_quantized_buffer(quantized_buffer.size()); - std::vector tl_dequantised_buffer(dequantised_buffer.size()); + std::vector tl_dequantized_buffer(dequantized_buffer.size()); for (;;) { const size_t i = cidx.fetch_add(1, std::memory_order_relaxed); if (i >= compatible_candidates.size()) { break; } @@ -1184,7 +1184,7 @@ static std::unordered_map target_bpw_type( const auto bpw = (float)tensor_bpw(tensor, tensor_types); const size_t bytes = tensor_bytes(tensor, tensor_types); const auto err = estimate_error(tensor, tensor_types, f32_sample, rows_sample, values, activations, - tl_quantized_buffer, tl_dequantised_buffer, tensor_lambda, slice_lambda); + tl_quantized_buffer, tl_dequantized_buffer, tensor_lambda, slice_lambda); eval_candidates[i] = candidate_types{ tensor_types, bpw, bytes, err }; } }); From 896cdc21217ab4d0b2bcb8b18938d3c0efc94dc1 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 21 Sep 2025 22:03:36 +0100 Subject: [PATCH 087/155] Refactor potential overflow --- src/llama-quant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 8a709ddfdd..52d7984e2a 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1002,7 +1002,7 @@ static std::unordered_map target_bpw_type( const std::string name = ggml_get_name(tensor); if (!can_quantize(tensor)) { continue; } - LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12d elements)\n", __func__, name.c_str(), (int)ggml_nelements(tensor)); + LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12" PRId64 " elements)\n", __func__, name.c_str(), ggml_nelements(tensor)); if (!ml.use_mmap) { if (buffer.size() < ggml_nbytes(tensor)) { buffer.resize(ggml_nbytes(tensor)); } tensor->data = buffer.data(); From b748a1efa7dd0ab0d4064574530b4b045b27bbfc Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 21 Sep 2025 22:03:54 +0100 Subject: [PATCH 088/155] Fix typo --- src/llama-quant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 52d7984e2a..2652f5c86e 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1709,7 +1709,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } else { LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate - ", __func__); } - LLAMA_LOG_INFO("using %s error estimation\n", params->no_bias ? "MSE only (no aligment bias)" : "aligment bias (default)"); + LLAMA_LOG_INFO("using %s error estimation\n", params->no_bias ? "MSE only (no alignment bias)" : "alignment bias (default)"); LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw); bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread); } else { From c855094dff509c97f6cc268e28f123262e67b6f7 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 22 Sep 2025 20:09:11 +0100 Subject: [PATCH 089/155] Exit loop if no better solution found --- src/llama-quant.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 2652f5c86e..8ee052a8e5 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1347,9 +1347,12 @@ static std::unordered_map target_bpw_type( // increase mu until we get under budget or hit a safety cap { int expand = 0; + size_t prev_bytes_hi = std::numeric_limits::max(); while (true) { lagrange_penalty(mu_hi, choice_hi, bytes_hi, err_hi); if (bytes_hi <= budget_bytes) { break; } + if (bytes_hi >= prev_bytes_hi) { break; } + prev_bytes_hi = bytes_hi; mu_hi *= 2.0; if (++expand > 60) { break; } // safety cap From 1fbc59f867b283d1f66a87a8b1f45d265cf69fca Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 22 Sep 2025 20:10:10 +0100 Subject: [PATCH 090/155] Replace slope with cross product --- src/llama-quant.cpp | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 8ee052a8e5..0b2f15f0a6 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1230,22 +1230,27 @@ static std::unordered_map target_bpw_type( if (candidates.size() < 3) { return; } // need at least 3 points to do convex hull // Convex hull (lower envelope) - auto slope = [](const candidate_types & a, const candidate_types & b) { - const double dx = b.bytes - a.bytes; - return dx <= 0.0 ? infinity : (b.error - a.error) / dx; - }; - std::vector hull; hull.reserve(candidates.size()); - for (const auto & p : candidates) { + for (const auto & c : candidates) { + auto cross_product = [](const candidate_types & h0, const candidate_types & h1, const candidate_types & p) -> double { + const double dx1 = (double)h1.bytes - (double)h0.bytes; + const double dy1 = h1.error - h0.error; + const double dx2 = (double)p.bytes - (double)h0.bytes; + const double dy2 = p.error - h0.error; + return dx1 * dy2 - dx2 * dy1; + }; + while (hull.size() >= 2) { - const double s1 = slope(hull[hull.size() - 2], hull[hull.size() - 1]); - const double s2 = slope(hull[hull.size() - 1], p); - if (s2 + epsilon < s1) hull.pop_back(); - else { break; } + if (cross_product(hull[hull.size() - 2], hull[hull.size() - 1], c) <= epsilon) { + hull.pop_back(); + } else { + break; + } } - hull.push_back(p); + hull.push_back(c); } + candidates.swap(hull); }; From f184450806163bd1af0eecaff5c31639cf3eaf8f Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 22 Sep 2025 20:10:42 +0100 Subject: [PATCH 091/155] Fix minor logic flaw --- src/llama-quant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 0b2f15f0a6..4c0ec3063a 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -957,7 +957,7 @@ static std::unordered_map target_bpw_type( if (out_mse) { *out_mse = total_mse; } if (out_proj) { *out_proj = total_proj; } - const double total_err = slice_bias_lambda ? total_mse + total_bias : total_mse + tensor_bias_lambda * total_proj; + const double total_err = total_mse + total_bias; return std::isfinite(total_err) ? total_err : infinity; }; From d79ade2e8e45057d9006b0b096888501ae639aab Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 22 Sep 2025 20:11:26 +0100 Subject: [PATCH 092/155] Adjust for small vector size --- src/llama-quant.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 4c0ec3063a..08e1c97185 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -885,9 +885,8 @@ static std::unordered_map target_bpw_type( if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); } int64_t k = (int64_t) std::floor(0.02 * (double)n); // trim 2% on each side - k = std::clamp(k, 0, n / 32); // but no more than ~3% - std::nth_element(v.begin(), v.begin() + k, v.end()); - std::nth_element(v.begin() + k, v.begin() + (n - k), v.end()); + k = std::clamp(k, 0, std::min(n / 32, n / 2 - 1)); // but no more than ~3% or n/2 if small + std::sort(v.begin(), v.end()); return std::accumulate(v.begin() + k, v.begin() + (n - k), 0.0); }; From 7ba6001ec8fda89e7d513ced2da7b9aa3532cb70 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 22 Sep 2025 20:11:54 +0100 Subject: [PATCH 093/155] Simplify candidates sorting --- src/llama-quant.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 08e1c97185..f4c0ea0fcd 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1209,6 +1209,10 @@ static std::unordered_map target_bpw_type( if (a.bytes != b.bytes) { return a.bytes < b.bytes; } return a.error < b.error; }); + const auto last = std::unique(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) { + return a.bytes == b.bytes; + }); + candidates.erase(last, candidates.end()); // Pareto by bytes -> error std::vector pareto; From d36ee0a0a86a65e1d730e788d735c1606ebeb49a Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 22 Sep 2025 20:41:56 +0100 Subject: [PATCH 094/155] Add comments to explain magic numbers --- src/llama-quant.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index f4c0ea0fcd..93007f281e 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -739,7 +739,7 @@ static std::unordered_map target_bpw_type( }; auto can_quantize = [&](const ggml_tensor * t) -> bool { - if (ggml_n_dims(t) < 2) { return false; } + if (ggml_n_dims(t) < 2) { return false; } // skip 1D tensors return is_quantizable(ggml_get_name(t), model.arch, params); }; @@ -882,10 +882,10 @@ static std::unordered_map target_bpw_type( auto trimmed_sum = [&](std::vector & v) -> double { const int64_t n = (int64_t)v.size(); if (n == 0) { return 0.0; } - if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); } + if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); } // use all samples for small datasets - int64_t k = (int64_t) std::floor(0.02 * (double)n); // trim 2% on each side - k = std::clamp(k, 0, std::min(n / 32, n / 2 - 1)); // but no more than ~3% or n/2 if small + int64_t k = (int64_t) std::floor(0.02 * (double)n); // trim 2% from each tail of the distribution + k = std::clamp(k, 0, std::min(n / 32, n / 2 - 1)); // cap trimming at ~3% (1/32) or half the samples - 1 std::sort(v.begin(), v.end()); return std::accumulate(v.begin() + k, v.begin() + (n - k), 0.0); }; @@ -1289,7 +1289,7 @@ static std::unordered_map target_bpw_type( if (total_elems == 0) { return {}; } const double target_bpw = params->target_bpw; - size_t budget_bytes = std::llround(target_bpw * (double)total_elems / 8.0); + size_t budget_bytes = std::llround(target_bpw * (double)total_elems / 8.0); // convert bpw to bytes auto emit_overrides = [&]() -> std::unordered_map { std::unordered_map overrides; @@ -1362,8 +1362,8 @@ static std::unordered_map target_bpw_type( if (bytes_hi >= prev_bytes_hi) { break; } prev_bytes_hi = bytes_hi; - mu_hi *= 2.0; - if (++expand > 60) { break; } // safety cap + mu_hi *= 2.0; // double the penalty multiplier to reduce tensor sizes + if (++expand > 60) { break; } // safety cap to prevent an infinite loop } } @@ -1371,8 +1371,8 @@ static std::unordered_map target_bpw_type( double best_over_gap = infinity; double best_under_err = infinity; double best_over_err = infinity; - for (int it = 0; it < 40; ++it) { - double mu = 0.5 * (mu_lo + mu_hi); + for (int it = 0; it < 40; ++it) { // binary search iterations for optimal Lagrange multiplier (40 ≈ 1e-12 precision) + double mu = 0.5 * (mu_lo + mu_hi); // midpoint of current bounds lagrange_penalty(mu, choice_mid, bytes_mid, err_mid); const double gap = std::abs((double)bytes_mid - (double)budget_bytes); @@ -1435,7 +1435,7 @@ static std::unordered_map target_bpw_type( if (cur_bytes + delta > budget_bytes) { continue; } double err_gain = std::max(0.0, ti.candidate[ti.choice].error - ti.candidate[j].error); - double ratio = err_gain / (double)(delta * 8); + double ratio = err_gain / (double)(delta * 8); // error reduction per bit if (ratio > best_ratio + epsilon || (std::abs(ratio - best_ratio) <= epsilon && delta < best_delta)) { best_ratio = ratio; best_delta = delta; From 8eedcf74bc4df64eb7fe5b4935390dc9ad73d104 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 22 Sep 2025 20:42:37 +0100 Subject: [PATCH 095/155] Increase scale multiplier --- src/llama-quant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 93007f281e..0f05c8f956 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -983,7 +983,7 @@ static std::unordered_map target_bpw_type( if (s1 > 0.0) { const auto n = (double)n_per_row; const double c = std::max(0.0, s2 / (s1 * s1 + epsilon) - 1.0 / n); - l = (float)std::clamp(8.0 * (c / (c + 1.0)), 0.0, 12.0); + l = (float)std::clamp(12.0 * (c / (c + 1.0)), 0.0, 12.0); } lambdas[(size_t)s] = l; From a74b410f5f6bd11ff42cc1f40fa93242d0f67940 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 25 Sep 2025 19:49:47 +0100 Subject: [PATCH 096/155] Move is_iq() into a lambda and remove unused variables --- src/llama-quant.cpp | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 0f05c8f956..af564ce03e 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -727,11 +727,28 @@ static std::unordered_map target_bpw_type( return (double)bytes * 8.0 / (double)ggml_nelements(t); }; - auto is_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> bool { + auto is_compatible = [](const ggml_tensor * t, const ggml_type typ) -> bool { const int64_t blck = ggml_blck_size(typ); return blck <= 1 || (t->ne[0] % blck) == 0; }; + auto is_iq = [](const enum ggml_type t) { + switch (t) { + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ2_S: + case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ3_S: + case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ4_XS: + return true; + default: + return false; + } + }; + auto make_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> ggml_type { if (is_compatible(t, typ)) return typ; ggml_type fb = fallback_type(typ); @@ -995,8 +1012,6 @@ static std::unordered_map target_bpw_type( std::vector all; all.reserve(tensors.size()); for (const auto * tw : tensors) { - std::vector workers; - workers.reserve(std::max(1, nthread)); ggml_tensor * tensor = tw->tensor; const std::string name = ggml_get_name(tensor); if (!can_quantize(tensor)) { continue; } From dbdd179a92426c2031e4bee1ba0ccace45ea29fe Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 25 Sep 2025 19:50:20 +0100 Subject: [PATCH 097/155] Combine quant types --- src/llama-quant.cpp | 75 ++++++++------------------------------------- 1 file changed, 13 insertions(+), 62 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index af564ce03e..f36b9202d5 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -75,43 +75,6 @@ static bool is_quantizable(const std::string & name, const llm_arch arch, const return q; } -static bool is_iq(const enum ggml_type t) { - switch (t) { - case GGML_TYPE_IQ1_S: - case GGML_TYPE_IQ1_M: - case GGML_TYPE_IQ2_XXS: - case GGML_TYPE_IQ2_XS: - case GGML_TYPE_IQ2_S: - case GGML_TYPE_IQ3_XXS: - case GGML_TYPE_IQ3_S: - case GGML_TYPE_IQ4_NL: - case GGML_TYPE_IQ4_XS: - return true; - default: - return false; - } -} - -static bool is_iq(const enum llama_ftype t) { - switch (t) { - case LLAMA_FTYPE_MOSTLY_IQ1_S: - case LLAMA_FTYPE_MOSTLY_IQ1_M: - case LLAMA_FTYPE_MOSTLY_IQ2_XXS: - case LLAMA_FTYPE_MOSTLY_IQ2_XS: - case LLAMA_FTYPE_MOSTLY_IQ2_S: - case LLAMA_FTYPE_MOSTLY_IQ2_M: - case LLAMA_FTYPE_MOSTLY_IQ3_XXS: - case LLAMA_FTYPE_MOSTLY_IQ3_XS: - case LLAMA_FTYPE_MOSTLY_IQ3_S: - case LLAMA_FTYPE_MOSTLY_IQ3_M: - case LLAMA_FTYPE_MOSTLY_IQ4_XS: - case LLAMA_FTYPE_MOSTLY_IQ4_NL: - return true; - default: - return false; - } -} - static enum ggml_type fallback_type(const enum ggml_type new_type) { switch (new_type) { case GGML_TYPE_TQ1_0: @@ -678,33 +641,21 @@ static std::unordered_map target_bpw_type( size_t n_elements = 0; }; - constexpr ggml_type k_quants[] = { - GGML_TYPE_Q2_K, - GGML_TYPE_Q3_K, - GGML_TYPE_Q4_K, - GGML_TYPE_Q5_K, - GGML_TYPE_Q6_K, - GGML_TYPE_Q8_0, -// TODO: find better way to handle F16/BF16 -#ifdef GGML_USE_METAL - GGML_TYPE_F16 -#else - GGML_TYPE_BF16 -#endif - }; - - constexpr ggml_type iq_quants[] = { + // subset of quantization types with the best accuracy/size tradeoff + constexpr ggml_type quant_types[] = { GGML_TYPE_IQ1_S, + GGML_TYPE_IQ1_M, GGML_TYPE_IQ2_XXS, - GGML_TYPE_IQ2_XS, - GGML_TYPE_IQ2_S, - GGML_TYPE_IQ3_S, + GGML_TYPE_Q2_K, + GGML_TYPE_IQ3_XXS, + GGML_TYPE_Q3_K, GGML_TYPE_IQ4_XS, - GGML_TYPE_IQ4_NL, + GGML_TYPE_Q4_1, + GGML_TYPE_Q4_K, + GGML_TYPE_Q5_1, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, GGML_TYPE_Q8_0, - // TODO: find better way to handle F16/BF16 #ifdef GGML_USE_METAL GGML_TYPE_F16 #else @@ -896,7 +847,7 @@ static std::unordered_map target_bpw_type( } // Compute error per slice with trimmed aggregation - auto trimmed_sum = [&](std::vector & v) -> double { + auto trimmed_sum = [](std::vector & v) -> double { const int64_t n = (int64_t)v.size(); if (n == 0) { return 0.0; } if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); } // use all samples for small datasets @@ -978,7 +929,7 @@ static std::unordered_map target_bpw_type( }; // Returns lambda per slice or 0.0 if no activations - auto estimate_lambda = [&](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) -> std::vector { + auto estimate_lambda = [](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) -> std::vector { const int64_t ns = std::max(1, ne2); std::vector lambdas(ns, 0.0f); if (!activations) { return lambdas; } @@ -1141,8 +1092,8 @@ static std::unordered_map target_bpw_type( // Build list of candidate types first (compatible ones) const bool has_valid_imatrix = !values_sample.empty() && values_sample.size() == (size_t)ne2 * (size_t)n_per_row; size_t max_row_sz = 0; - const ggml_type * base_arr = is_iq(params->ftype) ? iq_quants : k_quants; - const size_t base_sz = is_iq(params->ftype) ? std::size(iq_quants) : std::size(k_quants); + const ggml_type * base_arr = quant_types; + const size_t base_sz = std::size(quant_types); std::vector compatible_candidates; compatible_candidates.reserve(base_sz); From dd4f4bd0b88c4d59613033ba941d85e7ce1d9547 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 27 Sep 2025 17:23:48 +0100 Subject: [PATCH 098/155] Reduce bpw range --- src/llama-quant.cpp | 7 +------ tools/quantize/quantize.cpp | 8 ++++---- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index f36b9202d5..0386352014 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -655,12 +655,7 @@ static std::unordered_map target_bpw_type( GGML_TYPE_Q5_1, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, - GGML_TYPE_Q8_0, -#ifdef GGML_USE_METAL - GGML_TYPE_F16 -#else - GGML_TYPE_BF16 -#endif + GGML_TYPE_Q8_0 }; constexpr double epsilon = 1e-12; diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 03018cc301..69e03179b3 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -132,7 +132,7 @@ static void usage(const char * executable) { printf(" Advanced option to selectively quantize tensors. May be specified multiple times.\n"); printf(" --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n"); printf(" Advanced option to remove all tensors from the given layers\n"); - printf(" --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 16.0\n"); + printf(" --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0\n"); printf(" Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n"); printf(" --no-bias: use mean square error estimation only (no aligment bias)\n"); printf(" Advanced option use MSE only and disable aligment bias error estimation\n"); @@ -484,13 +484,13 @@ static bool parse_target_bpw(const char * data, float & target_bpw) { try { target_bpw = std::stof(data); - if (target_bpw < 0.0f || target_bpw > 16.0f) { - printf("\n%s: target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__); + if (target_bpw < 0.0f || target_bpw > 8.0f) { + printf("\n%s: target bits per weight (bpw) must be a positive number between 0.0 and 8.0\n\n", __func__); return false; } } catch (const std::exception & e) { - printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__, data); + printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 8.0\n\n", __func__, data); return false; } From d16945730eac146d87d158a97ef053f845921f01 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 27 Sep 2025 17:25:29 +0100 Subject: [PATCH 099/155] Refactor outlier trimming --- src/llama-quant.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 0386352014..df36a705c2 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -847,8 +847,7 @@ static std::unordered_map target_bpw_type( if (n == 0) { return 0.0; } if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); } // use all samples for small datasets - int64_t k = (int64_t) std::floor(0.02 * (double)n); // trim 2% from each tail of the distribution - k = std::clamp(k, 0, std::min(n / 32, n / 2 - 1)); // cap trimming at ~3% (1/32) or half the samples - 1 + int64_t k = (int64_t) std::floor(0.025 * (double)n); // trim 2.5% from each tail of the distribution std::sort(v.begin(), v.end()); return std::accumulate(v.begin() + k, v.begin() + (n - k), 0.0); }; From 87cba659089342ef4e4c2209d9a750555ae140e3 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 27 Sep 2025 17:26:30 +0100 Subject: [PATCH 100/155] Tighten worker allocator --- src/llama-quant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index df36a705c2..90931f25e7 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1136,7 +1136,7 @@ static std::unordered_map target_bpw_type( std::vector tl_quantized_buffer(quantized_buffer.size()); std::vector tl_dequantized_buffer(dequantized_buffer.size()); for (;;) { - const size_t i = cidx.fetch_add(1, std::memory_order_relaxed); + const size_t i = cidx.fetch_add(1, std::memory_order_acq_rel); if (i >= compatible_candidates.size()) { break; } const ggml_type tensor_types = compatible_candidates[i]; From 8a2c71f471842a9b2dcc0bc33592cd7adb8b8dfe Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 27 Sep 2025 17:27:29 +0100 Subject: [PATCH 101/155] Check for direction reversal --- src/llama-quant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 90931f25e7..601b9ada42 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1204,7 +1204,7 @@ static std::unordered_map target_bpw_type( }; while (hull.size() >= 2) { - if (cross_product(hull[hull.size() - 2], hull[hull.size() - 1], c) <= epsilon) { + if (cross_product(hull[hull.size() - 2], hull[hull.size() - 1], c) <= -1 * epsilon) { // very small negative tolerance hull.pop_back(); } else { break; From 3d75b14c0f2fc605fb39a3cb425c4c2482b8d8f5 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 27 Sep 2025 17:27:58 +0100 Subject: [PATCH 102/155] Simplify dequantisation --- src/llama-quant.cpp | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 601b9ada42..316dd35fa8 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -819,25 +819,16 @@ static std::unordered_map target_bpw_type( // Dequantize into dequantized_buffer { const ggml_type_traits * traits = ggml_get_type_traits(quant_type); - if (traits && traits->to_float && quant_type != GGML_TYPE_F16 && quant_type != GGML_TYPE_BF16) { - traits->to_float(quantized_buffer.data(), dequantized_buffer.data(), (int)(sample_rows * (size_t)n_per_row)); - } else { - for (size_t r = 0; r < sample_rows; ++r) { - const uint8_t * src = quantized_buffer.data() + r * row_sz; - float * dst = dequantized_buffer.data() + r * (size_t)n_per_row; - if (quant_type == GGML_TYPE_F16) { - ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row); - } else if (quant_type == GGML_TYPE_BF16) { - ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row); - } else { - if (!traits || !traits->to_float) { - if (out_mse) { *out_mse = infinity; } - if (out_proj) { *out_proj = 0.0; } - return infinity; - } - traits->to_float(src, dst, (int)n_per_row); - } - } + if (!traits || !traits->to_float) { + if (out_mse) { *out_mse = infinity; } + if (out_proj) { *out_proj = 0.0; } + return infinity; + } + + for (size_t r = 0; r < sample_rows; ++r) { + const uint8_t * src = quantized_buffer.data() + r * row_sz; + float * dst = dequantized_buffer.data() + r * (size_t)n_per_row; + traits->to_float(src, dst, (int)n_per_row); } } From e49e241d37e7fd7f25142ee514c9e129c304083b Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 27 Sep 2025 17:28:39 +0100 Subject: [PATCH 103/155] Calculate bpw over all tensors --- src/llama-quant.cpp | 45 ++++++++++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 316dd35fa8..699264553a 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1219,6 +1219,18 @@ static std::unordered_map target_bpw_type( if (all.empty()) { return {}; } + // Compute total elements across all tensors and bytes for non-quantizable tensors + size_t nq_elements = 0; + size_t nq_bytes = 0; + for (const auto & it : ml.weights_map) { + const ggml_tensor * tensor = it.second.tensor; + const std::string name = it.first; + nq_elements += (size_t)ggml_nelements(tensor); + if (!is_quantizable(name, model.arch, params)) { + nq_bytes += ggml_nbytes(tensor); + } + } + auto total_bytes = [&]() -> size_t { size_t tb = 0; for (const auto & ti : all) { @@ -1228,19 +1240,20 @@ static std::unordered_map target_bpw_type( return tb; }; - size_t total_elems = 0; + size_t q_elements = 0; size_t min_bytes = 0; size_t max_bytes = 0; for (const auto & ti : all) { - total_elems += (size_t)ti.n_elements; + q_elements += (size_t)ti.n_elements; min_bytes += ti.candidate.front().bytes; // smallest candidate per tensor max_bytes += ti.candidate.back().bytes; // largest candidate per tensor } - if (total_elems == 0) { return {}; } + if (q_elements == 0) { return {}; } const double target_bpw = params->target_bpw; - size_t budget_bytes = std::llround(target_bpw * (double)total_elems / 8.0); // convert bpw to bytes + size_t target_total_bytes = std::llround(target_bpw * (double)nq_elements / 8.0); + size_t budget_bytes = target_total_bytes >= nq_bytes ? target_total_bytes - nq_bytes : min_bytes; auto emit_overrides = [&]() -> std::unordered_map { std::unordered_map overrides; @@ -1374,29 +1387,35 @@ static std::unordered_map target_bpw_type( int best_i = -1; int best_j = -1; double best_ratio = -1.0; - size_t best_delta = 0; + double best_gain = -1.0; + for (int i = 0; i < (int)all.size(); ++i) { const auto &ti = all[i]; int j = ti.choice + 1; - // skip same-bytes entries while (j < (int)ti.candidate.size() && ti.candidate[j].bytes == ti.candidate[ti.choice].bytes) { ++j; } if (j >= (int)ti.candidate.size()) { continue; } - size_t delta = ti.candidate[j].bytes - ti.candidate[ti.choice].bytes; - if (cur_bytes + delta > budget_bytes) { continue; } + size_t delta_bytes = ti.candidate[j].bytes - ti.candidate[ti.choice].bytes; + if (cur_bytes + delta_bytes > budget_bytes) { continue; } double err_gain = std::max(0.0, ti.candidate[ti.choice].error - ti.candidate[j].error); - double ratio = err_gain / (double)(delta * 8); // error reduction per bit - if (ratio > best_ratio + epsilon || (std::abs(ratio - best_ratio) <= epsilon && delta < best_delta)) { + if (err_gain < epsilon) { continue; } // no real improvement + + double ratio = err_gain / (double)delta_bytes; // error reduction per byte + // For tie-breaking, prioritize the largest absolute error improvement. + if (ratio > best_ratio + epsilon || (std::abs(ratio - best_ratio) <= epsilon && err_gain > best_gain)) { best_ratio = ratio; - best_delta = delta; + best_gain = err_gain; best_i = i; best_j = j; } } - if (best_i < 0) { break; } + + if (best_i < 0) { break; } // no more upgrades within budget found + + size_t upgrade_cost = all[best_i].candidate[best_j].bytes - all[best_i].candidate[all[best_i].choice].bytes; all[best_i].choice = best_j; - cur_bytes += best_delta; + cur_bytes += upgrade_cost; } } From b3b8a111a58a8a1586c763382463ccdf9bba3f6a Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 28 Sep 2025 18:45:25 +0100 Subject: [PATCH 104/155] Compute rows based on tensor shape and slice count --- src/llama-quant.cpp | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 699264553a..7bfb8751ae 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -650,9 +650,7 @@ static std::unordered_map target_bpw_type( GGML_TYPE_IQ3_XXS, GGML_TYPE_Q3_K, GGML_TYPE_IQ4_XS, - GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, - GGML_TYPE_Q5_1, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, GGML_TYPE_Q8_0 @@ -961,10 +959,24 @@ static std::unordered_map target_bpw_type( ml.load_data_for(tensor); // Dequantize sampled rows into f32_sample - const int rows_sample_per_expert = activations_data ? 512 : 256; const int64_t n_per_row = tensor->ne[0]; const int64_t nrows_total = tensor->ne[1]; const int64_t ne2 = tensor->ne[2] > 0 ? tensor->ne[2] : 1; + + // Compute rows based on tensor shape and slice count + auto sample_rows = [](const int64_t n, const int64_t rows, const int64_t n2, const bool has_acts) -> int64_t { + const double tensor_budget = has_acts ? 1 * 1024 * 1024 : 0.5 * 1024 * 1024; + const double scale_rows = std::clamp(std::sqrt(std::max(1.0, (double)rows) / 4096.0), 0.5, 2.0); // favour more rows for large nrt + const double slice_budget = tensor_budget * scale_rows / std::max(1, n2); + const int64_t min_rows = has_acts ? 128 : 64; + const int64_t max_rows = 4096; + int64_t total_rows = std::llround(slice_budget / std::max(1, n)); + total_rows = std::max(min_rows, std::min(total_rows, std::min(rows, max_rows))); + if (rows <= min_rows * 2) { total_rows = rows; } // use all rows for small tensors + return total_rows; + }; + + const int64_t rows_sample_per_expert = sample_rows(n_per_row, nrows_total, ne2, activations_data != nullptr); std::vector f32_sample; f32_sample.reserve((size_t)ne2 * (size_t)std::min(nrows_total, rows_sample_per_expert) * (size_t)n_per_row); std::vector rows_sample(ne2, 0); From f5d8811ddde7533c561ad77d358d1d509a57ff9f Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Wed, 1 Oct 2025 19:04:43 +0100 Subject: [PATCH 105/155] Prioritise important tensors --- src/llama-quant.cpp | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 7bfb8751ae..a93d982e63 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -656,6 +656,13 @@ static std::unordered_map target_bpw_type( GGML_TYPE_Q8_0 }; + const char * important_tensors[] = { + ".output.weight", + ".attn_output.weight", + ".ffn_down.weight", + ".ffn_down_shexp.weight" + }; + constexpr double epsilon = 1e-12; constexpr double infinity = std::numeric_limits::infinity(); const char * func = __func__; @@ -1288,6 +1295,13 @@ static std::unordered_map target_bpw_type( return emit_overrides(); } + auto is_important = [&](const std::string & tensor_name) -> bool { + return std::any_of(std::begin(important_tensors), std::end(important_tensors), [&](const char* imp) { + return tensor_name.find(imp) != std::string::npos; + } + ); + }; + // Lagrangian relaxation to minimise error subject to a bpw target constraint auto lagrange_penalty = [&](const double mu, std::vector & choice, size_t & bytes, double & err) { choice.resize(all.size()); @@ -1295,11 +1309,15 @@ static std::unordered_map target_bpw_type( err = 0.0; for (size_t i = 0; i < all.size(); ++i) { const auto & candidate = all[i].candidate; + const std::string tensor_name = ggml_get_name(all[i].w->tensor); + double effective_mu = mu; + if (is_important(tensor_name)) { effective_mu *= 0.1; } // important tensors get 10x lower penalty + int best_j = 0; double best_val = infinity; for (int j = 0; j < (int)candidate.size(); ++j) { const double bits = (double)candidate[j].bytes * 8.0; - const double val = candidate[j].error + mu * bits; + const double val = candidate[j].error + effective_mu * bits; if (val < best_val - epsilon || (std::abs(val - best_val) <= epsilon && candidate[j].bytes < candidate[best_j].bytes)) { best_val = val; best_j = j; @@ -1402,18 +1420,21 @@ static std::unordered_map target_bpw_type( double best_gain = -1.0; for (int i = 0; i < (int)all.size(); ++i) { - const auto &ti = all[i]; + const auto & ti = all[i]; + const std::string tensor_name = ggml_get_name(ti.w->tensor); int j = ti.choice + 1; while (j < (int)ti.candidate.size() && ti.candidate[j].bytes == ti.candidate[ti.choice].bytes) { ++j; } - if (j >= (int)ti.candidate.size()) { continue; } + if (j >= (int)ti.candidate.size()) { continue; } // no upgrade available size_t delta_bytes = ti.candidate[j].bytes - ti.candidate[ti.choice].bytes; - if (cur_bytes + delta_bytes > budget_bytes) { continue; } + if (cur_bytes + delta_bytes > budget_bytes) { continue; } // won't fit in budget double err_gain = std::max(0.0, ti.candidate[ti.choice].error - ti.candidate[j].error); - if (err_gain < epsilon) { continue; } // no real improvement + if (err_gain < epsilon) { continue; } // no error improvement double ratio = err_gain / (double)delta_bytes; // error reduction per byte + if (is_important(tensor_name)) { ratio *= 2.0; } // important tensors get 2x boost + // For tie-breaking, prioritize the largest absolute error improvement. if (ratio > best_ratio + epsilon || (std::abs(ratio - best_ratio) <= epsilon && err_gain > best_gain)) { best_ratio = ratio; From 940db63144d7369f88145a099370cf1bd33a45b7 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Fri, 3 Oct 2025 11:08:02 +0100 Subject: [PATCH 106/155] Select quantization type if target_bpw is set unless user specifies type and threads --- tools/quantize/quantize.cpp | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 69e03179b3..89cf0fbf80 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -497,6 +497,24 @@ static bool parse_target_bpw(const char * data, float & target_bpw) { return true; } +static const char * get_ftype(const float bpw) { + const std::map quant_bpw = { + {1.5625, "IQ1_S"}, + {1.7500, "IQ1_M"}, + {2.0625, "IQ2_XXS"}, + {2.6250, "Q2_K"}, + {3.0625, "IQ3_XXS"}, + {3.4375, "Q3_K"}, + {4.2500, "IQ4_XS"}, + {4.5000, "Q4_K"}, + {5.5000, "Q5_K"}, + {6.5625, "Q6_K"}, + {8.5000, "Q8_0"} + }; + + return quant_bpw.lower_bound(bpw)->second; +} + int main(int argc, char ** argv) { if (argc < 3) { usage(argv[0]); @@ -655,6 +673,7 @@ int main(int argc, char ** argv) { std::string ftype_str; std::string suffix = ".gguf"; + std::vector tmp_argv(argv, argv + argc); if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) { std::string fpath; const size_t pos = fname_inp.find_last_of("/\\"); @@ -678,7 +697,21 @@ int main(int argc, char ** argv) { } arg_idx++; - if (argc <= arg_idx) { + // select quantization type if target_bpw is set unless user specifies type and threads + if (argc - arg_idx <= 1 && params.target_bpw != -1.0f) { + auto * ftype = const_cast(get_ftype(params.target_bpw)); + if (argc == arg_idx) { + tmp_argv.push_back(ftype); + tmp_argv.push_back(nullptr); + argv = const_cast(tmp_argv.data()); + argc++; + } else { + tmp_argv.insert(tmp_argv.end() - 1, ftype); + tmp_argv.push_back(nullptr); + argv = const_cast(tmp_argv.data()); + argc++; + } + } else if (argc <= arg_idx) { fprintf(stderr, "%s: missing ftype\n", __func__); return 1; } From 66d4aed173aba8b3b4e05c6d7b46ca8911ec7ddf Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 4 Oct 2025 08:21:01 +0100 Subject: [PATCH 107/155] Minor refactoring --- tools/quantize/quantize.cpp | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 89cf0fbf80..d355f97274 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -700,17 +700,11 @@ int main(int argc, char ** argv) { // select quantization type if target_bpw is set unless user specifies type and threads if (argc - arg_idx <= 1 && params.target_bpw != -1.0f) { auto * ftype = const_cast(get_ftype(params.target_bpw)); - if (argc == arg_idx) { - tmp_argv.push_back(ftype); - tmp_argv.push_back(nullptr); - argv = const_cast(tmp_argv.data()); - argc++; - } else { - tmp_argv.insert(tmp_argv.end() - 1, ftype); - tmp_argv.push_back(nullptr); - argv = const_cast(tmp_argv.data()); - argc++; - } + if (argc == arg_idx) { tmp_argv.push_back(ftype); } + else { tmp_argv.insert(tmp_argv.end() - 1, ftype); } + tmp_argv.push_back(nullptr); + argv = const_cast(tmp_argv.data()); + argc++; } else if (argc <= arg_idx) { fprintf(stderr, "%s: missing ftype\n", __func__); return 1; From 560e8c9d70964320a0283936b0d8e9fd198356ee Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 5 Oct 2025 14:41:42 +0100 Subject: [PATCH 108/155] Relax lambda clamping --- src/llama-quant.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index a93d982e63..422c929f0c 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -701,7 +701,7 @@ static std::unordered_map target_bpw_type( }; auto make_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> ggml_type { - if (is_compatible(t, typ)) return typ; + if (is_compatible(t, typ)) { return typ; } ggml_type fb = fallback_type(typ); return is_compatible(t, fb) ? fb : GGML_TYPE_F16; }; @@ -941,7 +941,7 @@ static std::unordered_map target_bpw_type( if (s1 > 0.0) { const auto n = (double)n_per_row; const double c = std::max(0.0, s2 / (s1 * s1 + epsilon) - 1.0 / n); - l = (float)std::clamp(12.0 * (c / (c + 1.0)), 0.0, 12.0); + l = (float)std::clamp(12.0 * (c / (c + 1.0)), 0.0, 16.0); } lambdas[(size_t)s] = l; @@ -1035,7 +1035,7 @@ static std::unordered_map target_bpw_type( for (int64_t r = offset; r < nrows_total && current < rows_sample_max; r += stride) { const uint8_t * src_row = (const uint8_t *)tensor->data + slice * (src_row_sz * nrows_total) + r * src_row_sz; if (src_type == GGML_TYPE_F32) { - auto src_f32 = (const float *)src_row; + const auto *src_f32 = (const float *)src_row; f32_sample.insert(f32_sample.end(), src_f32, src_f32 + n_per_row); } else { row_to_fp32(src_row, row_buffer.data()); @@ -1173,7 +1173,7 @@ static std::unordered_map target_bpw_type( // Keep only the pareto‑optimal candidates and enforce convexity in (bytes, error) curve auto pareto_convex = [](std::vector & candidates) { - if (candidates.empty()) return; + if (candidates.empty()) { return; } std::sort(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) { if (a.bytes != b.bytes) { return a.bytes < b.bytes; } From 533cda3076b5ae26d120f04b7aaa813f7b7a5ac7 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 5 Oct 2025 20:16:33 +0100 Subject: [PATCH 109/155] Add signal handler --- src/llama-quant.cpp | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 422c929f0c..50c8dbf423 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -613,6 +614,12 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * return new_size; } +static std::atomic bpw_stop{ false }; + +static void signal_handler(int) { + bpw_stop.store(true, std::memory_order_relaxed); +} + // Returns tensor type overrides to meet a global bpw target static std::unordered_map target_bpw_type( llama_model_loader & ml, @@ -711,6 +718,22 @@ static std::unordered_map target_bpw_type( return is_quantizable(ggml_get_name(t), model.arch, params); }; + auto install_signal_handlers = [] { + static std::once_flag once; + std::call_once(once, [] { + std::signal(SIGINT, signal_handler); + std::signal(SIGTERM, signal_handler); + }); + }; + + auto uninstall_signal_handlers = [] { + static std::once_flag once; + std::call_once(once, [] { + std::signal(SIGINT, SIG_DFL); + std::signal(SIGTERM, SIG_DFL); + }); + }; + // Estimate error for a given type using a sampled subset of rows auto estimate_error = [&](const ggml_tensor * t, const ggml_type quant_type, From e48ca32f19095ba0c47058dc7a703c1bb52977e0 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 5 Oct 2025 20:17:27 +0100 Subject: [PATCH 110/155] Add save_bpw_state() --- src/llama-quant.cpp | 50 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 50c8dbf423..3080b0ed71 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -734,6 +734,56 @@ static std::unordered_map target_bpw_type( }); }; + // Saved state per tensor + struct saved_info { + std::vector candidate; + int choice = -1; + float min_bpw = 0.0f; + float max_bpw = 0.0f; + size_t n_elements = 0; + }; + + auto save_bpw_state = [&](const std::vector & all_vec) { + const std::string tmp = checkpoint_file + ".tmp"; + std::ofstream ofs(tmp, std::ios::binary | std::ios::trunc); + if (!ofs) { return; } // best-effort + const float target_bpw = params->target_bpw; + const uint8_t bias_mode = params->no_bias ? 1 : 0; + ofs.write((const char *)&file_magic, sizeof(file_magic)); + ofs.write((const char *)&target_bpw, sizeof(target_bpw)); + ofs.write((const char *)&bias_mode, sizeof(bias_mode)); + const uint64_t n = all_vec.size(); + ofs.write((const char *)&n, sizeof(n)); + for (const auto & ti : all_vec) { + const std::string name = ggml_get_name(ti.w->tensor); + const uint32_t len = (uint32_t)name.size(); + ofs.write((const char *)&len, sizeof(len)); + ofs.write(name.data(), len); + + const uint64_t cn = ti.candidate.size(); + ofs.write((const char *)&cn, sizeof(cn)); + ofs.write((const char *)&ti.choice, sizeof(ti.choice)); + ofs.write((const char *)&ti.min_bpw, sizeof(ti.min_bpw)); + ofs.write((const char *)&ti.max_bpw, sizeof(ti.max_bpw)); + const uint64_t ne = ti.n_elements; + ofs.write((const char *)&ne, sizeof(ne)); + + for (const auto & c : ti.candidate) { + const int32_t t = c.type; + const uint64_t b = c.bytes; + ofs.write((const char *)&t, sizeof(t)); + ofs.write((const char *)&c.bpw, sizeof(c.bpw)); + ofs.write((const char *)&b, sizeof(b)); + ofs.write((const char *)&c.error, sizeof(c.error)); + } + } + + ofs.close(); + std::remove(checkpoint_file.c_str()); // TODO: handle errors + std::rename(tmp.c_str(), checkpoint_file.c_str()); + LLAMA_LOG_INFO("%s: saved bpw progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str()); + }; + // Estimate error for a given type using a sampled subset of rows auto estimate_error = [&](const ggml_tensor * t, const ggml_type quant_type, From 02c3073b81cc7fa26219419c517331b3e3243379 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 5 Oct 2025 20:18:36 +0100 Subject: [PATCH 111/155] Add load_bpw_state() --- src/llama-quant.cpp | 64 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 3080b0ed71..4d0dc6a36e 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -672,7 +672,9 @@ static std::unordered_map target_bpw_type( constexpr double epsilon = 1e-12; constexpr double infinity = std::numeric_limits::infinity(); + constexpr uint32_t file_magic = 0x42505731; // BPW1 const char * func = __func__; + const std::string checkpoint_file = ml.arch_name + ".bpw_state"; auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t { const int64_t n_per_row = t->ne[0]; @@ -784,6 +786,68 @@ static std::unordered_map target_bpw_type( LLAMA_LOG_INFO("%s: saved bpw progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str()); }; + auto load_bpw_state = [&]() -> std::unordered_map { + std::unordered_map out; + std::ifstream ifs(checkpoint_file, std::ios::binary); + if (!ifs) { return out; } + + uint32_t magic = 0; + float target_bpw = 0.0f; + uint8_t bias_mode = 0; + ifs.read((char *)&magic, sizeof(magic)); + ifs.read((char *)&target_bpw, sizeof(target_bpw)); + ifs.read((char *)&bias_mode, sizeof(bias_mode)); + if (magic != file_magic) { + LLAMA_LOG_WARN("%s: invalid resume file, ignoring: %s\n", func, checkpoint_file.c_str()); + return out; + } + if (target_bpw != params->target_bpw) { + LLAMA_LOG_WARN("%s: target bpw of %f does not match %f, ignoring: %s\n", func, params->target_bpw, target_bpw, checkpoint_file.c_str()); + return out; + } + if (bias_mode != (params->no_bias ? 1 : 0)) { + LLAMA_LOG_WARN("%s: bias mode does not match, ignoring: %s\n", func, checkpoint_file.c_str()); + return out; + } + + uint64_t n = 0; + ifs.read((char *)&n, sizeof(n)); + for (uint64_t i = 0; i < n; ++i) { + uint32_t len = 0; + ifs.read((char *)&len, sizeof(len)); + std::string name(len, '\0'); + ifs.read(name.data(), len); + + uint64_t cn = 0; + ifs.read((char *)&cn, sizeof(cn)); + + saved_info si; + ifs.read((char *)&si.choice, sizeof(si.choice)); + ifs.read((char *)&si.min_bpw, sizeof(si.min_bpw)); + ifs.read((char *)&si.max_bpw, sizeof(si.max_bpw)); + uint64_t ne = 0; + ifs.read((char *)&ne, sizeof(ne)); + si.n_elements = (size_t)ne; + + si.candidate.resize(cn); + for (size_t j = 0; j < si.candidate.size(); ++j) { + int32_t t = 0; + uint64_t b = 0; + ifs.read((char *)&t, sizeof(t)); + si.candidate[j].type = (ggml_type)t; + ifs.read((char *)&si.candidate[j].bpw, sizeof(si.candidate[j].bpw)); + ifs.read((char *)&b, sizeof(b)); + si.candidate[j].bytes = (size_t)b; + ifs.read((char *)&si.candidate[j].error, sizeof(si.candidate[j].error)); + } + + out.emplace(std::move(name), std::move(si)); + } + + LLAMA_LOG_INFO("%s: loaded bpw state for %lu tensors from %s\n", func, out.size(), checkpoint_file.c_str()); + return out; + }; + // Estimate error for a given type using a sampled subset of rows auto estimate_error = [&](const ggml_tensor * t, const ggml_type quant_type, From 74c62ed4e63e4e95f031875b6ead5718f5fb900a Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 5 Oct 2025 20:19:03 +0100 Subject: [PATCH 112/155] Add delete_bpw_state() --- src/llama-quant.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 4d0dc6a36e..9212c88563 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -848,6 +848,19 @@ static std::unordered_map target_bpw_type( return out; }; + auto delete_bpw_state = [&] { + LLAMA_LOG_INFO("%s: deleting %s\n", func, checkpoint_file.c_str()); + std::remove(checkpoint_file.c_str()); + }; + + auto check_signal_handler = [&](const std::vector & all_vec) { + if (bpw_stop.load(std::memory_order_relaxed)) { + LLAMA_LOG_INFO("\n%s: saving bpw progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str()); + save_bpw_state(all_vec); + throw std::runtime_error("user interrupted the process"); + } + }; + // Estimate error for a given type using a sampled subset of rows auto estimate_error = [&](const ggml_tensor * t, const ggml_type quant_type, From 46706cec28ad83b8ab10781493b84343b5b0f048 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 5 Oct 2025 20:20:28 +0100 Subject: [PATCH 113/155] Persist progress --- src/llama-quant.cpp | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 9212c88563..640672aec7 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1100,12 +1100,28 @@ static std::unordered_map target_bpw_type( return lambdas; }; + install_signal_handlers(); + auto bpw_data = load_bpw_state(); std::vector all; all.reserve(tensors.size()); for (const auto * tw : tensors) { ggml_tensor * tensor = tw->tensor; const std::string name = ggml_get_name(tensor); if (!can_quantize(tensor)) { continue; } + check_signal_handler(all); + + // If we already have fully evaluatedd this tensor then reuse it + if (auto it_saved = bpw_data.find(name); it_saved != bpw_data.end()) { + tensor_info info; + info.w = tw; + info.candidate = it_saved->second.candidate; + info.choice = it_saved->second.choice; + info.min_bpw = it_saved->second.min_bpw; + info.max_bpw = it_saved->second.max_bpw; + info.n_elements = it_saved->second.n_elements ? it_saved->second.n_elements : (size_t)ggml_nelements(tensor); + all.push_back(std::move(info)); + continue; + } LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12" PRId64 " elements)\n", __func__, name.c_str(), ggml_nelements(tensor)); if (!ml.use_mmap) { @@ -1296,6 +1312,7 @@ static std::unordered_map target_bpw_type( std::vector tl_quantized_buffer(quantized_buffer.size()); std::vector tl_dequantized_buffer(dequantized_buffer.size()); for (;;) { + if (bpw_stop.load(std::memory_order_relaxed)) { break; } // stop if a signal arrived const size_t i = cidx.fetch_add(1, std::memory_order_acq_rel); if (i >= compatible_candidates.size()) { break; } @@ -1311,6 +1328,11 @@ static std::unordered_map target_bpw_type( for (auto &th : eval_workers) { th.join(); } + // If interruption happened mid-evaluation, exit without adding a half-baked tensor entry + if (bpw_stop.load(std::memory_order_relaxed) && cidx.load(std::memory_order_relaxed) < compatible_candidates.size()) { + check_signal_handler(all); + } + for (auto &c : eval_candidates) { if (c.bytes > 0) { info.candidate.push_back(c); } } @@ -1384,6 +1406,7 @@ static std::unordered_map target_bpw_type( info.min_bpw = info.candidate.front().bpw; info.max_bpw = info.candidate.back().bpw; all.push_back(std::move(info)); + check_signal_handler(all); // save after each tensor } if (all.empty()) { return {}; } @@ -1441,7 +1464,7 @@ static std::unordered_map target_bpw_type( return emit_overrides(); } if (budget_bytes >= max_bytes) { - for (auto & ti : all) { ti.choice = (int) ti.candidate.size() - 1; } + for (auto & ti : all) { ti.choice = (int)ti.candidate.size() - 1; } return emit_overrides(); } From 84ada44894dec721124613820bf640b97ac3e784 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 5 Oct 2025 20:20:56 +0100 Subject: [PATCH 114/155] Uninstall signal handler and cleanup --- src/llama-quant.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 640672aec7..eb5c9124b5 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1625,6 +1625,9 @@ static std::unordered_map target_bpw_type( } } + delete_bpw_state(); // we're done, clear any checkpoint + uninstall_signal_handlers(); + return emit_overrides(); } From 044fa783c7e5e87bddf667fbe7396628e827b455 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 6 Oct 2025 21:40:37 +0100 Subject: [PATCH 115/155] Fix trimming logic --- src/llama-quant.cpp | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index eb5c9124b5..aeb1542607 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -849,8 +849,12 @@ static std::unordered_map target_bpw_type( }; auto delete_bpw_state = [&] { - LLAMA_LOG_INFO("%s: deleting %s\n", func, checkpoint_file.c_str()); - std::remove(checkpoint_file.c_str()); + std::ifstream ifs(checkpoint_file); + if (ifs.good()) { + LLAMA_LOG_INFO("%s: deleting %s\n", func, checkpoint_file.c_str()); + std::remove(checkpoint_file.c_str()); + } + }; auto check_signal_handler = [&](const std::vector & all_vec) { @@ -988,14 +992,16 @@ static std::unordered_map target_bpw_type( } // Compute error per slice with trimmed aggregation - auto trimmed_sum = [](std::vector & v) -> double { + auto trimmed_mean = [](std::vector & v) -> double { const int64_t n = (int64_t)v.size(); if (n == 0) { return 0.0; } - if (n < 50) { return std::accumulate(v.begin(), v.end(), 0.0); } // use all samples for small datasets - - int64_t k = (int64_t) std::floor(0.025 * (double)n); // trim 2.5% from each tail of the distribution + double sum = std::accumulate(v.begin(), v.end(), 0.0); + if (n < 50) { return sum / (double)n; } // too few elements to trim + int64_t k = (int64_t) std::floor(0.025 * (double)n); // trim 5% (2.5% each side) std::sort(v.begin(), v.end()); - return std::accumulate(v.begin() + k, v.begin() + (n - k), 0.0); + const auto num = (double)(n - 2 * k); + sum = std::accumulate(v.begin() + k, v.begin() + (n - k), 0.0); + return sum / std::max(1.0, num); }; size_t off = 0; @@ -1028,7 +1034,7 @@ static std::unordered_map target_bpw_type( } const double denom_x = row_sq_norm[ridx]; - const double m_norm = w_mse / (denom_x + epsilon); + const double m_norm = w_mse / (denom_x + epsilon); row_mse_norm.push_back(std::isfinite(m_norm) ? m_norm : infinity); if (a) { @@ -1044,9 +1050,8 @@ static std::unordered_map target_bpw_type( off += (size_t)n_per_row; } - const double scale_rows = (double)nrows / std::max(1.0, (double)rs); - const double slice_mse = trimmed_sum(row_mse_norm) * scale_rows; - const double slice_proj = a ? trimmed_sum(row_proj_norm) * scale_rows : 0.0; + const double slice_mse = trimmed_mean(row_mse_norm) * (double)nrows; + const double slice_proj = a ? trimmed_mean(row_proj_norm) * (double)nrows : 0.0; total_mse += slice_mse; total_proj += slice_proj; From c11184a3c11917aba2c3d360a9cbb3bf3ebaf38a Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 9 Oct 2025 11:58:01 +0100 Subject: [PATCH 116/155] Generate model ID hash --- src/llama-quant.cpp | 51 +++++++++++++++++++++++++++++++++------------ 1 file changed, 38 insertions(+), 13 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index aeb1542607..5388d5a072 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -674,7 +674,6 @@ static std::unordered_map target_bpw_type( constexpr double infinity = std::numeric_limits::infinity(); constexpr uint32_t file_magic = 0x42505731; // BPW1 const char * func = __func__; - const std::string checkpoint_file = ml.arch_name + ".bpw_state"; auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t { const int64_t n_per_row = t->ne[0]; @@ -745,6 +744,26 @@ static std::unordered_map target_bpw_type( size_t n_elements = 0; }; + auto djb2_hash = [](const uint8_t * data, size_t n) -> uint64_t { + uint64_t h = 5381; + for (size_t i = 0; i < n; ++i) { + h = (h << 5) + h + data[i]; + } + return h ? h : 0xeabada55cafed00d; + }; + + auto metadata_id = [&](const gguf_context * ctx) -> uint64_t { + const size_t sz = gguf_get_meta_size(ctx); + std::vector buf(sz); + gguf_get_meta_data(ctx, buf.data()); + return djb2_hash(buf.data(), buf.size()); + }; + + char hex[17]; + const uint64_t model_id = metadata_id(ml.meta.get()); + std::snprintf(hex, sizeof(hex), "%016" PRIx64, (uint64_t)model_id); + const std::string checkpoint_file = ml.arch_name + "-" + std::string(hex) + ".bpw_state"; + auto save_bpw_state = [&](const std::vector & all_vec) { const std::string tmp = checkpoint_file + ".tmp"; std::ofstream ofs(tmp, std::ios::binary | std::ios::trunc); @@ -752,6 +771,7 @@ static std::unordered_map target_bpw_type( const float target_bpw = params->target_bpw; const uint8_t bias_mode = params->no_bias ? 1 : 0; ofs.write((const char *)&file_magic, sizeof(file_magic)); + ofs.write((const char *)&model_id, sizeof(model_id)); ofs.write((const char *)&target_bpw, sizeof(target_bpw)); ofs.write((const char *)&bias_mode, sizeof(bias_mode)); const uint64_t n = all_vec.size(); @@ -781,9 +801,9 @@ static std::unordered_map target_bpw_type( } ofs.close(); - std::remove(checkpoint_file.c_str()); // TODO: handle errors + std::remove(checkpoint_file.c_str()); std::rename(tmp.c_str(), checkpoint_file.c_str()); - LLAMA_LOG_INFO("%s: saved bpw progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str()); + LLAMA_LOG_INFO("%s: saved progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str()); }; auto load_bpw_state = [&]() -> std::unordered_map { @@ -792,22 +812,27 @@ static std::unordered_map target_bpw_type( if (!ifs) { return out; } uint32_t magic = 0; - float target_bpw = 0.0f; - uint8_t bias_mode = 0; + uint64_t id = 0; + float bpw = 0.0f; + uint8_t bias = 0; ifs.read((char *)&magic, sizeof(magic)); - ifs.read((char *)&target_bpw, sizeof(target_bpw)); - ifs.read((char *)&bias_mode, sizeof(bias_mode)); + ifs.read((char *)&id, sizeof(id)); + ifs.read((char *)&bpw, sizeof(bpw)); + ifs.read((char *)&bias, sizeof(bias)); if (magic != file_magic) { LLAMA_LOG_WARN("%s: invalid resume file, ignoring: %s\n", func, checkpoint_file.c_str()); return out; - } - if (target_bpw != params->target_bpw) { - LLAMA_LOG_WARN("%s: target bpw of %f does not match %f, ignoring: %s\n", func, params->target_bpw, target_bpw, checkpoint_file.c_str()); + } else if (id != model_id) { + LLAMA_LOG_WARN("%s: model ID mismatch, ignoring: %s\n", func, checkpoint_file.c_str()); return out; - } - if (bias_mode != (params->no_bias ? 1 : 0)) { + } else if (bpw != params->target_bpw) { + LLAMA_LOG_WARN("%s: target bpw of %f does not match %f, ignoring: %s\n", func, params->target_bpw, bpw, checkpoint_file.c_str()); + return out; + } else if (bias != (params->no_bias ? 1 : 0)) { LLAMA_LOG_WARN("%s: bias mode does not match, ignoring: %s\n", func, checkpoint_file.c_str()); return out; + } else { + LLAMA_LOG_INFO("%s: resuming tensor quantization\n", func); } uint64_t n = 0; @@ -859,7 +884,7 @@ static std::unordered_map target_bpw_type( auto check_signal_handler = [&](const std::vector & all_vec) { if (bpw_stop.load(std::memory_order_relaxed)) { - LLAMA_LOG_INFO("\n%s: saving bpw progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str()); + LLAMA_LOG_INFO("\n%s: saving progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str()); save_bpw_state(all_vec); throw std::runtime_error("user interrupted the process"); } From 3a3d807fc3aacc01715047bcc893f925f5343c6b Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Fri, 10 Oct 2025 13:10:42 +0100 Subject: [PATCH 117/155] Remove bias mode computation --- src/llama-quant.cpp | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 5388d5a072..7b3e956193 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -769,11 +769,9 @@ static std::unordered_map target_bpw_type( std::ofstream ofs(tmp, std::ios::binary | std::ios::trunc); if (!ofs) { return; } // best-effort const float target_bpw = params->target_bpw; - const uint8_t bias_mode = params->no_bias ? 1 : 0; ofs.write((const char *)&file_magic, sizeof(file_magic)); ofs.write((const char *)&model_id, sizeof(model_id)); ofs.write((const char *)&target_bpw, sizeof(target_bpw)); - ofs.write((const char *)&bias_mode, sizeof(bias_mode)); const uint64_t n = all_vec.size(); ofs.write((const char *)&n, sizeof(n)); for (const auto & ti : all_vec) { @@ -814,11 +812,9 @@ static std::unordered_map target_bpw_type( uint32_t magic = 0; uint64_t id = 0; float bpw = 0.0f; - uint8_t bias = 0; ifs.read((char *)&magic, sizeof(magic)); ifs.read((char *)&id, sizeof(id)); ifs.read((char *)&bpw, sizeof(bpw)); - ifs.read((char *)&bias, sizeof(bias)); if (magic != file_magic) { LLAMA_LOG_WARN("%s: invalid resume file, ignoring: %s\n", func, checkpoint_file.c_str()); return out; @@ -828,9 +824,6 @@ static std::unordered_map target_bpw_type( } else if (bpw != params->target_bpw) { LLAMA_LOG_WARN("%s: target bpw of %f does not match %f, ignoring: %s\n", func, params->target_bpw, bpw, checkpoint_file.c_str()); return out; - } else if (bias != (params->no_bias ? 1 : 0)) { - LLAMA_LOG_WARN("%s: bias mode does not match, ignoring: %s\n", func, checkpoint_file.c_str()); - return out; } else { LLAMA_LOG_INFO("%s: resuming tensor quantization\n", func); } @@ -1319,13 +1312,11 @@ static std::unordered_map target_bpw_type( std::vector lambdas; const float * values = values_sample.empty() ? nullptr : values_sample.data(); const float * activations = activations_sample.empty() ? nullptr : activations_sample.data(); - if (!params->no_bias) { - double acc = 0.0; - int ns = 0; - lambdas = estimate_lambda(values, activations, n_per_row, ne2); - for (float l : lambdas) { acc += l; ++ns; } - tensor_lambda = ns ? (float)(acc / ns) : 0.0f; - } + double acc = 0.0; + int ns = 0; + lambdas = estimate_lambda(values, activations, n_per_row, ne2); + for (float l : lambdas) { acc += l; ++ns; } + tensor_lambda = ns ? (float)(acc / ns) : 0.0f; // Evaluate candidates std::vector eval_candidates(compatible_candidates.size()); @@ -1925,11 +1916,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: if (params->target_bpw != -1.0f && !params->only_copy) { if (params->imatrix) { if (params->activations) { - LLAMA_LOG_INFO("%s: imatrix with activations provided, target bpw quantization will be more accurate - ",__func__); + LLAMA_LOG_INFO("%s: imatrix with activations provided, target bpw quantization will be more accurate\n",__func__); } else { - LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate - ", __func__); + LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate\n", __func__); } - LLAMA_LOG_INFO("using %s error estimation\n", params->no_bias ? "MSE only (no alignment bias)" : "alignment bias (default)"); LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw); bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread); } else { From c93131cef6dbb4e415fd2b3625f644c6714e7465 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Fri, 10 Oct 2025 13:26:51 +0100 Subject: [PATCH 118/155] Remove --no-bias option --- include/llama.h | 1 - src/llama-quant.cpp | 3 +-- tools/quantize/quantize.cpp | 6 +----- 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/include/llama.h b/include/llama.h index 16f6124727..1df8f96920 100644 --- a/include/llama.h +++ b/include/llama.h @@ -365,7 +365,6 @@ extern "C" { void * tensor_types; // pointer to vector containing tensor types void * prune_layers; // pointer to vector containing layer indices to prune float target_bpw; // target bits per weight (bpw) - bool no_bias; // use mean square error estimation only (no aligment bias) } llama_model_quantize_params; typedef struct llama_logit_bias { diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 7b3e956193..4ad5124d1a 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -2180,8 +2180,7 @@ llama_model_quantize_params llama_model_quantize_default_params() { /*.kv_overrides =*/ nullptr, /*.tensor_type =*/ nullptr, /*.prune_layers =*/ nullptr, - /*.target_bpw =*/ -1.0f, - /*.no_bias =*/ false + /*.target_bpw =*/ -1.0f }; return result; diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index d355f97274..c254c3f6b2 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -118,7 +118,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp [[noreturn]] static void usage(const char * executable) { printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights]\n", executable); - printf(" [--target-bpw n] [--no-bias] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n"); + printf(" [--target-bpw n] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n"); printf(" model-f32.gguf [model-quant.gguf] type [nthreads]\n\n"); printf(" --allow-requantize: allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); printf(" --leave-output-tensor: will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); @@ -134,8 +134,6 @@ static void usage(const char * executable) { printf(" Advanced option to remove all tensors from the given layers\n"); printf(" --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0\n"); printf(" Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n"); - printf(" --no-bias: use mean square error estimation only (no aligment bias)\n"); - printf(" Advanced option use MSE only and disable aligment bias error estimation\n"); printf(" --keep-split: will generate quantized model in the same shards as input\n"); printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n"); @@ -559,8 +557,6 @@ int main(int argc, char ** argv) { if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) { usage(argv[0]); } - } else if (strcmp(argv[arg_idx], "--no-bias") == 0) { - params.no_bias = true; } else if (strcmp(argv[arg_idx], "--prune-layers") == 0) { if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) { usage(argv[0]); From 5b0d3f6d5ad46596e0f30c967c00e2dc2b93d8da Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 11 Oct 2025 10:04:48 +0100 Subject: [PATCH 119/155] Automatically determine if bias error is significant --- src/llama-quant.cpp | 52 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 48 insertions(+), 4 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 4ad5124d1a..07a88f0fd6 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -637,6 +637,8 @@ static std::unordered_map target_bpw_type( float bpw; size_t bytes; double error; + double mse = 0.0; + double proj = 0.0; }; struct tensor_info { @@ -1340,9 +1342,11 @@ static std::unordered_map target_bpw_type( const ggml_type tensor_types = compatible_candidates[i]; const auto bpw = (float)tensor_bpw(tensor, tensor_types); const size_t bytes = tensor_bytes(tensor, tensor_types); + double mse = 0.0; + double proj = 0.0; const auto err = estimate_error(tensor, tensor_types, f32_sample, rows_sample, values, activations, - tl_quantized_buffer, tl_dequantized_buffer, tensor_lambda, slice_lambda); - eval_candidates[i] = candidate_types{ tensor_types, bpw, bytes, err }; + tl_quantized_buffer, tl_dequantized_buffer, tensor_lambda, slice_lambda, &mse, &proj); + eval_candidates[i] = candidate_types{ tensor_types, bpw, bytes, err, mse, proj }; } }); } @@ -1354,8 +1358,48 @@ static std::unordered_map target_bpw_type( check_signal_handler(all); } - for (auto &c : eval_candidates) { - if (c.bytes > 0) { info.candidate.push_back(c); } + // Check if biasing is needed + bool bias_needed = false; + if (!lambdas.empty()) { + int min_mse = -1; + int min_bias = -1; + { + double best_mse = std::numeric_limits::infinity(); + double best_err = std::numeric_limits::infinity(); + for (int i = 0; i < (int)eval_candidates.size(); ++i) { + const auto & c = eval_candidates[i]; + if (c.bytes == 0) { continue; } + if (c.mse < best_mse) { + best_mse = c.mse; + min_mse = i; + } + if (c.error < best_err) { + best_err = c.error; + min_bias = i; + } + } + } + + if (min_mse != min_bias) { + bias_needed = true; + } else { + double max_rel_bias = 0.0; + for (const auto & c : eval_candidates) { + if (c.bytes == 0) { continue; } + const double mse = std::max(c.mse, epsilon); + const double bias_term = std::max(0.0, c.error - c.mse); + const double rel = bias_term / mse; + max_rel_bias = std::max(rel, max_rel_bias); + } + + bias_needed = max_rel_bias >= 0.5; // >= 50% of MSE? + } + } + + for (auto & c : eval_candidates) { + if (c.bytes == 0) { continue; } + const double final_err = bias_needed ? c.error : c.mse; + info.candidate.push_back(candidate_types{ c.type, c.bpw, c.bytes, final_err, c.mse, c.proj }); } if (info.candidate.empty()) { From 12e0524f3a24d4d5c8a81546fff83fee81e0d3e1 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 12 Oct 2025 15:12:15 +0100 Subject: [PATCH 120/155] Reduce compute time by parallelising tensor processing - courtesy of https://github.com/ddh0 --- src/llama-quant.cpp | 187 +++++++++++++++++++++++--------------------- 1 file changed, 100 insertions(+), 87 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 07a88f0fd6..c607651b05 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -15,6 +15,7 @@ #include #include #include +#include // Quantization types. Changes to this struct must be replicated in quantize.cpp struct tensor_quantization { @@ -623,7 +624,6 @@ static void signal_handler(int) { // Returns tensor type overrides to meet a global bpw target static std::unordered_map target_bpw_type( llama_model_loader & ml, - std::vector> & buffer, const llama_model & model, const std::vector & tensors, const std::map & mapped, @@ -659,6 +659,7 @@ static std::unordered_map target_bpw_type( GGML_TYPE_IQ3_XXS, GGML_TYPE_Q3_K, GGML_TYPE_IQ4_XS, + GGML_TYPE_IQ4_NL, GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, @@ -1127,16 +1128,22 @@ static std::unordered_map target_bpw_type( install_signal_handlers(); auto bpw_data = load_bpw_state(); - std::vector all; - all.reserve(tensors.size()); - for (const auto * tw : tensors) { + + // Significantly reduce compute time by parallelising tensor processing - courtesy of https://github.com/ddh0 + auto process_tensor = [&](const llama_model_loader::llama_tensor_weight * tw, + std::vector> & thread_local_buffer, + std::mutex & loader_mutex, + std::mutex & log_mutex) -> std::optional + { ggml_tensor * tensor = tw->tensor; const std::string name = ggml_get_name(tensor); - if (!can_quantize(tensor)) { continue; } - check_signal_handler(all); + if (bpw_stop.load(std::memory_order_relaxed)) { + return std::nullopt; + } - // If we already have fully evaluatedd this tensor then reuse it - if (auto it_saved = bpw_data.find(name); it_saved != bpw_data.end()) { + // check for pre-computed results from a checkpoint file. + auto it_saved = bpw_data.find(name); + if (it_saved != bpw_data.end()) { tensor_info info; info.w = tw; info.candidate = it_saved->second.candidate; @@ -1144,17 +1151,21 @@ static std::unordered_map target_bpw_type( info.min_bpw = it_saved->second.min_bpw; info.max_bpw = it_saved->second.max_bpw; info.n_elements = it_saved->second.n_elements ? it_saved->second.n_elements : (size_t)ggml_nelements(tensor); - all.push_back(std::move(info)); - continue; + return info; + } + { + std::lock_guard lock(log_mutex); + LLAMA_LOG_INFO("\ttarget_bpw_type: - processing tensor %45s \t(%12" PRId64 " elements)\n", name.c_str(), ggml_nelements(tensor)); } - LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12" PRId64 " elements)\n", __func__, name.c_str(), ggml_nelements(tensor)); if (!ml.use_mmap) { - if (buffer.size() < ggml_nbytes(tensor)) { buffer.resize(ggml_nbytes(tensor)); } - tensor->data = buffer.data(); + if (thread_local_buffer.size() < ggml_nbytes(tensor)) { thread_local_buffer.resize(ggml_nbytes(tensor)); } + tensor->data = thread_local_buffer.data(); + } + { + std::lock_guard lock(loader_mutex); + ml.load_data_for(tensor); } - - ml.load_data_for(tensor); // Dequantize sampled rows into f32_sample const int64_t n_per_row = tensor->ne[0]; @@ -1170,7 +1181,7 @@ static std::unordered_map target_bpw_type( const int64_t max_rows = 4096; int64_t total_rows = std::llround(slice_budget / std::max(1, n)); total_rows = std::max(min_rows, std::min(total_rows, std::min(rows, max_rows))); - if (rows <= min_rows * 2) { total_rows = rows; } // use all rows for small tensors + if (rows <= min_rows * 2) { total_rows = rows; } return total_rows; }; @@ -1191,17 +1202,16 @@ static std::unordered_map target_bpw_type( return; } if (t == GGML_TYPE_F16) { - ggml_fp16_to_fp32_row((const ggml_fp16_t *) src, dst, (int)n_per_row); + ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int)n_per_row); return; } if (t == GGML_TYPE_BF16) { - ggml_bf16_to_fp32_row((const ggml_bf16_t *) src, dst, (int)n_per_row); + ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int)n_per_row); return; } - if (src_is_quant) { GGML_ASSERT(src_traits && src_traits->to_float); - src_traits->to_float(src, dst, (int) n_per_row); + src_traits->to_float(src, dst, (int)n_per_row); return; } @@ -1266,6 +1276,7 @@ static std::unordered_map target_bpw_type( return; } + std::lock_guard lock(log_mutex); LLAMA_LOG_WARN("%s: side data size mismatch for %s: got %zu, expected %zu or %zu; ignoring\n", func, name.c_str(), src_sz, (size_t)n_per_row, want); }; @@ -1276,12 +1287,9 @@ static std::unordered_map target_bpw_type( if (values_all) { copy_or_broadcast(values_all, values_sz, values_sample); } if (activations_all) { copy_or_broadcast(activations_all, activations_sz, activations_sample); } - const int64_t nelem = ggml_nelements(tensor); tensor_info info; info.w = tw; - info.n_elements = nelem; - - // Prepare scratch buffers sized for the largest candidate row size + info.n_elements = ggml_nelements(tensor); size_t total_sampled_rows = f32_sample.size() / n_per_row; // Build list of candidate types first (compatible ones) @@ -1295,7 +1303,8 @@ static std::unordered_map target_bpw_type( for (size_t i = 0; i < base_sz; ++i) { ggml_type ts_type = base_arr[i]; if (is_iq(ts_type) && !has_valid_imatrix) { - LLAMA_LOG_WARN("%s: skipping %s for %s, no or mismatched imatrix\n", __func__, ggml_type_name(ts_type), name.c_str()); + std::lock_guard lock(log_mutex); + LLAMA_LOG_WARN("\t%s: skipping %s for %s, no or mismatched imatrix\n", func, ggml_type_name(ts_type), name.c_str()); continue; } @@ -1325,58 +1334,38 @@ static std::unordered_map target_bpw_type( std::vector quantized_buffer(max_row_sz * total_sampled_rows); std::vector dequantized_buffer(f32_sample.size()); const float * slice_lambda = lambdas.empty() ? nullptr : lambdas.data(); - int n_eval_threads = std::max(1, std::min(nthread, (int)compatible_candidates.size())); - std::atomic cidx{0}; - std::vector eval_workers; - eval_workers.reserve(n_eval_threads); - for (int ti = 0; ti < n_eval_threads; ++ti) { - eval_workers.emplace_back([&] { - // thread-local scratch - std::vector tl_quantized_buffer(quantized_buffer.size()); - std::vector tl_dequantized_buffer(dequantized_buffer.size()); - for (;;) { - if (bpw_stop.load(std::memory_order_relaxed)) { break; } // stop if a signal arrived - const size_t i = cidx.fetch_add(1, std::memory_order_acq_rel); - if (i >= compatible_candidates.size()) { break; } + for (size_t i = 0; i < compatible_candidates.size(); ++i) { + if (bpw_stop.load(std::memory_order_relaxed)) { break; } - const ggml_type tensor_types = compatible_candidates[i]; - const auto bpw = (float)tensor_bpw(tensor, tensor_types); - const size_t bytes = tensor_bytes(tensor, tensor_types); - double mse = 0.0; - double proj = 0.0; - const auto err = estimate_error(tensor, tensor_types, f32_sample, rows_sample, values, activations, - tl_quantized_buffer, tl_dequantized_buffer, tensor_lambda, slice_lambda, &mse, &proj); - eval_candidates[i] = candidate_types{ tensor_types, bpw, bytes, err, mse, proj }; - } - }); + const ggml_type tensor_types = compatible_candidates[i]; + const auto bpw = (float)tensor_bpw(tensor, tensor_types); + const size_t bytes = tensor_bytes(tensor, tensor_types); + double mse = 0.0; + double proj = 0.0; + const auto err = estimate_error(tensor, tensor_types, f32_sample, rows_sample, values, activations, + quantized_buffer, dequantized_buffer, tensor_lambda, slice_lambda, &mse, &proj); + eval_candidates[i] = candidate_types{ tensor_types, bpw, bytes, err, mse, proj }; } - for (auto &th : eval_workers) { th.join(); } - - // If interruption happened mid-evaluation, exit without adding a half-baked tensor entry - if (bpw_stop.load(std::memory_order_relaxed) && cidx.load(std::memory_order_relaxed) < compatible_candidates.size()) { - check_signal_handler(all); - } + if (bpw_stop.load(std::memory_order_relaxed)) { return std::nullopt; } // Check if biasing is needed bool bias_needed = false; if (!lambdas.empty()) { int min_mse = -1; int min_bias = -1; - { - double best_mse = std::numeric_limits::infinity(); - double best_err = std::numeric_limits::infinity(); - for (int i = 0; i < (int)eval_candidates.size(); ++i) { - const auto & c = eval_candidates[i]; - if (c.bytes == 0) { continue; } - if (c.mse < best_mse) { - best_mse = c.mse; - min_mse = i; - } - if (c.error < best_err) { - best_err = c.error; - min_bias = i; - } + double best_mse = std::numeric_limits::infinity(); + double best_err = std::numeric_limits::infinity(); + for (int i = 0; i < (int)eval_candidates.size(); ++i) { + const auto & c = eval_candidates[i]; + if (c.bytes == 0) { continue; } + if (c.mse < best_mse) { + best_mse = c.mse; + min_mse = i; + } + if (c.error < best_err) { + best_err = c.error; + min_bias = i; } } @@ -1388,8 +1377,7 @@ static std::unordered_map target_bpw_type( if (c.bytes == 0) { continue; } const double mse = std::max(c.mse, epsilon); const double bias_term = std::max(0.0, c.error - c.mse); - const double rel = bias_term / mse; - max_rel_bias = std::max(rel, max_rel_bias); + max_rel_bias = std::max(bias_term / mse, max_rel_bias); } bias_needed = max_rel_bias >= 0.5; // >= 50% of MSE? @@ -1404,7 +1392,7 @@ static std::unordered_map target_bpw_type( if (info.candidate.empty()) { // As a last resort, keep original type - float bpw = ggml_nbytes(tensor) * 8.0f / nelem; + float bpw = ggml_nbytes(tensor) * 8.0f / info.n_elements; info.candidate.push_back(candidate_types{ tensor->type, bpw, ggml_nbytes(tensor), 0.0 }); } @@ -1416,26 +1404,18 @@ static std::unordered_map target_bpw_type( if (a.bytes != b.bytes) { return a.bytes < b.bytes; } return a.error < b.error; }); - const auto last = std::unique(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) { + candidates.erase(std::unique(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) { return a.bytes == b.bytes; - }); - candidates.erase(last, candidates.end()); - - // Pareto by bytes -> error + }), candidates.end()); std::vector pareto; pareto.reserve(candidates.size()); double best_err = infinity; - size_t last_b = std::numeric_limits::max(); for (const auto & c : candidates) { - if (c.bytes != last_b) { - last_b = c.bytes; - if (c.error < best_err) { - best_err = c.error; - pareto.push_back(c); - } + if (c.error < best_err) { + best_err = c.error; + pareto.push_back(c); } } - candidates.swap(pareto); if (candidates.size() < 3) { return; } // need at least 3 points to do convex hull @@ -1470,10 +1450,43 @@ static std::unordered_map target_bpw_type( info.choice = 0; info.min_bpw = info.candidate.front().bpw; info.max_bpw = info.candidate.back().bpw; - all.push_back(std::move(info)); - check_signal_handler(all); // save after each tensor + + return info; + }; + + std::vector all; // this vector will be populated by the parallel workers + { + std::atomic tensor_idx{0}; // shared work queue index for all threads + const size_t num_tensors_to_process = tensors.size(); + std::mutex loader_mutex; + std::mutex log_mutex; + std::mutex results_mutex; + std::vector workers; + int num_threads_to_spawn = std::max(1, std::min(nthread, (int)num_tensors_to_process)); + + for (int i = 0; i < num_threads_to_spawn; ++i) { + workers.emplace_back([&]() { + std::vector> thread_local_buffer; + while (true) { + const size_t current_idx = tensor_idx.fetch_add(1); + if (current_idx >= num_tensors_to_process) { break; } + const auto * tw = tensors[current_idx]; + if (!can_quantize(tw->tensor)) { continue; } + // Execute the main processing logic for this tensor + std::optional result_info = process_tensor(tw, thread_local_buffer, loader_mutex, log_mutex); + if (result_info) { + std::lock_guard lock(results_mutex); + all.push_back(std::move(*result_info)); + } + } + }); + } + + for (auto & w : workers) { w.join(); } } + check_signal_handler(all); + if (all.empty()) { return {}; } // Compute total elements across all tensors and bytes for non-quantizable tensors @@ -1965,7 +1978,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate\n", __func__); } LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw); - bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread); + bpw_overrides = target_bpw_type(ml, model, tensors, mapped, values_data, activations_data, params, nthread); } else { LLAMA_LOG_WARN("%s: no imatrix provided, target bpw will not apply\n", __func__); } From b6094a97bfbd831a715ca366200f8b9372a26a0d Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 12 Oct 2025 16:30:35 +0100 Subject: [PATCH 121/155] Add quant types --- src/llama-quant.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index c607651b05..56e63f9bb7 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -655,8 +655,11 @@ static std::unordered_map target_bpw_type( GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M, GGML_TYPE_IQ2_XXS, + GGML_TYPE_IQ2_XS, + GGML_TYPE_IQ2_S, GGML_TYPE_Q2_K, GGML_TYPE_IQ3_XXS, + GGML_TYPE_IQ3_S, GGML_TYPE_Q3_K, GGML_TYPE_IQ4_XS, GGML_TYPE_IQ4_NL, @@ -1155,7 +1158,7 @@ static std::unordered_map target_bpw_type( } { std::lock_guard lock(log_mutex); - LLAMA_LOG_INFO("\ttarget_bpw_type: - processing tensor %45s \t(%12" PRId64 " elements)\n", name.c_str(), ggml_nelements(tensor)); + LLAMA_LOG_INFO("\t%s: - processing tensor %45s \t(%12" PRId64 " elements)\n", func, name.c_str(), ggml_nelements(tensor)); } if (!ml.use_mmap) { @@ -1457,19 +1460,19 @@ static std::unordered_map target_bpw_type( std::vector all; // this vector will be populated by the parallel workers { std::atomic tensor_idx{0}; // shared work queue index for all threads - const size_t num_tensors_to_process = tensors.size(); + const size_t tensors_to_process = tensors.size(); std::mutex loader_mutex; std::mutex log_mutex; std::mutex results_mutex; std::vector workers; - int num_threads_to_spawn = std::max(1, std::min(nthread, (int)num_tensors_to_process)); + int threads_to_spawn = std::max(1, std::min(nthread, (int)tensors_to_process)); - for (int i = 0; i < num_threads_to_spawn; ++i) { + for (int i = 0; i < threads_to_spawn; ++i) { workers.emplace_back([&]() { std::vector> thread_local_buffer; while (true) { const size_t current_idx = tensor_idx.fetch_add(1); - if (current_idx >= num_tensors_to_process) { break; } + if (current_idx >= tensors_to_process) { break; } const auto * tw = tensors[current_idx]; if (!can_quantize(tw->tensor)) { continue; } // Execute the main processing logic for this tensor From ca282302b5cde95945f8337e6df264d92e878501 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 12 Oct 2025 18:23:23 +0100 Subject: [PATCH 122/155] Add --keep-bpw-state option --- include/llama.h | 1 + src/llama-quant.cpp | 16 +++++----------- tools/quantize/quantize.cpp | 5 ++++- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/include/llama.h b/include/llama.h index 14e12d7c51..f745e2110b 100644 --- a/include/llama.h +++ b/include/llama.h @@ -366,6 +366,7 @@ extern "C" { void * tensor_types; // pointer to vector containing tensor types void * prune_layers; // pointer to vector containing layer indices to prune float target_bpw; // target bits per weight (bpw) + bool keep_bpw_state; // keep bpw state file } llama_model_quantize_params; typedef struct llama_logit_bias { diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 56e63f9bb7..4b243f1f55 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -659,7 +659,6 @@ static std::unordered_map target_bpw_type( GGML_TYPE_IQ2_S, GGML_TYPE_Q2_K, GGML_TYPE_IQ3_XXS, - GGML_TYPE_IQ3_S, GGML_TYPE_Q3_K, GGML_TYPE_IQ4_XS, GGML_TYPE_IQ4_NL, @@ -773,11 +772,9 @@ static std::unordered_map target_bpw_type( auto save_bpw_state = [&](const std::vector & all_vec) { const std::string tmp = checkpoint_file + ".tmp"; std::ofstream ofs(tmp, std::ios::binary | std::ios::trunc); - if (!ofs) { return; } // best-effort - const float target_bpw = params->target_bpw; + if (!ofs) { return; } ofs.write((const char *)&file_magic, sizeof(file_magic)); ofs.write((const char *)&model_id, sizeof(model_id)); - ofs.write((const char *)&target_bpw, sizeof(target_bpw)); const uint64_t n = all_vec.size(); ofs.write((const char *)&n, sizeof(n)); for (const auto & ti : all_vec) { @@ -817,19 +814,14 @@ static std::unordered_map target_bpw_type( uint32_t magic = 0; uint64_t id = 0; - float bpw = 0.0f; ifs.read((char *)&magic, sizeof(magic)); ifs.read((char *)&id, sizeof(id)); - ifs.read((char *)&bpw, sizeof(bpw)); if (magic != file_magic) { LLAMA_LOG_WARN("%s: invalid resume file, ignoring: %s\n", func, checkpoint_file.c_str()); return out; } else if (id != model_id) { LLAMA_LOG_WARN("%s: model ID mismatch, ignoring: %s\n", func, checkpoint_file.c_str()); return out; - } else if (bpw != params->target_bpw) { - LLAMA_LOG_WARN("%s: target bpw of %f does not match %f, ignoring: %s\n", func, params->target_bpw, bpw, checkpoint_file.c_str()); - return out; } else { LLAMA_LOG_INFO("%s: resuming tensor quantization\n", func); } @@ -874,7 +866,7 @@ static std::unordered_map target_bpw_type( auto delete_bpw_state = [&] { std::ifstream ifs(checkpoint_file); - if (ifs.good()) { + if (ifs.good() && !params->keep_bpw_state) { LLAMA_LOG_INFO("%s: deleting %s\n", func, checkpoint_file.c_str()); std::remove(checkpoint_file.c_str()); } @@ -1489,6 +1481,7 @@ static std::unordered_map target_bpw_type( } check_signal_handler(all); + if (params->keep_bpw_state) { save_bpw_state(all); } if (all.empty()) { return {}; } @@ -2240,7 +2233,8 @@ llama_model_quantize_params llama_model_quantize_default_params() { /*.kv_overrides =*/ nullptr, /*.tensor_type =*/ nullptr, /*.prune_layers =*/ nullptr, - /*.target_bpw =*/ -1.0f + /*.target_bpw =*/ -1.0f, + /*.keep_bpw_state =*/ false }; return result; diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index c254c3f6b2..ad2563a48d 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -118,7 +118,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp [[noreturn]] static void usage(const char * executable) { printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights]\n", executable); - printf(" [--target-bpw n] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n"); + printf(" [--target-bpw n] [--keep-bpw-state] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n"); printf(" model-f32.gguf [model-quant.gguf] type [nthreads]\n\n"); printf(" --allow-requantize: allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); printf(" --leave-output-tensor: will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); @@ -134,6 +134,7 @@ static void usage(const char * executable) { printf(" Advanced option to remove all tensors from the given layers\n"); printf(" --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0\n"); printf(" Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n"); + printf(" --keep-bpw-state: preserve the bpw computations in a state file\n"); printf(" --keep-split: will generate quantized model in the same shards as input\n"); printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n"); @@ -557,6 +558,8 @@ int main(int argc, char ** argv) { if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) { usage(argv[0]); } + } else if (strcmp(argv[arg_idx], "--keep-bpw-state") == 0) { + params.keep_bpw_state = true; } else if (strcmp(argv[arg_idx], "--prune-layers") == 0) { if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) { usage(argv[0]); From b1b58e67df30453edd64706abda76d3c42f0bb03 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 13 Oct 2025 14:54:32 +0100 Subject: [PATCH 123/155] Refactor signal handlers --- src/llama-quant.cpp | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 4b243f1f55..d1fa429553 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -632,6 +632,22 @@ static std::unordered_map target_bpw_type( const llama_model_quantize_params * params, int nthread ) { + // RAII guard for signal handlers + bpw_stop.store(false, std::memory_order_relaxed); + struct signal_scope_guard { + using handler_t = void (*)(int); + handler_t prev_int = SIG_DFL; + handler_t prev_term = SIG_DFL; + signal_scope_guard() { + prev_int = std::signal(SIGINT, signal_handler); + prev_term = std::signal(SIGTERM, signal_handler); + } + ~signal_scope_guard() { + std::signal(SIGINT, prev_int); + std::signal(SIGTERM, prev_term); + } + } _signal_guard; + struct candidate_types { ggml_type type; float bpw; @@ -724,22 +740,6 @@ static std::unordered_map target_bpw_type( return is_quantizable(ggml_get_name(t), model.arch, params); }; - auto install_signal_handlers = [] { - static std::once_flag once; - std::call_once(once, [] { - std::signal(SIGINT, signal_handler); - std::signal(SIGTERM, signal_handler); - }); - }; - - auto uninstall_signal_handlers = [] { - static std::once_flag once; - std::call_once(once, [] { - std::signal(SIGINT, SIG_DFL); - std::signal(SIGTERM, SIG_DFL); - }); - }; - // Saved state per tensor struct saved_info { std::vector candidate; @@ -1121,7 +1121,6 @@ static std::unordered_map target_bpw_type( return lambdas; }; - install_signal_handlers(); auto bpw_data = load_bpw_state(); // Significantly reduce compute time by parallelising tensor processing - courtesy of https://github.com/ddh0 @@ -1700,7 +1699,6 @@ static std::unordered_map target_bpw_type( } delete_bpw_state(); // we're done, clear any checkpoint - uninstall_signal_handlers(); return emit_overrides(); } From cd734b89ce3b2af611fd168975a5921f33b475eb Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 13 Oct 2025 15:15:23 +0100 Subject: [PATCH 124/155] Update quant types --- src/llama-quant.cpp | 3 ++- tools/quantize/quantize.cpp | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index d1fa429553..7543ec6961 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -681,7 +681,8 @@ static std::unordered_map target_bpw_type( GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, - GGML_TYPE_Q8_0 + GGML_TYPE_Q8_0, + GGML_TYPE_F16 }; const char * important_tensors[] = { diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index ad2563a48d..e67649beb9 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -501,6 +501,8 @@ static const char * get_ftype(const float bpw) { {1.5625, "IQ1_S"}, {1.7500, "IQ1_M"}, {2.0625, "IQ2_XXS"}, + {2.3125, "IQ2_XS"}, + {2.5625, "IQ2_S"}, {2.6250, "Q2_K"}, {3.0625, "IQ3_XXS"}, {3.4375, "Q3_K"}, From b7911f14314387e4101957d4eb4df9650660c877 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 13 Oct 2025 17:46:45 +0100 Subject: [PATCH 125/155] Minor refactoring --- src/llama-quant.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 7543ec6961..0f256eface 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1122,9 +1122,9 @@ static std::unordered_map target_bpw_type( return lambdas; }; - auto bpw_data = load_bpw_state(); + const auto bpw_data = load_bpw_state(); - // Significantly reduce compute time by parallelising tensor processing - courtesy of https://github.com/ddh0 + // Reduce compute time by parallelising tensor processing - courtesy of https://github.com/ddh0 auto process_tensor = [&](const llama_model_loader::llama_tensor_weight * tw, std::vector> & thread_local_buffer, std::mutex & loader_mutex, @@ -1330,7 +1330,7 @@ static std::unordered_map target_bpw_type( std::vector dequantized_buffer(f32_sample.size()); const float * slice_lambda = lambdas.empty() ? nullptr : lambdas.data(); for (size_t i = 0; i < compatible_candidates.size(); ++i) { - if (bpw_stop.load(std::memory_order_relaxed)) { break; } + if (bpw_stop.load(std::memory_order_relaxed)) { return std::nullopt; } const ggml_type tensor_types = compatible_candidates[i]; const auto bpw = (float)tensor_bpw(tensor, tensor_types); @@ -1383,6 +1383,8 @@ static std::unordered_map target_bpw_type( if (c.bytes == 0) { continue; } const double final_err = bias_needed ? c.error : c.mse; info.candidate.push_back(candidate_types{ c.type, c.bpw, c.bytes, final_err, c.mse, c.proj }); + // LLAMA_LOG_INFO("\t%s: %35s \t%10s \t%1.4f bpw \t%10zu bytes \t mse: %1.8e \t err: %1.8e\n", + // func, name.c_str(), ggml_type_name(c.type), c.bpw, c.bytes, c.mse, final_err); } if (info.candidate.empty()) { @@ -1426,7 +1428,7 @@ static std::unordered_map target_bpw_type( }; while (hull.size() >= 2) { - if (cross_product(hull[hull.size() - 2], hull[hull.size() - 1], c) <= -1 * epsilon) { // very small negative tolerance + if (cross_product(hull[hull.size() - 2], hull[hull.size() - 1], c) <= epsilon) { hull.pop_back(); } else { break; @@ -1670,7 +1672,6 @@ static std::unordered_map target_bpw_type( const auto & ti = all[i]; const std::string tensor_name = ggml_get_name(ti.w->tensor); int j = ti.choice + 1; - while (j < (int)ti.candidate.size() && ti.candidate[j].bytes == ti.candidate[ti.choice].bytes) { ++j; } if (j >= (int)ti.candidate.size()) { continue; } // no upgrade available size_t delta_bytes = ti.candidate[j].bytes - ti.candidate[ti.choice].bytes; From a6853ea2ae7d828e535874e6f2244786921df594 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 16 Oct 2025 11:20:24 +0100 Subject: [PATCH 126/155] Add tensor type and depth heuristics --- src/llama-quant.cpp | 94 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 83 insertions(+), 11 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 0f256eface..38d20e3d0f 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -16,6 +16,7 @@ #include #include #include +#include // Quantization types. Changes to this struct must be replicated in quantize.cpp struct tensor_quantization { @@ -685,13 +686,6 @@ static std::unordered_map target_bpw_type( GGML_TYPE_F16 }; - const char * important_tensors[] = { - ".output.weight", - ".attn_output.weight", - ".ffn_down.weight", - ".ffn_down_shexp.weight" - }; - constexpr double epsilon = 1e-12; constexpr double infinity = std::numeric_limits::infinity(); constexpr uint32_t file_magic = 0x42505731; // BPW1 @@ -1544,11 +1538,89 @@ static std::unordered_map target_bpw_type( return emit_overrides(); } - auto is_important = [&](const std::string & tensor_name) -> bool { - return std::any_of(std::begin(important_tensors), std::end(important_tensors), [&](const char* imp) { - return tensor_name.find(imp) != std::string::npos; + auto tensor_importance = [&](const std::vector & all_vec) -> std::unordered_map { + std::unordered_map scores; + for (const auto & ti : all_vec) { + const std::string name = ggml_get_name(ti.w->tensor); + float total_score = 0.0f; + float depth_score = 0.0f; + float type_score = 0.0f; + + // Depth component: output, embeddings & early/late layers are important + if (name.find("output.weight") != std::string::npos || + name.find("token_embd.weight") != std::string::npos) { + depth_score = 1.0f; } - ); + else if (name.find(".attn_output.weight") != std::string::npos) { + depth_score = 0.9f; + } else { + static const std::regex layer_pattern(R"(blk\.(\d+)\.)"); + std::smatch match; + if (std::regex_search(name, match, layer_pattern)) { + const int layer = std::stoi(match[1]); + const float normalized_layer = (float)layer / (float)std::max(1, (int)model.hparams.n_layer - 1); + const float center_dist = std::abs(normalized_layer - 0.5f) * 2.0f; + depth_score = 0.2f + 0.6f * center_dist; + } + } + + // Type component: certain tensor types are more important + if (name.find("output.weight") != std::string::npos) { + type_score = 1.0f; + } else if (name.find(".attn_output.weight") != std::string::npos) { + type_score = 0.9f; + } else if (name.find(".ffn_down.weight") != std::string::npos || + name.find(".ffn_down_shexp.weight") != std::string::npos || + name.find(".ffn_down_exps.weight") != std::string::npos) { + type_score = 0.8f; + } else if (name.find(".attn_q.weight") != std::string::npos || + name.find(".attn_k.weight") != std::string::npos || + name.find(".attn_v.weight") != std::string::npos || + name.find(".attn_qkv.weight") != std::string::npos) { + type_score = 0.7f; + } else if (name.find(".ffn_up.weight") != std::string::npos || + name.find(".ffn_gate.weight") != std::string::npos || + name.find(".ffn_up_shexp.weight") != std::string::npos || + name.find(".ffn_gate_shexp.weight") != std::string::npos || + name.find(".ffn_up_exps.weight") != std::string::npos || + name.find(".ffn_gate_exps.weight") != std::string::npos) { + type_score = 0.6f; + } else if (name.find("token_embd.weight") != std::string::npos) { + type_score = 0.5f; + } + + // Weighted combination + total_score = 0.80f * type_score + 0.20f * depth_score; // 80% type + 20% depth + scores[name] = total_score; + } + + return scores; + }; + + auto select_tensors = [&](const std::vector & all_vec) -> std::unordered_set { + const auto scores = tensor_importance(all_vec); + + // Sort by score + std::vector> sorted_scores(scores.begin(), scores.end()); + std::sort(sorted_scores.begin(), sorted_scores.end(), [](const auto & a, const auto & b) { return a.second > b.second; }); + + // Select top percentile + const size_t n_important = std::max(1, std::llround((double)sorted_scores.size() * 0.25f)); // top 25% + + std::unordered_set important; + for (size_t i = 0; i < std::min(n_important, sorted_scores.size()); ++i) { + important.insert(sorted_scores[i].first); + //LLAMA_LOG_DEBUG("\t%s: important tensor %s (score %.4f)\n", func, sorted_scores[i].first.c_str(), sorted_scores[i].second); + } + + LLAMA_LOG_INFO("%s: prioritizing %zu out off %zu tensors\n", func, important.size(), sorted_scores.size()); + return important; + }; + + const auto important_set = select_tensors(all); + + auto is_important = [&](const std::string & tensor_name) -> bool { + return important_set.count(tensor_name) > 0; }; // Lagrangian relaxation to minimise error subject to a bpw target constraint From 0b3e930d5204d3c4be96179835f5378811814247 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 16 Oct 2025 11:41:26 +0100 Subject: [PATCH 127/155] Add option to override bpw state file name --- include/llama.h | 1 + src/llama-quant.cpp | 21 +++++++++++++++++++-- tools/quantize/quantize.cpp | 15 +++++++++++---- 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/include/llama.h b/include/llama.h index f745e2110b..ce04011e19 100644 --- a/include/llama.h +++ b/include/llama.h @@ -367,6 +367,7 @@ extern "C" { void * prune_layers; // pointer to vector containing layer indices to prune float target_bpw; // target bits per weight (bpw) bool keep_bpw_state; // keep bpw state file + void * bpw_state; // pointer to bpw state file } llama_model_quantize_params; typedef struct llama_logit_bias { diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 38d20e3d0f..1dee52d58d 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -762,7 +762,23 @@ static std::unordered_map target_bpw_type( char hex[17]; const uint64_t model_id = metadata_id(ml.meta.get()); std::snprintf(hex, sizeof(hex), "%016" PRIx64, (uint64_t)model_id); - const std::string checkpoint_file = ml.arch_name + "-" + std::string(hex) + ".bpw_state"; + std::string checkpoint_file = ml.arch_name + "-" + std::string(hex) + ".bpw_state"; + if (params->keep_bpw_state && params->bpw_state) { + const auto * filename = static_cast(params->bpw_state); + std::ifstream ifs(filename, std::ios::binary); + if (ifs.good()) { + checkpoint_file = std::string(filename); + } else { + std::ofstream ofs(filename, std::ios::binary | std::ios::app); + if (ofs.is_open()) { + checkpoint_file = std::string(filename); + ofs.close(); + std::remove(checkpoint_file.c_str()); + } else { + LLAMA_LOG_WARN("%s: %s is not a valid file name. Using %s instead\n", func, filename, checkpoint_file.c_str()); + } + } + } auto save_bpw_state = [&](const std::vector & all_vec) { const std::string tmp = checkpoint_file + ".tmp"; @@ -2306,7 +2322,8 @@ llama_model_quantize_params llama_model_quantize_default_params() { /*.tensor_type =*/ nullptr, /*.prune_layers =*/ nullptr, /*.target_bpw =*/ -1.0f, - /*.keep_bpw_state =*/ false + /*.keep_bpw_state =*/ false, + /*.bpw_state =*/ nullptr }; return result; diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index e67649beb9..945acbe288 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -117,8 +117,8 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp [[noreturn]] static void usage(const char * executable) { - printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights]\n", executable); - printf(" [--target-bpw n] [--keep-bpw-state] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n"); + printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--target-bpw n]\n", executable); + printf(" [--bpw-state filename] [--keep-bpw-state] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n"); printf(" model-f32.gguf [model-quant.gguf] type [nthreads]\n\n"); printf(" --allow-requantize: allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); printf(" --leave-output-tensor: will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); @@ -128,13 +128,14 @@ static void usage(const char * executable) { printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n"); printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n"); printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n"); - printf(" --tensor-type TENSOR=TYPE: quantize this tensor to this ggml_type. example: --tensor-type attn_q=q8_0\n"); + printf(" --tensor-type TENSOR=TYPE: quantize this tensor to this ggml_type. Example: --tensor-type attn_q=q8_0\n"); printf(" Advanced option to selectively quantize tensors. May be specified multiple times.\n"); printf(" --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n"); printf(" Advanced option to remove all tensors from the given layers\n"); printf(" --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0\n"); printf(" Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n"); - printf(" --keep-bpw-state: preserve the bpw computations in a state file\n"); + printf(" --keep-bpw-state: save the bpw computations to -.bpw_state\n"); + printf(" --bpw-state: file name to use instead of default\n"); printf(" --keep-split: will generate quantized model in the same shards as input\n"); printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n"); @@ -562,6 +563,12 @@ int main(int argc, char ** argv) { } } else if (strcmp(argv[arg_idx], "--keep-bpw-state") == 0) { params.keep_bpw_state = true; + } else if (strcmp(argv[arg_idx], "--bpw-state") == 0) { + if (arg_idx < argc-1) { + params.bpw_state = argv[++arg_idx]; + } else { + usage(argv[0]); + } } else if (strcmp(argv[arg_idx], "--prune-layers") == 0) { if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) { usage(argv[0]); From a5103933bb4eec23b71bd8ccaae3b80710a1a82a Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 16 Oct 2025 15:11:48 +0100 Subject: [PATCH 128/155] Minor refactoring --- src/llama-quant.cpp | 51 +++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 18 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 1dee52d58d..b8391a4f2c 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -647,7 +647,7 @@ static std::unordered_map target_bpw_type( std::signal(SIGINT, prev_int); std::signal(SIGTERM, prev_term); } - } _signal_guard; + } signal_guard; struct candidate_types { ggml_type type; @@ -683,7 +683,11 @@ static std::unordered_map target_bpw_type( GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, GGML_TYPE_Q8_0, +#ifdef GGML_USE_METAL GGML_TYPE_F16 +#else + GGML_TYPE_BF16 +#endif }; constexpr double epsilon = 1e-12; @@ -1004,17 +1008,30 @@ static std::unordered_map target_bpw_type( // Dequantize into dequantized_buffer { - const ggml_type_traits * traits = ggml_get_type_traits(quant_type); - if (!traits || !traits->to_float) { - if (out_mse) { *out_mse = infinity; } - if (out_proj) { *out_proj = 0.0; } - return infinity; - } - - for (size_t r = 0; r < sample_rows; ++r) { - const uint8_t * src = quantized_buffer.data() + r * row_sz; - float * dst = dequantized_buffer.data() + r * (size_t)n_per_row; - traits->to_float(src, dst, (int)n_per_row); + if (quant_type == GGML_TYPE_F16) { + for (size_t r = 0; r < sample_rows; ++r) { + auto src = (const ggml_fp16_t *)(quantized_buffer.data() + r * row_sz); + float * dst = dequantized_buffer.data() + r * (size_t)n_per_row; + ggml_fp16_to_fp32_row(src, dst, (int)n_per_row); + } + } else if (quant_type == GGML_TYPE_BF16) { + for (size_t r = 0; r < sample_rows; ++r) { + auto src = (const ggml_bf16_t *)(quantized_buffer.data() + r * row_sz); + float * dst = dequantized_buffer.data() + r * (size_t)n_per_row; + ggml_bf16_to_fp32_row(src, dst, (int)n_per_row); + } + } else { + const ggml_type_traits * traits = ggml_get_type_traits(quant_type); + if (!traits || !traits->to_float) { + if (out_mse) { *out_mse = infinity; } + if (out_proj) { *out_proj = 0.0; } + return infinity; + } + for (size_t r = 0; r < sample_rows; ++r) { + const uint8_t * src = quantized_buffer.data() + r * row_sz; + float * dst = dequantized_buffer.data() + r * (size_t)n_per_row; + traits->to_float(src, dst, (int)n_per_row); + } } } @@ -1500,13 +1517,11 @@ static std::unordered_map target_bpw_type( // Compute total elements across all tensors and bytes for non-quantizable tensors size_t nq_elements = 0; size_t nq_bytes = 0; - for (const auto & it : ml.weights_map) { - const ggml_tensor * tensor = it.second.tensor; - const std::string name = it.first; + for (const auto * it : tensors) { + const ggml_tensor * tensor = it->tensor; + const std::string name = ggml_get_name(tensor); nq_elements += (size_t)ggml_nelements(tensor); - if (!is_quantizable(name, model.arch, params)) { - nq_bytes += ggml_nbytes(tensor); - } + if (!can_quantize(tensor)) { nq_bytes += ggml_nbytes(tensor); } } auto total_bytes = [&]() -> size_t { From fa1df81d49a0512cb4dc6b9b2afc10e7af86bcf2 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 20 Oct 2025 20:52:23 +0100 Subject: [PATCH 129/155] Finetune heuristics --- src/llama-quant.cpp | 51 ++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 91b127789c..5e3893151c 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1577,13 +1577,9 @@ static std::unordered_map target_bpw_type( float depth_score = 0.0f; float type_score = 0.0f; - // Depth component: output, embeddings & early/late layers are important - if (name.find("output.weight") != std::string::npos || - name.find("token_embd.weight") != std::string::npos) { + // Depth component: output & early/late layers are important + if (name == "output.weight") { depth_score = 1.0f; - } - else if (name.find(".attn_output.weight") != std::string::npos) { - depth_score = 0.9f; } else { static const std::regex layer_pattern(R"(blk\.(\d+)\.)"); std::smatch match; @@ -1591,38 +1587,40 @@ static std::unordered_map target_bpw_type( const int layer = std::stoi(match[1]); const float normalized_layer = (float)layer / (float)std::max(1, (int)model.hparams.n_layer - 1); const float center_dist = std::abs(normalized_layer - 0.5f) * 2.0f; - depth_score = 0.2f + 0.6f * center_dist; + depth_score = 0.9f * center_dist; } } - // Type component: certain tensor types are more important - if (name.find("output.weight") != std::string::npos) { + // Type component: certain tensor types have more impact on model quality + if (name == "output.weight") { type_score = 1.0f; - } else if (name.find(".attn_output.weight") != std::string::npos) { - type_score = 0.9f; } else if (name.find(".ffn_down.weight") != std::string::npos || - name.find(".ffn_down_shexp.weight") != std::string::npos || name.find(".ffn_down_exps.weight") != std::string::npos) { + type_score = 0.9f; + } else if (name.find(".attn_output.weight") != std::string::npos || + name.find(".time_mix_output.weight") != std::string::npos || + name.find(".attn_o.weight") != std::string::npos) { type_score = 0.8f; + } else if (name.find(".ffn_up.weight") != std::string::npos || + name.find(".ffn_gate.weight") != std::string::npos || + name.find(".ffn_up_exps.weight") != std::string::npos || + name.find(".ffn_gate_exps.weight") != std::string::npos) { + type_score = 0.3f; } else if (name.find(".attn_q.weight") != std::string::npos || name.find(".attn_k.weight") != std::string::npos || name.find(".attn_v.weight") != std::string::npos || name.find(".attn_qkv.weight") != std::string::npos) { - type_score = 0.7f; - } else if (name.find(".ffn_up.weight") != std::string::npos || - name.find(".ffn_gate.weight") != std::string::npos || - name.find(".ffn_up_shexp.weight") != std::string::npos || - name.find(".ffn_gate_shexp.weight") != std::string::npos || - name.find(".ffn_up_exps.weight") != std::string::npos || - name.find(".ffn_gate_exps.weight") != std::string::npos) { - type_score = 0.6f; + type_score = 0.2f; } else if (name.find("token_embd.weight") != std::string::npos) { - type_score = 0.5f; + type_score = 0.1f; } // Weighted combination - total_score = 0.80f * type_score + 0.20f * depth_score; // 80% type + 20% depth - scores[name] = total_score; + total_score = 0.8f * type_score + 0.2f * depth_score; // 80% type + 20% depth + if (total_score != 0.0f) { + scores[name] = total_score; + LLAMA_LOG_DEBUG("\t%s: \t %45s \t depth score %.4f \t type score %.4f \t total score %.4f\n", func, name.c_str(), depth_score, type_score, total_score); + } } return scores; @@ -1636,15 +1634,16 @@ static std::unordered_map target_bpw_type( std::sort(sorted_scores.begin(), sorted_scores.end(), [](const auto & a, const auto & b) { return a.second > b.second; }); // Select top percentile - const size_t n_important = std::max(1, std::llround((double)sorted_scores.size() * 0.25f)); // top 25% + const size_t n_important = std::max(1, std::llround((double)sorted_scores.size() * 0.25f)); // bump top 25% std::unordered_set important; for (size_t i = 0; i < std::min(n_important, sorted_scores.size()); ++i) { important.insert(sorted_scores[i].first); - //LLAMA_LOG_DEBUG("\t%s: important tensor %s (score %.4f)\n", func, sorted_scores[i].first.c_str(), sorted_scores[i].second); + LLAMA_LOG_DEBUG("\t%s: important tensor %s (score %.4f)\n", func, sorted_scores[i].first.c_str(), sorted_scores[i].second); } - LLAMA_LOG_INFO("%s: prioritizing %zu out off %zu tensors\n", func, important.size(), sorted_scores.size()); + const auto pct = 100.0 * (double)important.size() / (double)sorted_scores.size(); + LLAMA_LOG_INFO("%s: prioritizing %zu out of %zu tensors (%.2f%%)\n", func, important.size(), sorted_scores.size(), pct); return important; }; From 00ddf039b306882a8a15761624bcdd673f666f71 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 20 Oct 2025 21:38:49 +0100 Subject: [PATCH 130/155] Update usage --- tools/quantize/quantize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 945acbe288..f994999e59 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -118,7 +118,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp [[noreturn]] static void usage(const char * executable) { printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--target-bpw n]\n", executable); - printf(" [--bpw-state filename] [--keep-bpw-state] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n"); + printf(" [--keep-bpw-state] [--bpw-state filename] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n"); printf(" model-f32.gguf [model-quant.gguf] type [nthreads]\n\n"); printf(" --allow-requantize: allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); printf(" --leave-output-tensor: will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); From 543b5a99db2b74e2b74cb87a222a25586479bd9b Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 20 Oct 2025 21:57:03 +0100 Subject: [PATCH 131/155] Fix lambda capture --- src/llama-quant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 5e3893151c..e6c9bfa7f0 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1421,7 +1421,7 @@ static std::unordered_map target_bpw_type( } // Keep only the pareto‑optimal candidates and enforce convexity in (bytes, error) curve - auto pareto_convex = [](std::vector & candidates) { + auto pareto_convex = [epsilon](std::vector & candidates) { if (candidates.empty()) { return; } std::sort(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) { From 27bf25e93c9309b96a151c1d8c4eef8fdad0cb21 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 20 Oct 2025 22:04:35 +0100 Subject: [PATCH 132/155] Fix lambda capture --- src/llama-quant.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index e6c9bfa7f0..08f1b30293 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -690,7 +690,7 @@ static std::unordered_map target_bpw_type( #endif }; - constexpr double epsilon = 1e-12; + const double epsilon = 1e-12; constexpr double infinity = std::numeric_limits::infinity(); constexpr uint32_t file_magic = 0x42505731; // BPW1 const char * func = __func__; @@ -1118,7 +1118,7 @@ static std::unordered_map target_bpw_type( }; // Returns lambda per slice or 0.0 if no activations - auto estimate_lambda = [](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) -> std::vector { + auto estimate_lambda = [&](const float * values, const float * activations, const int64_t n_per_row, const int64_t ne2) -> std::vector { const int64_t ns = std::max(1, ne2); std::vector lambdas(ns, 0.0f); if (!activations) { return lambdas; } @@ -1421,7 +1421,7 @@ static std::unordered_map target_bpw_type( } // Keep only the pareto‑optimal candidates and enforce convexity in (bytes, error) curve - auto pareto_convex = [epsilon](std::vector & candidates) { + auto pareto_convex = [&](std::vector & candidates) { if (candidates.empty()) { return; } std::sort(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) { From 04561d5782b930e781627eee5ffcbb6b06e8b558 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 21 Oct 2025 12:53:26 +0100 Subject: [PATCH 133/155] Update epsilon specifier --- src/llama-quant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 08f1b30293..5280b9a02a 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -690,7 +690,7 @@ static std::unordered_map target_bpw_type( #endif }; - const double epsilon = 1e-12; + constexpr double epsilon = 1e-12; constexpr double infinity = std::numeric_limits::infinity(); constexpr uint32_t file_magic = 0x42505731; // BPW1 const char * func = __func__; From d6ccd5649ac6db0ad87156cf92f036737cf82be3 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 25 Oct 2025 12:09:20 +0100 Subject: [PATCH 134/155] Finetune heuristics --- src/llama-quant.cpp | 83 ++++++++++++++++++++++++--------------------- 1 file changed, 44 insertions(+), 39 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 5280b9a02a..617c7d9473 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -838,7 +838,7 @@ static std::unordered_map target_bpw_type( LLAMA_LOG_WARN("%s: model ID mismatch, ignoring: %s\n", func, checkpoint_file.c_str()); return out; } else { - LLAMA_LOG_INFO("%s: resuming tensor quantization\n", func); + LLAMA_LOG_INFO("%s: state file found, resuming tensor quantization\n", func); } uint64_t n = 0; @@ -1569,54 +1569,59 @@ static std::unordered_map target_bpw_type( return emit_overrides(); } - auto tensor_importance = [&](const std::vector & all_vec) -> std::unordered_map { + auto tensor_depth = [&](const std::string & name) -> float { + static const std::regex layer_pattern(R"(blk\.(\d+)\.)"); + std::smatch match; + + // Depth component: output, embeddings & early/late layers are important + if (name == "output.weight" || name == "token_embd.weight") { + return 1.0f; + } + if (std::regex_search(name, match, layer_pattern)) { + const int layer = std::stoi(match[1]); + const float normalized_layer = (float)layer / (float)std::max(1, (int)model.hparams.n_layer - 1); + const float center_dist = std::abs(normalized_layer - 0.5f) * 2.0f; + return 0.01f + 0.9f * center_dist; + } + + return 0.0f; + }; + + auto tensor_importance = [&](const std::vector & all_tensors) -> std::unordered_map { std::unordered_map scores; - for (const auto & ti : all_vec) { - const std::string name = ggml_get_name(ti.w->tensor); + for (const auto & t : all_tensors) { + const std::string name = ggml_get_name(t.w->tensor); float total_score = 0.0f; float depth_score = 0.0f; float type_score = 0.0f; - // Depth component: output & early/late layers are important - if (name == "output.weight") { - depth_score = 1.0f; - } else { - static const std::regex layer_pattern(R"(blk\.(\d+)\.)"); - std::smatch match; - if (std::regex_search(name, match, layer_pattern)) { - const int layer = std::stoi(match[1]); - const float normalized_layer = (float)layer / (float)std::max(1, (int)model.hparams.n_layer - 1); - const float center_dist = std::abs(normalized_layer - 0.5f) * 2.0f; - depth_score = 0.9f * center_dist; - } - } - // Type component: certain tensor types have more impact on model quality + const std::vector>> tensor_scores = { + {0.9f, {".ffn_down.weight", ".ffn_down_exps.weight"}}, + {0.89f, {".attn_output.weight", ".time_mix_output.weight", ".attn_o.weight"}}, + {0.3f, {".ffn_up.weight", ".ffn_gate.weight", ".ffn_up_exps.weight", ".ffn_gate_exps.weight"}}, + {0.29f, {".attn_q.weight", ".attn_k.weight", ".attn_v.weight", ".attn_qkv.weight"}}, + {0.2f, {"token_embd.weight"}} + }; if (name == "output.weight") { type_score = 1.0f; - } else if (name.find(".ffn_down.weight") != std::string::npos || - name.find(".ffn_down_exps.weight") != std::string::npos) { - type_score = 0.9f; - } else if (name.find(".attn_output.weight") != std::string::npos || - name.find(".time_mix_output.weight") != std::string::npos || - name.find(".attn_o.weight") != std::string::npos) { - type_score = 0.8f; - } else if (name.find(".ffn_up.weight") != std::string::npos || - name.find(".ffn_gate.weight") != std::string::npos || - name.find(".ffn_up_exps.weight") != std::string::npos || - name.find(".ffn_gate_exps.weight") != std::string::npos) { - type_score = 0.3f; - } else if (name.find(".attn_q.weight") != std::string::npos || - name.find(".attn_k.weight") != std::string::npos || - name.find(".attn_v.weight") != std::string::npos || - name.find(".attn_qkv.weight") != std::string::npos) { - type_score = 0.2f; - } else if (name.find("token_embd.weight") != std::string::npos) { - type_score = 0.1f; + } else { + for (const auto& ts : tensor_scores) { + const bool found = std::any_of(ts.second.begin(), ts.second.end(), [&](const char* pattern) { + return name.find(pattern) != std::string::npos; + }); + if (found) { + type_score = ts.first; + break; + } + } + } + if (type_score > 0.0f) { + depth_score = tensor_depth(name); } // Weighted combination - total_score = 0.8f * type_score + 0.2f * depth_score; // 80% type + 20% depth + total_score = 0.90f * type_score + 0.10f * depth_score; // 90% type + 10% depth if (total_score != 0.0f) { scores[name] = total_score; LLAMA_LOG_DEBUG("\t%s: \t %45s \t depth score %.4f \t type score %.4f \t total score %.4f\n", func, name.c_str(), depth_score, type_score, total_score); @@ -1634,7 +1639,7 @@ static std::unordered_map target_bpw_type( std::sort(sorted_scores.begin(), sorted_scores.end(), [](const auto & a, const auto & b) { return a.second > b.second; }); // Select top percentile - const size_t n_important = std::max(1, std::llround((double)sorted_scores.size() * 0.25f)); // bump top 25% + const size_t n_important = std::max(1, std::llround((double)sorted_scores.size() * 0.29f)); // 29% seems to be the pareto front std::unordered_set important; for (size_t i = 0; i < std::min(n_important, sorted_scores.size()); ++i) { From 5303212324c90745eb82c3e5f5abb32b184cb7fa Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 26 Oct 2025 17:40:52 +0000 Subject: [PATCH 135/155] Simplify tensor selection --- src/llama-quant.cpp | 99 +++++---------------------------------------- 1 file changed, 11 insertions(+), 88 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 617c7d9473..04f4ff341a 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -11,11 +11,12 @@ #include #include #include +#include +#include #include #include #include #include -#include #include // Quantization types. Changes to this struct must be replicated in quantize.cpp @@ -1151,7 +1152,7 @@ static std::unordered_map target_bpw_type( const auto bpw_data = load_bpw_state(); - // Reduce compute time by parallelising tensor processing - courtesy of https://github.com/ddh0 + // Parallelize tensor processing - courtesy of https://github.com/ddh0 auto process_tensor = [&](const llama_model_loader::llama_tensor_weight * tw, std::vector> & thread_local_buffer, std::mutex & loader_mutex, @@ -1569,93 +1570,15 @@ static std::unordered_map target_bpw_type( return emit_overrides(); } - auto tensor_depth = [&](const std::string & name) -> float { - static const std::regex layer_pattern(R"(blk\.(\d+)\.)"); - std::smatch match; - - // Depth component: output, embeddings & early/late layers are important - if (name == "output.weight" || name == "token_embd.weight") { - return 1.0f; - } - if (std::regex_search(name, match, layer_pattern)) { - const int layer = std::stoi(match[1]); - const float normalized_layer = (float)layer / (float)std::max(1, (int)model.hparams.n_layer - 1); - const float center_dist = std::abs(normalized_layer - 0.5f) * 2.0f; - return 0.01f + 0.9f * center_dist; - } - - return 0.0f; - }; - - auto tensor_importance = [&](const std::vector & all_tensors) -> std::unordered_map { - std::unordered_map scores; - for (const auto & t : all_tensors) { - const std::string name = ggml_get_name(t.w->tensor); - float total_score = 0.0f; - float depth_score = 0.0f; - float type_score = 0.0f; - - // Type component: certain tensor types have more impact on model quality - const std::vector>> tensor_scores = { - {0.9f, {".ffn_down.weight", ".ffn_down_exps.weight"}}, - {0.89f, {".attn_output.weight", ".time_mix_output.weight", ".attn_o.weight"}}, - {0.3f, {".ffn_up.weight", ".ffn_gate.weight", ".ffn_up_exps.weight", ".ffn_gate_exps.weight"}}, - {0.29f, {".attn_q.weight", ".attn_k.weight", ".attn_v.weight", ".attn_qkv.weight"}}, - {0.2f, {"token_embd.weight"}} - }; - if (name == "output.weight") { - type_score = 1.0f; - } else { - for (const auto& ts : tensor_scores) { - const bool found = std::any_of(ts.second.begin(), ts.second.end(), [&](const char* pattern) { - return name.find(pattern) != std::string::npos; - }); - if (found) { - type_score = ts.first; - break; - } - } - } - if (type_score > 0.0f) { - depth_score = tensor_depth(name); - } - - // Weighted combination - total_score = 0.90f * type_score + 0.10f * depth_score; // 90% type + 10% depth - if (total_score != 0.0f) { - scores[name] = total_score; - LLAMA_LOG_DEBUG("\t%s: \t %45s \t depth score %.4f \t type score %.4f \t total score %.4f\n", func, name.c_str(), depth_score, type_score, total_score); - } - } - - return scores; - }; - - auto select_tensors = [&](const std::vector & all_vec) -> std::unordered_set { - const auto scores = tensor_importance(all_vec); - - // Sort by score - std::vector> sorted_scores(scores.begin(), scores.end()); - std::sort(sorted_scores.begin(), sorted_scores.end(), [](const auto & a, const auto & b) { return a.second > b.second; }); - - // Select top percentile - const size_t n_important = std::max(1, std::llround((double)sorted_scores.size() * 0.29f)); // 29% seems to be the pareto front - - std::unordered_set important; - for (size_t i = 0; i < std::min(n_important, sorted_scores.size()); ++i) { - important.insert(sorted_scores[i].first); - LLAMA_LOG_DEBUG("\t%s: important tensor %s (score %.4f)\n", func, sorted_scores[i].first.c_str(), sorted_scores[i].second); - } - - const auto pct = 100.0 * (double)important.size() / (double)sorted_scores.size(); - LLAMA_LOG_INFO("%s: prioritizing %zu out of %zu tensors (%.2f%%)\n", func, important.size(), sorted_scores.size(), pct); - return important; - }; - - const auto important_set = select_tensors(all); - + // Certain tensors have a higher impact on model quality, so we apply a lower penalty to them auto is_important = [&](const std::string & tensor_name) -> bool { - return important_set.count(tensor_name) > 0; + const auto important = tensor_name == "output.weight" || + tensor_name.find(".ffn_down.weight") != std::string::npos || + tensor_name.find(".ffn_down_exps.weight") != std::string::npos || + tensor_name.find(".attn_output.weight") != std::string::npos || + tensor_name.find(".time_mix_output.weight") != std::string::npos || + tensor_name.find(".attn_o.weight") != std::string::npos; + return important; }; // Lagrangian relaxation to minimise error subject to a bpw target constraint From f8863b9a80822bb58e7406fd35d4452a97c4639a Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 28 Oct 2025 15:22:32 +0000 Subject: [PATCH 136/155] Minor refactoring --- src/llama-quant.cpp | 48 ++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 04f4ff341a..fdce1f4285 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -694,6 +694,7 @@ static std::unordered_map target_bpw_type( constexpr double epsilon = 1e-12; constexpr double infinity = std::numeric_limits::infinity(); constexpr uint32_t file_magic = 0x42505731; // BPW1 + constexpr uint64_t arbitrary_magic = 0xeabada55cafed00d; const char * func = __func__; auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t { @@ -731,7 +732,7 @@ static std::unordered_map target_bpw_type( auto make_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> ggml_type { if (is_compatible(t, typ)) { return typ; } - ggml_type fb = fallback_type(typ); + const ggml_type fb = fallback_type(typ); return is_compatible(t, fb) ? fb : GGML_TYPE_F16; }; @@ -754,7 +755,7 @@ static std::unordered_map target_bpw_type( for (size_t i = 0; i < n; ++i) { h = (h << 5) + h + data[i]; } - return h ? h : 0xeabada55cafed00d; + return h ? h : arbitrary_magic; }; auto metadata_id = [&](const gguf_context * ctx) -> uint64_t { @@ -795,7 +796,7 @@ static std::unordered_map target_bpw_type( ofs.write((const char *)&n, sizeof(n)); for (const auto & ti : all_vec) { const std::string name = ggml_get_name(ti.w->tensor); - const uint32_t len = (uint32_t)name.size(); + const auto len = (uint32_t)name.size(); ofs.write((const char *)&len, sizeof(len)); ofs.write(name.data(), len); @@ -835,13 +836,14 @@ static std::unordered_map target_bpw_type( if (magic != file_magic) { LLAMA_LOG_WARN("%s: invalid resume file, ignoring: %s\n", func, checkpoint_file.c_str()); return out; - } else if (id != model_id) { + } + if (id != model_id) { LLAMA_LOG_WARN("%s: model ID mismatch, ignoring: %s\n", func, checkpoint_file.c_str()); return out; - } else { - LLAMA_LOG_INFO("%s: state file found, resuming tensor quantization\n", func); } + LLAMA_LOG_INFO("%s: state file found, resuming tensor quantization\n", func); + uint64_t n = 0; ifs.read((char *)&n, sizeof(n)); for (uint64_t i = 0; i < n; ++i) { @@ -862,15 +864,15 @@ static std::unordered_map target_bpw_type( si.n_elements = (size_t)ne; si.candidate.resize(cn); - for (size_t j = 0; j < si.candidate.size(); ++j) { + for (auto & s : si.candidate) { int32_t t = 0; uint64_t b = 0; ifs.read((char *)&t, sizeof(t)); - si.candidate[j].type = (ggml_type)t; - ifs.read((char *)&si.candidate[j].bpw, sizeof(si.candidate[j].bpw)); + s.type = (ggml_type)t; + ifs.read((char *)&s.bpw, sizeof(s.bpw)); ifs.read((char *)&b, sizeof(b)); - si.candidate[j].bytes = (size_t)b; - ifs.read((char *)&si.candidate[j].error, sizeof(si.candidate[j].error)); + s.bytes = (size_t)b; + ifs.read((char *)&s.error, sizeof(s.error)); } out.emplace(std::move(name), std::move(si)); @@ -886,7 +888,6 @@ static std::unordered_map target_bpw_type( LLAMA_LOG_INFO("%s: deleting %s\n", func, checkpoint_file.c_str()); std::remove(checkpoint_file.c_str()); } - }; auto check_signal_handler = [&](const std::vector & all_vec) { @@ -1198,10 +1199,10 @@ static std::unordered_map target_bpw_type( // Compute rows based on tensor shape and slice count auto sample_rows = [](const int64_t n, const int64_t rows, const int64_t n2, const bool has_acts) -> int64_t { const double tensor_budget = has_acts ? 1 * 1024 * 1024 : 0.5 * 1024 * 1024; - const double scale_rows = std::clamp(std::sqrt(std::max(1.0, (double)rows) / 4096.0), 0.5, 2.0); // favour more rows for large nrt + const double scale_rows = std::clamp(std::sqrt(std::max(1.0, (double)rows) / 4096.0), 0.5, 2.0); // favour more rows for large tensors const double slice_budget = tensor_budget * scale_rows / std::max(1, n2); const int64_t min_rows = has_acts ? 128 : 64; - const int64_t max_rows = 4096; + constexpr int64_t max_rows = 4096; // row limit to avoid excessive memory use int64_t total_rows = std::llround(slice_budget / std::max(1, n)); total_rows = std::max(min_rows, std::min(total_rows, std::min(rows, max_rows))); if (rows <= min_rows * 2) { total_rows = rows; } @@ -1246,7 +1247,7 @@ static std::unordered_map target_bpw_type( f32_sample.clear(); std::vector row_buffer(n_per_row); for (int64_t slice = 0; slice < ne2; ++slice) { - std::mt19937 rng(std::hash{}(name) ^ 0xeabada55cafed00d ^ slice); + std::mt19937 rng(std::hash{}(name) ^ arbitrary_magic ^ slice); const int64_t rows_sample_max = std::max(1, std::min(nrows_total, rows_sample_per_expert)); const int64_t stride = std::max(1, nrows_total / rows_sample_max); int64_t offset = 0; @@ -1411,8 +1412,6 @@ static std::unordered_map target_bpw_type( if (c.bytes == 0) { continue; } const double final_err = bias_needed ? c.error : c.mse; info.candidate.push_back(candidate_types{ c.type, c.bpw, c.bytes, final_err, c.mse, c.proj }); - // LLAMA_LOG_INFO("\t%s: %35s \t%10s \t%1.4f bpw \t%10zu bytes \t mse: %1.8e \t err: %1.8e\n", - // func, name.c_str(), ggml_type_name(c.type), c.bpw, c.bytes, c.mse, final_err); } if (info.candidate.empty()) { @@ -1445,16 +1444,15 @@ static std::unordered_map target_bpw_type( if (candidates.size() < 3) { return; } // need at least 3 points to do convex hull // Convex hull (lower envelope) + auto cross_product = [](const candidate_types & h0, const candidate_types & h1, const candidate_types & p) -> double { + const double dx1 = (double)h1.bytes - (double)h0.bytes; + const double dy1 = h1.error - h0.error; + const double dx2 = (double)p.bytes - (double)h0.bytes; + const double dy2 = p.error - h0.error; + return dx1 * dy2 - dx2 * dy1; + }; std::vector hull; hull.reserve(candidates.size()); for (const auto & c : candidates) { - auto cross_product = [](const candidate_types & h0, const candidate_types & h1, const candidate_types & p) -> double { - const double dx1 = (double)h1.bytes - (double)h0.bytes; - const double dy1 = h1.error - h0.error; - const double dx2 = (double)p.bytes - (double)h0.bytes; - const double dy2 = p.error - h0.error; - return dx1 * dy2 - dx2 * dy1; - }; - while (hull.size() >= 2) { if (cross_product(hull[hull.size() - 2], hull[hull.size() - 1], c) <= epsilon) { hull.pop_back(); From 6e32244a06b1ffe513b1694ee647e92c09904dac Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 30 Oct 2025 21:53:07 +0000 Subject: [PATCH 137/155] Read statistics from imatrix --- include/llama.h | 1 + src/llama-quant.cpp | 28 ++++++++++----- tools/quantize/quantize.cpp | 68 +++++++++++++++++++++++++++++-------- 3 files changed, 75 insertions(+), 22 deletions(-) diff --git a/include/llama.h b/include/llama.h index ce04011e19..517ef5e0fb 100644 --- a/include/llama.h +++ b/include/llama.h @@ -368,6 +368,7 @@ extern "C" { float target_bpw; // target bits per weight (bpw) bool keep_bpw_state; // keep bpw state file void * bpw_state; // pointer to bpw state file + void * statistics; // pointer to statistics data } llama_model_quantize_params; typedef struct llama_logit_bias { diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index fdce1f4285..a8153494f9 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -631,6 +631,7 @@ static std::unordered_map target_bpw_type( const std::map & mapped, const std::unordered_map> * values_data, const std::unordered_map> * activations_data, + const std::unordered_map> * statistics_data, const llama_model_quantize_params * params, int nthread ) { @@ -1815,6 +1816,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } const std::unordered_map> * values_data = nullptr; const std::unordered_map> * activations_data = nullptr; + const std::unordered_map> * statistics_data = nullptr; if (params->imatrix) { values_data = static_cast>*>(params->imatrix); if (values_data) { @@ -1845,6 +1847,12 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } } } + if (params->statistics) { + statistics_data = static_cast>*>(params->statistics); + if (statistics_data) { + LLAMA_LOG_INFO(" and %d statistics",int(statistics_data->size())); + } + } LLAMA_LOG_INFO("\n"); gguf_context_ptr ctx_out { gguf_init_empty() }; @@ -1999,15 +2007,18 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: std::unordered_map bpw_overrides = {}; if (params->target_bpw != -1.0f && !params->only_copy) { if (params->imatrix) { - if (params->activations) { - LLAMA_LOG_INFO("%s: imatrix with activations provided, target bpw quantization will be more accurate\n",__func__); - } else { - LLAMA_LOG_WARN("%s: imatrix without activations provided, target bpw quantization will be less accurate\n", __func__); - } + const char* base_msg = params->activations + ? (params->statistics + ? "imatrix with activations and statistics provided, process will be more accurate\n" + : "imatrix with activations provided, process will be accurate\n") + : "imatrix without activations provided, process will be less accurate\n"; + if (params->activations) { LLAMA_LOG_INFO("%s: %s", __func__, base_msg); } + else { LLAMA_LOG_WARN("%s: %s", __func__, base_msg); } + LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw); - bpw_overrides = target_bpw_type(ml, model, tensors, mapped, values_data, activations_data, params, nthread); + bpw_overrides = target_bpw_type(ml, model, tensors, mapped, values_data, activations_data, statistics_data, params, nthread); } else { - LLAMA_LOG_WARN("%s: no imatrix provided, target bpw will not apply\n", __func__); + LLAMA_LOG_WARN("%s: --target-bpw requires an imatrix but none was provided, option will be ignored\n", __func__); } } @@ -2269,7 +2280,8 @@ llama_model_quantize_params llama_model_quantize_default_params() { /*.prune_layers =*/ nullptr, /*.target_bpw =*/ -1.0f, /*.keep_bpw_state =*/ false, - /*.bpw_state =*/ nullptr + /*.bpw_state =*/ nullptr, + /*.statistics =*/ nullptr }; return result; diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index f994999e59..0b2b05b60a 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -221,7 +221,8 @@ static int load_legacy_imatrix(const std::string & imatrix_file, std::vector & imatrix_datasets, std::unordered_map> & values_data, - std::unordered_map> & activations_data) { + std::unordered_map> & activations_data, + std::unordered_map> & statistics_data) { struct ggml_context * ctx = nullptr; struct gguf_init_params meta_gguf_params = { @@ -256,24 +257,28 @@ static int load_imatrix(const std::string & imatrix_file, const std::string sums_suffix{ ".in_sum" }; const std::string sums2_suffix{ ".in_sum2" }; const std::string counts_suffix{ ".counts" }; + const std::string stats_suffix{ ".stats" }; // Using an ordered map to get a deterministic iteration order. - std::map> sums_counts_for; + std::map> sums_counts_for; for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { std::string name = cur->name; if (name.empty()) { continue; } - if (string_remove_suffix(name, sums2_suffix)) { - // in_sum2 + if (string_remove_suffix(name, sums_suffix)) { + // in_sum std::get<0>(sums_counts_for[std::move(name)]) = cur; + } else if (string_remove_suffix(name, sums2_suffix)) { + // in_sum2 + std::get<1>(sums_counts_for[std::move(name)]) = cur; } else if (string_remove_suffix(name, counts_suffix)) { // counts - std::get<1>(sums_counts_for[std::move(name)]) = cur; - } else if (string_remove_suffix(name, sums_suffix)) { - // in_sum std::get<2>(sums_counts_for[std::move(name)]) = cur; + } else if (string_remove_suffix(name, stats_suffix)) { + // stats + std::get<3>(sums_counts_for[std::move(name)]) = cur; } else { // ignore other tensors @@ -282,11 +287,12 @@ static int load_imatrix(const std::string & imatrix_file, for (const auto & sc : sums_counts_for) { const std::string & name = sc.first; - const struct ggml_tensor * sums = std::get<2>(sc.second); - const struct ggml_tensor * sums2 = std::get<0>(sc.second); - const struct ggml_tensor * counts = std::get<1>(sc.second); + const struct ggml_tensor * sums = std::get<0>(sc.second); + const struct ggml_tensor * sums2 = std::get<1>(sc.second); + const struct ggml_tensor * counts = std::get<2>(sc.second); + const struct ggml_tensor * stats = std::get<3>(sc.second); - // check that sums, sums2 and counts have the same shape + // check sums2 and counts are present, and that sums and sums2 have the same shape if (!sums2 || !counts || (sums != nullptr && ggml_nelements(sums) != ggml_nelements(sums2))) { fprintf(stderr, "%s: mismatched sums and counts for %s\n", __func__, name.c_str()); gguf_free(ctx_gguf); @@ -302,6 +308,19 @@ static int load_imatrix(const std::string & imatrix_file, if (sums) { activations.resize(ggml_nelements(sums)); } + if (stats) { + auto & statistics = statistics_data[name]; + statistics.resize(ggml_nelements(stats)); + if (stats->type == GGML_TYPE_F32) { + std::memcpy(statistics.data(), stats->data, ggml_nelements(stats) * sizeof(float)); + } else { + fprintf(stderr, "%s: unsupported .stats type '%s' for '%s' - ignoring entry\n", + __func__, ggml_type_name(stats->type), name.c_str()); + statistics.clear(); + statistics_data.erase(name); + } + + } values.resize(ggml_nelements(sums2)); float max_count = 0.0f; for (int64_t j = 0; j < ne1; ++j) { @@ -354,10 +373,11 @@ static int prepare_imatrix(const std::string & imatrix_file, const std::vector & included_weights, const std::vector & excluded_weights, std::unordered_map> & values_data, - std::unordered_map> & activations_data) { + std::unordered_map> & activations_data, + std::unordered_map> & statistics_data) { int m_last_call = -1; if (!imatrix_file.empty()) { - m_last_call = load_imatrix(imatrix_file, imatrix_dataset, values_data, activations_data); + m_last_call = load_imatrix(imatrix_file, imatrix_dataset, values_data, activations_data, statistics_data); } if (values_data.empty()) { return m_last_call; @@ -380,11 +400,20 @@ static int prepare_imatrix(const std::string & imatrix_file, ++at; } } + for (auto st = statistics_data.begin(); st != statistics_data.end();) { + auto pos = st->first.find(name); + if (pos != std::string::npos) { + st = activations_data.erase(st); + } else { + ++st; + } + } } } if (!included_weights.empty()) { std::unordered_map> tmp_values; std::unordered_map> tmp_activations; + std::unordered_map> tmp_statistics; for (const auto & name : included_weights) { for (auto & e : values_data) { auto pos = e.first.find(name); @@ -398,9 +427,16 @@ static int prepare_imatrix(const std::string & imatrix_file, tmp_activations.emplace(std::move(a)); } } + for (auto & s : statistics_data) { + auto pos = s.first.find(name); + if (pos != std::string::npos) { + tmp_statistics.emplace(std::move(s)); + } + } } values_data = std::move(tmp_values); activations_data = std::move(tmp_activations); + statistics_data = std::move(tmp_statistics); } return m_last_call; @@ -617,7 +653,8 @@ int main(int argc, char ** argv) { std::vector imatrix_datasets; std::unordered_map> values_data; std::unordered_map> activations_data; - int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, values_data, activations_data); + std::unordered_map> statistics_data; + int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, values_data, activations_data, statistics_data); if (!values_data.empty()) { params.imatrix = &values_data; { @@ -657,6 +694,9 @@ int main(int argc, char ** argv) { if (!activations_data.empty()) { params.activations = &activations_data; } + if (!statistics_data.empty()) { + params.statistics = &statistics_data; + } if (!kv_overrides.empty()) { kv_overrides.emplace_back(); kv_overrides.back().key[0] = 0; From c59bb6d49d025765091d7c83a9b95528395de283 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 30 Oct 2025 22:11:40 +0000 Subject: [PATCH 138/155] Add Euclidean-Cosine score to identify important tensors --- src/llama-quant.cpp | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index a8153494f9..957dd5f367 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1571,12 +1571,25 @@ static std::unordered_map target_bpw_type( // Certain tensors have a higher impact on model quality, so we apply a lower penalty to them auto is_important = [&](const std::string & tensor_name) -> bool { - const auto important = tensor_name == "output.weight" || - tensor_name.find(".ffn_down.weight") != std::string::npos || - tensor_name.find(".ffn_down_exps.weight") != std::string::npos || - tensor_name.find(".attn_output.weight") != std::string::npos || - tensor_name.find(".time_mix_output.weight") != std::string::npos || - tensor_name.find(".attn_o.weight") != std::string::npos; + bool important = false; + + if (statistics_data) { + float ecs = 0.0f; // Euclidean-Cosine score + const std::string key = remap_imatrix(tensor_name, mapped); + const auto tstats = statistics_data->find(key); + if (tstats != statistics_data->end() && !tstats->second.empty()) { + ecs = tstats->second.front(); + important = ecs == 100.0f; // mark as important if ecs is 100% + } + } else { + important = tensor_name == "output.weight" || + tensor_name.find(".ffn_down.weight") != std::string::npos || + tensor_name.find(".ffn_down_exps.weight") != std::string::npos || + tensor_name.find(".attn_output.weight") != std::string::npos || + tensor_name.find(".time_mix_output.weight") != std::string::npos || + tensor_name.find(".attn_o.weight") != std::string::npos; + } + return important; }; From ac8cfbdd12eb2207098e3bcc4aee9347aa8366bc Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 17 Nov 2025 18:03:09 +0000 Subject: [PATCH 139/155] Improved is_important() logic --- src/llama-quant.cpp | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 739172c70f..1e8a2cda9c 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -635,8 +635,8 @@ static std::unordered_map target_bpw_type( const llama_model_quantize_params * params, int nthread ) { - // RAII guard for signal handlers bpw_stop.store(false, std::memory_order_relaxed); + // Signal handlers struct signal_scope_guard { using handler_t = void (*)(int); handler_t prev_int = SIG_DFL; @@ -1574,12 +1574,23 @@ static std::unordered_map target_bpw_type( bool important = false; if (statistics_data) { - float ecs = 0.0f; // Euclidean-Cosine score const std::string key = remap_imatrix(tensor_name, mapped); const auto tstats = statistics_data->find(key); if (tstats != statistics_data->end() && !tstats->second.empty()) { - ecs = tstats->second.front(); - important = ecs == 100.0f; // mark as important if ecs is 100% + float ecs = 0.0f; // Euclidean-Cosine score + float l2 = 0.0f; // L2 Euclidean Distance + float cs = 0.0f; // Cosine Similarity + try { + // ecs = tstats->second.at(0); + l2 = tstats->second.at(1); + cs = tstats->second.at(2); + } catch (std::out_of_range &) { + LLAMA_LOG_ERROR("\t%s: insufficient statistics for tensor %s\n", func, tensor_name.c_str()); + return false; + } + ecs = 100.0f - (100.0f / (1.0f + 0.01f * l2 * l2) * std::fabs(cs)); // ecs = 100 - (100 / (1 + (L2 Dist/p)^2) * |Cos Sim|^q) + // LLAMA_LOG_INFO("\t%s: tensor %s has ECS score %.4f (L2 Distance %.4f and CosSim %.4f\n", func, tensor_name.c_str(), ecs, l2, cs); + important = ecs >= 99.99f; // mark as important if ecs is >= 99.99% } } else { important = tensor_name == "output.weight" || From a0ba913613235c1639f92877f09e82c3db6fef47 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Wed, 19 Nov 2025 11:19:44 +0000 Subject: [PATCH 140/155] Fix lambda capture bug in Windows and initialise candidate_types struct --- src/llama-quant.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 1e8a2cda9c..86ca165b6c 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -652,10 +652,10 @@ static std::unordered_map target_bpw_type( } signal_guard; struct candidate_types { - ggml_type type; - float bpw; - size_t bytes; - double error; + ggml_type type = GGML_TYPE_COUNT; + float bpw = 0.0f; + size_t bytes = 0; + double error = 0.0; double mse = 0.0; double proj = 0.0; }; @@ -751,7 +751,7 @@ static std::unordered_map target_bpw_type( size_t n_elements = 0; }; - auto djb2_hash = [](const uint8_t * data, size_t n) -> uint64_t { + auto djb2_hash = [&](const uint8_t * data, const size_t n) -> uint64_t { uint64_t h = 5381; for (size_t i = 0; i < n; ++i) { h = (h << 5) + h + data[i]; From 9ec3e6e2629d294e7ae95ee58634c360475e67d7 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 23 Nov 2025 17:49:53 +0000 Subject: [PATCH 141/155] Remove processing statistics_data --- include/llama.h | 1 - src/llama-quant.cpp | 19 ++---------- tools/quantize/quantize.cpp | 61 ++++++------------------------------- 3 files changed, 12 insertions(+), 69 deletions(-) diff --git a/include/llama.h b/include/llama.h index 3515ee1a13..c82a4147f4 100644 --- a/include/llama.h +++ b/include/llama.h @@ -369,7 +369,6 @@ extern "C" { float target_bpw; // target bits per weight (bpw) bool keep_bpw_state; // keep bpw state file void * bpw_state; // pointer to bpw state file - void * statistics; // pointer to statistics data } llama_model_quantize_params; typedef struct llama_logit_bias { diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 86ca165b6c..99759a27c8 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -631,7 +631,6 @@ static std::unordered_map target_bpw_type( const std::map & mapped, const std::unordered_map> * values_data, const std::unordered_map> * activations_data, - const std::unordered_map> * statistics_data, const llama_model_quantize_params * params, int nthread ) { @@ -1840,7 +1839,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } const std::unordered_map> * values_data = nullptr; const std::unordered_map> * activations_data = nullptr; - const std::unordered_map> * statistics_data = nullptr; if (params->imatrix) { values_data = static_cast>*>(params->imatrix); if (values_data) { @@ -1871,12 +1869,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } } } - if (params->statistics) { - statistics_data = static_cast>*>(params->statistics); - if (statistics_data) { - LLAMA_LOG_INFO(" and %d statistics",int(statistics_data->size())); - } - } LLAMA_LOG_INFO("\n"); gguf_context_ptr ctx_out { gguf_init_empty() }; @@ -2031,16 +2023,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: std::unordered_map bpw_overrides = {}; if (params->target_bpw != -1.0f && !params->only_copy) { if (params->imatrix) { - const char* base_msg = params->activations - ? (params->statistics - ? "imatrix with activations and statistics provided, process will be more accurate\n" - : "imatrix with activations provided, process will be accurate\n") - : "imatrix without activations provided, process will be less accurate\n"; - if (params->activations) { LLAMA_LOG_INFO("%s: %s", __func__, base_msg); } - else { LLAMA_LOG_WARN("%s: %s", __func__, base_msg); } LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw); - bpw_overrides = target_bpw_type(ml, model, tensors, mapped, values_data, activations_data, statistics_data, params, nthread); + + bpw_overrides = target_bpw_type(ml, model, tensors, mapped, values_data, activations_data, params, nthread); } else { LLAMA_LOG_WARN("%s: --target-bpw requires an imatrix but none was provided, option will be ignored\n", __func__); } @@ -2305,7 +2291,6 @@ llama_model_quantize_params llama_model_quantize_default_params() { /*.target_bpw =*/ -1.0f, /*.keep_bpw_state =*/ false, /*.bpw_state =*/ nullptr, - /*.statistics =*/ nullptr }; return result; diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 0b2b05b60a..aabcd73986 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -221,8 +221,7 @@ static int load_legacy_imatrix(const std::string & imatrix_file, std::vector & imatrix_datasets, std::unordered_map> & values_data, - std::unordered_map> & activations_data, - std::unordered_map> & statistics_data) { + std::unordered_map> & activations_data) { struct ggml_context * ctx = nullptr; struct gguf_init_params meta_gguf_params = { @@ -257,10 +256,9 @@ static int load_imatrix(const std::string & imatrix_file, const std::string sums_suffix{ ".in_sum" }; const std::string sums2_suffix{ ".in_sum2" }; const std::string counts_suffix{ ".counts" }; - const std::string stats_suffix{ ".stats" }; // Using an ordered map to get a deterministic iteration order. - std::map> sums_counts_for; + std::map> sums_counts_for; for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { std::string name = cur->name; @@ -276,11 +274,7 @@ static int load_imatrix(const std::string & imatrix_file, } else if (string_remove_suffix(name, counts_suffix)) { // counts std::get<2>(sums_counts_for[std::move(name)]) = cur; - } else if (string_remove_suffix(name, stats_suffix)) { - // stats - std::get<3>(sums_counts_for[std::move(name)]) = cur; - } - else { + } else { // ignore other tensors } } @@ -290,7 +284,6 @@ static int load_imatrix(const std::string & imatrix_file, const struct ggml_tensor * sums = std::get<0>(sc.second); const struct ggml_tensor * sums2 = std::get<1>(sc.second); const struct ggml_tensor * counts = std::get<2>(sc.second); - const struct ggml_tensor * stats = std::get<3>(sc.second); // check sums2 and counts are present, and that sums and sums2 have the same shape if (!sums2 || !counts || (sums != nullptr && ggml_nelements(sums) != ggml_nelements(sums2))) { @@ -308,19 +301,6 @@ static int load_imatrix(const std::string & imatrix_file, if (sums) { activations.resize(ggml_nelements(sums)); } - if (stats) { - auto & statistics = statistics_data[name]; - statistics.resize(ggml_nelements(stats)); - if (stats->type == GGML_TYPE_F32) { - std::memcpy(statistics.data(), stats->data, ggml_nelements(stats) * sizeof(float)); - } else { - fprintf(stderr, "%s: unsupported .stats type '%s' for '%s' - ignoring entry\n", - __func__, ggml_type_name(stats->type), name.c_str()); - statistics.clear(); - statistics_data.erase(name); - } - - } values.resize(ggml_nelements(sums2)); float max_count = 0.0f; for (int64_t j = 0; j < ne1; ++j) { @@ -373,23 +353,22 @@ static int prepare_imatrix(const std::string & imatrix_file, const std::vector & included_weights, const std::vector & excluded_weights, std::unordered_map> & values_data, - std::unordered_map> & activations_data, - std::unordered_map> & statistics_data) { + std::unordered_map> & activations_data) { int m_last_call = -1; if (!imatrix_file.empty()) { - m_last_call = load_imatrix(imatrix_file, imatrix_dataset, values_data, activations_data, statistics_data); + m_last_call = load_imatrix(imatrix_file, imatrix_dataset, values_data, activations_data); } if (values_data.empty()) { return m_last_call; } if (!excluded_weights.empty()) { for (const auto & name : excluded_weights) { - for (auto it = values_data.begin(); it != values_data.end();) { - auto pos = it->first.find(name); + for (auto vt = values_data.begin(); vt != values_data.end();) { + auto pos = vt->first.find(name); if (pos != std::string::npos) { - it = values_data.erase(it); + vt = values_data.erase(vt); } else { - ++it; + ++vt; } } for (auto at = activations_data.begin(); at != activations_data.end();) { @@ -400,20 +379,11 @@ static int prepare_imatrix(const std::string & imatrix_file, ++at; } } - for (auto st = statistics_data.begin(); st != statistics_data.end();) { - auto pos = st->first.find(name); - if (pos != std::string::npos) { - st = activations_data.erase(st); - } else { - ++st; - } - } } } if (!included_weights.empty()) { std::unordered_map> tmp_values; std::unordered_map> tmp_activations; - std::unordered_map> tmp_statistics; for (const auto & name : included_weights) { for (auto & e : values_data) { auto pos = e.first.find(name); @@ -427,16 +397,9 @@ static int prepare_imatrix(const std::string & imatrix_file, tmp_activations.emplace(std::move(a)); } } - for (auto & s : statistics_data) { - auto pos = s.first.find(name); - if (pos != std::string::npos) { - tmp_statistics.emplace(std::move(s)); - } - } } values_data = std::move(tmp_values); activations_data = std::move(tmp_activations); - statistics_data = std::move(tmp_statistics); } return m_last_call; @@ -653,8 +616,7 @@ int main(int argc, char ** argv) { std::vector imatrix_datasets; std::unordered_map> values_data; std::unordered_map> activations_data; - std::unordered_map> statistics_data; - int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, values_data, activations_data, statistics_data); + int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, values_data, activations_data); if (!values_data.empty()) { params.imatrix = &values_data; { @@ -694,9 +656,6 @@ int main(int argc, char ** argv) { if (!activations_data.empty()) { params.activations = &activations_data; } - if (!statistics_data.empty()) { - params.statistics = &statistics_data; - } if (!kv_overrides.empty()) { kv_overrides.emplace_back(); kv_overrides.back().key[0] = 0; From 1c9993e13198a28db1b5a8e7cd0fcb5d6bcf89eb Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 23 Nov 2025 17:51:04 +0000 Subject: [PATCH 142/155] Add --disable-tensor-importance option --- include/llama.h | 1 + src/llama-quant.cpp | 39 ++++++++++++++----------------------- tools/quantize/quantize.cpp | 4 ++++ 3 files changed, 20 insertions(+), 24 deletions(-) diff --git a/include/llama.h b/include/llama.h index c82a4147f4..1f5b2e8a2b 100644 --- a/include/llama.h +++ b/include/llama.h @@ -369,6 +369,7 @@ extern "C" { float target_bpw; // target bits per weight (bpw) bool keep_bpw_state; // keep bpw state file void * bpw_state; // pointer to bpw state file + bool disable_tensor_importance; // treat all tensors equally during quantization } llama_model_quantize_params; typedef struct llama_logit_bias { diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 99759a27c8..2b9aba091b 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1570,29 +1570,10 @@ static std::unordered_map target_bpw_type( // Certain tensors have a higher impact on model quality, so we apply a lower penalty to them auto is_important = [&](const std::string & tensor_name) -> bool { - bool important = false; - - if (statistics_data) { - const std::string key = remap_imatrix(tensor_name, mapped); - const auto tstats = statistics_data->find(key); - if (tstats != statistics_data->end() && !tstats->second.empty()) { - float ecs = 0.0f; // Euclidean-Cosine score - float l2 = 0.0f; // L2 Euclidean Distance - float cs = 0.0f; // Cosine Similarity - try { - // ecs = tstats->second.at(0); - l2 = tstats->second.at(1); - cs = tstats->second.at(2); - } catch (std::out_of_range &) { - LLAMA_LOG_ERROR("\t%s: insufficient statistics for tensor %s\n", func, tensor_name.c_str()); - return false; - } - ecs = 100.0f - (100.0f / (1.0f + 0.01f * l2 * l2) * std::fabs(cs)); // ecs = 100 - (100 / (1 + (L2 Dist/p)^2) * |Cos Sim|^q) - // LLAMA_LOG_INFO("\t%s: tensor %s has ECS score %.4f (L2 Distance %.4f and CosSim %.4f\n", func, tensor_name.c_str(), ecs, l2, cs); - important = ecs >= 99.99f; // mark as important if ecs is >= 99.99% - } - } else { - important = tensor_name == "output.weight" || + bool important = tensor_name == "output.weight"; + if (!important && !params->disable_tensor_importance) { + important = tensor_name.find(".attn_v.weight") != std::string::npos || + tensor_name.find(".time_mix_value.weight") != std::string::npos || tensor_name.find(".ffn_down.weight") != std::string::npos || tensor_name.find(".ffn_down_exps.weight") != std::string::npos || tensor_name.find(".attn_output.weight") != std::string::npos || @@ -2023,7 +2004,16 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: std::unordered_map bpw_overrides = {}; if (params->target_bpw != -1.0f && !params->only_copy) { if (params->imatrix) { - + if (params->activations) { + LLAMA_LOG_INFO("%s: imatrix has activations, process will be more accurate\n", __func__); + } else { + LLAMA_LOG_INFO("%s: imatrix does not have activations, process may be less accurate\n", __func__); + } + if (params->disable_tensor_importance) { + LLAMA_LOG_INFO("%s: allocating bpw budget to tensors equally\n", __func__); + } else { + LLAMA_LOG_INFO("%s: allocating more bpw budget to important tensors\n", __func__); + } LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw); bpw_overrides = target_bpw_type(ml, model, tensors, mapped, values_data, activations_data, params, nthread); @@ -2291,6 +2281,7 @@ llama_model_quantize_params llama_model_quantize_default_params() { /*.target_bpw =*/ -1.0f, /*.keep_bpw_state =*/ false, /*.bpw_state =*/ nullptr, + /*.disable_tensor_importance =*/ false }; return result; diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index aabcd73986..4fee8c91a1 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -134,6 +134,8 @@ static void usage(const char * executable) { printf(" Advanced option to remove all tensors from the given layers\n"); printf(" --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0\n"); printf(" Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n"); + printf(" --disable-tensor-importance: treat all tensors equally during bpw quantization\n"); + printf(" Advanced option to disable allocating more bpw budget to important tensors. It may increase quality for some models\n"); printf(" --keep-bpw-state: save the bpw computations to -.bpw_state\n"); printf(" --bpw-state: file name to use instead of default\n"); printf(" --keep-split: will generate quantized model in the same shards as input\n"); @@ -560,6 +562,8 @@ int main(int argc, char ** argv) { if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) { usage(argv[0]); } + } else if (strcmp(argv[arg_idx], "--disable-tensor-importance") == 0) { + params.disable_tensor_importance = true; } else if (strcmp(argv[arg_idx], "--keep-bpw-state") == 0) { params.keep_bpw_state = true; } else if (strcmp(argv[arg_idx], "--bpw-state") == 0) { From 661600842096145db52a4c631bfe0303a5d454ee Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 24 Nov 2025 18:26:45 +0000 Subject: [PATCH 143/155] Use more descriptive option naming --- include/llama.h | 2 +- src/llama-quant.cpp | 10 +++++----- tools/quantize/quantize.cpp | 14 +++++++------- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/include/llama.h b/include/llama.h index 1f5b2e8a2b..50e61d4976 100644 --- a/include/llama.h +++ b/include/llama.h @@ -369,7 +369,7 @@ extern "C" { float target_bpw; // target bits per weight (bpw) bool keep_bpw_state; // keep bpw state file void * bpw_state; // pointer to bpw state file - bool disable_tensor_importance; // treat all tensors equally during quantization + bool no_importance; // allocate target bpw budget equitably across all tensors } llama_model_quantize_params; typedef struct llama_logit_bias { diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 2b9aba091b..c468a3e4fc 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1571,7 +1571,7 @@ static std::unordered_map target_bpw_type( // Certain tensors have a higher impact on model quality, so we apply a lower penalty to them auto is_important = [&](const std::string & tensor_name) -> bool { bool important = tensor_name == "output.weight"; - if (!important && !params->disable_tensor_importance) { + if (!important && !params->no_importance) { important = tensor_name.find(".attn_v.weight") != std::string::npos || tensor_name.find(".time_mix_value.weight") != std::string::npos || tensor_name.find(".ffn_down.weight") != std::string::npos || @@ -2009,10 +2009,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } else { LLAMA_LOG_INFO("%s: imatrix does not have activations, process may be less accurate\n", __func__); } - if (params->disable_tensor_importance) { - LLAMA_LOG_INFO("%s: allocating bpw budget to tensors equally\n", __func__); + if (params->no_importance) { + LLAMA_LOG_INFO("%s: distributing bpw budget equitably across all tensors\n", __func__); } else { - LLAMA_LOG_INFO("%s: allocating more bpw budget to important tensors\n", __func__); + LLAMA_LOG_INFO("%s: assigning more bpw budget to important tensors\n", __func__); } LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.4f bpw\n", __func__, params->target_bpw); @@ -2281,7 +2281,7 @@ llama_model_quantize_params llama_model_quantize_default_params() { /*.target_bpw =*/ -1.0f, /*.keep_bpw_state =*/ false, /*.bpw_state =*/ nullptr, - /*.disable_tensor_importance =*/ false + /*.no_importance =*/ false }; return result; diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 4fee8c91a1..dd4b860e1b 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -117,9 +117,9 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp [[noreturn]] static void usage(const char * executable) { - printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--target-bpw n]\n", executable); - printf(" [--keep-bpw-state] [--bpw-state filename] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n"); - printf(" model-f32.gguf [model-quant.gguf] type [nthreads]\n\n"); + printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights]\n", executable); + printf(" [--target-bpw n] [--no-importance] [--keep-bpw-state] [--bpw-state filename] [--output-tensor-type] [--token-embedding-type] [--tensor-type]\n"); + printf(" [--prune-layers] [--keep-split] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n"); printf(" --allow-requantize: allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); printf(" --leave-output-tensor: will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); printf(" --pure: disable k-quant mixtures and quantize all tensors to the same type\n"); @@ -134,8 +134,8 @@ static void usage(const char * executable) { printf(" Advanced option to remove all tensors from the given layers\n"); printf(" --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0\n"); printf(" Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n"); - printf(" --disable-tensor-importance: treat all tensors equally during bpw quantization\n"); - printf(" Advanced option to disable allocating more bpw budget to important tensors. It may increase quality for some models\n"); + printf(" --no-importance: distribute bpw budget equitably across all tensors\n"); + printf(" Advanced option to disable assigning more bpw budget to important tensors. It may increase quality for some models\n"); printf(" --keep-bpw-state: save the bpw computations to -.bpw_state\n"); printf(" --bpw-state: file name to use instead of default\n"); printf(" --keep-split: will generate quantized model in the same shards as input\n"); @@ -562,8 +562,8 @@ int main(int argc, char ** argv) { if (arg_idx == argc-1 || !parse_target_bpw(argv[++arg_idx], target_bpw)) { usage(argv[0]); } - } else if (strcmp(argv[arg_idx], "--disable-tensor-importance") == 0) { - params.disable_tensor_importance = true; + } else if (strcmp(argv[arg_idx], "--no-importance") == 0) { + params.no_importance = true; } else if (strcmp(argv[arg_idx], "--keep-bpw-state") == 0) { params.keep_bpw_state = true; } else if (strcmp(argv[arg_idx], "--bpw-state") == 0) { From 69a32b6f508a4d0d38f52cf91cc8cd5b42a4bf62 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 29 Nov 2025 10:28:43 +0000 Subject: [PATCH 144/155] Relax target bpw range --- tools/quantize/quantize.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index dd4b860e1b..ebeea65336 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -132,7 +132,7 @@ static void usage(const char * executable) { printf(" Advanced option to selectively quantize tensors. May be specified multiple times.\n"); printf(" --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n"); printf(" Advanced option to remove all tensors from the given layers\n"); - printf(" --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 8.0\n"); + printf(" --target-bpw: target bits per weight (bpw). Must be a positive number between 0.0 and 16.0\n"); printf(" Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n"); printf(" --no-importance: distribute bpw budget equitably across all tensors\n"); printf(" Advanced option to disable assigning more bpw budget to important tensors. It may increase quality for some models\n"); @@ -485,13 +485,13 @@ static bool parse_target_bpw(const char * data, float & target_bpw) { try { target_bpw = std::stof(data); - if (target_bpw < 0.0f || target_bpw > 8.0f) { - printf("\n%s: target bits per weight (bpw) must be a positive number between 0.0 and 8.0\n\n", __func__); + if (target_bpw < 0.0f || target_bpw > 16.0f) { + printf("\n%s: target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__); return false; } } catch (const std::exception & e) { - printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 8.0\n\n", __func__, data); + printf("\n%s: '%s' is not valid. Target bits per weight (bpw) must be a positive number between 0.0 and 16.0\n\n", __func__, data); return false; } From 5b557ca958d3b0cb4293e12aafe21135c0c12142 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 29 Nov 2025 10:30:20 +0000 Subject: [PATCH 145/155] Minor refactoring --- src/llama-quant.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index c468a3e4fc..2cb58d46bd 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -635,7 +635,7 @@ static std::unordered_map target_bpw_type( int nthread ) { bpw_stop.store(false, std::memory_order_relaxed); - // Signal handlers + // SIGINT/SIGTERM signal handlers struct signal_scope_guard { using handler_t = void (*)(int); handler_t prev_int = SIG_DFL; @@ -1361,14 +1361,14 @@ static std::unordered_map target_bpw_type( for (size_t i = 0; i < compatible_candidates.size(); ++i) { if (bpw_stop.load(std::memory_order_relaxed)) { return std::nullopt; } - const ggml_type tensor_types = compatible_candidates[i]; - const auto bpw = (float)tensor_bpw(tensor, tensor_types); - const size_t bytes = tensor_bytes(tensor, tensor_types); + const ggml_type tensor_type = compatible_candidates[i]; + const auto bpw = (float)tensor_bpw(tensor, tensor_type); + const size_t bytes = tensor_bytes(tensor, tensor_type); double mse = 0.0; double proj = 0.0; - const auto err = estimate_error(tensor, tensor_types, f32_sample, rows_sample, values, activations, + const auto err = estimate_error(tensor, tensor_type, f32_sample, rows_sample, values, activations, quantized_buffer, dequantized_buffer, tensor_lambda, slice_lambda, &mse, &proj); - eval_candidates[i] = candidate_types{ tensor_types, bpw, bytes, err, mse, proj }; + eval_candidates[i] = candidate_types{ tensor_type, bpw, bytes, err, mse, proj }; } if (bpw_stop.load(std::memory_order_relaxed)) { return std::nullopt; } From 229109f329c498078f84da39b2c1ebb807e60646 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 29 Nov 2025 10:31:39 +0000 Subject: [PATCH 146/155] Increase importance boost for final pass --- src/llama-quant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 2cb58d46bd..44f84ec949 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1714,7 +1714,7 @@ static std::unordered_map target_bpw_type( if (err_gain < epsilon) { continue; } // no error improvement double ratio = err_gain / (double)delta_bytes; // error reduction per byte - if (is_important(tensor_name)) { ratio *= 2.0; } // important tensors get 2x boost + if (is_important(tensor_name)) { ratio *= 5.0; } // important tensors get 5x boost // For tie-breaking, prioritize the largest absolute error improvement. if (ratio > best_ratio + epsilon || (std::abs(ratio - best_ratio) <= epsilon && err_gain > best_gain)) { From b97cda628960d66a9fcc301062a1dc3925feae9f Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 29 Nov 2025 23:52:51 +0000 Subject: [PATCH 147/155] Add B/F16 to get_ftype() --- tools/quantize/quantize.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index ebeea65336..a1426ea4a3 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -512,7 +512,12 @@ static const char * get_ftype(const float bpw) { {4.5000, "Q4_K"}, {5.5000, "Q5_K"}, {6.5625, "Q6_K"}, - {8.5000, "Q8_0"} + {8.5000, "Q8_0"}, +#ifdef GGML_USE_METAL + {16.0000, "F16"} +#else + {16.0000, "BF16"} +#endif }; return quant_bpw.lower_bound(bpw)->second; From 37cf51ebd032e63c7901835cdd85a0e7e9109e25 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 30 Nov 2025 00:29:35 +0000 Subject: [PATCH 148/155] Process bpw targets up to B/F16 --- src/llama-quant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 44f84ec949..6c6926dee8 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -2089,7 +2089,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: new_type = default_type; // get more optimal quantization type based on the tensor shape, layer, etc. - if (!params->pure && ggml_is_quantized(default_type)) { + if (!params->pure && (ggml_is_quantized(default_type) || params->target_bpw != -1.0f)) { int fallback = qs.n_fallback; new_type = llama_tensor_get_type(qs, new_type, tensor, ftype); From 5f7bba78288c2ef33d45adcd82141d70157eb402 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 25 Dec 2025 15:47:18 +0000 Subject: [PATCH 149/155] Improve state checkpoint filename --- src/llama-quant.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 33b7f7e584..3d4785c1a3 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -765,10 +765,18 @@ static std::unordered_map target_bpw_type( return djb2_hash(buf.data(), buf.size()); }; + std::string gen_name; + std::string checkpoint_file; char hex[17]; const uint64_t model_id = metadata_id(ml.meta.get()); + std::snprintf(hex, sizeof(hex), "%016" PRIx64, (uint64_t)model_id); - std::string checkpoint_file = ml.arch_name + "-" + std::string(hex) + ".bpw_state"; + ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false); + std::replace(gen_name.begin(), gen_name.end(), ' ', '_'); + + gen_name.empty() ? checkpoint_file = ml.arch_name : checkpoint_file = gen_name; + checkpoint_file += "-" + std::string(hex) + ".bpw_state"; + if (params->keep_bpw_state && params->bpw_state) { const auto * filename = static_cast(params->bpw_state); std::ifstream ifs(filename, std::ios::binary); From b6d718a4a6b789bf0f944ff5a9a4ff82e985fe38 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 25 Dec 2025 15:47:44 +0000 Subject: [PATCH 150/155] Add code comments --- src/llama-quant.cpp | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 3d4785c1a3..cab4ecaeec 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -623,7 +623,7 @@ static void signal_handler(int) { bpw_stop.store(true, std::memory_order_relaxed); } -// Returns tensor type overrides to meet a global bpw target +// Returns tensor type overrides that meet a global bpw target static std::unordered_map target_bpw_type( llama_model_loader & ml, const llama_model & model, @@ -650,6 +650,7 @@ static std::unordered_map target_bpw_type( } } signal_guard; + // Error and bias projection per GGML_TYPE per tensor struct candidate_types { ggml_type type = GGML_TYPE_COUNT; float bpw = 0.0f; @@ -659,6 +660,7 @@ static std::unordered_map target_bpw_type( double proj = 0.0; }; + // Per‑tensor quantization mix that satisfies a global bpw target struct tensor_info { const llama_model_loader::llama_tensor_weight * w = nullptr; std::vector candidate; @@ -697,22 +699,33 @@ static std::unordered_map target_bpw_type( constexpr uint64_t arbitrary_magic = 0xeabada55cafed00d; const char * func = __func__; + // Tensor size in bytes for a given type auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t { const int64_t n_per_row = t->ne[0]; const size_t row_sz = ggml_row_size(typ, n_per_row); return (size_t)ggml_nrows(t) * row_sz; }; + // Tensor bpw for a given type auto tensor_bpw = [&](const ggml_tensor * t, const ggml_type typ) -> double { const size_t bytes = tensor_bytes(t, typ); return (double)bytes * 8.0 / (double)ggml_nelements(t); }; + // Check if tensor is compatible with quantization type auto is_compatible = [](const ggml_tensor * t, const ggml_type typ) -> bool { const int64_t blck = ggml_blck_size(typ); return blck <= 1 || (t->ne[0] % blck) == 0; }; + // Get suitable fallback for type + auto make_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> ggml_type { + if (is_compatible(t, typ)) { return typ; } + const ggml_type fb = fallback_type(typ); + return is_compatible(t, fb) ? fb : GGML_TYPE_F16; + }; + + // Check if tensor is an IQ type auto is_iq = [](const enum ggml_type t) { switch (t) { case GGML_TYPE_IQ1_S: @@ -730,12 +743,7 @@ static std::unordered_map target_bpw_type( } }; - auto make_compatible = [&](const ggml_tensor * t, const ggml_type typ) -> ggml_type { - if (is_compatible(t, typ)) { return typ; } - const ggml_type fb = fallback_type(typ); - return is_compatible(t, fb) ? fb : GGML_TYPE_F16; - }; - + // Check if tensor can be quantized auto can_quantize = [&](const ggml_tensor * t) -> bool { if (ggml_n_dims(t) < 2) { return false; } // skip 1D tensors return is_quantizable(ggml_get_name(t), model.arch, params); @@ -750,6 +758,7 @@ static std::unordered_map target_bpw_type( size_t n_elements = 0; }; + // DJB2 hashing algorithm auto djb2_hash = [&](const uint8_t * data, const size_t n) -> uint64_t { uint64_t h = 5381; for (size_t i = 0; i < n; ++i) { @@ -758,6 +767,7 @@ static std::unordered_map target_bpw_type( return h ? h : arbitrary_magic; }; + // Get model ID from metadata hash auto metadata_id = [&](const gguf_context * ctx) -> uint64_t { const size_t sz = gguf_get_meta_size(ctx); std::vector buf(sz); @@ -794,6 +804,7 @@ static std::unordered_map target_bpw_type( } } + // Serializes vector to disk auto save_bpw_state = [&](const std::vector & all_vec) { const std::string tmp = checkpoint_file + ".tmp"; std::ofstream ofs(tmp, std::ios::binary | std::ios::trunc); @@ -832,6 +843,7 @@ static std::unordered_map target_bpw_type( LLAMA_LOG_INFO("%s: saved progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str()); }; + // Deserializes vector from disk auto load_bpw_state = [&]() -> std::unordered_map { std::unordered_map out; std::ifstream ifs(checkpoint_file, std::ios::binary); @@ -890,6 +902,7 @@ static std::unordered_map target_bpw_type( return out; }; + // Deletes checkpoint file unless --keep-bpw-state is set auto delete_bpw_state = [&] { std::ifstream ifs(checkpoint_file); if (ifs.good() && !params->keep_bpw_state) { @@ -898,6 +911,7 @@ static std::unordered_map target_bpw_type( } }; + // Check for user interrupt and save progress auto check_signal_handler = [&](const std::vector & all_vec) { if (bpw_stop.load(std::memory_order_relaxed)) { LLAMA_LOG_INFO("\n%s: saving progress for %lu tensors to %s\n", func, all_vec.size(), checkpoint_file.c_str()); @@ -1161,7 +1175,7 @@ static std::unordered_map target_bpw_type( const auto bpw_data = load_bpw_state(); - // Parallelize tensor processing - courtesy of https://github.com/ddh0 + // Parallelize tensor processing (courtesy of https://github.com/ddh0) auto process_tensor = [&](const llama_model_loader::llama_tensor_weight * tw, std::vector> & thread_local_buffer, std::mutex & loader_mutex, @@ -1555,6 +1569,7 @@ static std::unordered_map target_bpw_type( size_t target_total_bytes = std::llround(target_bpw * (double)nq_elements / 8.0); size_t budget_bytes = target_total_bytes >= nq_bytes ? target_total_bytes - nq_bytes : min_bytes; + // Get the types' override auto emit_overrides = [&]() -> std::unordered_map { std::unordered_map overrides; LLAMA_LOG_INFO("%s: - estimated tensor quantization mix:\n", func); @@ -1592,7 +1607,7 @@ static std::unordered_map target_bpw_type( return important; }; - // Lagrangian relaxation to minimise error subject to a bpw target constraint + // Lagrangian relaxation to minimize error subject to a bpw target constraint auto lagrange_penalty = [&](const double mu, std::vector & choice, size_t & bytes, double & err) { choice.resize(all.size()); bytes = 0; @@ -1636,7 +1651,7 @@ static std::unordered_map target_bpw_type( lagrange_penalty(mu_lo, choice_lo, bytes_lo, err_lo); - // increase mu until we get under budget or hit a safety cap + // Increase mu until we get under budget or hit a safety cap { int expand = 0; size_t prev_bytes_hi = std::numeric_limits::max(); @@ -1741,7 +1756,7 @@ static std::unordered_map target_bpw_type( } } - delete_bpw_state(); // we're done, clear any checkpoint + delete_bpw_state(); return emit_overrides(); } From 3be3b1ef87f353840de25fd7bffde00330fac7b4 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 25 Dec 2025 17:44:43 +0000 Subject: [PATCH 151/155] Update usage() --- tools/quantize/quantize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index a1426ea4a3..cbb8655c63 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -136,7 +136,7 @@ static void usage(const char * executable) { printf(" Advanced option to automatically select quantization types to achieve a total bits per weight (bpw) target\n"); printf(" --no-importance: distribute bpw budget equitably across all tensors\n"); printf(" Advanced option to disable assigning more bpw budget to important tensors. It may increase quality for some models\n"); - printf(" --keep-bpw-state: save the bpw computations to -.bpw_state\n"); + printf(" --keep-bpw-state: save the bpw computations to -.bpw_state\n"); printf(" --bpw-state: file name to use instead of default\n"); printf(" --keep-split: will generate quantized model in the same shards as input\n"); printf(" --override-kv KEY=TYPE:VALUE\n"); From 311c2c9f0ebd6e08f5b20e8827c654cebb7a41d6 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 25 Dec 2025 17:45:05 +0000 Subject: [PATCH 152/155] Update README.md --- tools/quantize/README.md | 71 +++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 37 deletions(-) diff --git a/tools/quantize/README.md b/tools/quantize/README.md index 22f0710286..9b93edafec 100644 --- a/tools/quantize/README.md +++ b/tools/quantize/README.md @@ -58,6 +58,8 @@ Options: Advanced options: * `--tensor-type` quantize specific tensor(s) to specific quant types. Supports regex syntax. May be specified multiple times. * `--prune-layers` prune (remove) the layers in the list +* `--target-bpw` automatically choose quant types so that the overall model size matches a given bits per weight (bpw) average. +* `--no-importance` during bpw computation, treat each tensor equally instead of prioritizing some. It may yield better quality for some models. * `--override-kv` option to override model metadata by key in the quantized model. May be specified multiple times Examples: @@ -97,59 +99,54 @@ Examples: ./llama-quantize --imatrix imatrix.gguf --override-kv qwen3moe.expert_used_count=int:16 --prune-layers 20,21,22 input-model-f32.gguf pruned-model-f32.gguf copy 8 ``` +```bash +# quantize model targeting a specific bpw average and save the bpw computations to the default file. Model type is optional and can be omitted +./llama-quantize --target-bpw 4.567 --keep-bpw-state --imatrix imatrix.gguf input-model-f32.gguf 8 +``` + ## Memory/Disk Requirements When running the larger models, make sure you have enough disk space to store all the intermediate files. As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same. For exmaple (Llama 3.1): | Model | Original size | Quantized size (Q4_K_M) | -| ----: | ------------: | ----------------------: | +|------:|--------------:|------------------------:| | 8B | 32.1 GB | 4.9 GB | | 70B | 280.9 GB | 43.1 GB | | 405B | 1,625.1 GB | 249.1 GB | - ## Quantization Several quantization methods are supported. They differ in the resulting model disk size and inference speed. For example, ### [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) -| Measure | IQ1_S | IQ1_M | IQ2_XXS | IQ2_XS | IQ2_S | IQ2_M | -| --------------------------- | ------------ | ------------ | ------------ | ------------- | ------------- | ------------ | -| bits/weight | 2.0042 | 2.1460 | 2.3824 | 2.5882 | 2.7403 | 2.9294 | -| size (GiB) | 1.87 | 2.01 | 2.23 | 2.42 | 2.56 | 2.74 | -| prompt processing t/s @ 512 | 858.88 ±1.22 | 847.99 ±0.47 | 852.39 ±0.85 | 826.99 ±12.51 | 783.55 ±13.73 | 787.68 ±7.00 | -| text generation t/s @ 128 | 79.73 ±0.79 | 72.92 ±0.14 | 79.86 ±0.22 | 78.04 ±0.46 | 77.30 ±2.47 | 74.44 ±0.15 | - -| Measure | IQ3_XXS | IQ3_XS | IQ3_S | IQ3_M | IQ4_XS | IQ4_NL | -| --------------------------- | ------------ | ------------ | ------------ | ------------- | ------------- | ------------ | -| bits/weight | 3.2548 | 3.4977 | 3.6606 | 3.7628 | 4.4597 | 4.6818 | -| size (GiB) | 3.04 | 3.27 | 3.42 | 3.52 | 4.17 | 4.38 | -| prompt processing t/s @ 512 | 813.88 ±6.53 | 708.71 ±1.26 | 798.78 ±8.81 | 768.70 ±13.73 | 771.80 ±11.38 | 806.03 ±7.07 | -| text generation t/s @ 128 | 73.95 ±0.20 | 71.67 ±0.54 | 69.31 ±0.63 | 70.15 ±0.33 | 77.51 ±0.20 | 76.63 ±0.28 | - - -| Measure | Q2_K_S | Q2_K | Q3_K_S | Q3_K_M | Q3_K_L | Q4_K_S | -| --------------------------- | ------------ | ------------ | ------------ | ------------ | ------------ | ------------ | -| bits/weight | 2.9697 | 3.1593 | 3.6429 | 3.9960 | 4.2979 | 4.6672 | -| size (GiB) | 2.78 | 2.95 | 3.41 | 3.74 | 4.02 | 4.36 | -| prompt processing t/s @ 512 | 798.91 ±6.40 | 784.45 ±7.85 | 752.17 ±7.94 | 783.44 ±9.92 | 761.17 ±7.55 | 818.55 ±9.58 | -| text generation t/s @ 128 | 90.01 ±0.12 | 79.85 ±0.20 | 69.84 ±0.18 | 71.68 ±0.22 | 69.38 ±0.49 | 76.71 ±0.20 | - -| Measure | Q4_K_S | Q4_K_M | Q5_K_S | Q5_K_M | Q6_K | Q8_0 | -| --------------------------- | ------------ | ------------- | ------------ | ------------ | ------------- | ------------ | -| bits/weight | 4.6672 | 4.8944 | 5.5704 | 5.7036 | 6.5633 | 8.5008 | -| size (GiB) | 4.36 | 4.58 | 5.21 | 5.33 | 6.14 | 7.95 | -| prompt processing t/s @ 512 | 818.55 ±9.58 | 821.81 ±21.44 | 752.52 ±0.99 | 758.69 ±7.43 | 812.01 ±10.82 | 865.09 ±8.30 | -| text generation t/s @ 128 | 76.71 ±0.20 | 71.93 ±1.52 | 69.53 ±0.18 | 67.23 ±1.08 | 58.67 ±3.13 | 50.93 ±0.08 | - -| Measure | F16 | -| --------------------------- | ------------ | -| bits/weight | 16.0005 | -| size (GiB) | 14.96 | -| prompt processing t/s @ 512 | 923.49 ±0.53 | -| text generation t/s @ 128 | 29.17 ±0.04 | +| Quant Type | bits/weight | size (GiB) | prompt processing t/s @ 512 | text generation t/s @ 128 | +|:----------:|------------:|-----------:|----------------------------:|--------------------------:| +| IQ1_S | 2.0042 | 1.87 | 858.88 ±1.22 | 79.73 ±0.79 | +| IQ1_M | 2.1460 | 2.01 | 847.99 ±0.47 | 72.92 ±0.14 | +| IQ2_XXS | 2.3824 | 2.23 | 852.39 ±0.85 | 79.86 ±0.22 | +| IQ2_XS | 2.5882 | 2.42 | 826.99 ±12.51 | 78.04 ±0.46 | +| IQ2_S | 2.7403 | 2.56 | 783.55 ±13.73 | 77.30 ±2.47 | +| IQ2_M | 2.9294 | 2.74 | 787.68 ±7.00 | 74.44 ±0.15 | +| IQ3_XXS | 3.2548 | 3.04 | 813.88 ±6.53 | 73.95 ±0.20 | +| IQ3_XS | 3.4977 | 3.27 | 708.71 ±1.26 | 71.67 ±0.54 | +| IQ3_S | 3.6606 | 3.42 | 798.78 ±8.81 | 69.31 ±0.63 | +| IQ3_M | 3.7628 | 3.52 | 768.70 ±13.73 | 70.15 ±0.33 | +| IQ4_XS | 4.4597 | 4.17 | 771.80 ±11.38 | 77.51 ±0.20 | +| IQ4_NL | 4.6818 | 4.38 | 818.55 ±9.58 | 76.71 ±0.20 | +| Q2_K_S | 2.9697 | 2.78 | 798.91 ±6.40 | 90.01 ±0.12 | +| Q2_K | 3.1593 | 2.95 | 784.45 ±7.85 | 79.85 ±0.20 | +| Q3_K_S | 3.6429 | 3.41 | 752.17 ±7.94 | 71.68 ±0.22 | +| Q3_K_L | 4.2979 | 4.02 | 761.17 ±7.55 | 69.38 ±0.49 | +| Q4_K_S | 4.6672 | 4.36 | 818.55 ±9.58 | 76.71 ±0.20 | +| Q4_K_S | 4.6672 | 4.36 | 818.55 ±9.58 | 76.71 ±0.20 | +| Q4_K_M | 4.8944 | 4.58 | 821.81 ±21.44 | 71.93 ±1.52 | +| Q5_K_S | 5.5704 | 5.21 | 752.52 ±0.99 | 69.53 ±0.18 | +| Q5_K_M | 5.7036 | 5.33 | 758.69 ±7.43 | 67.23 ±1.08 | +| Q6_K | 6.5633 | 6.14 | 812.01 ±10.82 | 58.67 ±3.13 | +| Q8_0 | 8.5008 | 7.95 | 865.09 ±8.30 | 50.93 ±0.08 | +| F16 | 16.0005 | 14.96 | 923.49 ±0.53 | 29.17 ±0.04 | ## Background information on llama-quantize From 7f886128617334831b6e99dfcdff994a5cf6bf4e Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 25 Dec 2025 17:47:38 +0000 Subject: [PATCH 153/155] Update README.md --- tools/quantize/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/quantize/README.md b/tools/quantize/README.md index 9b93edafec..986ba95be5 100644 --- a/tools/quantize/README.md +++ b/tools/quantize/README.md @@ -56,10 +56,10 @@ Options: * `--keep-split` will generate the quantized model in the same shards as the input file otherwise it will produce a single quantized file Advanced options: -* `--tensor-type` quantize specific tensor(s) to specific quant types. Supports regex syntax. May be specified multiple times. +* `--tensor-type` quantize specific tensor(s) to specific quant types. Supports regex syntax. May be specified multiple times * `--prune-layers` prune (remove) the layers in the list -* `--target-bpw` automatically choose quant types so that the overall model size matches a given bits per weight (bpw) average. -* `--no-importance` during bpw computation, treat each tensor equally instead of prioritizing some. It may yield better quality for some models. +* `--target-bpw` automatically choose quant types so that the overall model size matches a given bits per weight (bpw) average +* `--no-importance` during bpw computation, treat each tensor equally instead of prioritizing some. It may yield better quality for some models * `--override-kv` option to override model metadata by key in the quantized model. May be specified multiple times Examples: From 91846ee79b385f88fea67150f1a82a5a9058e406 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Mon, 29 Dec 2025 13:02:06 +0000 Subject: [PATCH 154/155] Change checkpoint file magic --- src/llama-quant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index cab4ecaeec..f518c10781 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -695,7 +695,7 @@ static std::unordered_map target_bpw_type( constexpr double epsilon = 1e-12; constexpr double infinity = std::numeric_limits::infinity(); - constexpr uint32_t file_magic = 0x42505731; // BPW1 + constexpr uint32_t file_magic = 0x4d534531; // MSE1 constexpr uint64_t arbitrary_magic = 0xeabada55cafed00d; const char * func = __func__; From 960ef9614178a825578c32417ed5876c367a506d Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Thu, 1 Jan 2026 13:44:59 +0000 Subject: [PATCH 155/155] Prepare for future optimization algorithms --- src/llama-quant.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index f518c10781..67e5aa9827 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -695,7 +695,7 @@ static std::unordered_map target_bpw_type( constexpr double epsilon = 1e-12; constexpr double infinity = std::numeric_limits::infinity(); - constexpr uint32_t file_magic = 0x4d534531; // MSE1 + constexpr uint32_t file_magic = 0x4d534531; // MSE1 constexpr uint64_t arbitrary_magic = 0xeabada55cafed00d; const char * func = __func__; @@ -785,7 +785,7 @@ static std::unordered_map target_bpw_type( std::replace(gen_name.begin(), gen_name.end(), ' ', '_'); gen_name.empty() ? checkpoint_file = ml.arch_name : checkpoint_file = gen_name; - checkpoint_file += "-" + std::string(hex) + ".bpw_state"; + checkpoint_file += "-" + std::string(hex) + "-mse.bpw_state"; if (params->keep_bpw_state && params->bpw_state) { const auto * filename = static_cast(params->bpw_state);