From 844ad3e3268259b85456ebfd4d3417f9b3825c29 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 11 Feb 2026 12:47:13 -0600 Subject: [PATCH 01/35] clean slate for branch --- include/llama.h | 1 + src/llama-quant.cpp | 3 ++- tools/quantize/quantize.cpp | 8 ++++++-- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/include/llama.h b/include/llama.h index 46c3672e98..8bcefda896 100644 --- a/include/llama.h +++ b/include/llama.h @@ -393,6 +393,7 @@ extern "C" { void * kv_overrides; // pointer to vector containing overrides void * tensor_types; // pointer to vector containing tensor types void * prune_layers; // pointer to vector containing layer indices to prune + bool dry_run; // calculate and show the final quantization size without performing quantization } llama_model_quantize_params; typedef struct llama_logit_bias { diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index a7891647c3..730f13e29e 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1048,7 +1048,8 @@ llama_model_quantize_params llama_model_quantize_default_params() { /*.imatrix =*/ nullptr, /*.kv_overrides =*/ nullptr, /*.tensor_type =*/ nullptr, - /*.prune_layers =*/ nullptr + /*.prune_layers =*/ nullptr, + /*.dry_run =*/ false }; return result; diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index c0f49279ee..3f99d9e6a7 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -120,7 +120,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp static void usage(const char * executable) { printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights]\n", executable); printf(" [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--tensor-type-file]\n"); - printf(" [--prune-layers] [--keep-split] [--override-kv]\n"); + printf(" [--prune-layers] [--keep-split] [--override-kv] [--dry-run]\n"); printf(" model-f32.gguf [model-quant.gguf] type [nthreads]\n\n"); printf(" --allow-requantize\n"); printf(" allow requantizing tensors that have already been quantized\n"); @@ -156,7 +156,9 @@ static void usage(const char * executable) { printf(" generate quantized model in the same shards as input\n"); printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" override model metadata by key in the quantized model. may be specified multiple times.\n"); - printf(" WARNING: this is an advanced option, use with care.\n\n"); + printf(" WARNING: this is an advanced option, use with care.\n"); + printf(" --dry-run\n"); + printf(" calculate and show the final quantization size without performing quantization\n\n"); printf("note: --include-weights and --exclude-weights cannot be used together\n\n"); printf("-----------------------------------------------------------------------------\n"); printf(" allowed quantization types\n"); @@ -532,6 +534,8 @@ int main(int argc, char ** argv) { if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) { usage(argv[0]); } + } else if (strcmp(argv[arg_idx], "--dry-run") == 0) { + params.dry_run = true; } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) { params.allow_requantize = true; } else if (strcmp(argv[arg_idx], "--pure") == 0) { From 0d22288f001163d5312a33a99ebf9db26c37e344 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 11 Feb 2026 14:08:01 -0600 Subject: [PATCH 02/35] use 6 characters for tensor dims --- src/llama-impl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-impl.cpp b/src/llama-impl.cpp index 8e3e7b223a..60c7fcd050 100644 --- a/src/llama-impl.cpp +++ b/src/llama-impl.cpp @@ -111,7 +111,7 @@ std::string llama_format_tensor_shape(const struct ggml_tensor * t) { char buf[256]; snprintf(buf, sizeof(buf), "%5" PRId64, t->ne[0]); for (int i = 1; i < GGML_MAX_DIMS; i++) { - snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, t->ne[i]); + snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %6" PRId64, t->ne[i]); } return buf; } From 56c27b13ad0ea970111b68c90056ed8c830d2dc2 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 11 Feb 2026 14:08:17 -0600 Subject: [PATCH 03/35] add --dry-run to llama-quantize --- src/llama-quant.cpp | 259 ++++++++++++++++++++---------------- tools/quantize/quantize.cpp | 39 ++++-- 2 files changed, 169 insertions(+), 129 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 730f13e29e..2836caaf3a 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -735,24 +735,31 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: }; const auto tn = LLM_TN(model.arch); - new_ofstream(0); + + // no output file for --dry-run + if (!params->dry_run) { + new_ofstream(0); + } + for (const auto * it : tensors) { const auto & weight = *it; ggml_tensor * tensor = weight.tensor; - if (weight.idx != cur_split && params->keep_split) { + if (!params->dry_run && (weight.idx != cur_split && params->keep_split)) { close_ofstream(); new_ofstream(weight.idx); } const std::string name = ggml_get_name(tensor); - if (!ml.use_mmap) { - if (read_data.size() < ggml_nbytes(tensor)) { - read_data.resize(ggml_nbytes(tensor)); + if (!params->dry_run) { + if (!ml.use_mmap) { + if (read_data.size() < ggml_nbytes(tensor)) { + read_data.resize(ggml_nbytes(tensor)); + } + tensor->data = read_data.data(); } - tensor->data = read_data.data(); + ml.load_data_for(tensor); } - ml.load_data_for(tensor); LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ", ++idx, ml.n_tensors, @@ -900,126 +907,148 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: quantize = tensor->type != new_type; } - if (!quantize) { - new_type = tensor->type; - new_data = tensor->data; - new_size = ggml_nbytes(tensor); - LLAMA_LOG_INFO("size = %8.3f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0); - } else { - const int64_t nelements = ggml_nelements(tensor); - - const float * imatrix = nullptr; - if (imatrix_data) { - auto it = imatrix_data->find(remap_imatrix(tensor->name, mapped)); - if (it == imatrix_data->end()) { - LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name); - } else { - if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) { - imatrix = it->second.data(); - } else { - LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__, - int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name); - - // this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix - // this is a significant error and it may be good idea to abort the process if this happens, - // since many people will miss the error and not realize that most of the model is being quantized without an imatrix - // tok_embd should be ignored in this case, since it always causes this warning - if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) { - throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s", - int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name)); - } - } - } - } - if ((new_type == GGML_TYPE_IQ2_XXS || - new_type == GGML_TYPE_IQ2_XS || - new_type == GGML_TYPE_IQ2_S || - new_type == GGML_TYPE_IQ1_S || - (new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) || - (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) { - LLAMA_LOG_ERROR("\n\n============================================================\n"); - LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name); - LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n"); - LLAMA_LOG_ERROR("============================================================\n\n"); - throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name)); - } - - float * f32_data; - - if (tensor->type == GGML_TYPE_F32) { - f32_data = (float *) tensor->data; - } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) { - throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type))); + // we have now decided on the target type for this tensor + // the --dry-run option calculates the final quantization size without quantizting + if (params->dry_run) { + if (quantize) { + new_size = ggml_nrows(tensor) * ggml_row_size(new_type, tensor->ne[0]); + LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB (%s)\n", + ggml_nbytes(tensor)/1024.0/1024.0, + new_size/1024.0/1024.0, + ggml_type_name(new_type)); } else { - llama_tensor_dequantize_impl(tensor, f32_conv_buf, workers, nelements, nthread); - f32_data = (float *) f32_conv_buf.data(); + new_size = ggml_nbytes(tensor); + LLAMA_LOG_INFO("size = %8.3f MiB\n", new_size/1024.0/1024.0); } + total_size_org += ggml_nbytes(tensor); + total_size_new += new_size; + continue; + } else { + // no --dry-run, perform quantization + if (!quantize) { + new_type = tensor->type; + new_data = tensor->data; + new_size = ggml_nbytes(tensor); + LLAMA_LOG_INFO("size = %8.3f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0); + } else { + const int64_t nelements = ggml_nelements(tensor); - LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type)); - fflush(stdout); + const float * imatrix = nullptr; + if (imatrix_data) { + auto it = imatrix_data->find(remap_imatrix(tensor->name, mapped)); + if (it == imatrix_data->end()) { + LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name); + } else { + if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) { + imatrix = it->second.data(); + } else { + LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__, + int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name); - if (work.size() < (size_t)nelements * 4) { - work.resize(nelements * 4); // upper bound on size - } - new_data = work.data(); - - const int64_t n_per_row = tensor->ne[0]; - const int64_t nrows = tensor->ne[1]; - - static const int64_t min_chunk_size = 32 * 512; - const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)); - - const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1]; - const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size; - const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1; - - // quantize each expert separately since they have different importance matrices - new_size = 0; - for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) { - const float * f32_data_03 = f32_data + i03 * nelements_matrix; - void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows; - const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr; - - new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use); - - // TODO: temporary sanity check that the F16 -> MXFP4 is lossless -#if 0 - if (new_type == GGML_TYPE_MXFP4) { - auto * x = f32_data_03; - - //LLAMA_LOG_INFO("nrows = %d, n_per_row = %d\n", nrows, n_per_row); - std::vector deq(nrows*n_per_row); - const ggml_type_traits * qtype = ggml_get_type_traits(new_type); - qtype->to_float(new_data_03, deq.data(), deq.size()); - - double err = 0.0f; - for (int i = 0; i < (int) deq.size(); ++i) { - err += fabsf(deq[i] - x[i]); - //if (fabsf(deq[i] - x[i]) > 0.00001 && i < 256) { - if (deq[i] != x[i]) { - LLAMA_LOG_INFO("deq[%d] = %f, x[%d] = %f\n", i, deq[i], i, x[i]); + // this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix + // this is a significant error and it may be good idea to abort the process if this happens, + // since many people will miss the error and not realize that most of the model is being quantized without an imatrix + // tok_embd should be ignored in this case, since it always causes this warning + if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) { + throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s", + int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name)); + } } } - //LLAMA_LOG_INFO("err = %f\n", err); - GGML_ASSERT(err == 0.00000); } + if ((new_type == GGML_TYPE_IQ2_XXS || + new_type == GGML_TYPE_IQ2_XS || + new_type == GGML_TYPE_IQ2_S || + new_type == GGML_TYPE_IQ1_S || + (new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) || + (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) { + LLAMA_LOG_ERROR("\n\n============================================================\n"); + LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name); + LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n"); + LLAMA_LOG_ERROR("============================================================\n\n"); + throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name)); + } + + float * f32_data; + + if (tensor->type == GGML_TYPE_F32) { + f32_data = (float *) tensor->data; + } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) { + throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type))); + } else { + llama_tensor_dequantize_impl(tensor, f32_conv_buf, workers, nelements, nthread); + f32_data = (float *) f32_conv_buf.data(); + } + + LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type)); + fflush(stdout); + + if (work.size() < (size_t)nelements * 4) { + work.resize(nelements * 4); // upper bound on size + } + new_data = work.data(); + + const int64_t n_per_row = tensor->ne[0]; + const int64_t nrows = tensor->ne[1]; + + static const int64_t min_chunk_size = 32 * 512; + const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)); + + const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1]; + const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size; + const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1; + + // quantize each expert separately since they have different importance matrices + new_size = 0; + for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) { + const float * f32_data_03 = f32_data + i03 * nelements_matrix; + void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows; + const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr; + + new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use); + + // TODO: temporary sanity check that the F16 -> MXFP4 is lossless +#if 0 + if (new_type == GGML_TYPE_MXFP4) { + auto * x = f32_data_03; + + //LLAMA_LOG_INFO("nrows = %d, n_per_row = %d\n", nrows, n_per_row); + std::vector deq(nrows*n_per_row); + const ggml_type_traits * qtype = ggml_get_type_traits(new_type); + qtype->to_float(new_data_03, deq.data(), deq.size()); + + double err = 0.0f; + for (int i = 0; i < (int) deq.size(); ++i) { + err += fabsf(deq[i] - x[i]); + //if (fabsf(deq[i] - x[i]) > 0.00001 && i < 256) { + if (deq[i] != x[i]) { + LLAMA_LOG_INFO("deq[%d] = %f, x[%d] = %f\n", i, deq[i], i, x[i]); + } + } + //LLAMA_LOG_INFO("err = %f\n", err); + GGML_ASSERT(err == 0.00000); + } #endif + } + LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0); } - LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0); - } - total_size_org += ggml_nbytes(tensor); - total_size_new += new_size; + total_size_org += ggml_nbytes(tensor); + total_size_new += new_size; - // update the gguf meta data as we go - gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type); - GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size); - gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data); + // update the gguf meta data as we go + gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type); + GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size); + gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data); - // write tensor data + padding - fout.write((const char *) new_data, new_size); - zeros(fout, GGML_PAD(new_size, align) - new_size); + // write tensor data + padding + fout.write((const char *) new_data, new_size); + zeros(fout, GGML_PAD(new_size, align) - new_size); + } // no --dry-run + } // iterate over tensors + + if (!params->dry_run) { + close_ofstream(); } - close_ofstream(); LLAMA_LOG_INFO("%s: model size = %8.2f MiB\n", __func__, total_size_org/1024.0/1024.0); LLAMA_LOG_INFO("%s: quant size = %8.2f MiB\n", __func__, total_size_new/1024.0/1024.0); diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 3f99d9e6a7..91b0367742 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -626,7 +626,7 @@ int main(int argc, char ** argv) { llama_backend_init(); - // parse command line arguments +// parse command line arguments const std::string fname_inp = argv[arg_idx]; arg_idx++; std::string fname_out; @@ -634,22 +634,26 @@ int main(int argc, char ** argv) { std::string ftype_str; std::string suffix = ".gguf"; if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) { - std::string fpath; - const size_t pos = fname_inp.find_last_of("/\\"); - if (pos != std::string::npos) { - fpath = fname_inp.substr(0, pos + 1); - } + // argv[arg_idx] is the ftype directly: + if (!params.dry_run) { + std::string fpath; + const size_t pos = fname_inp.find_last_of("/\\"); + if (pos != std::string::npos) { + fpath = fname_inp.substr(0, pos + 1); + } - // export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting - fname_out = fpath + "ggml-model-" + ftype_str; - if (!params.keep_split) { - fname_out += suffix; + // export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting + fname_out = fpath + "ggml-model-" + ftype_str; + if (!params.keep_split) { + fname_out += suffix; + } } arg_idx++; if (ftype_str == "COPY") { params.only_copy = true; } } else { + // argv[arg_idx] is not a valid ftype, so treat it as output path: fname_out = argv[arg_idx]; if (params.keep_split && fname_out.find(suffix) != std::string::npos) { fname_out = fname_out.substr(0, fname_out.length() - suffix.length()); @@ -692,14 +696,21 @@ int main(int argc, char ** argv) { return 1; } - if (std::error_code ec; std::filesystem::equivalent(fname_inp, fname_out, ec)) { - fprintf(stderr, "%s: error: input and output files are the same: '%s'\n", __func__, fname_inp.c_str()); - return 1; + if (!params.dry_run) { + if (std::error_code ec; std::filesystem::equivalent(fname_inp, fname_out, ec)) { + fprintf(stderr, "%s: error: input and output files are the same: '%s'\n", __func__, fname_inp.c_str()); + return 1; + } } print_build_info(); - fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str()); + if (params.dry_run) { + fprintf(stderr, "%s: calculating quantization size for '%s' as %s", __func__, fname_inp.c_str(), ftype_str.c_str()); + } else { + fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str()); + } + if (params.nthread > 0) { fprintf(stderr, " using %d threads", params.nthread); } From c3f42dedd1f446b2e7733ef12c6d93e61a0e5509 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 11 Feb 2026 14:29:22 -0600 Subject: [PATCH 04/35] use 6 characters for tensor dims (cont.) --- src/llama-impl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-impl.cpp b/src/llama-impl.cpp index 60c7fcd050..710a5a1e08 100644 --- a/src/llama-impl.cpp +++ b/src/llama-impl.cpp @@ -109,7 +109,7 @@ std::string llama_format_tensor_shape(const std::vector & ne) { std::string llama_format_tensor_shape(const struct ggml_tensor * t) { char buf[256]; - snprintf(buf, sizeof(buf), "%5" PRId64, t->ne[0]); + snprintf(buf, sizeof(buf), "%6" PRId64, t->ne[0]); for (int i = 1; i < GGML_MAX_DIMS; i++) { snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %6" PRId64, t->ne[i]); } From b9b32f0d2d7a8f041d97e6d6ce00f636cdd6f42b Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 11 Feb 2026 14:45:44 -0600 Subject: [PATCH 05/35] no need to re-calculate ggml_nbytes for tensor --- src/llama-quant.cpp | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 2836caaf3a..e65c28723f 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -750,11 +750,12 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } const std::string name = ggml_get_name(tensor); + const size_t tensor_size = ggml_nbytes(tensor); if (!params->dry_run) { if (!ml.use_mmap) { - if (read_data.size() < ggml_nbytes(tensor)) { - read_data.resize(ggml_nbytes(tensor)); + if (read_data.size() < tensor_size) { + read_data.resize(tensor_size); } tensor->data = read_data.data(); } @@ -908,19 +909,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } // we have now decided on the target type for this tensor - // the --dry-run option calculates the final quantization size without quantizting if (params->dry_run) { + // the --dry-run option calculates the final quantization size without quantizting if (quantize) { new_size = ggml_nrows(tensor) * ggml_row_size(new_type, tensor->ne[0]); LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB (%s)\n", - ggml_nbytes(tensor)/1024.0/1024.0, + tensor_size/1024.0/1024.0, new_size/1024.0/1024.0, ggml_type_name(new_type)); } else { - new_size = ggml_nbytes(tensor); + new_size = tensor_size; LLAMA_LOG_INFO("size = %8.3f MiB\n", new_size/1024.0/1024.0); } - total_size_org += ggml_nbytes(tensor); + total_size_org += tensor_size; total_size_new += new_size; continue; } else { @@ -928,8 +929,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: if (!quantize) { new_type = tensor->type; new_data = tensor->data; - new_size = ggml_nbytes(tensor); - LLAMA_LOG_INFO("size = %8.3f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0); + new_size = tensor_size; + LLAMA_LOG_INFO("size = %8.3f MiB\n", tensor_size/1024.0/1024.0); } else { const int64_t nelements = ggml_nelements(tensor); @@ -1030,9 +1031,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } #endif } - LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0); + LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", tensor_size/1024.0/1024.0, new_size/1024.0/1024.0); } - total_size_org += ggml_nbytes(tensor); + total_size_org += tensor_size; total_size_new += new_size; // update the gguf meta data as we go From 150e1db21d32db1eb2b19c24cd82cd23aaf52398 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 11 Feb 2026 14:49:56 -0600 Subject: [PATCH 06/35] fix indent --- tools/quantize/quantize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 91b0367742..8497cb8039 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -626,7 +626,7 @@ int main(int argc, char ** argv) { llama_backend_init(); -// parse command line arguments + // parse command line arguments const std::string fname_inp = argv[arg_idx]; arg_idx++; std::string fname_out; From 966b21a981d2279358d6de76a03dc8de6b8617d4 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 11 Feb 2026 15:30:12 -0600 Subject: [PATCH 07/35] show model and quant BPW when quant completes --- src/llama-quant.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index e65c28723f..d7b90db01f 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1051,8 +1051,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: close_ofstream(); } - LLAMA_LOG_INFO("%s: model size = %8.2f MiB\n", __func__, total_size_org/1024.0/1024.0); - LLAMA_LOG_INFO("%s: quant size = %8.2f MiB\n", __func__, total_size_new/1024.0/1024.0); + LLAMA_LOG_INFO("%s: model size = %8.2f MiB (%.2f BPW)\n", __func__, total_size_org/1024.0/1024.0, total_size_org*8.0/ml.n_elements); + LLAMA_LOG_INFO("%s: quant size = %8.2f MiB (%.2f BPW)\n", __func__, total_size_new/1024.0/1024.0, total_size_new*8.0/ml.n_elements); if (qs.n_fallback > 0) { LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n", From 07f882bbbb8380ad5ef1b5da845322d8dcd11b7d Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 11 Feb 2026 15:36:42 -0600 Subject: [PATCH 08/35] add example to --help --- tools/quantize/quantize.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 8497cb8039..7c9a7f29cc 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -158,7 +158,8 @@ static void usage(const char * executable) { printf(" override model metadata by key in the quantized model. may be specified multiple times.\n"); printf(" WARNING: this is an advanced option, use with care.\n"); printf(" --dry-run\n"); - printf(" calculate and show the final quantization size without performing quantization\n\n"); + printf(" calculate and show the final quantization size without performing quantization\n"); + printf(" example: llama-quantize --dry-run model-f32.gguf Q4_K\n\n"); printf("note: --include-weights and --exclude-weights cannot be used together\n\n"); printf("-----------------------------------------------------------------------------\n"); printf(" allowed quantization types\n"); From 2769f352077c3692e3f4cf1ad1e1fa5f56a2af7b Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 11 Feb 2026 20:49:05 -0600 Subject: [PATCH 09/35] new function `tensor_requires_imatrix`, add courtesy warning about imatrix --- src/llama-quant.cpp | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index d7b90db01f..8a668e6b23 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -479,6 +479,22 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * return new_size; } +static bool tensor_requires_imatrix(const llama_model_quantize_params * params, const ggml_tensor * t, const ggml_type dst_type) { + if (!params->imatrix) { + if ( + dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS || + dst_type == GGML_TYPE_IQ2_S || dst_type == GGML_TYPE_IQ1_S || ( + dst_type == GGML_TYPE_IQ1_M && strcmp(t->name, "token_embd.weight") && + strcmp(t->name, "output.weight") + ) || ( + dst_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && + strcmp(t->name, "token_embd.weight") != 0 + ) + ) return true; + } + return false; +} + static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { ggml_type default_type; llama_ftype ftype = params->ftype; @@ -741,6 +757,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: new_ofstream(0); } + // flag for `--dry-run`, to let the user know if imatrix will be required for a real + // quantization, as a courtesy + bool will_require_imatrix = false; + for (const auto * it : tensors) { const auto & weight = *it; ggml_tensor * tensor = weight.tensor; @@ -921,6 +941,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: new_size = tensor_size; LLAMA_LOG_INFO("size = %8.3f MiB\n", new_size/1024.0/1024.0); } + if (!will_require_imatrix && tensor_requires_imatrix(params, tensor, new_type)) { + will_require_imatrix = true; + } total_size_org += tensor_size; total_size_new += new_size; continue; @@ -957,12 +980,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } } } - if ((new_type == GGML_TYPE_IQ2_XXS || - new_type == GGML_TYPE_IQ2_XS || - new_type == GGML_TYPE_IQ2_S || - new_type == GGML_TYPE_IQ1_S || - (new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) || - (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) { + if (tensor_requires_imatrix(params, tensor, new_type)) { LLAMA_LOG_ERROR("\n\n============================================================\n"); LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name); LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n"); @@ -1053,6 +1071,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: LLAMA_LOG_INFO("%s: model size = %8.2f MiB (%.2f BPW)\n", __func__, total_size_org/1024.0/1024.0, total_size_org*8.0/ml.n_elements); LLAMA_LOG_INFO("%s: quant size = %8.2f MiB (%.2f BPW)\n", __func__, total_size_new/1024.0/1024.0, total_size_new*8.0/ml.n_elements); + if (!params->imatrix && params->dry_run && will_require_imatrix) { + LLAMA_LOG_WARN("%s: WARNING: dry run completed successfully, but actually completing this quantization will require an imatrix!\n"); + } if (qs.n_fallback > 0) { LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n", From ea8da0503c48077b0468c15345aaf49ebf8e1a37 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 11 Feb 2026 20:57:16 -0600 Subject: [PATCH 10/35] missing __func__, move imatrix flag set --- src/llama-quant.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 8a668e6b23..76581f8b4b 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -937,13 +937,13 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: tensor_size/1024.0/1024.0, new_size/1024.0/1024.0, ggml_type_name(new_type)); + if (!will_require_imatrix && tensor_requires_imatrix(params, tensor, new_type)) { + will_require_imatrix = true; + } } else { new_size = tensor_size; LLAMA_LOG_INFO("size = %8.3f MiB\n", new_size/1024.0/1024.0); } - if (!will_require_imatrix && tensor_requires_imatrix(params, tensor, new_type)) { - will_require_imatrix = true; - } total_size_org += tensor_size; total_size_new += new_size; continue; @@ -1072,7 +1072,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: LLAMA_LOG_INFO("%s: model size = %8.2f MiB (%.2f BPW)\n", __func__, total_size_org/1024.0/1024.0, total_size_org*8.0/ml.n_elements); LLAMA_LOG_INFO("%s: quant size = %8.2f MiB (%.2f BPW)\n", __func__, total_size_new/1024.0/1024.0, total_size_new*8.0/ml.n_elements); if (!params->imatrix && params->dry_run && will_require_imatrix) { - LLAMA_LOG_WARN("%s: WARNING: dry run completed successfully, but actually completing this quantization will require an imatrix!\n"); + LLAMA_LOG_WARN("%s: WARNING: dry run completed successfully, but actually completing this quantization will require an imatrix!\n", + __func__ + ); } if (qs.n_fallback > 0) { From 3211a847ef3c153fe499aeb259e2a6f996c6e75d Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 11 Feb 2026 20:58:52 -0600 Subject: [PATCH 11/35] logic error --- src/llama-quant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 76581f8b4b..c411d41153 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -980,7 +980,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } } } - if (tensor_requires_imatrix(params, tensor, new_type)) { + if (tensor_requires_imatrix(params, tensor, new_type) && !imatrix) { LLAMA_LOG_ERROR("\n\n============================================================\n"); LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name); LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n"); From 55dbee2bbe1059dac78eb139869c0aa189558df2 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 11 Feb 2026 21:03:34 -0600 Subject: [PATCH 12/35] fixup tensor_requires_imatrix --- src/llama-quant.cpp | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index c411d41153..252fbe2085 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -480,19 +480,18 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * } static bool tensor_requires_imatrix(const llama_model_quantize_params * params, const ggml_tensor * t, const ggml_type dst_type) { - if (!params->imatrix) { - if ( - dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS || - dst_type == GGML_TYPE_IQ2_S || dst_type == GGML_TYPE_IQ1_S || ( - dst_type == GGML_TYPE_IQ1_M && strcmp(t->name, "token_embd.weight") && - strcmp(t->name, "output.weight") - ) || ( - dst_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && - strcmp(t->name, "token_embd.weight") != 0 - ) - ) return true; + if (dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS || + dst_type == GGML_TYPE_IQ2_S || dst_type == GGML_TYPE_IQ1_S || ( + dst_type == GGML_TYPE_IQ1_M && strcmp(t->name, "token_embd.weight") && + strcmp(t->name, "output.weight") + ) || ( + dst_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && + strcmp(t->name, "token_embd.weight") != 0 + )) { + return true; + } else { + return false; } - return false; } static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { From 22db76409b7495835c2fac8f491423887445ad1a Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 11 Feb 2026 21:14:19 -0600 Subject: [PATCH 13/35] add missing `GGML_TYPE`s --- src/llama-quant.cpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 252fbe2085..3cad6bc6e7 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -480,14 +480,19 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * } static bool tensor_requires_imatrix(const llama_model_quantize_params * params, const ggml_tensor * t, const ggml_type dst_type) { - if (dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS || - dst_type == GGML_TYPE_IQ2_S || dst_type == GGML_TYPE_IQ1_S || ( + if ( + dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS || + dst_type == GGML_TYPE_IQ3_XXS || dst_type == GGML_TYPE_IQ1_S || + dst_type == GGML_TYPE_IQ2_S || dst_type == GGML_TYPE_IQ1_M || + dst_type == GGML_TYPE_TQ1_0 || dst_type == GGML_TYPE_TQ2_0 || + ( dst_type == GGML_TYPE_IQ1_M && strcmp(t->name, "token_embd.weight") && strcmp(t->name, "output.weight") ) || ( dst_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(t->name, "token_embd.weight") != 0 - )) { + ) + ) { return true; } else { return false; @@ -979,7 +984,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } } } - if (tensor_requires_imatrix(params, tensor, new_type) && !imatrix) { + if (!imatrix && tensor_requires_imatrix(params, tensor, new_type)) { LLAMA_LOG_ERROR("\n\n============================================================\n"); LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name); LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n"); From ae786b862da889a9345a7360e1c7b57c6056510f Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 11 Feb 2026 21:21:40 -0600 Subject: [PATCH 14/35] simplify and rename `tensor_type_requires_imatrix` --- src/llama-quant.cpp | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 3cad6bc6e7..5b3fec3dc5 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -479,20 +479,11 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * return new_size; } -static bool tensor_requires_imatrix(const llama_model_quantize_params * params, const ggml_tensor * t, const ggml_type dst_type) { - if ( - dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS || +static bool tensor_type_requires_imatrix(const ggml_type dst_type) { + if (dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS || dst_type == GGML_TYPE_IQ3_XXS || dst_type == GGML_TYPE_IQ1_S || dst_type == GGML_TYPE_IQ2_S || dst_type == GGML_TYPE_IQ1_M || - dst_type == GGML_TYPE_TQ1_0 || dst_type == GGML_TYPE_TQ2_0 || - ( - dst_type == GGML_TYPE_IQ1_M && strcmp(t->name, "token_embd.weight") && - strcmp(t->name, "output.weight") - ) || ( - dst_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && - strcmp(t->name, "token_embd.weight") != 0 - ) - ) { + dst_type == GGML_TYPE_TQ1_0 || dst_type == GGML_TYPE_TQ2_0) { return true; } else { return false; @@ -941,7 +932,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: tensor_size/1024.0/1024.0, new_size/1024.0/1024.0, ggml_type_name(new_type)); - if (!will_require_imatrix && tensor_requires_imatrix(params, tensor, new_type)) { + if (!will_require_imatrix && tensor_type_requires_imatrix(new_type)) { will_require_imatrix = true; } } else { @@ -984,7 +975,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } } } - if (!imatrix && tensor_requires_imatrix(params, tensor, new_type)) { + if (!imatrix && tensor_type_requires_imatrix(new_type)) { LLAMA_LOG_ERROR("\n\n============================================================\n"); LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name); LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n"); From 1ccd7a49baeb5f4643bccc75008de47ba85d843c Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 11 Feb 2026 21:41:37 -0600 Subject: [PATCH 15/35] simplify for style --- src/llama-quant.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 5b3fec3dc5..31694e2834 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -480,14 +480,12 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * } static bool tensor_type_requires_imatrix(const ggml_type dst_type) { - if (dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS || + return ( + dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS || dst_type == GGML_TYPE_IQ3_XXS || dst_type == GGML_TYPE_IQ1_S || dst_type == GGML_TYPE_IQ2_S || dst_type == GGML_TYPE_IQ1_M || - dst_type == GGML_TYPE_TQ1_0 || dst_type == GGML_TYPE_TQ2_0) { - return true; - } else { - return false; - } + dst_type == GGML_TYPE_TQ1_0 || dst_type == GGML_TYPE_TQ2_0 + ); } static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { @@ -1066,6 +1064,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: LLAMA_LOG_INFO("%s: model size = %8.2f MiB (%.2f BPW)\n", __func__, total_size_org/1024.0/1024.0, total_size_org*8.0/ml.n_elements); LLAMA_LOG_INFO("%s: quant size = %8.2f MiB (%.2f BPW)\n", __func__, total_size_new/1024.0/1024.0, total_size_new*8.0/ml.n_elements); + if (!params->imatrix && params->dry_run && will_require_imatrix) { LLAMA_LOG_WARN("%s: WARNING: dry run completed successfully, but actually completing this quantization will require an imatrix!\n", __func__ From 1658228d6acc770c884965ff0582a7633b75f96a Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 11 Feb 2026 21:53:07 -0600 Subject: [PATCH 16/35] add back Q2_K edge case for imatrix --- src/llama-quant.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 31694e2834..543b658e56 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -479,12 +479,15 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * return new_size; } -static bool tensor_type_requires_imatrix(const ggml_type dst_type) { +static bool tensor_type_requires_imatrix(const llama_model_quantize_params * params, const ggml_tensor * t, const ggml_type dst_type) { return ( dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS || dst_type == GGML_TYPE_IQ3_XXS || dst_type == GGML_TYPE_IQ1_S || dst_type == GGML_TYPE_IQ2_S || dst_type == GGML_TYPE_IQ1_M || - dst_type == GGML_TYPE_TQ1_0 || dst_type == GGML_TYPE_TQ2_0 + dst_type == GGML_TYPE_TQ1_0 || dst_type == GGML_TYPE_TQ2_0 || + ( // Q2_K is the worst k-quant type - only allow it without imatrix for token embeddings + dst_type == GGML_TYPE_Q2_K && strcmp(t->name, "token_embd.weight") != 0 + ) ); } @@ -930,7 +933,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: tensor_size/1024.0/1024.0, new_size/1024.0/1024.0, ggml_type_name(new_type)); - if (!will_require_imatrix && tensor_type_requires_imatrix(new_type)) { + if (!will_require_imatrix && tensor_type_requires_imatrix(params, tensor, new_type)) { will_require_imatrix = true; } } else { @@ -973,7 +976,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } } } - if (!imatrix && tensor_type_requires_imatrix(new_type)) { + if (!imatrix && tensor_type_requires_imatrix(params, tensor, new_type)) { LLAMA_LOG_ERROR("\n\n============================================================\n"); LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name); LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n"); From b15bb3404cf49d4be1a4d1e5cafbdb544d086d0d Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 11 Feb 2026 21:57:55 -0600 Subject: [PATCH 17/35] guard ftype imatrix warning --- tools/quantize/quantize.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 7c9a7f29cc..59bf9bd3fd 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -686,11 +686,12 @@ int main(int argc, char ** argv) { } } - if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || - params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || - params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || - params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || - params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && imatrix_data.empty()) { + if (!params.dry_run && + ( + params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || + params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || + params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M + ) && imatrix_data.empty()) { fprintf(stderr, "\n==========================================================================================================\n"); fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n"); fprintf(stderr, "==========================================================================================================\n\n\n"); From 40528248fcbc212bcde26f8d25b4b411a023d5f3 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 11 Feb 2026 22:18:56 -0600 Subject: [PATCH 18/35] comment ref #12557 --- src/llama-quant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 543b658e56..49a9696503 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -484,7 +484,7 @@ static bool tensor_type_requires_imatrix(const llama_model_quantize_params * par dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS || dst_type == GGML_TYPE_IQ3_XXS || dst_type == GGML_TYPE_IQ1_S || dst_type == GGML_TYPE_IQ2_S || dst_type == GGML_TYPE_IQ1_M || - dst_type == GGML_TYPE_TQ1_0 || dst_type == GGML_TYPE_TQ2_0 || + // dst_type == GGML_TYPE_TQ1_0 || dst_type == GGML_TYPE_TQ2_0 || // uncomment if #12557 is merged ( // Q2_K is the worst k-quant type - only allow it without imatrix for token embeddings dst_type == GGML_TYPE_Q2_K && strcmp(t->name, "token_embd.weight") != 0 ) From 44f9fee2488858307798bae9b576541e9e887599 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 11 Feb 2026 22:23:10 -0600 Subject: [PATCH 19/35] remove per @compilade --- src/llama-quant.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 49a9696503..652d93dbc9 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -484,7 +484,6 @@ static bool tensor_type_requires_imatrix(const llama_model_quantize_params * par dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS || dst_type == GGML_TYPE_IQ3_XXS || dst_type == GGML_TYPE_IQ1_S || dst_type == GGML_TYPE_IQ2_S || dst_type == GGML_TYPE_IQ1_M || - // dst_type == GGML_TYPE_TQ1_0 || dst_type == GGML_TYPE_TQ2_0 || // uncomment if #12557 is merged ( // Q2_K is the worst k-quant type - only allow it without imatrix for token embeddings dst_type == GGML_TYPE_Q2_K && strcmp(t->name, "token_embd.weight") != 0 ) From f58de63ec30f96b1e88eecd5ca659d9248b9eda8 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 11 Feb 2026 22:30:06 -0600 Subject: [PATCH 20/35] remove unused `params` parameter --- src/llama-quant.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 652d93dbc9..9781202f90 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -479,7 +479,7 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * return new_size; } -static bool tensor_type_requires_imatrix(const llama_model_quantize_params * params, const ggml_tensor * t, const ggml_type dst_type) { +static bool tensor_type_requires_imatrix(const ggml_tensor * t, const ggml_type dst_type) { return ( dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS || dst_type == GGML_TYPE_IQ3_XXS || dst_type == GGML_TYPE_IQ1_S || @@ -932,7 +932,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: tensor_size/1024.0/1024.0, new_size/1024.0/1024.0, ggml_type_name(new_type)); - if (!will_require_imatrix && tensor_type_requires_imatrix(params, tensor, new_type)) { + if (!will_require_imatrix && tensor_type_requires_imatrix(tensor, new_type)) { will_require_imatrix = true; } } else { @@ -975,7 +975,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } } } - if (!imatrix && tensor_type_requires_imatrix(params, tensor, new_type)) { + if (!imatrix && tensor_type_requires_imatrix(tensor, new_type)) { LLAMA_LOG_ERROR("\n\n============================================================\n"); LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name); LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n"); From 5d6c92440cc773e8362f23f8afb1d6561a26a243 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 12 Feb 2026 17:52:59 -0600 Subject: [PATCH 21/35] initial commit for branch --- src/llama-quant.cpp | 400 ++++++++++++++++++++---------------- tools/quantize/quantize.cpp | 12 -- 2 files changed, 226 insertions(+), 186 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 9781202f90..b805641416 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -479,7 +479,8 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * return new_size; } -static bool tensor_type_requires_imatrix(const ggml_tensor * t, const ggml_type dst_type) { +// based on this tensor and the destination tensor type, do we require an importance matrix? +static bool tensor_requires_imatrix(const ggml_tensor * t, const ggml_type dst_type) { return ( dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS || dst_type == GGML_TYPE_IQ3_XXS || dst_type == GGML_TYPE_IQ1_S || @@ -490,6 +491,151 @@ static bool tensor_type_requires_imatrix(const ggml_tensor * t, const ggml_type ); } +// do we allow this tensor to be quantized? +static bool tensor_allows_quantization(const llama_model_quantize_params * params, llm_arch arch, const ggml_tensor * tensor) { + const std::string name = tensor->name; + + // This used to be a regex, but has an extreme cost to compile times. + bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'? + + // quantize only 2D and 3D tensors (experts) + quantize &= (ggml_n_dims(tensor) >= 2); + + // do not quantize norm tensors + quantize &= name.find("_norm.weight") == std::string::npos; + + quantize &= params->quantize_output_tensor || name != "output.weight"; + quantize &= !params->only_copy; + + // do not quantize expert gating tensors + // NOTE: can't use LLM_TN here because the layer number is not known + quantize &= name.find("ffn_gate_inp.weight") == std::string::npos; + + // these are very small (e.g. 4x4) + quantize &= name.find("altup") == std::string::npos; + quantize &= name.find("laurel") == std::string::npos; + + // these are not too big so keep them as it is + quantize &= name.find("per_layer_model_proj") == std::string::npos; + + // do not quantize positional embeddings and token types (BERT) + quantize &= name != LLM_TN(arch)(LLM_TENSOR_POS_EMBD, "weight"); + quantize &= name != LLM_TN(arch)(LLM_TENSOR_TOKEN_TYPES, "weight"); + + // do not quantize Mamba /Kimi's small conv1d weights + // NOTE: can't use LLM_TN here because the layer number is not known + quantize &= name.find("ssm_conv1d") == std::string::npos; + quantize &= name.find("shortconv.conv.weight") == std::string::npos; + + // do not quantize RWKV's small yet 2D weights + quantize &= name.find("time_mix_first.weight") == std::string::npos; + quantize &= name.find("time_mix_w0.weight") == std::string::npos; + quantize &= name.find("time_mix_w1.weight") == std::string::npos; + quantize &= name.find("time_mix_w2.weight") == std::string::npos; + quantize &= name.find("time_mix_v0.weight") == std::string::npos; + quantize &= name.find("time_mix_v1.weight") == std::string::npos; + quantize &= name.find("time_mix_v2.weight") == std::string::npos; + quantize &= name.find("time_mix_a0.weight") == std::string::npos; + quantize &= name.find("time_mix_a1.weight") == std::string::npos; + quantize &= name.find("time_mix_a2.weight") == std::string::npos; + quantize &= name.find("time_mix_g1.weight") == std::string::npos; + quantize &= name.find("time_mix_g2.weight") == std::string::npos; + quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos; + quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos; + quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos; + + // do not quantize relative position bias (T5) + quantize &= name.find("attn_rel_b.weight") == std::string::npos; + + // do not quantize specific multimodal tensors + quantize &= name.find(".position_embd.") == std::string::npos; + + return quantize; +} + +static ggml_type get_tensor_target_type( + quantize_state_impl & qs, + const llama_model_quantize_params * params, + const ggml_tensor * tensor, + ggml_type default_type +) { + ggml_type new_type; + // get more optimal quantization type based on the tensor shape, layer, etc. + if (!params->pure && ggml_is_quantized(default_type)) { + + // if the user provided tensor types - use those + bool manual = false; + if (params->tensor_types) { + const std::vector & tensor_types = *static_cast *>(params->tensor_types); + const std::string tensor_name(tensor->name); + for (const auto & [tname, qtype] : tensor_types) { + if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) { + if (qtype != new_type) { + LLAMA_LOG_WARN("(manual override: %s -> %s) ", ggml_type_name(new_type), ggml_type_name(qtype)); + new_type = qtype; // if two or more types are specified for the same tensor, the last match wins + manual = true; + break; + } + } + } + } + + // if not manual - use the standard logic for choosing the quantization type based on the selected mixture + if (!manual) { + new_type = llama_tensor_get_type(qs, new_type, tensor, params->ftype); + } + + // incompatible tensor shapes are handled here - fallback to a compatible type + { + bool convert_incompatible_tensor = false; + + const int64_t nx = tensor->ne[0]; + const int64_t ny = tensor->ne[1]; + const int64_t qk_k = ggml_blck_size(new_type); + + if (nx % qk_k != 0) { + LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type)); + convert_incompatible_tensor = true; + } else { + ++qs.n_k_quantized; + } + + if (convert_incompatible_tensor) { + switch (new_type) { + case GGML_TYPE_TQ1_0: + case GGML_TYPE_TQ2_0: new_type = GGML_TYPE_Q4_0; break; // TODO: use a symmetric type instead + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ2_S: + case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ3_S: + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break; + case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break; + case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break; + case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break; + default: throw std::runtime_error("\nUnsupported tensor size encountered\n"); + } + if (tensor->ne[0] % ggml_blck_size(new_type) != 0) { + new_type = GGML_TYPE_F16; + } + LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type)); + ++qs.n_fallback; + } + } + } + if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) { + new_type = params->token_embedding_type; + } + if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) { + new_type = params->output_tensor_type; + } + return new_type; +} + static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { ggml_type default_type; llama_ftype ftype = params->ftype; @@ -628,8 +774,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: int blk_id = 0; // make a list of weights - std::vector tensors; - tensors.reserve(ml.weights_map.size()); + std::vector weights; + weights.reserve(ml.weights_map.size()); for (const auto & it : ml.weights_map) { const std::string remapped_name(remap_layer(it.first, prune_list, mapped, blk_id)); if (remapped_name.empty()) { @@ -641,8 +787,16 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: ggml_set_name(it.second.tensor, remapped_name.c_str()); LLAMA_LOG_DEBUG("%s: tensor %s remapped to %s\n", __func__, it.first.c_str(), ggml_get_name(it.second.tensor)); } - tensors.push_back(&it.second); + weights.push_back(&it.second); } + + // make a list of tensors (same pointers as from weights) + std::vector tensors; + tensors.reserve(weights.size()); + for (size_t i = 0; i < weights.size(); ++i) { + tensors.push_back(weights[i]->tensor); + } + if (!prune_list.empty()) { gguf_set_val_u32(ctx_out.get(), ml.llm_kv(LLM_KV_BLOCK_COUNT).c_str(), blk_id); } @@ -657,26 +811,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: }); } - for (const auto * it : tensors) { - const struct ggml_tensor * tensor = it->tensor; - - const std::string name = ggml_get_name(tensor); - - // TODO: avoid hardcoded tensor names - use the TN_* constants - if (name.find("attn_v.weight") != std::string::npos || - name.find("attn_qkv.weight") != std::string::npos || - name.find("attn_kv_b.weight")!= std::string::npos) { - ++qs.n_attention_wv; - } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) { - qs.has_output = true; - } - } - - qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer; - - size_t total_size_org = 0; - size_t total_size_new = 0; - std::vector workers; workers.reserve(nthread); @@ -690,23 +824,61 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // Assume split index is continuous if (params->keep_split) { - for (const auto * it : tensors) { + for (const auto * it : weights) { n_split = std::max(uint16_t(it->idx + 1), n_split); } } std::vector ctx_outs(n_split); ctx_outs[0] = std::move(ctx_out); - // populate the original tensors so we get an initial meta data - for (const auto * it : tensors) { + // flag for `--dry-run`, to let the user know if imatrix will be required for a real + // quantization, as a courtesy + bool will_require_imatrix = false; + + // this is the preliminary iteration over all weights (not the main loop) + for (const auto * it : weights) { + const ggml_tensor * tensor = it->tensor; + const std::string name = tensor->name; + + // TODO: avoid hardcoded tensor names - use the TN_* constants + if (name.find("attn_v.weight") != std::string::npos || + name.find("attn_qkv.weight") != std::string::npos || + name.find("attn_kv_b.weight")!= std::string::npos) { + ++qs.n_attention_wv; + } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) { + qs.has_output = true; + } + + // populate the original tensors so we get an initial meta data uint16_t i_split = params->keep_split ? it->idx : 0; - ggml_tensor * tensor = it->tensor; if (!ctx_outs[i_split]) { ctx_outs[i_split].reset(gguf_init_empty()); } gguf_add_tensor(ctx_outs[i_split].get(), tensor); + + // TODO: we could save this per-tensor and correlate it with the vector of tensors so we + // don't have to call this function again later (currently twice per tensor) + ggml_type target_type = get_tensor_target_type(qs, params, tensor, default_type); + + if (!params->imatrix && + tensor_allows_quantization(params, model.arch, tensor) && + tensor_requires_imatrix(tensor, target_type) + ) { + if (params->dry_run) { + will_require_imatrix = true; // set flag for warning later, but continue with dry run + } else { + LLAMA_LOG_ERROR("\n\n============================================================================\n" + " ERROR: this quantization requires an importance matrix!\n" + " offending tensor: %s (target type: %s)\n" + "============================================================================\n\n", + name, ggml_type_name(target_type)); + throw new std::runtime_error("this quantization requires an imatrix!"); + } + } } + qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer; + // Set split info if needed if (n_split > 1) { for (size_t i = 0; i < ctx_outs.size(); ++i) { @@ -752,13 +924,14 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: new_ofstream(0); } - // flag for `--dry-run`, to let the user know if imatrix will be required for a real - // quantization, as a courtesy - bool will_require_imatrix = false; + size_t total_size_org = 0; + size_t total_size_new = 0; - for (const auto * it : tensors) { + // iterate over all weights (main loop) + for (const auto * it : weights) { const auto & weight = *it; ggml_tensor * tensor = weight.tensor; + if (!params->dry_run && (weight.idx != cur_split && params->keep_split)) { close_ofstream(); new_ofstream(weight.idx); @@ -778,161 +951,40 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ", - ++idx, ml.n_tensors, - ggml_get_name(tensor), - llama_format_tensor_shape(tensor).c_str(), - ggml_type_name(tensor->type)); + ++idx, ml.n_tensors, + ggml_get_name(tensor), + llama_format_tensor_shape(tensor).c_str(), + ggml_type_name(tensor->type)); - // This used to be a regex, but has an extreme cost to compile times. - bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'? + // will we quantize this tensor? + bool do_quantize = tensor_allows_quantization(params, model.arch, tensor); - // quantize only 2D and 3D tensors (experts) - quantize &= (ggml_n_dims(tensor) >= 2); + ggml_type new_type = default_type; - // do not quantize norm tensors - quantize &= name.find("_norm.weight") == std::string::npos; + // if so, what will be the target type? + if (do_quantize) { + new_type = get_tensor_target_type(qs, params, tensor, default_type); + // If we've decided to quantize to the same type the tensor is already + // in then there's nothing to do. + do_quantize = tensor->type != new_type; + } - quantize &= params->quantize_output_tensor || name != "output.weight"; - quantize &= !params->only_copy; - - // do not quantize expert gating tensors - // NOTE: can't use LLM_TN here because the layer number is not known - quantize &= name.find("ffn_gate_inp.weight") == std::string::npos; - - // these are very small (e.g. 4x4) - quantize &= name.find("altup") == std::string::npos; - quantize &= name.find("laurel") == std::string::npos; - - // these are not too big so keep them as it is - quantize &= name.find("per_layer_model_proj") == std::string::npos; - - // do not quantize positional embeddings and token types (BERT) - quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight"); - quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight"); - - // do not quantize Mamba /Kimi's small conv1d weights - // NOTE: can't use LLM_TN here because the layer number is not known - quantize &= name.find("ssm_conv1d") == std::string::npos; - quantize &= name.find("shortconv.conv.weight") == std::string::npos; - - // do not quantize RWKV's small yet 2D weights - quantize &= name.find("time_mix_first.weight") == std::string::npos; - quantize &= name.find("time_mix_w0.weight") == std::string::npos; - quantize &= name.find("time_mix_w1.weight") == std::string::npos; - quantize &= name.find("time_mix_w2.weight") == std::string::npos; - quantize &= name.find("time_mix_v0.weight") == std::string::npos; - quantize &= name.find("time_mix_v1.weight") == std::string::npos; - quantize &= name.find("time_mix_v2.weight") == std::string::npos; - quantize &= name.find("time_mix_a0.weight") == std::string::npos; - quantize &= name.find("time_mix_a1.weight") == std::string::npos; - quantize &= name.find("time_mix_a2.weight") == std::string::npos; - quantize &= name.find("time_mix_g1.weight") == std::string::npos; - quantize &= name.find("time_mix_g2.weight") == std::string::npos; - quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos; - quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos; - quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos; - - // do not quantize relative position bias (T5) - quantize &= name.find("attn_rel_b.weight") == std::string::npos; - - // do not quantize specific multimodal tensors - quantize &= name.find(".position_embd.") == std::string::npos; - - ggml_type new_type; void * new_data; size_t new_size; - if (quantize) { - new_type = default_type; + // + // perform quantization (or dry run) + // - // get more optimal quantization type based on the tensor shape, layer, etc. - if (!params->pure && ggml_is_quantized(default_type)) { - // if the user provided tensor types - use those - bool manual = false; - if (params->tensor_types) { - const std::vector & tensor_types = *static_cast *>(params->tensor_types); - const std::string tensor_name(tensor->name); - for (const auto & [tname, qtype] : tensor_types) { - if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) { - if (qtype != new_type) { - LLAMA_LOG_WARN("(manual override: %s -> %s) ", ggml_type_name(new_type), ggml_type_name(qtype)); - new_type = qtype; // if two or more types are specified for the same tensor, the last match wins - manual = true; - break; - } - } - } - } - - // if not manual - use the standard logic for choosing the quantization type based on the selected mixture - if (!manual) { - new_type = llama_tensor_get_type(qs, new_type, tensor, ftype); - } - - // incompatible tensor shapes are handled here - fallback to a compatible type - { - bool convert_incompatible_tensor = false; - - const int64_t nx = tensor->ne[0]; - const int64_t ny = tensor->ne[1]; - const int64_t qk_k = ggml_blck_size(new_type); - - if (nx % qk_k != 0) { - LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type)); - convert_incompatible_tensor = true; - } else { - ++qs.n_k_quantized; - } - - if (convert_incompatible_tensor) { - switch (new_type) { - case GGML_TYPE_TQ1_0: - case GGML_TYPE_TQ2_0: new_type = GGML_TYPE_Q4_0; break; // TODO: use a symmetric type instead - case GGML_TYPE_IQ2_XXS: - case GGML_TYPE_IQ2_XS: - case GGML_TYPE_IQ2_S: - case GGML_TYPE_IQ3_XXS: - case GGML_TYPE_IQ3_S: - case GGML_TYPE_IQ1_S: - case GGML_TYPE_IQ1_M: - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break; - case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break; - case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break; - case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break; - default: throw std::runtime_error("\nUnsupported tensor size encountered\n"); - } - if (tensor->ne[0] % ggml_blck_size(new_type) != 0) { - new_type = GGML_TYPE_F16; - } - LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type)); - ++qs.n_fallback; - } - } - } - if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) { - new_type = params->token_embedding_type; - } - if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) { - new_type = params->output_tensor_type; - } - - // If we've decided to quantize to the same type the tensor is already - // in then there's nothing to do. - quantize = tensor->type != new_type; - } - - // we have now decided on the target type for this tensor if (params->dry_run) { // the --dry-run option calculates the final quantization size without quantizting - if (quantize) { + if (do_quantize) { new_size = ggml_nrows(tensor) * ggml_row_size(new_type, tensor->ne[0]); LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB (%s)\n", tensor_size/1024.0/1024.0, new_size/1024.0/1024.0, ggml_type_name(new_type)); - if (!will_require_imatrix && tensor_type_requires_imatrix(tensor, new_type)) { + if (!will_require_imatrix && tensor_requires_imatrix(tensor, new_type)) { will_require_imatrix = true; } } else { @@ -944,7 +996,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: continue; } else { // no --dry-run, perform quantization - if (!quantize) { + if (!do_quantize) { new_type = tensor->type; new_data = tensor->data; new_size = tensor_size; @@ -975,7 +1027,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } } } - if (!imatrix && tensor_type_requires_imatrix(tensor, new_type)) { + if (!imatrix && tensor_requires_imatrix(tensor, new_type)) { LLAMA_LOG_ERROR("\n\n============================================================\n"); LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name); LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n"); diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 59bf9bd3fd..e9448028da 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -686,18 +686,6 @@ int main(int argc, char ** argv) { } } - if (!params.dry_run && - ( - params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || - params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || - params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M - ) && imatrix_data.empty()) { - fprintf(stderr, "\n==========================================================================================================\n"); - fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n"); - fprintf(stderr, "==========================================================================================================\n\n\n"); - return 1; - } - if (!params.dry_run) { if (std::error_code ec; std::filesystem::equivalent(fname_inp, fname_out, ec)) { fprintf(stderr, "%s: error: input and output files are the same: '%s'\n", __func__, fname_inp.c_str()); From 67e25bbae1fd675ebeacd1f2720b4bfacbe9cc23 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 12 Feb 2026 18:02:40 -0600 Subject: [PATCH 22/35] fix compile errors --- src/llama-quant.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index b805641416..467d847196 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -559,7 +559,7 @@ static ggml_type get_tensor_target_type( const ggml_tensor * tensor, ggml_type default_type ) { - ggml_type new_type; + ggml_type new_type = default_type; // get more optimal quantization type based on the tensor shape, layer, etc. if (!params->pure && ggml_is_quantized(default_type)) { @@ -803,7 +803,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // keep_split requires that the weights are sorted by split index if (params->keep_split) { - std::sort(tensors.begin(), tensors.end(), [](const llama_model_loader::llama_tensor_weight * a, const llama_model_loader::llama_tensor_weight * b) { + std::sort(weights.begin(), weights.end(), [](const llama_model_loader::llama_tensor_weight * a, const llama_model_loader::llama_tensor_weight * b) { if (a->idx == b->idx) { return a->offs < b->offs; } @@ -871,7 +871,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: " ERROR: this quantization requires an importance matrix!\n" " offending tensor: %s (target type: %s)\n" "============================================================================\n\n", - name, ggml_type_name(target_type)); + name.c_str(), ggml_type_name(target_type)); throw new std::runtime_error("this quantization requires an imatrix!"); } } From 1f25c130de615417bc2312ef6266acc32acf70dd Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 12 Feb 2026 18:11:44 -0600 Subject: [PATCH 23/35] pretty error msg --- src/llama-quant.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 467d847196..dd81b310d8 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -867,10 +867,11 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: if (params->dry_run) { will_require_imatrix = true; // set flag for warning later, but continue with dry run } else { - LLAMA_LOG_ERROR("\n\n============================================================================\n" - " ERROR: this quantization requires an importance matrix!\n" - " offending tensor: %s (target type: %s)\n" - "============================================================================\n\n", + LLAMA_LOG_ERROR("\n============================================================================\n" + " ERROR: this quantization requires an importance matrix!\n" + " - offending tensor: %s\n" + " - target type: %s\n" + "============================================================================\n\n", name.c_str(), ggml_type_name(target_type)); throw new std::runtime_error("this quantization requires an imatrix!"); } From 6734e77662dac4ab24f4b5cb5cc39609ea3d2b13 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 12 Feb 2026 18:22:14 -0600 Subject: [PATCH 24/35] don't throw by pointer; unify MiB formatting --- src/llama-quant.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index dd81b310d8..c9b7e5c7ce 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -873,7 +873,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: " - target type: %s\n" "============================================================================\n\n", name.c_str(), ggml_type_name(target_type)); - throw new std::runtime_error("this quantization requires an imatrix!"); + throw std::runtime_error("this quantization requires an imatrix!"); } } } @@ -981,7 +981,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // the --dry-run option calculates the final quantization size without quantizting if (do_quantize) { new_size = ggml_nrows(tensor) * ggml_row_size(new_type, tensor->ne[0]); - LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB (%s)\n", + LLAMA_LOG_INFO("size = %8.3f MiB -> %8.3f MiB (%s)\n", tensor_size/1024.0/1024.0, new_size/1024.0/1024.0, ggml_type_name(new_type)); @@ -1097,7 +1097,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } #endif } - LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", tensor_size/1024.0/1024.0, new_size/1024.0/1024.0); + LLAMA_LOG_INFO("size = %8.3f MiB -> %8.3f MiB\n", tensor_size/1024.0/1024.0, new_size/1024.0/1024.0); } total_size_org += tensor_size; total_size_new += new_size; From d648629f560d7759524e6a76368b825ebf9aa652 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 12 Feb 2026 18:24:16 -0600 Subject: [PATCH 25/35] remove unused `std::vector tensors;` --- src/llama-quant.cpp | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index c9b7e5c7ce..763c6e1baf 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -790,13 +790,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: weights.push_back(&it.second); } - // make a list of tensors (same pointers as from weights) - std::vector tensors; - tensors.reserve(weights.size()); - for (size_t i = 0; i < weights.size(); ++i) { - tensors.push_back(weights[i]->tensor); - } - if (!prune_list.empty()) { gguf_set_val_u32(ctx_out.get(), ml.llm_kv(LLM_KV_BLOCK_COUNT).c_str(), blk_id); } @@ -885,7 +878,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: for (size_t i = 0; i < ctx_outs.size(); ++i) { gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i); gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split); - gguf_set_val_i32(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), (int32_t)tensors.size()); + gguf_set_val_i32(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), (int32_t)weights.size()); } } From fd3787ee05a78d67407d30f2fcf979b147b5bb4c Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 12 Feb 2026 18:24:47 -0600 Subject: [PATCH 26/35] typo --- src/llama-quant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 763c6e1baf..dd6c978b94 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -971,7 +971,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // if (params->dry_run) { - // the --dry-run option calculates the final quantization size without quantizting + // the --dry-run option calculates the final quantization size without quantizing if (do_quantize) { new_size = ggml_nrows(tensor) * ggml_row_size(new_type, tensor->ne[0]); LLAMA_LOG_INFO("size = %8.3f MiB -> %8.3f MiB (%s)\n", From 053a28980b5185fdc57a66a2c9714114c39e95f1 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 12 Feb 2026 18:31:59 -0600 Subject: [PATCH 27/35] don't double-count `qs` --- src/llama-quant.cpp | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index dd6c978b94..e35b4573f3 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -553,11 +553,12 @@ static bool tensor_allows_quantization(const llama_model_quantize_params * param return quantize; } -static ggml_type get_tensor_target_type( +static ggml_type tensor_get_target_type( quantize_state_impl & qs, const llama_model_quantize_params * params, const ggml_tensor * tensor, - ggml_type default_type + ggml_type default_type, + bool update_stats // should we update qs or no? ) { ggml_type new_type = default_type; // get more optimal quantization type based on the tensor shape, layer, etc. @@ -597,7 +598,9 @@ static ggml_type get_tensor_target_type( LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type)); convert_incompatible_tensor = true; } else { - ++qs.n_k_quantized; + if (update_stats) { + ++qs.n_k_quantized; + } } if (convert_incompatible_tensor) { @@ -623,7 +626,9 @@ static ggml_type get_tensor_target_type( new_type = GGML_TYPE_F16; } LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type)); - ++qs.n_fallback; + if (update_stats) { + ++qs.n_fallback; + } } } } @@ -851,7 +856,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // TODO: we could save this per-tensor and correlate it with the vector of tensors so we // don't have to call this function again later (currently twice per tensor) - ggml_type target_type = get_tensor_target_type(qs, params, tensor, default_type); + ggml_type target_type = tensor_get_target_type(qs, params, tensor, default_type, false); if (!params->imatrix && tensor_allows_quantization(params, model.arch, tensor) && @@ -957,7 +962,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // if so, what will be the target type? if (do_quantize) { - new_type = get_tensor_target_type(qs, params, tensor, default_type); + new_type = tensor_get_target_type(qs, params, tensor, default_type, true); // If we've decided to quantize to the same type the tensor is already // in then there's nothing to do. do_quantize = tensor->type != new_type; From 97aefac773779c8c803cea30321c95827f7b0cc4 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 12 Feb 2026 20:00:23 -0600 Subject: [PATCH 28/35] update_stats guard --- src/llama-quant.cpp | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index e35b4573f3..461d7689a8 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -175,7 +175,7 @@ static void llama_tensor_dequantize_impl( workers.clear(); } -static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) { +static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype, bool update_stats) { const std::string name = ggml_get_name(tensor); // TODO: avoid hardcoded tensor names - use the TN_* constants @@ -257,7 +257,9 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t if (name.find("attn_v.weight") != std::string::npos) { if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; - ++qs.i_attention_wv; + if (update_stats) { + ++qs.i_attention_wv; + } } else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) { new_type = GGML_TYPE_Q4_K; @@ -266,7 +268,9 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t if (qs.i_ffn_down < qs.n_ffn_down/8) { new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; } - ++qs.i_ffn_down; + if (update_stats) { + ++qs.i_ffn_down; + } } else if (name.find("attn_output.weight") != std::string::npos) { if (qs.model.hparams.n_expert == 8) { @@ -313,7 +317,9 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t // TODO: explore better strategies new_type = GGML_TYPE_Q8_0; } - ++qs.i_attention_wv; + if (update_stats) { + ++qs.i_attention_wv; + } } else if (name.find("attn_k.weight") != std::string::npos) { if (qs.model.hparams.n_expert == 8) { // for the 8-expert model, bumping this to Q8_0 trades just ~128MB @@ -377,7 +383,9 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix. new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1; } - ++qs.i_ffn_down; + if (update_stats) { + ++qs.i_ffn_down; + } } else if (name.find("attn_output.weight") != std::string::npos) { if (arch != LLM_ARCH_FALCON) { if (qs.model.hparams.n_expert == 8) { @@ -411,7 +419,9 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) { new_type = GGML_TYPE_IQ3_XXS; } - ++qs.i_ffn_gate; + if (update_stats) { + ++qs.i_ffn_gate; + } } else if (name.find("ffn_up") != std::string::npos) { auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str()); @@ -419,7 +429,9 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) { new_type = GGML_TYPE_IQ3_XXS; } - ++qs.i_ffn_up; + if (update_stats) { + ++qs.i_ffn_up; + } } return new_type; @@ -583,7 +595,7 @@ static ggml_type tensor_get_target_type( // if not manual - use the standard logic for choosing the quantization type based on the selected mixture if (!manual) { - new_type = llama_tensor_get_type(qs, new_type, tensor, params->ftype); + new_type = llama_tensor_get_type(qs, new_type, tensor, params->ftype, update_stats); } // incompatible tensor shapes are handled here - fallback to a compatible type @@ -625,8 +637,8 @@ static ggml_type tensor_get_target_type( if (tensor->ne[0] % ggml_blck_size(new_type) != 0) { new_type = GGML_TYPE_F16; } - LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type)); if (update_stats) { + LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type)); ++qs.n_fallback; } } From bddc67547fdf2fe13fcb6c2db855e5466d9f3c64 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Fri, 13 Feb 2026 21:13:53 -0600 Subject: [PATCH 29/35] correct function names --- src/llama-quant.cpp | 245 ++++++++++++++++++++++---------------------- 1 file changed, 122 insertions(+), 123 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 461d7689a8..47ece2d666 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -175,7 +175,7 @@ static void llama_tensor_dequantize_impl( workers.clear(); } -static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype, bool update_stats) { +static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype, bool update_stats) { const std::string name = ggml_get_name(tensor); // TODO: avoid hardcoded tensor names - use the TN_* constants @@ -437,6 +437,95 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t return new_type; } +// determine the ggml_type that this tensor should be quantized to +static ggml_type llama_tensor_get_type( + quantize_state_impl & qs, + const llama_model_quantize_params * params, + const ggml_tensor * tensor, + ggml_type default_type, + bool update_stats // we only update qs if this flag is true +) { + ggml_type new_type = default_type; + // get more optimal quantization type based on the tensor shape, layer, etc. + if (!params->pure && ggml_is_quantized(default_type)) { + + // if the user provided tensor types - use those + bool manual = false; + if (params->tensor_types) { + const std::vector & tensor_types = *static_cast *>(params->tensor_types); + const std::string tensor_name(tensor->name); + for (const auto & [tname, qtype] : tensor_types) { + if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) { + if (qtype != new_type) { + LLAMA_LOG_WARN("(manual override: %s -> %s) ", ggml_type_name(new_type), ggml_type_name(qtype)); + new_type = qtype; // if two or more types are specified for the same tensor, the last match wins + manual = true; + break; + } + } + } + } + + // if not manual - use the standard logic for choosing the quantization type based on the selected mixture + if (!manual) { + new_type = llama_tensor_get_type_impl(qs, new_type, tensor, params->ftype, update_stats); + } + + // incompatible tensor shapes are handled here - fallback to a compatible type + { + bool convert_incompatible_tensor = false; + + const int64_t nx = tensor->ne[0]; + const int64_t ny = tensor->ne[1]; + const int64_t qk_k = ggml_blck_size(new_type); + + if (nx % qk_k != 0) { + LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type)); + convert_incompatible_tensor = true; + } else { + if (update_stats) { + ++qs.n_k_quantized; + } + } + + if (convert_incompatible_tensor) { + switch (new_type) { + case GGML_TYPE_TQ1_0: + case GGML_TYPE_TQ2_0: new_type = GGML_TYPE_Q4_0; break; // TODO: use a symmetric type instead + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ2_S: + case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ3_S: + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break; + case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break; + case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break; + case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break; + default: throw std::runtime_error("\nUnsupported tensor size encountered\n"); + } + if (tensor->ne[0] % ggml_blck_size(new_type) != 0) { + new_type = GGML_TYPE_F16; + } + if (update_stats) { + LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type)); + ++qs.n_fallback; + } + } + } + } + if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) { + new_type = params->token_embedding_type; + } + if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) { + new_type = params->output_tensor_type; + } + return new_type; +} + static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector & workers, const int nthread) { if (nthread < 2) { // single-thread @@ -508,149 +597,61 @@ static bool tensor_allows_quantization(const llama_model_quantize_params * param const std::string name = tensor->name; // This used to be a regex, but has an extreme cost to compile times. - bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'? + bool allowed = name.rfind("weight") == name.size() - 6; // ends with 'weight'? // quantize only 2D and 3D tensors (experts) - quantize &= (ggml_n_dims(tensor) >= 2); + allowed &= (ggml_n_dims(tensor) >= 2); // do not quantize norm tensors - quantize &= name.find("_norm.weight") == std::string::npos; + allowed &= name.find("_norm.weight") == std::string::npos; - quantize &= params->quantize_output_tensor || name != "output.weight"; - quantize &= !params->only_copy; + allowed &= params->quantize_output_tensor || name != "output.weight"; + allowed &= !params->only_copy; // do not quantize expert gating tensors // NOTE: can't use LLM_TN here because the layer number is not known - quantize &= name.find("ffn_gate_inp.weight") == std::string::npos; + allowed &= name.find("ffn_gate_inp.weight") == std::string::npos; // these are very small (e.g. 4x4) - quantize &= name.find("altup") == std::string::npos; - quantize &= name.find("laurel") == std::string::npos; + allowed &= name.find("altup") == std::string::npos; + allowed &= name.find("laurel") == std::string::npos; // these are not too big so keep them as it is - quantize &= name.find("per_layer_model_proj") == std::string::npos; + allowed &= name.find("per_layer_model_proj") == std::string::npos; // do not quantize positional embeddings and token types (BERT) - quantize &= name != LLM_TN(arch)(LLM_TENSOR_POS_EMBD, "weight"); - quantize &= name != LLM_TN(arch)(LLM_TENSOR_TOKEN_TYPES, "weight"); + allowed &= name != LLM_TN(arch)(LLM_TENSOR_POS_EMBD, "weight"); + allowed &= name != LLM_TN(arch)(LLM_TENSOR_TOKEN_TYPES, "weight"); // do not quantize Mamba /Kimi's small conv1d weights // NOTE: can't use LLM_TN here because the layer number is not known - quantize &= name.find("ssm_conv1d") == std::string::npos; - quantize &= name.find("shortconv.conv.weight") == std::string::npos; + allowed &= name.find("ssm_conv1d") == std::string::npos; + allowed &= name.find("shortconv.conv.weight") == std::string::npos; // do not quantize RWKV's small yet 2D weights - quantize &= name.find("time_mix_first.weight") == std::string::npos; - quantize &= name.find("time_mix_w0.weight") == std::string::npos; - quantize &= name.find("time_mix_w1.weight") == std::string::npos; - quantize &= name.find("time_mix_w2.weight") == std::string::npos; - quantize &= name.find("time_mix_v0.weight") == std::string::npos; - quantize &= name.find("time_mix_v1.weight") == std::string::npos; - quantize &= name.find("time_mix_v2.weight") == std::string::npos; - quantize &= name.find("time_mix_a0.weight") == std::string::npos; - quantize &= name.find("time_mix_a1.weight") == std::string::npos; - quantize &= name.find("time_mix_a2.weight") == std::string::npos; - quantize &= name.find("time_mix_g1.weight") == std::string::npos; - quantize &= name.find("time_mix_g2.weight") == std::string::npos; - quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos; - quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos; - quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos; + allowed &= name.find("time_mix_first.weight") == std::string::npos; + allowed &= name.find("time_mix_w0.weight") == std::string::npos; + allowed &= name.find("time_mix_w1.weight") == std::string::npos; + allowed &= name.find("time_mix_w2.weight") == std::string::npos; + allowed &= name.find("time_mix_v0.weight") == std::string::npos; + allowed &= name.find("time_mix_v1.weight") == std::string::npos; + allowed &= name.find("time_mix_v2.weight") == std::string::npos; + allowed &= name.find("time_mix_a0.weight") == std::string::npos; + allowed &= name.find("time_mix_a1.weight") == std::string::npos; + allowed &= name.find("time_mix_a2.weight") == std::string::npos; + allowed &= name.find("time_mix_g1.weight") == std::string::npos; + allowed &= name.find("time_mix_g2.weight") == std::string::npos; + allowed &= name.find("time_mix_decay_w1.weight") == std::string::npos; + allowed &= name.find("time_mix_decay_w2.weight") == std::string::npos; + allowed &= name.find("time_mix_lerp_fused.weight") == std::string::npos; // do not quantize relative position bias (T5) - quantize &= name.find("attn_rel_b.weight") == std::string::npos; + allowed &= name.find("attn_rel_b.weight") == std::string::npos; // do not quantize specific multimodal tensors - quantize &= name.find(".position_embd.") == std::string::npos; + allowed &= name.find(".position_embd.") == std::string::npos; - return quantize; -} - -static ggml_type tensor_get_target_type( - quantize_state_impl & qs, - const llama_model_quantize_params * params, - const ggml_tensor * tensor, - ggml_type default_type, - bool update_stats // should we update qs or no? -) { - ggml_type new_type = default_type; - // get more optimal quantization type based on the tensor shape, layer, etc. - if (!params->pure && ggml_is_quantized(default_type)) { - - // if the user provided tensor types - use those - bool manual = false; - if (params->tensor_types) { - const std::vector & tensor_types = *static_cast *>(params->tensor_types); - const std::string tensor_name(tensor->name); - for (const auto & [tname, qtype] : tensor_types) { - if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) { - if (qtype != new_type) { - LLAMA_LOG_WARN("(manual override: %s -> %s) ", ggml_type_name(new_type), ggml_type_name(qtype)); - new_type = qtype; // if two or more types are specified for the same tensor, the last match wins - manual = true; - break; - } - } - } - } - - // if not manual - use the standard logic for choosing the quantization type based on the selected mixture - if (!manual) { - new_type = llama_tensor_get_type(qs, new_type, tensor, params->ftype, update_stats); - } - - // incompatible tensor shapes are handled here - fallback to a compatible type - { - bool convert_incompatible_tensor = false; - - const int64_t nx = tensor->ne[0]; - const int64_t ny = tensor->ne[1]; - const int64_t qk_k = ggml_blck_size(new_type); - - if (nx % qk_k != 0) { - LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type)); - convert_incompatible_tensor = true; - } else { - if (update_stats) { - ++qs.n_k_quantized; - } - } - - if (convert_incompatible_tensor) { - switch (new_type) { - case GGML_TYPE_TQ1_0: - case GGML_TYPE_TQ2_0: new_type = GGML_TYPE_Q4_0; break; // TODO: use a symmetric type instead - case GGML_TYPE_IQ2_XXS: - case GGML_TYPE_IQ2_XS: - case GGML_TYPE_IQ2_S: - case GGML_TYPE_IQ3_XXS: - case GGML_TYPE_IQ3_S: - case GGML_TYPE_IQ1_S: - case GGML_TYPE_IQ1_M: - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break; - case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break; - case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break; - case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break; - default: throw std::runtime_error("\nUnsupported tensor size encountered\n"); - } - if (tensor->ne[0] % ggml_blck_size(new_type) != 0) { - new_type = GGML_TYPE_F16; - } - if (update_stats) { - LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type)); - ++qs.n_fallback; - } - } - } - } - if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) { - new_type = params->token_embedding_type; - } - if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) { - new_type = params->output_tensor_type; - } - return new_type; + return allowed; } static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { @@ -866,9 +867,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } gguf_add_tensor(ctx_outs[i_split].get(), tensor); - // TODO: we could save this per-tensor and correlate it with the vector of tensors so we - // don't have to call this function again later (currently twice per tensor) - ggml_type target_type = tensor_get_target_type(qs, params, tensor, default_type, false); + ggml_type target_type = llama_tensor_get_type(qs, params, tensor, default_type, false); if (!params->imatrix && tensor_allows_quantization(params, model.arch, tensor) && From 7b127e126aa6a51340695849dc9343fc053fc9a4 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Fri, 13 Feb 2026 21:17:53 -0600 Subject: [PATCH 30/35] correct function names --- src/llama-quant.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 47ece2d666..4530cb1079 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -437,13 +437,15 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type return new_type; } -// determine the ggml_type that this tensor should be quantized to +// determine the ggml_type that this tensor should be quantized to. +// +// `qs` statistics will only be updated if the `update_stats` parameter is true. static ggml_type llama_tensor_get_type( quantize_state_impl & qs, const llama_model_quantize_params * params, const ggml_tensor * tensor, ggml_type default_type, - bool update_stats // we only update qs if this flag is true + bool update_stats ) { ggml_type new_type = default_type; // get more optimal quantization type based on the tensor shape, layer, etc. @@ -973,7 +975,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // if so, what will be the target type? if (do_quantize) { - new_type = tensor_get_target_type(qs, params, tensor, default_type, true); + new_type = llama_tensor_get_type(qs, params, tensor, default_type, true); // If we've decided to quantize to the same type the tensor is already // in then there's nothing to do. do_quantize = tensor->type != new_type; From aaf010edeb207fa5eed03b46f89a562faa90521e Mon Sep 17 00:00:00 2001 From: ddh0 Date: Mon, 16 Feb 2026 12:20:16 -0600 Subject: [PATCH 31/35] new function `llama_tensor_update_stats` --- src/llama-quant.cpp | 64 ++++++++++++++++++++------------------------- 1 file changed, 29 insertions(+), 35 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 4530cb1079..9e7c58b167 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -175,7 +175,14 @@ static void llama_tensor_dequantize_impl( workers.clear(); } -static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype, bool update_stats) { +// internal logic for selecting the target tensor type for a given quantization +// and model arch +static ggml_type llama_tensor_get_type_impl( + quantize_state_impl & qs, + ggml_type new_type, + const ggml_tensor * tensor, + const llama_ftype ftype +) { const std::string name = ggml_get_name(tensor); // TODO: avoid hardcoded tensor names - use the TN_* constants @@ -257,9 +264,6 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type if (name.find("attn_v.weight") != std::string::npos) { if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; - if (update_stats) { - ++qs.i_attention_wv; - } } else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) { new_type = GGML_TYPE_Q4_K; @@ -268,9 +272,6 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type if (qs.i_ffn_down < qs.n_ffn_down/8) { new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; } - if (update_stats) { - ++qs.i_ffn_down; - } } else if (name.find("attn_output.weight") != std::string::npos) { if (qs.model.hparams.n_expert == 8) { @@ -317,9 +318,6 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type // TODO: explore better strategies new_type = GGML_TYPE_Q8_0; } - if (update_stats) { - ++qs.i_attention_wv; - } } else if (name.find("attn_k.weight") != std::string::npos) { if (qs.model.hparams.n_expert == 8) { // for the 8-expert model, bumping this to Q8_0 trades just ~128MB @@ -383,9 +381,6 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix. new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1; } - if (update_stats) { - ++qs.i_ffn_down; - } } else if (name.find("attn_output.weight") != std::string::npos) { if (arch != LLM_ARCH_FALCON) { if (qs.model.hparams.n_expert == 8) { @@ -419,9 +414,6 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) { new_type = GGML_TYPE_IQ3_XXS; } - if (update_stats) { - ++qs.i_ffn_gate; - } } else if (name.find("ffn_up") != std::string::npos) { auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str()); @@ -429,23 +421,17 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) { new_type = GGML_TYPE_IQ3_XXS; } - if (update_stats) { - ++qs.i_ffn_up; - } } return new_type; } -// determine the ggml_type that this tensor should be quantized to. -// -// `qs` statistics will only be updated if the `update_stats` parameter is true. +// determine the ggml_type that this tensor should be quantized to static ggml_type llama_tensor_get_type( quantize_state_impl & qs, const llama_model_quantize_params * params, const ggml_tensor * tensor, - ggml_type default_type, - bool update_stats + const ggml_type default_type ) { ggml_type new_type = default_type; // get more optimal quantization type based on the tensor shape, layer, etc. @@ -470,7 +456,7 @@ static ggml_type llama_tensor_get_type( // if not manual - use the standard logic for choosing the quantization type based on the selected mixture if (!manual) { - new_type = llama_tensor_get_type_impl(qs, new_type, tensor, params->ftype, update_stats); + new_type = llama_tensor_get_type_impl(qs, new_type, tensor, params->ftype); } // incompatible tensor shapes are handled here - fallback to a compatible type @@ -484,10 +470,6 @@ static ggml_type llama_tensor_get_type( if (nx % qk_k != 0) { LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type)); convert_incompatible_tensor = true; - } else { - if (update_stats) { - ++qs.n_k_quantized; - } } if (convert_incompatible_tensor) { @@ -512,10 +494,6 @@ static ggml_type llama_tensor_get_type( if (tensor->ne[0] % ggml_blck_size(new_type) != 0) { new_type = GGML_TYPE_F16; } - if (update_stats) { - LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type)); - ++qs.n_fallback; - } } } } @@ -528,6 +506,20 @@ static ggml_type llama_tensor_get_type( return new_type; } +// update internal quantization state statistics based on the tensor name +static void llama_tensor_update_stats(quantize_state_impl & qs, const std::string & name) { + if (name.find("attn_v.weight") != std::string::npos || + name.find("attn_kv_b.weight") != std::string::npos) { + ++qs.i_attention_wv; + } else if (name.find("ffn_down") != std::string::npos) { + ++qs.i_ffn_down; + } else if (name.find("ffn_gate") != std::string::npos) { + ++qs.i_ffn_gate; + } else if (name.find("ffn_up") != std::string::npos) { + ++qs.i_ffn_up; + } +} + static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector & workers, const int nthread) { if (nthread < 2) { // single-thread @@ -869,7 +861,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } gguf_add_tensor(ctx_outs[i_split].get(), tensor); - ggml_type target_type = llama_tensor_get_type(qs, params, tensor, default_type, false); + ggml_type target_type = llama_tensor_get_type(qs, params, tensor, default_type); if (!params->imatrix && tensor_allows_quantization(params, model.arch, tensor) && @@ -975,12 +967,14 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // if so, what will be the target type? if (do_quantize) { - new_type = llama_tensor_get_type(qs, params, tensor, default_type, true); + new_type = llama_tensor_get_type(qs, params, tensor, default_type); // If we've decided to quantize to the same type the tensor is already // in then there's nothing to do. do_quantize = tensor->type != new_type; } + llama_tensor_update_stats(qs, name); + void * new_data; size_t new_size; From 521a13e6c68b41120e985ad7645f03650ad9bf40 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Mon, 16 Feb 2026 12:34:51 -0600 Subject: [PATCH 32/35] correct fallback logic --- src/llama-quant.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 9e7c58b167..5bce2bf221 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -175,8 +175,8 @@ static void llama_tensor_dequantize_impl( workers.clear(); } -// internal logic for selecting the target tensor type for a given quantization -// and model arch +// internal standard logic for selecting the target tensor type for a specific +// quantization mixture & model architecture static ggml_type llama_tensor_get_type_impl( quantize_state_impl & qs, ggml_type new_type, @@ -422,7 +422,6 @@ static ggml_type llama_tensor_get_type_impl( new_type = GGML_TYPE_IQ3_XXS; } } - return new_type; } @@ -468,7 +467,7 @@ static ggml_type llama_tensor_get_type( const int64_t qk_k = ggml_blck_size(new_type); if (nx % qk_k != 0) { - LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type)); + LLAMA_LOG_WARN("\n%s: tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type)); convert_incompatible_tensor = true; } @@ -489,7 +488,7 @@ static ggml_type llama_tensor_get_type( case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break; case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break; case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break; - default: throw std::runtime_error("\nUnsupported tensor size encountered\n"); + default: throw std::runtime_error("unsupported tensor size"); } if (tensor->ne[0] % ggml_blck_size(new_type) != 0) { new_type = GGML_TYPE_F16; From 3c1f94a49d331d96ce7f2469fb901eebc10803bf Mon Sep 17 00:00:00 2001 From: ddh0 Date: Mon, 16 Feb 2026 13:13:44 -0600 Subject: [PATCH 33/35] correct fallback logic --- src/llama-quant.cpp | 42 ++++++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 5bce2bf221..afec667dc1 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -89,7 +89,6 @@ struct quantize_state_impl { int i_ffn_gate = 0; int i_ffn_up = 0; - int n_k_quantized = 0; int n_fallback = 0; bool has_imatrix = false; @@ -493,6 +492,7 @@ static ggml_type llama_tensor_get_type( if (tensor->ne[0] % ggml_blck_size(new_type) != 0) { new_type = GGML_TYPE_F16; } + ++qs.n_fallback; } } } @@ -505,20 +505,6 @@ static ggml_type llama_tensor_get_type( return new_type; } -// update internal quantization state statistics based on the tensor name -static void llama_tensor_update_stats(quantize_state_impl & qs, const std::string & name) { - if (name.find("attn_v.weight") != std::string::npos || - name.find("attn_kv_b.weight") != std::string::npos) { - ++qs.i_attention_wv; - } else if (name.find("ffn_down") != std::string::npos) { - ++qs.i_ffn_down; - } else if (name.find("ffn_gate") != std::string::npos) { - ++qs.i_ffn_gate; - } else if (name.find("ffn_up") != std::string::npos) { - ++qs.i_ffn_up; - } -} - static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector & workers, const int nthread) { if (nthread < 2) { // single-thread @@ -867,7 +853,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: tensor_requires_imatrix(tensor, target_type) ) { if (params->dry_run) { - will_require_imatrix = true; // set flag for warning later, but continue with dry run + // set flag for warning later, but continue with dry run + will_require_imatrix = true; } else { LLAMA_LOG_ERROR("\n============================================================================\n" " ERROR: this quantization requires an importance matrix!\n" @@ -880,6 +867,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } } + qs.n_fallback = 0; // may have been falsely incremented by the preliminary loop over weights qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer; // Set split info if needed @@ -964,15 +952,25 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: ggml_type new_type = default_type; - // if so, what will be the target type? + // if so, what will be the new type? if (do_quantize) { new_type = llama_tensor_get_type(qs, params, tensor, default_type); // If we've decided to quantize to the same type the tensor is already // in then there's nothing to do. do_quantize = tensor->type != new_type; - } - llama_tensor_update_stats(qs, name); + // count stats for this tensor based on its name + if (name.find("attn_v.weight") != std::string::npos || + name.find("attn_kv_b.weight") != std::string::npos) { + ++qs.i_attention_wv; + } else if (name.find("ffn_down") != std::string::npos) { + ++qs.i_ffn_down; + } else if (name.find("ffn_gate") != std::string::npos) { + ++qs.i_ffn_gate; + } else if (name.find("ffn_up") != std::string::npos) { + ++qs.i_ffn_up; + } + } void * new_data; size_t new_size; @@ -1131,8 +1129,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } if (qs.n_fallback > 0) { - LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n", - __func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback); + LLAMA_LOG_WARN("%s: WARNING: %d tensor(s) required fallback quantization\n", + __func__, qs.n_fallback); } } @@ -1143,7 +1141,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: llama_model_quantize_params llama_model_quantize_default_params() { llama_model_quantize_params result = { /*.nthread =*/ 0, - /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1, + /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q8_0, /*.output_tensor_type =*/ GGML_TYPE_COUNT, /*.token_embedding_type =*/ GGML_TYPE_COUNT, /*.allow_requantize =*/ false, From a26db356c9ce52ca8b770de47613cfaa3344bf99 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Mon, 16 Feb 2026 13:43:22 -0600 Subject: [PATCH 34/35] refactor --- src/llama-quant.cpp | 164 +++++++++++++++++++++++--------------------- 1 file changed, 85 insertions(+), 79 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index afec667dc1..5cc538caf0 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include // Quantization types. Changes to this struct must be replicated in quantize.cpp @@ -19,6 +20,35 @@ struct tensor_quantization { ggml_type quant = GGML_TYPE_COUNT; }; +struct quantize_state_impl { + const llama_model & model; + const llama_model_quantize_params * params; + + int n_attention_wv = 0; + int n_ffn_down = 0; + int n_ffn_gate = 0; + int n_ffn_up = 0; + int i_attention_wv = 0; + int i_ffn_down = 0; + int i_ffn_gate = 0; + int i_ffn_up = 0; + + int n_fallback = 0; + + bool has_imatrix = false; + + // used to figure out if a model shares tok_embd with the output weight + bool has_output = false; + + // if this flag is false, the code will skip updating this struct + bool do_count = false; + + quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params) + : model(model) + , params(params) + {} +}; + static void zeros(std::ofstream & file, size_t n) { char zero = 0; for (size_t i = 0; i < n; ++i) { @@ -76,32 +106,6 @@ static std::string remap_imatrix (const std::string & orig_name, const std::map< return orig_name; } -struct quantize_state_impl { - const llama_model & model; - const llama_model_quantize_params * params; - - int n_attention_wv = 0; - int n_ffn_down = 0; - int n_ffn_gate = 0; - int n_ffn_up = 0; - int i_attention_wv = 0; - int i_ffn_down = 0; - int i_ffn_gate = 0; - int i_ffn_up = 0; - - int n_fallback = 0; - - bool has_imatrix = false; - - // used to figure out if a model shares tok_embd with the output weight - bool has_output = false; - - quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params) - : model(model) - , params(params) - {} -}; - static void llama_tensor_dequantize_impl( ggml_tensor * tensor, std::vector> & output, std::vector & workers, const size_t nelements, const int nthread @@ -177,7 +181,7 @@ static void llama_tensor_dequantize_impl( // internal standard logic for selecting the target tensor type for a specific // quantization mixture & model architecture static ggml_type llama_tensor_get_type_impl( - quantize_state_impl & qs, + quantize_state_impl * qs, ggml_type new_type, const ggml_tensor * tensor, const llama_ftype ftype @@ -185,13 +189,13 @@ static ggml_type llama_tensor_get_type_impl( const std::string name = ggml_get_name(tensor); // TODO: avoid hardcoded tensor names - use the TN_* constants - const llm_arch arch = qs.model.arch; + const llm_arch arch = qs->model.arch; const auto tn = LLM_TN(arch); auto use_more_bits = [](int i_layer, int n_layers) -> bool { return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2; }; - const int n_expert = std::max(1, (int)qs.model.hparams.n_expert); + const int n_expert = std::max(1, (int)qs->model.hparams.n_expert); auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) { if (n_expert > 1) { // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but occasionally randomly @@ -210,9 +214,9 @@ static ggml_type llama_tensor_get_type_impl( // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings // with the quantization of the output tensor - if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) { - if (qs.params->output_tensor_type < GGML_TYPE_COUNT) { - new_type = qs.params->output_tensor_type; + if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs->has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) { + if (qs->params->output_tensor_type < GGML_TYPE_COUNT) { + new_type = qs->params->output_tensor_type; } else { const int64_t nx = tensor->ne[0]; const int64_t qk_k = ggml_blck_size(new_type); @@ -241,8 +245,8 @@ static ggml_type llama_tensor_get_type_impl( new_type = GGML_TYPE_Q8_0; } } else if (name == "token_embd.weight" || name == "per_layer_token_embd.weight") { - if (qs.params->token_embedding_type < GGML_TYPE_COUNT) { - new_type = qs.params->token_embedding_type; + if (qs->params->token_embedding_type < GGML_TYPE_COUNT) { + new_type = qs->params->token_embedding_type; } else { if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { @@ -261,19 +265,19 @@ static ggml_type llama_tensor_get_type_impl( } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { if (name.find("attn_v.weight") != std::string::npos) { - if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; + if (qs->model.hparams.n_gqa() >= 4 || qs->model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; } - else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) { + else if (qs->model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) { new_type = GGML_TYPE_Q4_K; } else if (name.find("ffn_down") != std::string::npos) { - if (qs.i_ffn_down < qs.n_ffn_down/8) { + if (qs->i_ffn_down < qs->n_ffn_down/8) { new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; } } else if (name.find("attn_output.weight") != std::string::npos) { - if (qs.model.hparams.n_expert == 8) { + if (qs->model.hparams.n_expert == 8) { new_type = GGML_TYPE_Q5_K; } else { if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS; @@ -282,43 +286,43 @@ static ggml_type llama_tensor_get_type_impl( } } else if (name.find("attn_v.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { - new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + new_type = qs->model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) { + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs->model.hparams.n_gqa() >= 4) { new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { - new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + new_type = qs->model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs->has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; } - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) { + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs->model.hparams.n_gqa() >= 4) { new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { - new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + new_type = qs->i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) { + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs->model.hparams.n_gqa() >= 4) { new_type = GGML_TYPE_Q5_K; } else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && - use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; - if (qs.model.type == LLM_TYPE_70B) { + use_more_bits(qs->i_attention_wv, qs->n_attention_wv)) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs->i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; + if (qs->model.type == LLM_TYPE_70B) { // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with // nearly negligible increase in model size by quantizing this tensor with more bits: if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K; } - if (qs.model.hparams.n_expert == 8) { + if (qs->model.hparams.n_expert == 8) { // for the 8-expert model, bumping this to Q8_0 trades just ~128MB // TODO: explore better strategies new_type = GGML_TYPE_Q8_0; } } else if (name.find("attn_k.weight") != std::string::npos) { - if (qs.model.hparams.n_expert == 8) { + if (qs->model.hparams.n_expert == 8) { // for the 8-expert model, bumping this to Q8_0 trades just ~128MB // TODO: explore better strategies new_type = GGML_TYPE_Q8_0; @@ -337,13 +341,13 @@ static ggml_type llama_tensor_get_type_impl( new_type = GGML_TYPE_IQ2_S; } } else if (name.find("ffn_down") != std::string::npos) { - auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); + auto info = layer_info(qs->i_ffn_down, qs->n_ffn_down, name.c_str()); int i_layer = info.first, n_layer = info.second; if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs->has_imatrix) { new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { @@ -352,7 +356,7 @@ static ggml_type llama_tensor_get_type_impl( : GGML_TYPE_Q3_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 || - (qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) { + (qs->model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) { new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { @@ -366,7 +370,7 @@ static ggml_type llama_tensor_get_type_impl( if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; } } - else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) { + else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs->has_imatrix) { new_type = GGML_TYPE_Q5_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; @@ -374,7 +378,7 @@ static ggml_type llama_tensor_get_type_impl( new_type = GGML_TYPE_Q5_K; } else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0) - && qs.has_imatrix && i_layer < n_layer/8) { + && qs->has_imatrix && i_layer < n_layer/8) { // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0. // We only do it when an imatrix is provided because a) we want to make sure that one can always get the // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix. @@ -382,7 +386,7 @@ static ggml_type llama_tensor_get_type_impl( } } else if (name.find("attn_output.weight") != std::string::npos) { if (arch != LLM_ARCH_FALCON) { - if (qs.model.hparams.n_expert == 8) { + if (qs->model.hparams.n_expert == 8) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || @@ -408,14 +412,14 @@ static ggml_type llama_tensor_get_type_impl( else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; } else if (name.find("ffn_gate") != std::string::npos) { - auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str()); + auto info = layer_info(qs->i_ffn_gate, qs->n_ffn_gate, name.c_str()); int i_layer = info.first, n_layer = info.second; if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) { new_type = GGML_TYPE_IQ3_XXS; } } else if (name.find("ffn_up") != std::string::npos) { - auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str()); + auto info = layer_info(qs->i_ffn_up, qs->n_ffn_up, name.c_str()); int i_layer = info.first, n_layer = info.second; if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) { new_type = GGML_TYPE_IQ3_XXS; @@ -426,7 +430,7 @@ static ggml_type llama_tensor_get_type_impl( // determine the ggml_type that this tensor should be quantized to static ggml_type llama_tensor_get_type( - quantize_state_impl & qs, + quantize_state_impl * qs, const llama_model_quantize_params * params, const ggml_tensor * tensor, const ggml_type default_type @@ -492,7 +496,9 @@ static ggml_type llama_tensor_get_type( if (tensor->ne[0] % ggml_blck_size(new_type) != 0) { new_type = GGML_TYPE_F16; } - ++qs.n_fallback; + if (qs->do_count) { + ++qs->n_fallback; + } } } } @@ -708,7 +714,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: model.load_hparams(ml); model.load_stats (ml); - quantize_state_impl qs(model, params); + auto qs = std::make_unique(model, params); if (params->only_copy) { ftype = ml.ftype; @@ -718,7 +724,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: imatrix_data = static_cast>*>(params->imatrix); if (imatrix_data) { LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size())); - qs.has_imatrix = true; + qs->has_imatrix = true; // check imatrix for nans or infs for (const auto & kv : *imatrix_data) { for (float f : kv.second) { @@ -830,15 +836,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: const ggml_tensor * tensor = it->tensor; const std::string name = tensor->name; - // TODO: avoid hardcoded tensor names - use the TN_* constants - if (name.find("attn_v.weight") != std::string::npos || - name.find("attn_qkv.weight") != std::string::npos || - name.find("attn_kv_b.weight")!= std::string::npos) { - ++qs.n_attention_wv; - } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) { - qs.has_output = true; - } - // populate the original tensors so we get an initial meta data uint16_t i_split = params->keep_split ? it->idx : 0; if (!ctx_outs[i_split]) { @@ -867,9 +864,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } } - qs.n_fallback = 0; // may have been falsely incremented by the preliminary loop over weights - qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer; - // Set split info if needed if (n_split > 1) { for (size_t i = 0; i < ctx_outs.size(); ++i) { @@ -918,6 +912,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: size_t total_size_org = 0; size_t total_size_new = 0; + qs->n_ffn_down = qs->n_ffn_gate = qs->n_ffn_up = (int)model.hparams.n_layer; + qs->do_count = true; // we start counting stats for the main loop + // iterate over all weights (main loop) for (const auto * it : weights) { const auto & weight = *it; @@ -931,6 +928,15 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: const std::string name = ggml_get_name(tensor); const size_t tensor_size = ggml_nbytes(tensor); + // TODO: avoid hardcoded tensor names - use the TN_* constants + if (name.find("attn_v.weight") != std::string::npos || + name.find("attn_qkv.weight") != std::string::npos || + name.find("attn_kv_b.weight")!= std::string::npos) { + ++qs->n_attention_wv; + } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) { + qs->has_output = true; + } + if (!params->dry_run) { if (!ml.use_mmap) { if (read_data.size() < tensor_size) { @@ -962,13 +968,13 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // count stats for this tensor based on its name if (name.find("attn_v.weight") != std::string::npos || name.find("attn_kv_b.weight") != std::string::npos) { - ++qs.i_attention_wv; + ++qs->i_attention_wv; } else if (name.find("ffn_down") != std::string::npos) { - ++qs.i_ffn_down; + ++qs->i_ffn_down; } else if (name.find("ffn_gate") != std::string::npos) { - ++qs.i_ffn_gate; + ++qs->i_ffn_gate; } else if (name.find("ffn_up") != std::string::npos) { - ++qs.i_ffn_up; + ++qs->i_ffn_up; } } @@ -1128,9 +1134,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: ); } - if (qs.n_fallback > 0) { + if (qs->n_fallback > 0) { LLAMA_LOG_WARN("%s: WARNING: %d tensor(s) required fallback quantization\n", - __func__, qs.n_fallback); + __func__, qs->n_fallback); } } From ce0ad2986b4e044b24240059cc64437387989e65 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Mon, 16 Feb 2026 13:59:13 -0600 Subject: [PATCH 35/35] refactor --- src/llama-quant.cpp | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 5cc538caf0..93fef63bd2 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -40,7 +40,7 @@ struct quantize_state_impl { // used to figure out if a model shares tok_embd with the output weight bool has_output = false; - // if this flag is false, the code will skip updating this struct + // if this flag is false, the code will skip updating the per-tensor counters bool do_count = false; quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params) @@ -181,10 +181,10 @@ static void llama_tensor_dequantize_impl( // internal standard logic for selecting the target tensor type for a specific // quantization mixture & model architecture static ggml_type llama_tensor_get_type_impl( - quantize_state_impl * qs, - ggml_type new_type, - const ggml_tensor * tensor, - const llama_ftype ftype + quantize_state_impl * qs, + ggml_type new_type, + const ggml_tensor * tensor, + llama_ftype ftype ) { const std::string name = ggml_get_name(tensor); @@ -267,6 +267,9 @@ static ggml_type llama_tensor_get_type_impl( if (name.find("attn_v.weight") != std::string::npos) { if (qs->model.hparams.n_gqa() >= 4 || qs->model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; + if (qs->do_count) { + ++qs->i_attention_wv; + } } else if (qs->model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) { new_type = GGML_TYPE_Q4_K; @@ -275,6 +278,9 @@ static ggml_type llama_tensor_get_type_impl( if (qs->i_ffn_down < qs->n_ffn_down/8) { new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; } + if (qs->do_count) { + ++qs->i_ffn_down; + } } else if (name.find("attn_output.weight") != std::string::npos) { if (qs->model.hparams.n_expert == 8) { @@ -321,6 +327,9 @@ static ggml_type llama_tensor_get_type_impl( // TODO: explore better strategies new_type = GGML_TYPE_Q8_0; } + if (qs->do_count) { + ++qs->i_attention_wv; + } } else if (name.find("attn_k.weight") != std::string::npos) { if (qs->model.hparams.n_expert == 8) { // for the 8-expert model, bumping this to Q8_0 trades just ~128MB @@ -384,6 +393,9 @@ static ggml_type llama_tensor_get_type_impl( // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix. new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1; } + if (qs->do_count) { + ++qs->i_ffn_down; + } } else if (name.find("attn_output.weight") != std::string::npos) { if (arch != LLM_ARCH_FALCON) { if (qs->model.hparams.n_expert == 8) { @@ -417,6 +429,9 @@ static ggml_type llama_tensor_get_type_impl( if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) { new_type = GGML_TYPE_IQ3_XXS; } + if (qs->do_count) { + ++qs->i_ffn_gate; + } } else if (name.find("ffn_up") != std::string::npos) { auto info = layer_info(qs->i_ffn_up, qs->n_ffn_up, name.c_str()); @@ -424,6 +439,9 @@ static ggml_type llama_tensor_get_type_impl( if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) { new_type = GGML_TYPE_IQ3_XXS; } + if (qs->do_count) { + ++qs->i_ffn_up; + } } return new_type; } @@ -714,6 +732,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: model.load_hparams(ml); model.load_stats (ml); + // quantize_state_impl qs(model, params); auto qs = std::make_unique(model, params); if (params->only_copy) { @@ -843,7 +862,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } gguf_add_tensor(ctx_outs[i_split].get(), tensor); - ggml_type target_type = llama_tensor_get_type(qs, params, tensor, default_type); + ggml_type target_type = llama_tensor_get_type(qs.get(), params, tensor, default_type); if (!params->imatrix && tensor_allows_quantization(params, model.arch, tensor) && @@ -960,7 +979,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // if so, what will be the new type? if (do_quantize) { - new_type = llama_tensor_get_type(qs, params, tensor, default_type); + new_type = llama_tensor_get_type(qs.get(), params, tensor, default_type); // If we've decided to quantize to the same type the tensor is already // in then there's nothing to do. do_quantize = tensor->type != new_type;