From d4ac2106fb5b9e1a98d6aef8a0931e73e46f324e Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sun, 24 Aug 2025 13:39:10 +0100 Subject: [PATCH] Improve logging and some minor code refactoring --- src/llama-quant.cpp | 26 +++++++++++++++----------- tools/quantize/quantize.cpp | 7 +------ 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 4ed9454068..407a63d887 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -132,7 +132,6 @@ static std::string remap_imatrix (const std::string & orig_name, const std::map< for (const auto & p : mapped) { if (p.second == blk) { - LLAMA_LOG_DEBUG("(blk.%d imatrix) ", p.first); return new_name.replace(match.position(1), match.length(1), std::to_string(p.first)); } } @@ -1257,7 +1256,7 @@ static std::unordered_map target_bpw_type( // Build the override map std::unordered_map overrides; - LLAMA_LOG_INFO("%s: - estimated tensor quantization mix to achieve %.4f bpw at lowest ppl\n", __func__, target_bpw); + LLAMA_LOG_INFO("%s: - estimated tensor quantization mix:\n", __func__); for (const auto & ti : all) { LLAMA_LOG_INFO("\t%s: %45s - \t%8s, \t%1.4f bpw,\terror: %.4f\n", __func__, ggml_get_name(ti.w->tensor), ggml_type_name(ti.candidate[ti.choice].type), ti.candidate[ti.choice].bpw, ti.candidate[ti.choice].error); @@ -1352,7 +1351,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: if (params->imatrix) { values_data = static_cast>*>(params->imatrix); if (values_data) { - LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(values_data->size())); + LLAMA_LOG_INFO("================================ Have weights data with %d entries",int(values_data->size())); qs.has_imatrix = true; // check imatrix for nans or infs for (const auto & kv : *values_data) { @@ -1367,7 +1366,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: if (params->activations) { activations_data = static_cast>*>(params->activations); if (activations_data) { - LLAMA_LOG_INFO("================================ Have activations data with %d entries\n",int(activations_data->size())); + LLAMA_LOG_INFO(" and %d activations",int(activations_data->size())); qs.has_activations = true; // check activations for nans or infs for (const auto & kv : *activations_data) { @@ -1379,6 +1378,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } } } + LLAMA_LOG_INFO("\n"); gguf_context_ptr ctx_out { gguf_init_empty() }; @@ -1655,12 +1655,16 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: if (!params->pure && ggml_is_quantized(default_type)) { int fallback = qs.n_fallback; new_type = llama_tensor_get_type(qs, new_type, tensor, ftype); - // get bpw override - const auto override = bpw_overrides.find(name); - if (override != bpw_overrides.end() && override->second != new_type) { - LLAMA_LOG_DEBUG("(bpw overriding %s) ", ggml_type_name(new_type)); - new_type = override->second; + + // get quantization type overrides targeting a given bits per weight budget + if (params->target_bpw != -1.0f && !bpw_overrides.empty()) { + const auto override = bpw_overrides.find(name); + if (override != bpw_overrides.end() && override->second != new_type) { + LLAMA_LOG_DEBUG("(bpw override %s) ", ggml_type_name(new_type)); + new_type = override->second; + } } + // unless the user specifies a type, and the tensor shape will not require fallback quantisation if (params->tensor_types && qs.n_fallback - fallback == 0) { const std::vector & tensor_types = *static_cast *>(params->tensor_types); @@ -1668,7 +1672,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: for (const auto & [tname, qtype] : tensor_types) { if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) { if (qtype != new_type) { - LLAMA_LOG_DEBUG("(type overriding %s) ", ggml_type_name(new_type)); + LLAMA_LOG_DEBUG("(type override %s) ", ggml_type_name(new_type)); new_type = qtype; // if two or more types are specified for the same tensor, the last match wins } } @@ -1699,7 +1703,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: if (values_data) { auto it = values_data->find(remap_imatrix(tensor->name, mapped)); if (it == values_data->end()) { - LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name); + LLAMA_LOG_INFO("\n====== %s: did not find weights for %s, ", __func__, tensor->name); } else { if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) { imatrix = it->second.data(); diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index b907008cb4..77fa6b90ce 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -399,12 +399,7 @@ static int prepare_imatrix(const std::string & imatrix_file, values_data = std::move(tmp_values); activations_data = std::move(tmp_activations); } - if (!values_data.empty()) { - printf("%s: have %d importance matrix value entries\n", __func__, int(values_data.size())); - } - if (!activations_data.empty()) { - printf("%s: have %d importance matrix activation entries\n", __func__, int(activations_data.size())); - } + return m_last_call; }