From ee05d6bc0b250a7c19b9dedf504163509ef736f8 Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Tue, 19 Aug 2025 22:32:53 +0100 Subject: [PATCH] Update comments --- src/llama-quant.cpp | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 5af70c1c9b..546f6b438c 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -596,13 +596,13 @@ static std::unordered_map target_bpw_type( ggml_type type; float bpw; size_t bytes; - float error; // lower is better + float error; }; struct tensor_info { const llama_model_loader::llama_tensor_weight * w; - std::vector candidate; // sorted by bpw ascending - int choice = -1; // index into cand + std::vector candidate; + int choice = -1; float min_bpw = 0.0; float max_bpw = 0.0; size_t n_elements = 0; @@ -610,7 +610,6 @@ static std::unordered_map target_bpw_type( auto name_tn = LLM_TN(model.arch); - // The candidate types we consider; adjust as needed const ggml_type base_candidates[] = { // Model's GGML_TYPE_IQ1_S, @@ -639,8 +638,6 @@ static std::unordered_map target_bpw_type( bool q = name.rfind("weight") == name.size() - 6; q &= (ggml_n_dims(t) >= 2); q &= name.find("_norm.weight") == std::string::npos; - //q &= name != name_tn(LLM_TENSOR_TOKEN_EMBD, "weight"); - //q &= name != name_tn(LLM_TENSOR_OUTPUT, "weight"); q &= name.find("ffn_gate_inp.weight") == std::string::npos; q &= name.find("altup") == std::string::npos; q &= name.find("laurel") == std::string::npos; @@ -719,7 +716,7 @@ static std::unordered_map target_bpw_type( const ggml_type_traits * traits = ggml_get_type_traits(typ); if (!traits || !traits->to_float) { - // cannot dequantize candidate -> assign very high error + // Cannot dequantize candidate -> assign very high error return 1e35f; } @@ -842,12 +839,10 @@ static std::unordered_map target_bpw_type( info.w = tw; info.n_elements = nelem; - // Candidate build with compatibility handling and availability checks + // Build per-tensor candidate list for (ggml_type ts_type : base_candidates) { - // Skip IQ* without imatrix if (is_iq(ts_type) && !values) { continue; } ggml_type tt = make_compatible(t, ts_type); - // After fallback, if still incompatible, skip if (!is_compatible(t, tt)) { continue; } // Compute bpw and bytes @@ -861,19 +856,18 @@ static std::unordered_map target_bpw_type( } if (info.candidate.empty()) { - // as a last resort, keep original type + // As a last resort, keep original type float bpw = ggml_nbytes(t) * 8.0f / nelem; info.candidate.push_back(candidate_types{t->type, bpw, ggml_nbytes(t), 0.0}); } - // Sort by bpw ascending std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types &a, const candidate_types &b) { if (a.bpw != b.bpw) { return a.bpw < b.bpw; } if (a.error != b.error) { return a.error < b.error; } return a.bytes < b.bytes; }); - // collapse candidates with identical storage size (bytes) + // Collapse candidates with identical storage size (bytes) { std::vector uniq; uniq.reserve(info.candidate.size()); @@ -903,7 +897,6 @@ static std::unordered_map target_bpw_type( if (all.empty()) { return {}; } // Greedy allocation from minimum bpw upward to reach target_bpw - // Start with minimal bpw assignment auto current_total_bytes = [&]() -> size_t { size_t b = 0; for (const auto & ti : all) { @@ -938,11 +931,11 @@ static std::unordered_map target_bpw_type( } struct upgrade { - int idx; // tensor index - int next; // next candidate index (strictly larger bytes) - double err; // error reduction - size_t delta_bytes; // increase in bytes - double ratio; // err per added bit + int idx; + int next; + double err; + size_t delta_bytes; + double ratio; }; // Find next strictly-larger candidate index for a tensor @@ -998,6 +991,7 @@ static std::unordered_map target_bpw_type( } // We might still be below target but taking any single upgrade overshoots. + // Try to find the best upgrade that overshoots the target_bpw by the least and has the best error-to-size ratio. { double under_gap = (double)target_bpw - bpw_now;