diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 316dd35fa8..699264553a 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1219,6 +1219,18 @@ static std::unordered_map target_bpw_type( if (all.empty()) { return {}; } + // Compute total elements across all tensors and bytes for non-quantizable tensors + size_t nq_elements = 0; + size_t nq_bytes = 0; + for (const auto & it : ml.weights_map) { + const ggml_tensor * tensor = it.second.tensor; + const std::string name = it.first; + nq_elements += (size_t)ggml_nelements(tensor); + if (!is_quantizable(name, model.arch, params)) { + nq_bytes += ggml_nbytes(tensor); + } + } + auto total_bytes = [&]() -> size_t { size_t tb = 0; for (const auto & ti : all) { @@ -1228,19 +1240,20 @@ static std::unordered_map target_bpw_type( return tb; }; - size_t total_elems = 0; + size_t q_elements = 0; size_t min_bytes = 0; size_t max_bytes = 0; for (const auto & ti : all) { - total_elems += (size_t)ti.n_elements; + q_elements += (size_t)ti.n_elements; min_bytes += ti.candidate.front().bytes; // smallest candidate per tensor max_bytes += ti.candidate.back().bytes; // largest candidate per tensor } - if (total_elems == 0) { return {}; } + if (q_elements == 0) { return {}; } const double target_bpw = params->target_bpw; - size_t budget_bytes = std::llround(target_bpw * (double)total_elems / 8.0); // convert bpw to bytes + size_t target_total_bytes = std::llround(target_bpw * (double)nq_elements / 8.0); + size_t budget_bytes = target_total_bytes >= nq_bytes ? target_total_bytes - nq_bytes : min_bytes; auto emit_overrides = [&]() -> std::unordered_map { std::unordered_map overrides; @@ -1374,29 +1387,35 @@ static std::unordered_map target_bpw_type( int best_i = -1; int best_j = -1; double best_ratio = -1.0; - size_t best_delta = 0; + double best_gain = -1.0; + for (int i = 0; i < (int)all.size(); ++i) { const auto &ti = all[i]; int j = ti.choice + 1; - // skip same-bytes entries while (j < (int)ti.candidate.size() && ti.candidate[j].bytes == ti.candidate[ti.choice].bytes) { ++j; } if (j >= (int)ti.candidate.size()) { continue; } - size_t delta = ti.candidate[j].bytes - ti.candidate[ti.choice].bytes; - if (cur_bytes + delta > budget_bytes) { continue; } + size_t delta_bytes = ti.candidate[j].bytes - ti.candidate[ti.choice].bytes; + if (cur_bytes + delta_bytes > budget_bytes) { continue; } double err_gain = std::max(0.0, ti.candidate[ti.choice].error - ti.candidate[j].error); - double ratio = err_gain / (double)(delta * 8); // error reduction per bit - if (ratio > best_ratio + epsilon || (std::abs(ratio - best_ratio) <= epsilon && delta < best_delta)) { + if (err_gain < epsilon) { continue; } // no real improvement + + double ratio = err_gain / (double)delta_bytes; // error reduction per byte + // For tie-breaking, prioritize the largest absolute error improvement. + if (ratio > best_ratio + epsilon || (std::abs(ratio - best_ratio) <= epsilon && err_gain > best_gain)) { best_ratio = ratio; - best_delta = delta; + best_gain = err_gain; best_i = i; best_j = j; } } - if (best_i < 0) { break; } + + if (best_i < 0) { break; } // no more upgrades within budget found + + size_t upgrade_cost = all[best_i].candidate[best_j].bytes - all[best_i].candidate[all[best_i].choice].bytes; all[best_i].choice = best_j; - cur_bytes += best_delta; + cur_bytes += upgrade_cost; } }