diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index ebacf68806..ab6601a8bf 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1146,8 +1146,7 @@ static std::unordered_map target_bpw_type( for (size_t i = 0; i < base_sz; ++i) { ggml_type ts_type = base_arr[i]; if (is_iq(ts_type) && !has_valid_imatrix) { - LLAMA_LOG_WARN("%s: skipping %s quantization for %s, no or mismatched imatrix provided\n", - __func__, ggml_type_name(ts_type), name.c_str()); + LLAMA_LOG_WARN("%s: skipping %s quantization for %s, no or mismatched imatrix provided\n", __func__, ggml_type_name(ts_type), name.c_str()); continue; } @@ -1214,60 +1213,54 @@ static std::unordered_map target_bpw_type( info.candidate.push_back(candidate_types{ tensor->type, bpw, ggml_nbytes(tensor), 0.0 }); } - // Keep only the pareto‑optimal candidates: if A has >= bytes and >= error than B, drop A. + // Keep only the pareto‑optimal candidates and enforce convexity in (bytes, error) curve { - std::vector pruned; - pruned.reserve(info.candidate.size()); + auto & candidates = info.candidate; + if (!candidates.empty()) { + std::sort(candidates.begin(), candidates.end(), [](const candidate_types & a, const candidate_types & b) { + if (a.bytes != b.bytes) { return a.bytes < b.bytes; } - // Sort by bytes ascending, error ascending - std::sort(info.candidate.begin(), info.candidate.end(), [](const candidate_types & a, const candidate_types & b) { - if (a.bytes != b.bytes) { return a.bytes < b.bytes; } - return a.error < b.error; - }); + return a.error < b.error; + }); - double best_err = infinity; - size_t last_bytes = std::numeric_limits::max(); - for (const auto & c : info.candidate) { - // Only keep the best error seen so far at strictly larger byte sizes - if (c.bytes != last_bytes) { - // first time we see this byte size - last_bytes = c.bytes; - if (c.error < best_err) { - pruned.push_back(c); - best_err = c.error; + std::vector pareto; + pareto.reserve(candidates.size()); + double best_err = infinity; + size_t last_bytes = std::numeric_limits::max(); + for (const auto & c : candidates) { + if (c.bytes != last_bytes) { + last_bytes = c.bytes; + if (c.error < best_err) { + best_err = c.error; + pareto.push_back(c); + } } - } else { - // same bytes: we already sorted by error; skip - } - } - - info.candidate.swap(pruned); - } - - // Enforce convexity in (bytes, error) curve - { - const auto & c = info.candidate; - if (c.size() >= 3) { - std::vector convex; - convex.reserve(c.size()); - auto slope = [](const candidate_types & a, const candidate_types & b) -> double { - const double dx = (double)b.bytes - (double)a.bytes; - if (dx <= 0.0) { return infinity; } - - return ((double)b.error - (double)a.error) / dx; - }; - - for (const auto & p : c) { - while (convex.size() >= 2) { - double s1 = slope(convex[convex.size() - 2], convex[convex.size() - 1]); - double s2 = slope(convex[convex.size() - 1], p); - if (s2 + epsilon < s1) { convex.pop_back(); } - else { break; } - } - convex.push_back(p); } - info.candidate.swap(convex); + candidates.swap(pareto); + + if (candidates.size() >= 3) { + std::vector hull; + hull.reserve(candidates.size()); + auto slope = [](const candidate_types & a, const candidate_types & b) { + const double dx = b.bytes - a.bytes; + + return dx <= 0.0 ? infinity : (b.error - a.error) / dx; + }; + + for (const auto & p : candidates) { + while (hull.size() >= 2) { + double s1 = slope(hull[hull.size() - 2], hull[hull.size() - 1]); + double s2 = slope(hull[hull.size() - 1], p); + if (s2 + epsilon < s1) { hull.pop_back(); } + else { break; } + } + + hull.push_back(p); + } + + candidates.swap(hull); + } } }