From 3856d60328349c5b2a4e381d6fdff20d272415ab Mon Sep 17 00:00:00 2001 From: Ed Addario Date: Sat, 23 Aug 2025 14:45:07 +0100 Subject: [PATCH] Restrict quant types per family --- src/llama-quant.cpp | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 4ce651723f..7615376e31 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -628,11 +628,7 @@ static std::unordered_map target_bpw_type( constexpr ggml_type k_quants[] = { GGML_TYPE_Q2_K, GGML_TYPE_Q3_K, - GGML_TYPE_Q4_0, - GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, - GGML_TYPE_Q5_0, - GGML_TYPE_Q5_1, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, GGML_TYPE_Q8_0, @@ -646,19 +642,12 @@ static std::unordered_map target_bpw_type( constexpr ggml_type iq_quants[] = { GGML_TYPE_IQ1_S, - GGML_TYPE_IQ1_M, - GGML_TYPE_IQ2_XXS, - GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S, - GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS, - GGML_TYPE_IQ4_NL, - // TODO: add higher-precision fallbacks for IQ mixes to improve ppl if bpw budget allows it? - GGML_TYPE_Q5_0, - GGML_TYPE_Q5_1, GGML_TYPE_Q5_K, - GGML_TYPE_Q6_K + GGML_TYPE_Q6_K, + GGML_TYPE_Q8_0 }; auto tensor_bytes = [](const ggml_tensor * t, const ggml_type typ) -> size_t { @@ -888,8 +877,8 @@ static std::unordered_map target_bpw_type( } } - // abias_lambda djusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE - // larger value favours quantisation types that produce a smaller bias even if the MSE is slightly larger + // bias_lambda adjusts the trade-off between systematic bias (introduced by block‑wise scaling) and MSE + // larger value favours quantisation types that produce smaller bias even if the MSE is slightly larger constexpr float bias_lambda = 1.5f; constexpr double epsilon = 1e-12; double err_num = weighted_mse;