Do not mix K and IQ quants

This commit is contained in:
Ed Addario 2025-08-20 13:27:01 +01:00
parent 69586e212e
commit 29b2dc3ec0
No known key found for this signature in database
GPG Key ID: E7875815A3230993
1 changed files with 46 additions and 16 deletions

View File

@ -36,6 +36,26 @@ static bool is_iq(const enum ggml_type t) {
}
}
static bool is_iq(const enum llama_ftype t) {
switch (t) {
case LLAMA_FTYPE_MOSTLY_IQ1_S:
case LLAMA_FTYPE_MOSTLY_IQ1_M:
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:
case LLAMA_FTYPE_MOSTLY_IQ2_XS:
case LLAMA_FTYPE_MOSTLY_IQ2_S:
case LLAMA_FTYPE_MOSTLY_IQ2_M:
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:
case LLAMA_FTYPE_MOSTLY_IQ3_XS:
case LLAMA_FTYPE_MOSTLY_IQ3_S:
case LLAMA_FTYPE_MOSTLY_IQ3_M:
case LLAMA_FTYPE_MOSTLY_IQ4_XS:
case LLAMA_FTYPE_MOSTLY_IQ4_NL:
return true;
default:
return false;
}
}
static enum ggml_type fallback_type(const enum ggml_type new_type) {
switch (new_type) {
case GGML_TYPE_TQ1_0:
@ -587,7 +607,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
const std::map<int, std::string> & mapped,
const std::unordered_map<std::string, std::vector<float>> * values_data,
const std::unordered_map<std::string, std::vector<float>> * activations_data,
float target_bpw,
const llama_model_quantize_params * params,
int nthread,
int sample_rows_per_expert = 128,
float bias_lambda = 1.0
@ -608,19 +628,7 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
size_t n_elements = 0;
};
auto name_tn = LLM_TN(model.arch);
const ggml_type base_candidates[] = {
// Model's
GGML_TYPE_IQ1_S,
GGML_TYPE_IQ1_M,
GGML_TYPE_IQ2_XXS,
GGML_TYPE_IQ2_XS,
GGML_TYPE_IQ2_S,
GGML_TYPE_IQ3_XXS,
GGML_TYPE_IQ3_S,
GGML_TYPE_IQ4_XS,
GGML_TYPE_IQ4_NL,
const ggml_type k_candidates[] = {
GGML_TYPE_Q2_K,
GGML_TYPE_Q3_K,
GGML_TYPE_Q4_0,
@ -639,6 +647,21 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
#endif
};
const ggml_type iq_candidates[] = {
GGML_TYPE_IQ1_S,
GGML_TYPE_IQ1_M,
GGML_TYPE_IQ2_XXS,
GGML_TYPE_IQ2_XS,
GGML_TYPE_IQ2_S,
GGML_TYPE_IQ3_XXS,
GGML_TYPE_IQ3_S,
GGML_TYPE_IQ4_XS,
GGML_TYPE_IQ4_NL,
};
auto name_tn = LLM_TN(model.arch);
float target_bpw = params->target_bpw;
auto can_quantize = [&](const ggml_tensor * t) -> bool {
const std::string name = ggml_get_name(t);
bool q = name.rfind("weight") == name.size() - 6;
@ -838,8 +861,15 @@ static std::unordered_map<std::string, ggml_type> target_bpw_type(
info.w = tw;
info.n_elements = nelem;
std::vector<ggml_type> quant_candidates;
if (is_iq(params->ftype)) {
quant_candidates.assign(std::begin(iq_candidates), std::end(iq_candidates));
} else {
quant_candidates.assign(std::begin(k_candidates), std::end(k_candidates));
}
// Build per-tensor candidate list
for (ggml_type ts_type : base_candidates) {
for (ggml_type ts_type : quant_candidates) {
if (is_iq(ts_type) && !values) { continue; }
ggml_type tt = make_compatible(t, ts_type);
if (!is_compatible(t, tt)) { continue; }
@ -1305,7 +1335,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
std::unordered_map<std::string, ggml_type> bpw_overrides = {};
if (params->target_bpw != -1.0f) {
LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.3f bpw at lowest ppl - this opearation may take some time\n", __func__, params->target_bpw);
bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params->target_bpw, nthread);
bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread);
}
int cur_split = -1;