diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 5522fe39d2..9dc903874f 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -36,6 +36,26 @@ static bool is_iq(const enum ggml_type t) { } } +static bool is_iq(const enum llama_ftype t) { + switch (t) { + case LLAMA_FTYPE_MOSTLY_IQ1_S: + case LLAMA_FTYPE_MOSTLY_IQ1_M: + case LLAMA_FTYPE_MOSTLY_IQ2_XXS: + case LLAMA_FTYPE_MOSTLY_IQ2_XS: + case LLAMA_FTYPE_MOSTLY_IQ2_S: + case LLAMA_FTYPE_MOSTLY_IQ2_M: + case LLAMA_FTYPE_MOSTLY_IQ3_XXS: + case LLAMA_FTYPE_MOSTLY_IQ3_XS: + case LLAMA_FTYPE_MOSTLY_IQ3_S: + case LLAMA_FTYPE_MOSTLY_IQ3_M: + case LLAMA_FTYPE_MOSTLY_IQ4_XS: + case LLAMA_FTYPE_MOSTLY_IQ4_NL: + return true; + default: + return false; + } +} + static enum ggml_type fallback_type(const enum ggml_type new_type) { switch (new_type) { case GGML_TYPE_TQ1_0: @@ -587,7 +607,7 @@ static std::unordered_map target_bpw_type( const std::map & mapped, const std::unordered_map> * values_data, const std::unordered_map> * activations_data, - float target_bpw, + const llama_model_quantize_params * params, int nthread, int sample_rows_per_expert = 128, float bias_lambda = 1.0 @@ -608,19 +628,7 @@ static std::unordered_map target_bpw_type( size_t n_elements = 0; }; - auto name_tn = LLM_TN(model.arch); - - const ggml_type base_candidates[] = { - // Model's - GGML_TYPE_IQ1_S, - GGML_TYPE_IQ1_M, - GGML_TYPE_IQ2_XXS, - GGML_TYPE_IQ2_XS, - GGML_TYPE_IQ2_S, - GGML_TYPE_IQ3_XXS, - GGML_TYPE_IQ3_S, - GGML_TYPE_IQ4_XS, - GGML_TYPE_IQ4_NL, + const ggml_type k_candidates[] = { GGML_TYPE_Q2_K, GGML_TYPE_Q3_K, GGML_TYPE_Q4_0, @@ -639,6 +647,21 @@ static std::unordered_map target_bpw_type( #endif }; + const ggml_type iq_candidates[] = { + GGML_TYPE_IQ1_S, + GGML_TYPE_IQ1_M, + GGML_TYPE_IQ2_XXS, + GGML_TYPE_IQ2_XS, + GGML_TYPE_IQ2_S, + GGML_TYPE_IQ3_XXS, + GGML_TYPE_IQ3_S, + GGML_TYPE_IQ4_XS, + GGML_TYPE_IQ4_NL, + }; + + auto name_tn = LLM_TN(model.arch); + float target_bpw = params->target_bpw; + auto can_quantize = [&](const ggml_tensor * t) -> bool { const std::string name = ggml_get_name(t); bool q = name.rfind("weight") == name.size() - 6; @@ -838,8 +861,15 @@ static std::unordered_map target_bpw_type( info.w = tw; info.n_elements = nelem; + std::vector quant_candidates; + if (is_iq(params->ftype)) { + quant_candidates.assign(std::begin(iq_candidates), std::end(iq_candidates)); + } else { + quant_candidates.assign(std::begin(k_candidates), std::end(k_candidates)); + } + // Build per-tensor candidate list - for (ggml_type ts_type : base_candidates) { + for (ggml_type ts_type : quant_candidates) { if (is_iq(ts_type) && !values) { continue; } ggml_type tt = make_compatible(t, ts_type); if (!is_compatible(t, tt)) { continue; } @@ -1305,7 +1335,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: std::unordered_map bpw_overrides = {}; if (params->target_bpw != -1.0f) { LLAMA_LOG_INFO("%s: computing tensor quantization mix to achieve %.3f bpw at lowest ppl - this opearation may take some time\n", __func__, params->target_bpw); - bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params->target_bpw, nthread); + bpw_overrides = target_bpw_type(ml, read_data, model, tensors, mapped, values_data, activations_data, params, nthread); } int cur_split = -1;