diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 0b23eaef3a..6d67448fda 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -259,9 +259,39 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; ++qs.i_attention_wv; } - else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) { + else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k") != std::string::npos) { new_type = GGML_TYPE_Q4_K; } + else if (qs.model.hparams.n_expert >= 8 && name.find("attn_q") != std::string::npos) { + new_type = GGML_TYPE_Q4_K; + } + else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down") != std::string::npos) { + if (qs.i_ffn_down < qs.n_ffn_down/16) { + new_type = GGML_TYPE_Q4_K; + } + else if (qs.i_ffn_down < qs.n_ffn_down/8) { + new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; + } + ++qs.i_ffn_down; + } + else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate") != std::string::npos) { + if (qs.i_ffn_gate < qs.n_ffn_gate/16) { + new_type = GGML_TYPE_Q4_K; + } + else if (qs.i_ffn_gate < qs.n_ffn_gate/8 || qs.i_ffn_gate >= 7*qs.n_ffn_gate/8) { + new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; + } + ++qs.i_ffn_gate; + } + else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up") != std::string::npos) { + if (qs.i_ffn_up < qs.n_ffn_up/16) { + new_type = GGML_TYPE_Q4_K; + } + else if (qs.i_ffn_up < qs.n_ffn_up/8) { + new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; + } + ++qs.i_ffn_up; + } else if (name.find("ffn_down") != std::string::npos) { if (qs.i_ffn_down < qs.n_ffn_down/8) { new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; @@ -269,7 +299,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t ++qs.i_ffn_down; } else if (name.find("attn_output.weight") != std::string::npos) { - if (qs.model.hparams.n_expert == 8) { + if (qs.model.hparams.n_expert >= 8) { new_type = GGML_TYPE_Q5_K; } else { if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS; @@ -380,7 +410,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t ++qs.i_ffn_down; } else if (name.find("attn_output.weight") != std::string::npos) { if (arch != LLM_ARCH_FALCON) { - if (qs.model.hparams.n_expert == 8) { + if (qs.model.hparams.n_expert >= 8) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || @@ -420,6 +450,38 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t new_type = GGML_TYPE_IQ3_XXS; } ++qs.i_ffn_up; + } else if (name.find("attn_kv_a_mqa") != std::string::npos) { + if (qs.model.hparams.n_expert >= 8) { + new_type = GGML_TYPE_Q8_0; + } + } else if (name.find("attn_kv_b.weight") != std::string::npos) { + if (qs.model.hparams.n_expert >= 8) { + new_type = GGML_TYPE_Q4_K; + if (qs.i_attention_wv < qs.n_attention_wv/16) { + new_type = GGML_TYPE_Q8_0; + } else if (use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) { + new_type = GGML_TYPE_Q6_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; + } + ++qs.i_attention_wv; + } else if (name.find("attn_q_b.weight") != std::string::npos) { + if (qs.model.hparams.n_expert >= 8) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { + new_type = GGML_TYPE_Q4_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; + } + } else if (name.find("attn_q_a.weight") != std::string::npos) { + if (qs.model.hparams.n_expert >= 8) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { + new_type = GGML_TYPE_Q4_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; + } } // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;