From 7889d1b81b80b0b98d6458a1fb6b6dca0b61c01e Mon Sep 17 00:00:00 2001 From: Colin Kealty <3266127+bartowski1182@users.noreply.github.com> Date: Thu, 3 Apr 2025 19:45:46 -0400 Subject: [PATCH] A bit more weight to shared experts for larger sizes --- src/llama-quant.cpp | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index aa5f64e250..1b2e7632cf 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -443,6 +443,30 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = GGML_TYPE_IQ2_S; } + } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_down_shexp.weight") != std::string::npos) { + new_type = GGML_TYPE_Q5_K; + if (use_more_bits(qs.i_ffn_down_shexp, qs.n_ffn_down_shexp)) { + new_type = GGML_TYPE_Q8_0; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0; + ++qs.i_ffn_down_shexp; + } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_gate_shexp.weight") != std::string::npos) { + new_type = GGML_TYPE_Q5_K; + if (use_more_bits(qs.i_ffn_gate_shexp, qs.n_ffn_gate_shexp)) { + new_type = GGML_TYPE_Q8_0; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0; + ++qs.i_ffn_gate_shexp; + } else if (qs.model.hparams.n_expert >= 8 && name.find("ffn_up_shexp.weight") != std::string::npos) { + new_type = GGML_TYPE_Q5_K; + if (use_more_bits(qs.i_ffn_up_shexp, qs.n_ffn_up_shexp)) { + new_type = GGML_TYPE_Q8_0; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q8_0; + ++qs.i_ffn_up_shexp; } else if (name.find("ffn_down") != std::string::npos) { auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); int i_layer = info.first, n_layer = info.second;