This commit is contained in:
ddh0 2026-02-16 13:43:22 -06:00
parent 3c1f94a49d
commit a26db356c9
1 changed files with 85 additions and 79 deletions

View File

@ -11,6 +11,7 @@
#include <mutex>
#include <regex>
#include <thread>
#include <memory>
#include <unordered_map>
// Quantization types. Changes to this struct must be replicated in quantize.cpp
@ -19,6 +20,35 @@ struct tensor_quantization {
ggml_type quant = GGML_TYPE_COUNT;
};
struct quantize_state_impl {
const llama_model & model;
const llama_model_quantize_params * params;
int n_attention_wv = 0;
int n_ffn_down = 0;
int n_ffn_gate = 0;
int n_ffn_up = 0;
int i_attention_wv = 0;
int i_ffn_down = 0;
int i_ffn_gate = 0;
int i_ffn_up = 0;
int n_fallback = 0;
bool has_imatrix = false;
// used to figure out if a model shares tok_embd with the output weight
bool has_output = false;
// if this flag is false, the code will skip updating this struct
bool do_count = false;
quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params)
: model(model)
, params(params)
{}
};
static void zeros(std::ofstream & file, size_t n) {
char zero = 0;
for (size_t i = 0; i < n; ++i) {
@ -76,32 +106,6 @@ static std::string remap_imatrix (const std::string & orig_name, const std::map<
return orig_name;
}
struct quantize_state_impl {
const llama_model & model;
const llama_model_quantize_params * params;
int n_attention_wv = 0;
int n_ffn_down = 0;
int n_ffn_gate = 0;
int n_ffn_up = 0;
int i_attention_wv = 0;
int i_ffn_down = 0;
int i_ffn_gate = 0;
int i_ffn_up = 0;
int n_fallback = 0;
bool has_imatrix = false;
// used to figure out if a model shares tok_embd with the output weight
bool has_output = false;
quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params)
: model(model)
, params(params)
{}
};
static void llama_tensor_dequantize_impl(
ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
const size_t nelements, const int nthread
@ -177,7 +181,7 @@ static void llama_tensor_dequantize_impl(
// internal standard logic for selecting the target tensor type for a specific
// quantization mixture & model architecture
static ggml_type llama_tensor_get_type_impl(
quantize_state_impl & qs,
quantize_state_impl * qs,
ggml_type new_type,
const ggml_tensor * tensor,
const llama_ftype ftype
@ -185,13 +189,13 @@ static ggml_type llama_tensor_get_type_impl(
const std::string name = ggml_get_name(tensor);
// TODO: avoid hardcoded tensor names - use the TN_* constants
const llm_arch arch = qs.model.arch;
const llm_arch arch = qs->model.arch;
const auto tn = LLM_TN(arch);
auto use_more_bits = [](int i_layer, int n_layers) -> bool {
return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2;
};
const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
const int n_expert = std::max(1, (int)qs->model.hparams.n_expert);
auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
if (n_expert > 1) {
// Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but occasionally randomly
@ -210,9 +214,9 @@ static ggml_type llama_tensor_get_type_impl(
// for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
// with the quantization of the output tensor
if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
new_type = qs.params->output_tensor_type;
if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs->has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
if (qs->params->output_tensor_type < GGML_TYPE_COUNT) {
new_type = qs->params->output_tensor_type;
} else {
const int64_t nx = tensor->ne[0];
const int64_t qk_k = ggml_blck_size(new_type);
@ -241,8 +245,8 @@ static ggml_type llama_tensor_get_type_impl(
new_type = GGML_TYPE_Q8_0;
}
} else if (name == "token_embd.weight" || name == "per_layer_token_embd.weight") {
if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
new_type = qs.params->token_embedding_type;
if (qs->params->token_embedding_type < GGML_TYPE_COUNT) {
new_type = qs->params->token_embedding_type;
} else {
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
@ -261,19 +265,19 @@ static ggml_type llama_tensor_get_type_impl(
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
if (name.find("attn_v.weight") != std::string::npos) {
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
if (qs->model.hparams.n_gqa() >= 4 || qs->model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
}
else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
else if (qs->model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
new_type = GGML_TYPE_Q4_K;
}
else if (name.find("ffn_down") != std::string::npos) {
if (qs.i_ffn_down < qs.n_ffn_down/8) {
if (qs->i_ffn_down < qs->n_ffn_down/8) {
new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
}
}
else if (name.find("attn_output.weight") != std::string::npos) {
if (qs.model.hparams.n_expert == 8) {
if (qs->model.hparams.n_expert == 8) {
new_type = GGML_TYPE_Q5_K;
} else {
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
@ -282,43 +286,43 @@ static ggml_type llama_tensor_get_type_impl(
}
} else if (name.find("attn_v.weight") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
new_type = qs->model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs->model.hparams.n_gqa() >= 4) {
new_type = GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
new_type = qs->model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs->has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
}
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs->model.hparams.n_gqa() >= 4) {
new_type = GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
new_type = GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
new_type = qs->i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) {
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs->model.hparams.n_gqa() >= 4) {
new_type = GGML_TYPE_Q5_K;
}
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
if (qs.model.type == LLM_TYPE_70B) {
use_more_bits(qs->i_attention_wv, qs->n_attention_wv)) new_type = GGML_TYPE_Q6_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs->i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
if (qs->model.type == LLM_TYPE_70B) {
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
// nearly negligible increase in model size by quantizing this tensor with more bits:
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
}
if (qs.model.hparams.n_expert == 8) {
if (qs->model.hparams.n_expert == 8) {
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
// TODO: explore better strategies
new_type = GGML_TYPE_Q8_0;
}
} else if (name.find("attn_k.weight") != std::string::npos) {
if (qs.model.hparams.n_expert == 8) {
if (qs->model.hparams.n_expert == 8) {
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
// TODO: explore better strategies
new_type = GGML_TYPE_Q8_0;
@ -337,13 +341,13 @@ static ggml_type llama_tensor_get_type_impl(
new_type = GGML_TYPE_IQ2_S;
}
} else if (name.find("ffn_down") != std::string::npos) {
auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
auto info = layer_info(qs->i_ffn_down, qs->n_ffn_down, name.c_str());
int i_layer = info.first, n_layer = info.second;
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs->has_imatrix) {
new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
@ -352,7 +356,7 @@ static ggml_type llama_tensor_get_type_impl(
: GGML_TYPE_Q3_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
(qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
(qs->model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
new_type = GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
@ -366,7 +370,7 @@ static ggml_type llama_tensor_get_type_impl(
if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
}
}
else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) {
else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs->has_imatrix) {
new_type = GGML_TYPE_Q5_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
@ -374,7 +378,7 @@ static ggml_type llama_tensor_get_type_impl(
new_type = GGML_TYPE_Q5_K;
}
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0)
&& qs.has_imatrix && i_layer < n_layer/8) {
&& qs->has_imatrix && i_layer < n_layer/8) {
// Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
// We only do it when an imatrix is provided because a) we want to make sure that one can always get the
// same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
@ -382,7 +386,7 @@ static ggml_type llama_tensor_get_type_impl(
}
} else if (name.find("attn_output.weight") != std::string::npos) {
if (arch != LLM_ARCH_FALCON) {
if (qs.model.hparams.n_expert == 8) {
if (qs->model.hparams.n_expert == 8) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
@ -408,14 +412,14 @@ static ggml_type llama_tensor_get_type_impl(
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
}
else if (name.find("ffn_gate") != std::string::npos) {
auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
auto info = layer_info(qs->i_ffn_gate, qs->n_ffn_gate, name.c_str());
int i_layer = info.first, n_layer = info.second;
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
new_type = GGML_TYPE_IQ3_XXS;
}
}
else if (name.find("ffn_up") != std::string::npos) {
auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
auto info = layer_info(qs->i_ffn_up, qs->n_ffn_up, name.c_str());
int i_layer = info.first, n_layer = info.second;
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
new_type = GGML_TYPE_IQ3_XXS;
@ -426,7 +430,7 @@ static ggml_type llama_tensor_get_type_impl(
// determine the ggml_type that this tensor should be quantized to
static ggml_type llama_tensor_get_type(
quantize_state_impl & qs,
quantize_state_impl * qs,
const llama_model_quantize_params * params,
const ggml_tensor * tensor,
const ggml_type default_type
@ -492,7 +496,9 @@ static ggml_type llama_tensor_get_type(
if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
new_type = GGML_TYPE_F16;
}
++qs.n_fallback;
if (qs->do_count) {
++qs->n_fallback;
}
}
}
}
@ -708,7 +714,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
model.load_hparams(ml);
model.load_stats (ml);
quantize_state_impl qs(model, params);
auto qs = std::make_unique<quantize_state_impl>(model, params);
if (params->only_copy) {
ftype = ml.ftype;
@ -718,7 +724,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
if (imatrix_data) {
LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
qs.has_imatrix = true;
qs->has_imatrix = true;
// check imatrix for nans or infs
for (const auto & kv : *imatrix_data) {
for (float f : kv.second) {
@ -830,15 +836,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
const ggml_tensor * tensor = it->tensor;
const std::string name = tensor->name;
// TODO: avoid hardcoded tensor names - use the TN_* constants
if (name.find("attn_v.weight") != std::string::npos ||
name.find("attn_qkv.weight") != std::string::npos ||
name.find("attn_kv_b.weight")!= std::string::npos) {
++qs.n_attention_wv;
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
qs.has_output = true;
}
// populate the original tensors so we get an initial meta data
uint16_t i_split = params->keep_split ? it->idx : 0;
if (!ctx_outs[i_split]) {
@ -867,9 +864,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
}
}
qs.n_fallback = 0; // may have been falsely incremented by the preliminary loop over weights
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
// Set split info if needed
if (n_split > 1) {
for (size_t i = 0; i < ctx_outs.size(); ++i) {
@ -918,6 +912,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
size_t total_size_org = 0;
size_t total_size_new = 0;
qs->n_ffn_down = qs->n_ffn_gate = qs->n_ffn_up = (int)model.hparams.n_layer;
qs->do_count = true; // we start counting stats for the main loop
// iterate over all weights (main loop)
for (const auto * it : weights) {
const auto & weight = *it;
@ -931,6 +928,15 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
const std::string name = ggml_get_name(tensor);
const size_t tensor_size = ggml_nbytes(tensor);
// TODO: avoid hardcoded tensor names - use the TN_* constants
if (name.find("attn_v.weight") != std::string::npos ||
name.find("attn_qkv.weight") != std::string::npos ||
name.find("attn_kv_b.weight")!= std::string::npos) {
++qs->n_attention_wv;
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
qs->has_output = true;
}
if (!params->dry_run) {
if (!ml.use_mmap) {
if (read_data.size() < tensor_size) {
@ -962,13 +968,13 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
// count stats for this tensor based on its name
if (name.find("attn_v.weight") != std::string::npos ||
name.find("attn_kv_b.weight") != std::string::npos) {
++qs.i_attention_wv;
++qs->i_attention_wv;
} else if (name.find("ffn_down") != std::string::npos) {
++qs.i_ffn_down;
++qs->i_ffn_down;
} else if (name.find("ffn_gate") != std::string::npos) {
++qs.i_ffn_gate;
++qs->i_ffn_gate;
} else if (name.find("ffn_up") != std::string::npos) {
++qs.i_ffn_up;
++qs->i_ffn_up;
}
}
@ -1128,9 +1134,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
);
}
if (qs.n_fallback > 0) {
if (qs->n_fallback > 0) {
LLAMA_LOG_WARN("%s: WARNING: %d tensor(s) required fallback quantization\n",
__func__, qs.n_fallback);
__func__, qs->n_fallback);
}
}