correct fallback logic

This commit is contained in:
ddh0 2026-02-16 13:13:44 -06:00
parent 521a13e6c6
commit 3c1f94a49d
1 changed files with 20 additions and 22 deletions

View File

@ -89,7 +89,6 @@ struct quantize_state_impl {
int i_ffn_gate = 0;
int i_ffn_up = 0;
int n_k_quantized = 0;
int n_fallback = 0;
bool has_imatrix = false;
@ -493,6 +492,7 @@ static ggml_type llama_tensor_get_type(
if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
new_type = GGML_TYPE_F16;
}
++qs.n_fallback;
}
}
}
@ -505,20 +505,6 @@ static ggml_type llama_tensor_get_type(
return new_type;
}
// update internal quantization state statistics based on the tensor name
static void llama_tensor_update_stats(quantize_state_impl & qs, const std::string & name) {
if (name.find("attn_v.weight") != std::string::npos ||
name.find("attn_kv_b.weight") != std::string::npos) {
++qs.i_attention_wv;
} else if (name.find("ffn_down") != std::string::npos) {
++qs.i_ffn_down;
} else if (name.find("ffn_gate") != std::string::npos) {
++qs.i_ffn_gate;
} else if (name.find("ffn_up") != std::string::npos) {
++qs.i_ffn_up;
}
}
static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
if (nthread < 2) {
// single-thread
@ -867,7 +853,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
tensor_requires_imatrix(tensor, target_type)
) {
if (params->dry_run) {
will_require_imatrix = true; // set flag for warning later, but continue with dry run
// set flag for warning later, but continue with dry run
will_require_imatrix = true;
} else {
LLAMA_LOG_ERROR("\n============================================================================\n"
" ERROR: this quantization requires an importance matrix!\n"
@ -880,6 +867,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
}
}
qs.n_fallback = 0; // may have been falsely incremented by the preliminary loop over weights
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
// Set split info if needed
@ -964,15 +952,25 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
ggml_type new_type = default_type;
// if so, what will be the target type?
// if so, what will be the new type?
if (do_quantize) {
new_type = llama_tensor_get_type(qs, params, tensor, default_type);
// If we've decided to quantize to the same type the tensor is already
// in then there's nothing to do.
do_quantize = tensor->type != new_type;
}
llama_tensor_update_stats(qs, name);
// count stats for this tensor based on its name
if (name.find("attn_v.weight") != std::string::npos ||
name.find("attn_kv_b.weight") != std::string::npos) {
++qs.i_attention_wv;
} else if (name.find("ffn_down") != std::string::npos) {
++qs.i_ffn_down;
} else if (name.find("ffn_gate") != std::string::npos) {
++qs.i_ffn_gate;
} else if (name.find("ffn_up") != std::string::npos) {
++qs.i_ffn_up;
}
}
void * new_data;
size_t new_size;
@ -1131,8 +1129,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
}
if (qs.n_fallback > 0) {
LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",
__func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback);
LLAMA_LOG_WARN("%s: WARNING: %d tensor(s) required fallback quantization\n",
__func__, qs.n_fallback);
}
}
@ -1143,7 +1141,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
llama_model_quantize_params llama_model_quantize_default_params() {
llama_model_quantize_params result = {
/*.nthread =*/ 0,
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q8_0,
/*.output_tensor_type =*/ GGML_TYPE_COUNT,
/*.token_embedding_type =*/ GGML_TYPE_COUNT,
/*.allow_requantize =*/ false,