quant : manual overrides of tensor types take precedence (#18952)

This commit is contained in:
Georgi Gerganov 2026-01-22 16:17:06 +02:00 committed by GitHub
parent 8b30840703
commit 0e4ebeb057
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 53 additions and 56 deletions

View File

@ -422,57 +422,6 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
++qs.i_ffn_up;
}
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
//}
// IK: let's remove this, else Q2_K is almost the same as Q3_K_S
//else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
//}
// This can be used to reduce the size of the Q5_K_S model.
// The associated PPL increase is fully in line with the size reduction
//else {
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
//}
bool convert_incompatible_tensor = false;
{
const int64_t nx = tensor->ne[0];
const int64_t ny = tensor->ne[1];
const int64_t qk_k = ggml_blck_size(new_type);
if (nx % qk_k != 0) {
LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
convert_incompatible_tensor = true;
} else {
++qs.n_k_quantized;
}
}
if (convert_incompatible_tensor) {
switch (new_type) {
case GGML_TYPE_TQ1_0:
case GGML_TYPE_TQ2_0: new_type = GGML_TYPE_Q4_0; break; // TODO: use a symmetric type instead
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ1_M:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
}
if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
new_type = GGML_TYPE_F16;
}
LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
++qs.n_fallback;
}
return new_type;
}
@ -875,21 +824,69 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
// get more optimal quantization type based on the tensor shape, layer, etc.
if (!params->pure && ggml_is_quantized(default_type)) {
int fallback = qs.n_fallback;
new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
// unless the user specifies a type, and the tensor geometry will not require fallback quantisation
if (params->tensor_types && qs.n_fallback - fallback == 0) {
// if the user provided tensor types - use those
bool manual = false;
if (params->tensor_types) {
const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
const std::string tensor_name(tensor->name);
for (const auto & [tname, qtype] : tensor_types) {
if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
if (qtype != new_type) {
LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
LLAMA_LOG_WARN("(manual override: %s -> %s) ", ggml_type_name(new_type), ggml_type_name(qtype));
new_type = qtype; // if two or more types are specified for the same tensor, the last match wins
manual = true;
break;
}
}
}
}
// if not manual - use the standard logic for choosing the quantization type based on the selected mixture
if (!manual) {
new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
}
// incompatible tensor shapes are handled here - fallback to a compatible type
{
bool convert_incompatible_tensor = false;
const int64_t nx = tensor->ne[0];
const int64_t ny = tensor->ne[1];
const int64_t qk_k = ggml_blck_size(new_type);
if (nx % qk_k != 0) {
LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
convert_incompatible_tensor = true;
} else {
++qs.n_k_quantized;
}
if (convert_incompatible_tensor) {
switch (new_type) {
case GGML_TYPE_TQ1_0:
case GGML_TYPE_TQ2_0: new_type = GGML_TYPE_Q4_0; break; // TODO: use a symmetric type instead
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ1_M:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
}
if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
new_type = GGML_TYPE_F16;
}
LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
++qs.n_fallback;
}
}
}
if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
new_type = params->token_embedding_type;