add back Q2_K edge case for imatrix

2026-02-11 21:53:07 -06:00 · 2026-02-11 21:53:07 -06:00 · 1658228d6a
parent 1ccd7a49ba
commit 1658228d6a
1 changed files with 7 additions and 4 deletions
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@ -479,12 +479,15 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
    return new_size;
 }

-static bool tensor_type_requires_imatrix(const ggml_type dst_type) {
+static bool tensor_type_requires_imatrix(const llama_model_quantize_params * params, const ggml_tensor * t, const ggml_type dst_type) {
    return (
        dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS ||
        dst_type == GGML_TYPE_IQ3_XXS || dst_type == GGML_TYPE_IQ1_S  ||
        dst_type == GGML_TYPE_IQ2_S   || dst_type == GGML_TYPE_IQ1_M  ||
-        dst_type == GGML_TYPE_TQ1_0   || dst_type == GGML_TYPE_TQ2_0
+        dst_type == GGML_TYPE_TQ1_0   || dst_type == GGML_TYPE_TQ2_0  ||
+        (   // Q2_K is the worst k-quant type - only allow it without imatrix for token embeddings
+            dst_type == GGML_TYPE_Q2_K && strcmp(t->name, "token_embd.weight") != 0
+        )
    );
 }

@ -930,7 +933,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                               tensor_size/1024.0/1024.0,
                               new_size/1024.0/1024.0,
                               ggml_type_name(new_type));
-                if (!will_require_imatrix && tensor_type_requires_imatrix(new_type)) {
+                if (!will_require_imatrix && tensor_type_requires_imatrix(params, tensor, new_type)) {
                    will_require_imatrix = true;
                }
            } else {
@ -973,7 +976,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                        }
                    }
                }
-                if (!imatrix && tensor_type_requires_imatrix(new_type)) {
+                if (!imatrix && tensor_type_requires_imatrix(params, tensor, new_type)) {
                    LLAMA_LOG_ERROR("\n\n============================================================\n");
                    LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
                    LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");