correct function names

2026-02-13 21:13:53 -06:00 · 2026-02-13 21:13:53 -06:00 · bddc67547f
parent 97aefac773
commit bddc67547f
1 changed files with 122 additions and 123 deletions
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@ -175,7 +175,7 @@ static void llama_tensor_dequantize_impl(
    workers.clear();
 }

-static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype, bool update_stats) {
+static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype, bool update_stats) {
    const std::string name = ggml_get_name(tensor);

    // TODO: avoid hardcoded tensor names - use the TN_* constants
@ -437,6 +437,95 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
    return new_type;
 }

+// determine the ggml_type that this tensor should be quantized to
+static ggml_type llama_tensor_get_type(
+                  quantize_state_impl & qs,
+    const llama_model_quantize_params * params,
+                    const ggml_tensor * tensor,
+                            ggml_type   default_type,
+                                 bool   update_stats // we only update qs if this flag is true
+) {
+    ggml_type new_type = default_type;
+    // get more optimal quantization type based on the tensor shape, layer, etc.
+    if (!params->pure && ggml_is_quantized(default_type)) {
+
+        // if the user provided tensor types - use those
+        bool manual = false;
+        if (params->tensor_types) {
+            const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
+            const std::string tensor_name(tensor->name);
+            for (const auto & [tname, qtype] : tensor_types) {
+                if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
+                    if  (qtype != new_type) {
+                        LLAMA_LOG_WARN("(manual override: %s -> %s) ", ggml_type_name(new_type), ggml_type_name(qtype));
+                        new_type = qtype; // if two or more types are specified for the same tensor, the last match wins
+                        manual = true;
+                        break;
+                    }
+                }
+            }
+        }
+
+        // if not manual - use the standard logic for choosing the quantization type based on the selected mixture
+        if (!manual) {
+            new_type = llama_tensor_get_type_impl(qs, new_type, tensor, params->ftype, update_stats);
+        }
+
+        // incompatible tensor shapes are handled here - fallback to a compatible type
+        {
+            bool convert_incompatible_tensor = false;
+
+            const int64_t nx = tensor->ne[0];
+            const int64_t ny = tensor->ne[1];
+            const int64_t qk_k = ggml_blck_size(new_type);
+
+            if (nx % qk_k != 0) {
+                LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
+                convert_incompatible_tensor = true;
+            } else {
+                if (update_stats) {
+                    ++qs.n_k_quantized;
+                }
+            }
+
+            if (convert_incompatible_tensor) {
+                switch (new_type) {
+                    case GGML_TYPE_TQ1_0:
+                    case GGML_TYPE_TQ2_0:  new_type = GGML_TYPE_Q4_0; break;  // TODO: use a symmetric type instead
+                    case GGML_TYPE_IQ2_XXS:
+                    case GGML_TYPE_IQ2_XS:
+                    case GGML_TYPE_IQ2_S:
+                    case GGML_TYPE_IQ3_XXS:
+                    case GGML_TYPE_IQ3_S:
+                    case GGML_TYPE_IQ1_S:
+                    case GGML_TYPE_IQ1_M:
+                    case GGML_TYPE_Q2_K:
+                    case GGML_TYPE_Q3_K:
+                    case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
+                    case GGML_TYPE_Q4_K:   new_type = GGML_TYPE_Q5_0;   break;
+                    case GGML_TYPE_Q5_K:   new_type = GGML_TYPE_Q5_1;   break;
+                    case GGML_TYPE_Q6_K:   new_type = GGML_TYPE_Q8_0;   break;
+                    default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
+                }
+                if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
+                    new_type = GGML_TYPE_F16;
+                }
+                if (update_stats) {
+                    LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
+                    ++qs.n_fallback;
+                }
+            }
+        }
+    }
+    if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
+        new_type = params->token_embedding_type;
+    }
+    if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
+        new_type = params->output_tensor_type;
+    }
+    return new_type;
+}
+
 static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
    if (nthread < 2) {
        // single-thread
@ -508,149 +597,61 @@ static bool tensor_allows_quantization(const llama_model_quantize_params * param
    const std::string name = tensor->name;

    // This used to be a regex, but <regex> has an extreme cost to compile times.
-    bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
+    bool allowed = name.rfind("weight") == name.size() - 6; // ends with 'weight'?

    // quantize only 2D and 3D tensors (experts)
-    quantize &= (ggml_n_dims(tensor) >= 2);
+    allowed &= (ggml_n_dims(tensor) >= 2);

    // do not quantize norm tensors
-    quantize &= name.find("_norm.weight") == std::string::npos;
+    allowed &= name.find("_norm.weight") == std::string::npos;

-    quantize &= params->quantize_output_tensor || name != "output.weight";
-    quantize &= !params->only_copy;
+    allowed &= params->quantize_output_tensor || name != "output.weight";
+    allowed &= !params->only_copy;

    // do not quantize expert gating tensors
    // NOTE: can't use LLM_TN here because the layer number is not known
-    quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
+    allowed &= name.find("ffn_gate_inp.weight") == std::string::npos;

    // these are very small (e.g. 4x4)
-    quantize &= name.find("altup")  == std::string::npos;
-    quantize &= name.find("laurel") == std::string::npos;
+    allowed &= name.find("altup")  == std::string::npos;
+    allowed &= name.find("laurel") == std::string::npos;

    // these are not too big so keep them as it is
-    quantize &= name.find("per_layer_model_proj") == std::string::npos;
+    allowed &= name.find("per_layer_model_proj") == std::string::npos;

    // do not quantize positional embeddings and token types (BERT)
-    quantize &= name != LLM_TN(arch)(LLM_TENSOR_POS_EMBD,    "weight");
-    quantize &= name != LLM_TN(arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
+    allowed &= name != LLM_TN(arch)(LLM_TENSOR_POS_EMBD,    "weight");
+    allowed &= name != LLM_TN(arch)(LLM_TENSOR_TOKEN_TYPES, "weight");

    // do not quantize Mamba /Kimi's small conv1d weights
    // NOTE: can't use LLM_TN here because the layer number is not known
-    quantize &= name.find("ssm_conv1d") == std::string::npos;
-    quantize &= name.find("shortconv.conv.weight") == std::string::npos;
+    allowed &= name.find("ssm_conv1d") == std::string::npos;
+    allowed &= name.find("shortconv.conv.weight") == std::string::npos;

    // do not quantize RWKV's small yet 2D weights
-    quantize &= name.find("time_mix_first.weight") == std::string::npos;
-    quantize &= name.find("time_mix_w0.weight") == std::string::npos;
-    quantize &= name.find("time_mix_w1.weight") == std::string::npos;
-    quantize &= name.find("time_mix_w2.weight") == std::string::npos;
-    quantize &= name.find("time_mix_v0.weight") == std::string::npos;
-    quantize &= name.find("time_mix_v1.weight") == std::string::npos;
-    quantize &= name.find("time_mix_v2.weight") == std::string::npos;
-    quantize &= name.find("time_mix_a0.weight") == std::string::npos;
-    quantize &= name.find("time_mix_a1.weight") == std::string::npos;
-    quantize &= name.find("time_mix_a2.weight") == std::string::npos;
-    quantize &= name.find("time_mix_g1.weight") == std::string::npos;
-    quantize &= name.find("time_mix_g2.weight") == std::string::npos;
-    quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
-    quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
-    quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
+    allowed &= name.find("time_mix_first.weight") == std::string::npos;
+    allowed &= name.find("time_mix_w0.weight") == std::string::npos;
+    allowed &= name.find("time_mix_w1.weight") == std::string::npos;
+    allowed &= name.find("time_mix_w2.weight") == std::string::npos;
+    allowed &= name.find("time_mix_v0.weight") == std::string::npos;
+    allowed &= name.find("time_mix_v1.weight") == std::string::npos;
+    allowed &= name.find("time_mix_v2.weight") == std::string::npos;
+    allowed &= name.find("time_mix_a0.weight") == std::string::npos;
+    allowed &= name.find("time_mix_a1.weight") == std::string::npos;
+    allowed &= name.find("time_mix_a2.weight") == std::string::npos;
+    allowed &= name.find("time_mix_g1.weight") == std::string::npos;
+    allowed &= name.find("time_mix_g2.weight") == std::string::npos;
+    allowed &= name.find("time_mix_decay_w1.weight") == std::string::npos;
+    allowed &= name.find("time_mix_decay_w2.weight") == std::string::npos;
+    allowed &= name.find("time_mix_lerp_fused.weight") == std::string::npos;

    // do not quantize relative position bias (T5)
-    quantize &= name.find("attn_rel_b.weight") == std::string::npos;
+    allowed &= name.find("attn_rel_b.weight") == std::string::npos;

    // do not quantize specific multimodal tensors
-    quantize &= name.find(".position_embd.") == std::string::npos;
+    allowed &= name.find(".position_embd.") == std::string::npos;

-    return quantize;
-}
-
-static ggml_type tensor_get_target_type(
-                  quantize_state_impl & qs,
-    const llama_model_quantize_params * params,
-                    const ggml_tensor * tensor,
-                            ggml_type   default_type,
-                                 bool   update_stats // should we update qs or no?
-) {
-    ggml_type new_type = default_type;
-    // get more optimal quantization type based on the tensor shape, layer, etc.
-    if (!params->pure && ggml_is_quantized(default_type)) {
-
-        // if the user provided tensor types - use those
-        bool manual = false;
-        if (params->tensor_types) {
-            const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
-            const std::string tensor_name(tensor->name);
-            for (const auto & [tname, qtype] : tensor_types) {
-                if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
-                    if  (qtype != new_type) {
-                        LLAMA_LOG_WARN("(manual override: %s -> %s) ", ggml_type_name(new_type), ggml_type_name(qtype));
-                        new_type = qtype; // if two or more types are specified for the same tensor, the last match wins
-                        manual = true;
-                        break;
-                    }
-                }
-            }
-        }
-
-        // if not manual - use the standard logic for choosing the quantization type based on the selected mixture
-        if (!manual) {
-            new_type = llama_tensor_get_type(qs, new_type, tensor, params->ftype, update_stats);
-        }
-
-        // incompatible tensor shapes are handled here - fallback to a compatible type
-        {
-            bool convert_incompatible_tensor = false;
-
-            const int64_t nx = tensor->ne[0];
-            const int64_t ny = tensor->ne[1];
-            const int64_t qk_k = ggml_blck_size(new_type);
-
-            if (nx % qk_k != 0) {
-                LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
-                convert_incompatible_tensor = true;
-            } else {
-                if (update_stats) {
-                    ++qs.n_k_quantized;
-                }
-            }
-
-            if (convert_incompatible_tensor) {
-                switch (new_type) {
-                    case GGML_TYPE_TQ1_0:
-                    case GGML_TYPE_TQ2_0:  new_type = GGML_TYPE_Q4_0; break;  // TODO: use a symmetric type instead
-                    case GGML_TYPE_IQ2_XXS:
-                    case GGML_TYPE_IQ2_XS:
-                    case GGML_TYPE_IQ2_S:
-                    case GGML_TYPE_IQ3_XXS:
-                    case GGML_TYPE_IQ3_S:
-                    case GGML_TYPE_IQ1_S:
-                    case GGML_TYPE_IQ1_M:
-                    case GGML_TYPE_Q2_K:
-                    case GGML_TYPE_Q3_K:
-                    case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
-                    case GGML_TYPE_Q4_K:   new_type = GGML_TYPE_Q5_0;   break;
-                    case GGML_TYPE_Q5_K:   new_type = GGML_TYPE_Q5_1;   break;
-                    case GGML_TYPE_Q6_K:   new_type = GGML_TYPE_Q8_0;   break;
-                    default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
-                }
-                if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
-                    new_type = GGML_TYPE_F16;
-                }
-                if (update_stats) {
-                    LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
-                    ++qs.n_fallback;
-                }
-            }
-        }
-    }
-    if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
-        new_type = params->token_embedding_type;
-    }
-    if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
-        new_type = params->output_tensor_type;
-    }
-    return new_type;
+    return allowed;
 }

 static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
@ -866,9 +867,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
        }
        gguf_add_tensor(ctx_outs[i_split].get(), tensor);

-        // TODO: we could save this per-tensor and correlate it with the vector of tensors so we
-        //       don't have to call this function again later (currently twice per tensor)
-        ggml_type target_type = tensor_get_target_type(qs, params, tensor, default_type, false);
+        ggml_type target_type = llama_tensor_get_type(qs, params, tensor, default_type, false);

        if (!params->imatrix &&
            tensor_allows_quantization(params, model.arch, tensor) &&