Use Q8_0_C in token embd, lm_head, and for 5 and 6 bits quant

2025-12-29 15:27:50 +08:00 · 2025-12-29 15:27:50 +08:00 · 4e451778d3
parent 67c9720e49
commit 4e451778d3
1 changed files with 5 additions and 9 deletions
--- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp
@ -164,23 +164,19 @@ clEnqueueMemcpyINTEL_fn ggml_openvino_get_clEnqueueMemcpyINTEL() {
 // Get requantization type for a tensor type (returns nullopt if no requant needed)
 std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor * tensor) {
    if (!ggml_openvino_is_npu()) {
        return std::nullopt;
    }
    // NPU requantization rules
    if (strncmp(tensor->name, "token_embd.weight", 17) == 0) {
-        return ExtraQuantType::F16;
+        return ExtraQuantType::Q8_0_C;
    }
    if (strncmp(tensor->name, "output.weight", 13) == 0) {
        return ExtraQuantType::Q8_0_C;
    }
    if (ggml_openvino_is_npu()) {
        return ExtraQuantType::Q4_0_128;
    }
    switch (tensor->type) {
    case GGML_TYPE_Q4_0:
    case GGML_TYPE_Q4_1:
    case GGML_TYPE_Q4_K:
    case GGML_TYPE_Q6_K:
    case GGML_TYPE_Q5_K:
-        return ExtraQuantType::Q4_0_128;
+        return ExtraQuantType::Q8_0_C;
    default:
        return std::nullopt;
    }