From 4e451778d32e4093e148f9ec38221ee29e6b28cd Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 29 Dec 2025 15:27:50 +0800 Subject: [PATCH] Use Q8_0_C in token embd, lm_head, and for 5 and 6 bits quant --- ggml/src/ggml-openvino/ggml-openvino-extra.cpp | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp index 2f24d7a1db..35d3d93cfd 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp @@ -164,23 +164,19 @@ clEnqueueMemcpyINTEL_fn ggml_openvino_get_clEnqueueMemcpyINTEL() { // Get requantization type for a tensor type (returns nullopt if no requant needed) std::optional ggml_openvino_get_requant_type(const ggml_tensor * tensor) { - if (!ggml_openvino_is_npu()) { - return std::nullopt; - } - // NPU requantization rules if (strncmp(tensor->name, "token_embd.weight", 17) == 0) { - return ExtraQuantType::F16; + return ExtraQuantType::Q8_0_C; } if (strncmp(tensor->name, "output.weight", 13) == 0) { + return ExtraQuantType::Q8_0_C; + } + if (ggml_openvino_is_npu()) { return ExtraQuantType::Q4_0_128; } switch (tensor->type) { - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q4_K: case GGML_TYPE_Q6_K: case GGML_TYPE_Q5_K: - return ExtraQuantType::Q4_0_128; + return ExtraQuantType::Q8_0_C; default: return std::nullopt; }