From 4e451778d32e4093e148f9ec38221ee29e6b28cd Mon Sep 17 00:00:00 2001
From: "Yu, Zijun" <zijun.yu@intel.com>
Date: Mon, 29 Dec 2025 15:27:50 +0800
Subject: [PATCH] Use Q8_0_C in token embd, lm_head, and for 5 and 6 bits quant

---
 ggml/src/ggml-openvino/ggml-openvino-extra.cpp | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)
diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp
index 2f24d7a1db..35d3d93cfd 100644
--- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp
@@ -164,23 +164,19 @@ clEnqueueMemcpyINTEL_fn ggml_openvino_get_clEnqueueMemcpyINTEL() {
 
 // Get requantization type for a tensor type (returns nullopt if no requant needed)
 std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor * tensor) {
-    if (!ggml_openvino_is_npu()) {
-        return std::nullopt;
-    }
-    // NPU requantization rules
     if (strncmp(tensor->name, "token_embd.weight", 17) == 0) {
-        return ExtraQuantType::F16;
+        return ExtraQuantType::Q8_0_C;
     }
     if (strncmp(tensor->name, "output.weight", 13) == 0) {
+        return ExtraQuantType::Q8_0_C;
+    }
+    if (ggml_openvino_is_npu()) {
         return ExtraQuantType::Q4_0_128;
     }
     switch (tensor->type) {
-    case GGML_TYPE_Q4_0:
-    case GGML_TYPE_Q4_1:
-    case GGML_TYPE_Q4_K:
     case GGML_TYPE_Q6_K:
     case GGML_TYPE_Q5_K:
-        return ExtraQuantType::Q4_0_128;
+        return ExtraQuantType::Q8_0_C;
     default:
         return std::nullopt;
     }