From c1142ddb7c060ab826aa34d57017c829028af5e9 Mon Sep 17 00:00:00 2001
From: "Yu, Zijun" <zijun.yu@intel.com>
Date: Fri, 26 Dec 2025 15:18:30 +0800
Subject: [PATCH] NPU always requant to q4_0_128

---
 .../src/ggml-openvino/ggml-openvino-extra.cpp | 16 ++++++---
 ggml/src/ggml-openvino/ggml-openvino-extra.h  |  2 +-
 ggml/src/ggml-openvino/ggml-quants.cpp        | 34 -------------------
 ggml/src/ggml-openvino/ggml-quants.hpp        |  4 ---
 4 files changed, 12 insertions(+), 44 deletions(-)
diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp
index eff1627cb4..26cc386dff 100644
--- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp
@@ -3,6 +3,7 @@
 #include "ggml-impl.h"
 #include "ggml.h"
 
+#include <cstring>
 #include <openvino/runtime/intel_gpu/ocl/ocl.hpp>
 #include <openvino/runtime/intel_npu/level_zero/level_zero.hpp>
 
@@ -162,19 +163,24 @@ clEnqueueMemcpyINTEL_fn ggml_openvino_get_clEnqueueMemcpyINTEL() {
 }
 
 // Get requantization type for a tensor type (returns nullopt if no requant needed)
-std::optional<ExtraQuantType> ggml_openvino_get_requant_type(ggml_type type) {
+std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor * tensor) {
     if (!ggml_openvino_is_npu()) {
         return std::nullopt;
     }
     // NPU requantization rules
-    switch (type) {
+    if (strncmp(tensor->name, "token_embd.weight", 17) == 0) {
+        return ExtraQuantType::F16;
+    }
+    if (strncmp(tensor->name, "output.weight", 13) == 0) {
+        return ExtraQuantType::Q4_0_128;
+    }
+    switch (tensor->type) {
     case GGML_TYPE_Q4_0:
     case GGML_TYPE_Q4_1:
     case GGML_TYPE_Q4_K:
-        return ExtraQuantType::Q4_0_128;
     case GGML_TYPE_Q6_K:
     case GGML_TYPE_Q5_K:
-        return ExtraQuantType::F16;
+        return ExtraQuantType::Q4_0_128;
     default:
         return std::nullopt;
     }
@@ -200,7 +206,7 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
     const size_t alignment = 64;  // Good for SIMD
 
     // Check if requantization is needed (NPU-specific)
-    auto requant_type = ggml_openvino_get_requant_type(tensor->type);
+    auto requant_type = ggml_openvino_get_requant_type(tensor);
     if (requant_type.has_value()) {
         layout.is_requant = true;
         layout.requant_type = requant_type;
diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.h b/ggml/src/ggml-openvino/ggml-openvino-extra.h
index 2f9d257769..fbfe459edf 100644
--- a/ggml/src/ggml-openvino/ggml-openvino-extra.h
+++ b/ggml/src/ggml-openvino/ggml-openvino-extra.h
@@ -83,7 +83,7 @@ const std::string & ggml_openvino_get_device_name();
 bool ggml_openvino_is_npu();
 
 // Get requantization type for a tensor type (returns nullopt if no requant needed)
-std::optional<ExtraQuantType> ggml_openvino_get_requant_type(ggml_type type);
+std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor * tensor);
 
 // =====================================================
 // OpenVINO Tensor Extra Types
diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp
index 6cacc7b034..1a5679cd8d 100644
--- a/ggml/src/ggml-openvino/ggml-quants.cpp
+++ b/ggml/src/ggml-openvino/ggml-quants.cpp
@@ -535,40 +535,6 @@ std::shared_ptr<ov::Node> requantize_to_buffers(const ggml_tensor * tensor,
     return result;
 }
 
-std::shared_ptr<ov::Node> requantize(const ggml_tensor * tensor, ExtraQuantType requant_type) {
-    ov::Shape node_shape = {(uint64_t) (tensor->ne[1]), (uint64_t) (tensor->ne[0])};
-
-    // FIXME hardcoded workaround to fix the case where token_emb.weight is q4_0 (instead of q6_k)
-    // (In some q4_0 models which use two different weight for token_emb and output, token_emb is q4_0)
-    std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : "";
-    if (device == "NPU" && std::string(tensor->name) == "token_embd.weight") {
-        requant_type = ExtraQuantType::F16;
-    }
-
-    // Determine block size
-    int64_t block_size = node_shape[1];
-    if (requant_type == ExtraQuantType::Q4_0_128) {
-        block_size = 128;
-    } else if (requant_type == ExtraQuantType::Q8_0_32) {
-        block_size = 32;
-    }
-
-    // Allocate tensors
-    ov::Tensor weights, scales, biases;
-    if (requant_type == ExtraQuantType::F16) {
-        weights = ov::Tensor(ov::element::f16, node_shape);
-    } else {
-        bool is_u4 = (requant_type == ExtraQuantType::Q4_0_C || requant_type == ExtraQuantType::Q4_0_128);
-        ov::element::Type weight_type = is_u4 ? ov::element::u4 : ov::element::u8;
-        ov::Shape scales_shape = {node_shape[0], node_shape[1] / block_size};
-        weights = ov::Tensor(weight_type, node_shape);
-        scales = ov::Tensor(ov::element::f16, scales_shape);
-        biases = ov::Tensor(ov::element::f16, scales_shape);
-    }
-
-    return requantize_to_buffers(tensor, tensor->data, requant_type, block_size, weights, scales, biases);
-}
-
 std::shared_ptr<ov::Node> process_weight_tensor(const ggml_tensor * tensor, const void * data, void * output_base_ptr) {
     GGML_ASSERT(tensor != nullptr);
     GGML_ASSERT(data != nullptr);
diff --git a/ggml/src/ggml-openvino/ggml-quants.hpp b/ggml/src/ggml-openvino/ggml-quants.hpp
index b1d286f1b8..a1334e2408 100644
--- a/ggml/src/ggml-openvino/ggml-quants.hpp
+++ b/ggml/src/ggml-openvino/ggml-quants.hpp
@@ -52,10 +52,6 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight,
                                        ov::Tensor& biases,
                                        size_t group_size = GGML_QUANTIZATION_GROUP_SIZE);
 
-// ExtraQuantType is defined in ggml-openvino-extra.h
-
-std::shared_ptr<ov::Node> requantize(const ggml_tensor* tensor, ExtraQuantType requant_type);
-
 // Extract quantized weights from tensor and create weight subgraph
 // If weights/scales/biases are provided (non-empty), uses them as output buffers
 // Otherwise allocates new ov::Tensors internally