NPU always requant to q4_0_128

2025-12-26 15:18:30 +08:00 · 2025-12-26 15:18:30 +08:00 · c1142ddb7c
parent 52a44012c0
commit c1142ddb7c
4 changed files with 12 additions and 44 deletions
--- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp
@ -3,6 +3,7 @@
 #include "ggml-impl.h"
 #include "ggml.h"

+#include <cstring>
 #include <openvino/runtime/intel_gpu/ocl/ocl.hpp>
 #include <openvino/runtime/intel_npu/level_zero/level_zero.hpp>

@ -162,19 +163,24 @@ clEnqueueMemcpyINTEL_fn ggml_openvino_get_clEnqueueMemcpyINTEL() {
 }

 // Get requantization type for a tensor type (returns nullopt if no requant needed)
-std::optional<ExtraQuantType> ggml_openvino_get_requant_type(ggml_type type) {
+std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor * tensor) {
    if (!ggml_openvino_is_npu()) {
        return std::nullopt;
    }
    // NPU requantization rules
-    switch (type) {
+    if (strncmp(tensor->name, "token_embd.weight", 17) == 0) {
+        return ExtraQuantType::F16;
+    }
+    if (strncmp(tensor->name, "output.weight", 13) == 0) {
+        return ExtraQuantType::Q4_0_128;
+    }
+    switch (tensor->type) {
    case GGML_TYPE_Q4_0:
    case GGML_TYPE_Q4_1:
    case GGML_TYPE_Q4_K:
-        return ExtraQuantType::Q4_0_128;
    case GGML_TYPE_Q6_K:
    case GGML_TYPE_Q5_K:
-        return ExtraQuantType::F16;
+        return ExtraQuantType::Q4_0_128;
    default:
        return std::nullopt;
    }
@ -200,7 +206,7 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
    const size_t alignment = 64;  // Good for SIMD

    // Check if requantization is needed (NPU-specific)
-    auto requant_type = ggml_openvino_get_requant_type(tensor->type);
+    auto requant_type = ggml_openvino_get_requant_type(tensor);
    if (requant_type.has_value()) {
        layout.is_requant = true;
        layout.requant_type = requant_type;
--- a/ggml/src/ggml-openvino/ggml-openvino-extra.h
+++ b/ggml/src/ggml-openvino/ggml-openvino-extra.h
@ -83,7 +83,7 @@ const std::string & ggml_openvino_get_device_name();
 bool ggml_openvino_is_npu();

 // Get requantization type for a tensor type (returns nullopt if no requant needed)
-std::optional<ExtraQuantType> ggml_openvino_get_requant_type(ggml_type type);
+std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor * tensor);

 // =====================================================
 // OpenVINO Tensor Extra Types
--- a/ggml/src/ggml-openvino/ggml-quants.cpp
+++ b/ggml/src/ggml-openvino/ggml-quants.cpp
@ -535,40 +535,6 @@ std::shared_ptr<ov::Node> requantize_to_buffers(const ggml_tensor * tensor,
    return result;
 }

-std::shared_ptr<ov::Node> requantize(const ggml_tensor * tensor, ExtraQuantType requant_type) {
-    ov::Shape node_shape = {(uint64_t) (tensor->ne[1]), (uint64_t) (tensor->ne[0])};
-
-    // FIXME hardcoded workaround to fix the case where token_emb.weight is q4_0 (instead of q6_k)
-    // (In some q4_0 models which use two different weight for token_emb and output, token_emb is q4_0)
-    std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : "";
-    if (device == "NPU" && std::string(tensor->name) == "token_embd.weight") {
-        requant_type = ExtraQuantType::F16;
-    }
-
-    // Determine block size
-    int64_t block_size = node_shape[1];
-    if (requant_type == ExtraQuantType::Q4_0_128) {
-        block_size = 128;
-    } else if (requant_type == ExtraQuantType::Q8_0_32) {
-        block_size = 32;
-    }
-
-    // Allocate tensors
-    ov::Tensor weights, scales, biases;
-    if (requant_type == ExtraQuantType::F16) {
-        weights = ov::Tensor(ov::element::f16, node_shape);
-    } else {
-        bool is_u4 = (requant_type == ExtraQuantType::Q4_0_C || requant_type == ExtraQuantType::Q4_0_128);
-        ov::element::Type weight_type = is_u4 ? ov::element::u4 : ov::element::u8;
-        ov::Shape scales_shape = {node_shape[0], node_shape[1] / block_size};
-        weights = ov::Tensor(weight_type, node_shape);
-        scales = ov::Tensor(ov::element::f16, scales_shape);
-        biases = ov::Tensor(ov::element::f16, scales_shape);
-    }
-
-    return requantize_to_buffers(tensor, tensor->data, requant_type, block_size, weights, scales, biases);
-}
-
 std::shared_ptr<ov::Node> process_weight_tensor(const ggml_tensor * tensor, const void * data, void * output_base_ptr) {
    GGML_ASSERT(tensor != nullptr);
    GGML_ASSERT(data != nullptr);
--- a/ggml/src/ggml-openvino/ggml-quants.hpp
+++ b/ggml/src/ggml-openvino/ggml-quants.hpp
@ -52,10 +52,6 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight,
                                       ov::Tensor& biases,
                                       size_t group_size = GGML_QUANTIZATION_GROUP_SIZE);

-// ExtraQuantType is defined in ggml-openvino-extra.h
-
-std::shared_ptr<ov::Node> requantize(const ggml_tensor* tensor, ExtraQuantType requant_type);
-
 // Extract quantized weights from tensor and create weight subgraph
 // If weights/scales/biases are provided (non-empty), uses them as output buffers
 // Otherwise allocates new ov::Tensors internally