From c1142ddb7c060ab826aa34d57017c829028af5e9 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 26 Dec 2025 15:18:30 +0800 Subject: [PATCH] NPU always requant to q4_0_128 --- .../src/ggml-openvino/ggml-openvino-extra.cpp | 16 ++++++--- ggml/src/ggml-openvino/ggml-openvino-extra.h | 2 +- ggml/src/ggml-openvino/ggml-quants.cpp | 34 ------------------- ggml/src/ggml-openvino/ggml-quants.hpp | 4 --- 4 files changed, 12 insertions(+), 44 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp index eff1627cb4..26cc386dff 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp @@ -3,6 +3,7 @@ #include "ggml-impl.h" #include "ggml.h" +#include #include #include @@ -162,19 +163,24 @@ clEnqueueMemcpyINTEL_fn ggml_openvino_get_clEnqueueMemcpyINTEL() { } // Get requantization type for a tensor type (returns nullopt if no requant needed) -std::optional ggml_openvino_get_requant_type(ggml_type type) { +std::optional ggml_openvino_get_requant_type(const ggml_tensor * tensor) { if (!ggml_openvino_is_npu()) { return std::nullopt; } // NPU requantization rules - switch (type) { + if (strncmp(tensor->name, "token_embd.weight", 17) == 0) { + return ExtraQuantType::F16; + } + if (strncmp(tensor->name, "output.weight", 13) == 0) { + return ExtraQuantType::Q4_0_128; + } + switch (tensor->type) { case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_K: - return ExtraQuantType::Q4_0_128; case GGML_TYPE_Q6_K: case GGML_TYPE_Q5_K: - return ExtraQuantType::F16; + return ExtraQuantType::Q4_0_128; default: return std::nullopt; } @@ -200,7 +206,7 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten const size_t alignment = 64; // Good for SIMD // Check if requantization is needed (NPU-specific) - auto requant_type = ggml_openvino_get_requant_type(tensor->type); + auto requant_type = ggml_openvino_get_requant_type(tensor); if (requant_type.has_value()) { layout.is_requant = true; layout.requant_type = requant_type; diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.h b/ggml/src/ggml-openvino/ggml-openvino-extra.h index 2f9d257769..fbfe459edf 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.h +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.h @@ -83,7 +83,7 @@ const std::string & ggml_openvino_get_device_name(); bool ggml_openvino_is_npu(); // Get requantization type for a tensor type (returns nullopt if no requant needed) -std::optional ggml_openvino_get_requant_type(ggml_type type); +std::optional ggml_openvino_get_requant_type(const ggml_tensor * tensor); // ===================================================== // OpenVINO Tensor Extra Types diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp index 6cacc7b034..1a5679cd8d 100644 --- a/ggml/src/ggml-openvino/ggml-quants.cpp +++ b/ggml/src/ggml-openvino/ggml-quants.cpp @@ -535,40 +535,6 @@ std::shared_ptr requantize_to_buffers(const ggml_tensor * tensor, return result; } -std::shared_ptr requantize(const ggml_tensor * tensor, ExtraQuantType requant_type) { - ov::Shape node_shape = {(uint64_t) (tensor->ne[1]), (uint64_t) (tensor->ne[0])}; - - // FIXME hardcoded workaround to fix the case where token_emb.weight is q4_0 (instead of q6_k) - // (In some q4_0 models which use two different weight for token_emb and output, token_emb is q4_0) - std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : ""; - if (device == "NPU" && std::string(tensor->name) == "token_embd.weight") { - requant_type = ExtraQuantType::F16; - } - - // Determine block size - int64_t block_size = node_shape[1]; - if (requant_type == ExtraQuantType::Q4_0_128) { - block_size = 128; - } else if (requant_type == ExtraQuantType::Q8_0_32) { - block_size = 32; - } - - // Allocate tensors - ov::Tensor weights, scales, biases; - if (requant_type == ExtraQuantType::F16) { - weights = ov::Tensor(ov::element::f16, node_shape); - } else { - bool is_u4 = (requant_type == ExtraQuantType::Q4_0_C || requant_type == ExtraQuantType::Q4_0_128); - ov::element::Type weight_type = is_u4 ? ov::element::u4 : ov::element::u8; - ov::Shape scales_shape = {node_shape[0], node_shape[1] / block_size}; - weights = ov::Tensor(weight_type, node_shape); - scales = ov::Tensor(ov::element::f16, scales_shape); - biases = ov::Tensor(ov::element::f16, scales_shape); - } - - return requantize_to_buffers(tensor, tensor->data, requant_type, block_size, weights, scales, biases); -} - std::shared_ptr process_weight_tensor(const ggml_tensor * tensor, const void * data, void * output_base_ptr) { GGML_ASSERT(tensor != nullptr); GGML_ASSERT(data != nullptr); diff --git a/ggml/src/ggml-openvino/ggml-quants.hpp b/ggml/src/ggml-openvino/ggml-quants.hpp index b1d286f1b8..a1334e2408 100644 --- a/ggml/src/ggml-openvino/ggml-quants.hpp +++ b/ggml/src/ggml-openvino/ggml-quants.hpp @@ -52,10 +52,6 @@ ov::Output make_int4_weights(ov::Tensor& weight, ov::Tensor& biases, size_t group_size = GGML_QUANTIZATION_GROUP_SIZE); -// ExtraQuantType is defined in ggml-openvino-extra.h - -std::shared_ptr requantize(const ggml_tensor* tensor, ExtraQuantType requant_type); - // Extract quantized weights from tensor and create weight subgraph // If weights/scales/biases are provided (non-empty), uses them as output buffers // Otherwise allocates new ov::Tensors internally