From 9ca53c79917aa13954fddda4ae45878e6261b19d Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 19 Aug 2025 14:56:28 +0800 Subject: [PATCH] Add NPU Q4_0 support --- ggml/src/ggml-openvino/ggml-openvino.cpp | 28 +++++++++++++++--------- ggml/src/ggml-openvino/ggml-quants.cpp | 13 ++++++----- ggml/src/ggml-openvino/ggml-quants.hpp | 13 +++++++++++ 3 files changed, 39 insertions(+), 15 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 4b743be688..a6ec1c64c2 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -333,16 +333,24 @@ static bool is_op_unsupported_case(const ggml_tensor* op) { static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor* op) { GGML_ASSERT(dev->reg != nullptr); - static const std::set supported_types{GGML_TYPE_F32, - GGML_TYPE_F16, - GGML_TYPE_BF16, - GGML_TYPE_I64, - GGML_TYPE_I32, - GGML_TYPE_Q4_0, - GGML_TYPE_Q4_1, - GGML_TYPE_Q4_K, - GGML_TYPE_Q8_0, - GGML_TYPE_Q6_K}; + static std::set supported_types{GGML_TYPE_F32, + GGML_TYPE_F16, + GGML_TYPE_BF16, + GGML_TYPE_I64, + GGML_TYPE_I32, + GGML_TYPE_Q4_0, + GGML_TYPE_Q4_1, + GGML_TYPE_Q4_K, + GGML_TYPE_Q8_0, + GGML_TYPE_Q6_K}; + + std::string device = std::string(getenv("GGML_OPENVINO_DEVICE")); + bool is_npu = device == "NPU"; + if (is_npu) { + // NPU has poor support for asymmetric quantization + supported_types.erase(GGML_TYPE_Q4_1); + supported_types.erase(GGML_TYPE_Q4_K); + } static const std::set supported_ops{GGML_OP_NONE, GGML_OP_ADD, diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp index e969b0b54a..97aa494ed8 100644 --- a/ggml/src/ggml-openvino/ggml-quants.cpp +++ b/ggml/src/ggml-openvino/ggml-quants.cpp @@ -230,6 +230,10 @@ ov::Output make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, o } auto zero_point = std::make_shared(biases_u8); + float zp_value; + if (ov::op::util::get_single_value(zero_point, zp_value)) { + zero_point = ov::op::v0::Constant::create(zero_point->get_element_type(), {}, {zp_value}); + } // Quantization operations auto weights_f16 = std::make_shared(weights_node, ov::element::f16); @@ -287,12 +291,11 @@ ov::Output make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, o zero_point_data[i] = (bias2 << 4) | (bias1 & 0x0F); } - // CVS-166438: GGUF Q4_0 zp array (U4) with all same value (8) will be converted to single U4 scalar via ConvertU4WeightsZeroPointToScalar transformation. - // This corner case can be handled by CPU plugin properly, but will trigger compilation error on GPU plugin. - // Temporal WA by adding one small bias to keep zp array shape for GPU plugin, confirm no accuracy impact for final LLM generation results. - zero_point_data[0] += 1; - auto zero_points_node = std::make_shared(zero_point_tensor); + float zp_value; + if (ov::op::util::get_single_value(zero_points_node, zp_value)) { + zero_points_node = ov::op::v0::Constant::create(zero_points_node->get_element_type(), {}, {zp_value}); + } auto zero_points_f16 = std::make_shared(zero_points_node, ov::element::f16); auto scales_f16 = std::make_shared(scales); diff --git a/ggml/src/ggml-openvino/ggml-quants.hpp b/ggml/src/ggml-openvino/ggml-quants.hpp index 9c0dd89a95..ae37b1618e 100644 --- a/ggml/src/ggml-openvino/ggml-quants.hpp +++ b/ggml/src/ggml-openvino/ggml-quants.hpp @@ -1,5 +1,7 @@ #include +#include #include + #include "ggml.h" void unpack_32_4(const uint8_t* data, uint8_t* dst); @@ -42,3 +44,14 @@ ov::Output make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size = GGML_QUANTIZATION_GROUP_SIZE); + +namespace ov { +namespace op { +namespace util { +// From /src/common/transformations/include/transformations/utils/utils.hpp +bool get_single_value(const std::shared_ptr& const_node, + float& value, + bool check_value_range = true); +} // namespace util +} // namespace op +} // namespace ov