Add NPU Q4_0 support

This commit is contained in:
Yu, Zijun 2025-08-19 14:56:28 +08:00 committed by Mustafa Cavus
parent 9900245e0b
commit 9ca53c7991
3 changed files with 39 additions and 15 deletions

View File

@ -333,16 +333,24 @@ static bool is_op_unsupported_case(const ggml_tensor* op) {
static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor* op) {
GGML_ASSERT(dev->reg != nullptr);
static const std::set<ggml_type> supported_types{GGML_TYPE_F32,
GGML_TYPE_F16,
GGML_TYPE_BF16,
GGML_TYPE_I64,
GGML_TYPE_I32,
GGML_TYPE_Q4_0,
GGML_TYPE_Q4_1,
GGML_TYPE_Q4_K,
GGML_TYPE_Q8_0,
GGML_TYPE_Q6_K};
static std::set<ggml_type> supported_types{GGML_TYPE_F32,
GGML_TYPE_F16,
GGML_TYPE_BF16,
GGML_TYPE_I64,
GGML_TYPE_I32,
GGML_TYPE_Q4_0,
GGML_TYPE_Q4_1,
GGML_TYPE_Q4_K,
GGML_TYPE_Q8_0,
GGML_TYPE_Q6_K};
std::string device = std::string(getenv("GGML_OPENVINO_DEVICE"));
bool is_npu = device == "NPU";
if (is_npu) {
// NPU has poor support for asymmetric quantization
supported_types.erase(GGML_TYPE_Q4_1);
supported_types.erase(GGML_TYPE_Q4_K);
}
static const std::set<ggml_op> supported_ops{GGML_OP_NONE,
GGML_OP_ADD,

View File

@ -230,6 +230,10 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, o
}
auto zero_point = std::make_shared<ov::op::v0::Constant>(biases_u8);
float zp_value;
if (ov::op::util::get_single_value(zero_point, zp_value)) {
zero_point = ov::op::v0::Constant::create(zero_point->get_element_type(), {}, {zp_value});
}
// Quantization operations
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
@ -287,12 +291,11 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, o
zero_point_data[i] = (bias2 << 4) | (bias1 & 0x0F);
}
// CVS-166438: GGUF Q4_0 zp array (U4) with all same value (8) will be converted to single U4 scalar via ConvertU4WeightsZeroPointToScalar transformation.
// This corner case can be handled by CPU plugin properly, but will trigger compilation error on GPU plugin.
// Temporal WA by adding one small bias to keep zp array shape for GPU plugin, confirm no accuracy impact for final LLM generation results.
zero_point_data[0] += 1;
auto zero_points_node = std::make_shared<ov::op::v0::Constant>(zero_point_tensor);
float zp_value;
if (ov::op::util::get_single_value(zero_points_node, zp_value)) {
zero_points_node = ov::op::v0::Constant::create(zero_points_node->get_element_type(), {}, {zp_value});
}
auto zero_points_f16 = std::make_shared<ov::op::v0::Convert>(zero_points_node, ov::element::f16);
auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);

View File

@ -1,5 +1,7 @@
#include <cstdint>
#include <openvino/op/constant.hpp>
#include <openvino/runtime/tensor.hpp>
#include "ggml.h"
void unpack_32_4(const uint8_t* data, uint8_t* dst);
@ -42,3 +44,14 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight,
ov::Tensor& scales,
ov::Tensor& biases,
size_t group_size = GGML_QUANTIZATION_GROUP_SIZE);
namespace ov {
namespace op {
namespace util {
// From <openvino>/src/common/transformations/include/transformations/utils/utils.hpp
bool get_single_value(const std::shared_ptr<ov::op::v0::Constant>& const_node,
float& value,
bool check_value_range = true);
} // namespace util
} // namespace op
} // namespace ov