Add NPU Q4_0 support
This commit is contained in:
parent
9900245e0b
commit
9ca53c7991
|
|
@ -333,16 +333,24 @@ static bool is_op_unsupported_case(const ggml_tensor* op) {
|
|||
static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor* op) {
|
||||
GGML_ASSERT(dev->reg != nullptr);
|
||||
|
||||
static const std::set<ggml_type> supported_types{GGML_TYPE_F32,
|
||||
GGML_TYPE_F16,
|
||||
GGML_TYPE_BF16,
|
||||
GGML_TYPE_I64,
|
||||
GGML_TYPE_I32,
|
||||
GGML_TYPE_Q4_0,
|
||||
GGML_TYPE_Q4_1,
|
||||
GGML_TYPE_Q4_K,
|
||||
GGML_TYPE_Q8_0,
|
||||
GGML_TYPE_Q6_K};
|
||||
static std::set<ggml_type> supported_types{GGML_TYPE_F32,
|
||||
GGML_TYPE_F16,
|
||||
GGML_TYPE_BF16,
|
||||
GGML_TYPE_I64,
|
||||
GGML_TYPE_I32,
|
||||
GGML_TYPE_Q4_0,
|
||||
GGML_TYPE_Q4_1,
|
||||
GGML_TYPE_Q4_K,
|
||||
GGML_TYPE_Q8_0,
|
||||
GGML_TYPE_Q6_K};
|
||||
|
||||
std::string device = std::string(getenv("GGML_OPENVINO_DEVICE"));
|
||||
bool is_npu = device == "NPU";
|
||||
if (is_npu) {
|
||||
// NPU has poor support for asymmetric quantization
|
||||
supported_types.erase(GGML_TYPE_Q4_1);
|
||||
supported_types.erase(GGML_TYPE_Q4_K);
|
||||
}
|
||||
|
||||
static const std::set<ggml_op> supported_ops{GGML_OP_NONE,
|
||||
GGML_OP_ADD,
|
||||
|
|
|
|||
|
|
@ -230,6 +230,10 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, o
|
|||
}
|
||||
|
||||
auto zero_point = std::make_shared<ov::op::v0::Constant>(biases_u8);
|
||||
float zp_value;
|
||||
if (ov::op::util::get_single_value(zero_point, zp_value)) {
|
||||
zero_point = ov::op::v0::Constant::create(zero_point->get_element_type(), {}, {zp_value});
|
||||
}
|
||||
|
||||
// Quantization operations
|
||||
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
|
||||
|
|
@ -287,12 +291,11 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, o
|
|||
zero_point_data[i] = (bias2 << 4) | (bias1 & 0x0F);
|
||||
}
|
||||
|
||||
// CVS-166438: GGUF Q4_0 zp array (U4) with all same value (8) will be converted to single U4 scalar via ConvertU4WeightsZeroPointToScalar transformation.
|
||||
// This corner case can be handled by CPU plugin properly, but will trigger compilation error on GPU plugin.
|
||||
// Temporal WA by adding one small bias to keep zp array shape for GPU plugin, confirm no accuracy impact for final LLM generation results.
|
||||
zero_point_data[0] += 1;
|
||||
|
||||
auto zero_points_node = std::make_shared<ov::op::v0::Constant>(zero_point_tensor);
|
||||
float zp_value;
|
||||
if (ov::op::util::get_single_value(zero_points_node, zp_value)) {
|
||||
zero_points_node = ov::op::v0::Constant::create(zero_points_node->get_element_type(), {}, {zp_value});
|
||||
}
|
||||
auto zero_points_f16 = std::make_shared<ov::op::v0::Convert>(zero_points_node, ov::element::f16);
|
||||
|
||||
auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
|
||||
|
|
|
|||
|
|
@ -1,5 +1,7 @@
|
|||
#include <cstdint>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/runtime/tensor.hpp>
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
void unpack_32_4(const uint8_t* data, uint8_t* dst);
|
||||
|
|
@ -42,3 +44,14 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight,
|
|||
ov::Tensor& scales,
|
||||
ov::Tensor& biases,
|
||||
size_t group_size = GGML_QUANTIZATION_GROUP_SIZE);
|
||||
|
||||
namespace ov {
|
||||
namespace op {
|
||||
namespace util {
|
||||
// From <openvino>/src/common/transformations/include/transformations/utils/utils.hpp
|
||||
bool get_single_value(const std::shared_ptr<ov::op::v0::Constant>& const_node,
|
||||
float& value,
|
||||
bool check_value_range = true);
|
||||
} // namespace util
|
||||
} // namespace op
|
||||
} // namespace ov
|
||||
|
|
|
|||
Loading…
Reference in New Issue