Dequantize q4_1 q4_k q6_k for NPU

This commit is contained in:
Yu, Zijun 2025-08-29 11:39:27 +08:00 committed by Mustafa Cavus
parent 82c98335d3
commit b593428eb3
4 changed files with 26 additions and 18 deletions

View File

@ -370,7 +370,8 @@ std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const
return kv_param_res_names; return kv_param_res_names;
} }
std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(struct ggml_cgraph* cgraph) { std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(
struct ggml_cgraph* cgraph, std::set<ggml_type> types_to_dequantize) {
std::map<std::string, std::shared_ptr<ov::Node>> model_weights; std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
static std::mutex weights_mutex; static std::mutex weights_mutex;
auto* nodes = cgraph->nodes; auto* nodes = cgraph->nodes;
@ -395,7 +396,7 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
} }
} }
if (should_create) { if (should_create) {
auto weight_node = create_weight_node(src); auto weight_node = create_weight_node(src, types_to_dequantize.count(src->type) > 0);
weight_node->set_friendly_name(src_name); weight_node->set_friendly_name(src_name);
{ {
std::lock_guard<std::mutex> lock(weights_mutex); std::lock_guard<std::mutex> lock(weights_mutex);
@ -409,7 +410,7 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
return model_weights; return model_weights;
} }
std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) { std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, bool to_dequantize) {
std::set<ggml_type> weight_types = { std::set<ggml_type> weight_types = {
GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K}; GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K};
if (weight_types.find(tensor->type) == weight_types.end()) { if (weight_types.find(tensor->type) == weight_types.end()) {
@ -422,15 +423,17 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor)
auto ne_total = ggml_nelements(tensor); auto ne_total = ggml_nelements(tensor);
OPENVINO_ASSERT(node_shape[0] == 1, "Got 3D weights, expect all weights to be 2D: ", tensor->name); OPENVINO_ASSERT(node_shape[0] == 1, "Got 3D weights, expect all weights to be 2D: ", tensor->name);
node_shape.erase(node_shape.begin());
// F16 and F32 case // F16 and F32 case
if (node_type != ov::element::dynamic) { if (node_type != ov::element::dynamic) {
ov::Tensor weights(node_type, node_shape); ov::Tensor weights(node_type, node_shape);
memcpy(weights.data(), tensor->data, ne_total * node_type.size()); memcpy(weights.data(), tensor->data, ne_total * node_type.size());
std::shared_ptr<ov::Node> weight_node = std::make_shared<ov::op::v0::Constant>(weights); std::shared_ptr<ov::Node> weight_node = std::make_shared<ov::op::v0::Constant>(weights);
if (node_type == ov::element::f16) { // Disabled because it triggers a bug in NPUW, no performance impact on CPU GPU
weight_node = std::make_shared<ov::op::v0::Convert>(weight_node, ov::element::f32); // if (node_type == ov::element::f16) {
} // weight_node = std::make_shared<ov::op::v0::Convert>(weight_node, ov::element::f32);
// }
weight_node->set_friendly_name(tensor->name); weight_node->set_friendly_name(tensor->name);
return weight_node; return weight_node;
} }
@ -440,7 +443,15 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor)
tensor->extra == nullptr, tensor->extra == nullptr,
"Unsupported weight tensor: " + std::string(tensor->name) + " Possibly this is a repacked quantized weights"); "Unsupported weight tensor: " + std::string(tensor->name) + " Possibly this is a repacked quantized weights");
node_shape.erase(node_shape.begin()); if (to_dequantize) {
std::vector<float> weights_f32(ne_total);
ggml_get_type_traits(tensor->type)->to_float(tensor->data, weights_f32.data(), ggml_nelements(tensor));
ov::Tensor weights(ov::element::f16, node_shape);
ggml_get_type_traits(GGML_TYPE_F16)->from_float_ref(weights_f32.data(), weights.data(), ggml_nelements(tensor));
std::shared_ptr<ov::Node> weight_node = std::make_shared<ov::op::v0::Constant>(weights);
weight_node->set_friendly_name(tensor->name);
return weight_node;
}
uint64_t weights_per_byte; uint64_t weights_per_byte;
if (tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_1 || tensor->type == GGML_TYPE_Q4_K) { if (tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_1 || tensor->type == GGML_TYPE_Q4_K) {

View File

@ -117,8 +117,9 @@ public:
static void dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename); static void dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename);
static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor); static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor, bool to_dequantize);
static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(struct ggml_cgraph* cgraph); static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(
struct ggml_cgraph* cgraph, std::set<ggml_type> types_to_dequantize = {});
const ggml_tensor* get_tensor_used_op(const ggml_tensor* tensor) const; const ggml_tensor* get_tensor_used_op(const ggml_tensor* tensor) const;
const ggml_tensor* get_tensor_from_name(const std::string& name) const; const ggml_tensor* get_tensor_from_name(const std::string& name) const;

View File

@ -344,14 +344,6 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
GGML_TYPE_Q8_0, GGML_TYPE_Q8_0,
GGML_TYPE_Q6_K}; GGML_TYPE_Q6_K};
std::string device = std::string(getenv("GGML_OPENVINO_DEVICE"));
bool is_npu = device == "NPU";
if (is_npu) {
// NPU has poor support for asymmetric quantization
supported_types.erase(GGML_TYPE_Q4_1);
supported_types.erase(GGML_TYPE_Q4_K);
}
static const std::set<ggml_op> supported_ops{GGML_OP_NONE, static const std::set<ggml_op> supported_ops{GGML_OP_NONE,
GGML_OP_ADD, GGML_OP_ADD,
GGML_OP_MUL, GGML_OP_MUL,

View File

@ -130,7 +130,11 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
compile_end_time = conversion_end_time; compile_end_time = conversion_end_time;
} else { } else {
std::shared_ptr<ov::Model> model; std::shared_ptr<ov::Model> model;
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); std::set<ggml_type> types_to_dequantize;
if (is_static) {
types_to_dequantize = {GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K};
}
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, types_to_dequantize);
if (is_static) { if (is_static) {
ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, model_weights, is_static, true); ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, model_weights, is_static, true);