From b593428eb30cb5daf8000fde044a4f1da35f86f4 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 29 Aug 2025 11:39:27 +0800 Subject: [PATCH] Dequantize q4_1 q4_k q6_k for NPU --- ggml/src/ggml-openvino/ggml-decoder.cpp | 25 +++++++++++++++++------- ggml/src/ggml-openvino/ggml-decoder.h | 5 +++-- ggml/src/ggml-openvino/ggml-openvino.cpp | 8 -------- ggml/src/ggml-openvino/utils.cpp | 6 +++++- 4 files changed, 26 insertions(+), 18 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index b20bfd0c76..fef8648ebd 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -370,7 +370,8 @@ std::map GgmlOvDecoder::get_kv_param_res_names() const return kv_param_res_names; } -std::map> GgmlOvDecoder::create_weight_nodes(struct ggml_cgraph* cgraph) { +std::map> GgmlOvDecoder::create_weight_nodes( + struct ggml_cgraph* cgraph, std::set types_to_dequantize) { std::map> model_weights; static std::mutex weights_mutex; auto* nodes = cgraph->nodes; @@ -395,7 +396,7 @@ std::map> GgmlOvDecoder::create_weight_no } } if (should_create) { - auto weight_node = create_weight_node(src); + auto weight_node = create_weight_node(src, types_to_dequantize.count(src->type) > 0); weight_node->set_friendly_name(src_name); { std::lock_guard lock(weights_mutex); @@ -409,7 +410,7 @@ std::map> GgmlOvDecoder::create_weight_no return model_weights; } -std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) { +std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, bool to_dequantize) { std::set weight_types = { GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K}; if (weight_types.find(tensor->type) == weight_types.end()) { @@ -422,15 +423,17 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) auto ne_total = ggml_nelements(tensor); OPENVINO_ASSERT(node_shape[0] == 1, "Got 3D weights, expect all weights to be 2D: ", tensor->name); + node_shape.erase(node_shape.begin()); // F16 and F32 case if (node_type != ov::element::dynamic) { ov::Tensor weights(node_type, node_shape); memcpy(weights.data(), tensor->data, ne_total * node_type.size()); std::shared_ptr weight_node = std::make_shared(weights); - if (node_type == ov::element::f16) { - weight_node = std::make_shared(weight_node, ov::element::f32); - } + // Disabled because it triggers a bug in NPUW, no performance impact on CPU GPU + // if (node_type == ov::element::f16) { + // weight_node = std::make_shared(weight_node, ov::element::f32); + // } weight_node->set_friendly_name(tensor->name); return weight_node; } @@ -440,7 +443,15 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) tensor->extra == nullptr, "Unsupported weight tensor: " + std::string(tensor->name) + " Possibly this is a repacked quantized weights"); - node_shape.erase(node_shape.begin()); + if (to_dequantize) { + std::vector weights_f32(ne_total); + ggml_get_type_traits(tensor->type)->to_float(tensor->data, weights_f32.data(), ggml_nelements(tensor)); + ov::Tensor weights(ov::element::f16, node_shape); + ggml_get_type_traits(GGML_TYPE_F16)->from_float_ref(weights_f32.data(), weights.data(), ggml_nelements(tensor)); + std::shared_ptr weight_node = std::make_shared(weights); + weight_node->set_friendly_name(tensor->name); + return weight_node; + } uint64_t weights_per_byte; if (tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_1 || tensor->type == GGML_TYPE_Q4_K) { diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index df23c649f4..b446841514 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -117,8 +117,9 @@ public: static void dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename); - static std::shared_ptr create_weight_node(ggml_tensor* tensor); - static std::map> create_weight_nodes(struct ggml_cgraph* cgraph); + static std::shared_ptr create_weight_node(ggml_tensor* tensor, bool to_dequantize); + static std::map> create_weight_nodes( + struct ggml_cgraph* cgraph, std::set types_to_dequantize = {}); const ggml_tensor* get_tensor_used_op(const ggml_tensor* tensor) const; const ggml_tensor* get_tensor_from_name(const std::string& name) const; diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index a6ec1c64c2..60a2eb388e 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -344,14 +344,6 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con GGML_TYPE_Q8_0, GGML_TYPE_Q6_K}; - std::string device = std::string(getenv("GGML_OPENVINO_DEVICE")); - bool is_npu = device == "NPU"; - if (is_npu) { - // NPU has poor support for asymmetric quantization - supported_types.erase(GGML_TYPE_Q4_1); - supported_types.erase(GGML_TYPE_Q4_K); - } - static const std::set supported_ops{GGML_OP_NONE, GGML_OP_ADD, GGML_OP_MUL, diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 43fa0c469d..e49d941da4 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -130,7 +130,11 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c compile_end_time = conversion_end_time; } else { std::shared_ptr model; - auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); + std::set types_to_dequantize; + if (is_static) { + types_to_dequantize = {GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K}; + } + auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, types_to_dequantize); if (is_static) { ggml_decoder = std::make_shared(cgraph, model_weights, is_static, true);