From 663a0b8cce302fcf9f56d7b5019d427ff6e60689 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Wed, 6 Aug 2025 15:54:40 +0800 Subject: [PATCH] Quant models run with accuracy issue --- ggml/src/ggml-openvino/ggml-decoder.cpp | 20 ++++++++++++++++++- ggml/src/ggml-openvino/ggml-quant.cpp | 4 +++- .../ggml-openvino/openvino/op/get_rows.cpp | 11 ++++++++-- .../openvino/translate_session.cpp | 1 - ggml/src/ggml-openvino/openvino/utils.cpp | 2 ++ 5 files changed, 33 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index c2e164b808..a3e7059fa2 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -22,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -415,6 +417,9 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) auto node_shape = get_shape(tensor); auto ne_total = ggml_nelements(tensor); + OPENVINO_ASSERT(node_shape[0] == 1, "Got 3D weights, expect all weights to be 2D: ", tensor->name); + + // F16 and F32 case if (node_type != ov::element::dynamic) { ov::Tensor weights(node_type, node_shape); memcpy(weights.data(), tensor->data, ne_total * node_type.size()); @@ -426,6 +431,9 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) return weight_node; } + // Quantized case + node_shape.erase(node_shape.begin()); + uint64_t weights_per_byte; if (tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_1 || tensor->type == GGML_TYPE_Q4_K) { weights_per_byte = 2; @@ -459,7 +467,7 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) ov::Output weight_node; if (tensor->type == GGML_TYPE_Q4_0) { extract_q4_0_data(tensor, weights, scales, biases); - weight_node = make_int8_weights(weights, scales, biases, weights_per_block); + weight_node = make_int4_weights(weights, scales, biases, weights_per_block); } else if (tensor->type == GGML_TYPE_Q4_1) { extract_q4_1_data(tensor, weights, scales, biases); weight_node = make_int4_weights(weights, scales, biases, weights_per_block); @@ -474,7 +482,17 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) extract_q4_k_data(tensor, weights, scales, biases); weight_node = make_int4_weights(weights, scales, biases, weights_per_block); } + + OPENVINO_ASSERT(weight_node.get_shape().size() == 2, "Weight should be 2D"); + // weight_node = std::make_shared( + // weight_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {0})); + weight_node.get_node_shared_ptr()->set_friendly_name(tensor->name); + // GGML_LOG_DEBUG("Created weight node: %s %s %s%s\n", + // tensor->name, + // ggml_type_name(tensor->type), + // weight_node.get_element_type().get_type_name().c_str(), + // weight_node.get_partial_shape().to_string().c_str()); return weight_node.get_node_shared_ptr(); } diff --git a/ggml/src/ggml-openvino/ggml-quant.cpp b/ggml/src/ggml-openvino/ggml-quant.cpp index 4311ab138e..14ef58a3f7 100644 --- a/ggml/src/ggml-openvino/ggml-quant.cpp +++ b/ggml/src/ggml-openvino/ggml-quant.cpp @@ -1,4 +1,7 @@ +#include "ggml-quant.hpp" + #include +#include #include #include #include @@ -6,7 +9,6 @@ #include #include #include -#include #include "ggml.h" diff --git a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp index 36795fd43e..0de77da59f 100644 --- a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp @@ -1,4 +1,3 @@ -#include #include #include #include @@ -7,6 +6,7 @@ #include #include #include +#include #include "../node_context.hpp" #include "../op_table.hpp" @@ -31,11 +31,18 @@ OutputVector translate_get_rows(const NodeContext& context) { indices = process_view_input(context, 1); } - auto axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1}); + Output axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1}); if (indices.get_partial_shape()[1].get_length() == 1) { indices = std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); + if (data.get_partial_shape().rank() == 2) { + axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {0}); + } res = std::make_shared(data, indices, axis); + if (data.get_partial_shape().rank() == 2) { + res = + std::make_shared(res, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); + } } else { indices = std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 3e27a689d5..6280467041 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -212,7 +212,6 @@ std::shared_ptr TranslateSession::apply_transformations(std::shared_ptr(); - manager.register_pass(); if (!ggml_model_decoder->is_static()) { const auto kv_param_res_names = ggml_model_decoder->get_kv_param_res_names(); diff --git a/ggml/src/ggml-openvino/openvino/utils.cpp b/ggml/src/ggml-openvino/openvino/utils.cpp index c4197ccc3a..ef5f51ebbc 100644 --- a/ggml/src/ggml-openvino/openvino/utils.cpp +++ b/ggml/src/ggml-openvino/openvino/utils.cpp @@ -17,6 +17,8 @@ #include #include +#include "ggml-impl.h" + namespace ov { namespace frontend { namespace ggml {