From 663a0b8cce302fcf9f56d7b5019d427ff6e60689 Mon Sep 17 00:00:00 2001
From: "Yu, Zijun" <zijun.yu@intel.com>
Date: Wed, 6 Aug 2025 15:54:40 +0800
Subject: [PATCH] Quant models run with accuracy issue

---
 ggml/src/ggml-openvino/ggml-decoder.cpp       | 20 ++++++++++++++++++-
 ggml/src/ggml-openvino/ggml-quant.cpp         |  4 +++-
 .../ggml-openvino/openvino/op/get_rows.cpp    | 11 ++++++++--
 .../openvino/translate_session.cpp            |  1 -
 ggml/src/ggml-openvino/openvino/utils.cpp     |  2 ++
 5 files changed, 33 insertions(+), 5 deletions(-)
diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
index c2e164b808..a3e7059fa2 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -14,6 +14,7 @@
 #include <map>
 #include <memory>
 #include <openvino/core/dimension.hpp>
+#include <openvino/core/except.hpp>
 #include <openvino/core/node.hpp>
 #include <openvino/core/partial_shape.hpp>
 #include <openvino/core/type/bfloat16.hpp>
@@ -22,6 +23,7 @@
 #include <openvino/op/constant.hpp>
 #include <openvino/op/convert.hpp>
 #include <openvino/op/parameter.hpp>
+#include <openvino/op/unsqueeze.hpp>
 #include <openvino/runtime/tensor.hpp>
 #include <ostream>
 #include <set>
@@ -415,6 +417,9 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor)
     auto node_shape = get_shape(tensor);
     auto ne_total = ggml_nelements(tensor);
 
+    OPENVINO_ASSERT(node_shape[0] == 1, "Got 3D weights, expect all weights to be 2D: ", tensor->name);
+
+    // F16 and F32 case
     if (node_type != ov::element::dynamic) {
         ov::Tensor weights(node_type, node_shape);
         memcpy(weights.data(), tensor->data, ne_total * node_type.size());
@@ -426,6 +431,9 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor)
         return weight_node;
     }
 
+    // Quantized case
+    node_shape.erase(node_shape.begin());
+
     uint64_t weights_per_byte;
     if (tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_1 || tensor->type == GGML_TYPE_Q4_K) {
         weights_per_byte = 2;
@@ -459,7 +467,7 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor)
     ov::Output<ov::Node> weight_node;
     if (tensor->type == GGML_TYPE_Q4_0) {
         extract_q4_0_data(tensor, weights, scales, biases);
-        weight_node = make_int8_weights(weights, scales, biases, weights_per_block);
+        weight_node = make_int4_weights(weights, scales, biases, weights_per_block);
     } else if (tensor->type == GGML_TYPE_Q4_1) {
         extract_q4_1_data(tensor, weights, scales, biases);
         weight_node = make_int4_weights(weights, scales, biases, weights_per_block);
@@ -474,7 +482,17 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor)
         extract_q4_k_data(tensor, weights, scales, biases);
         weight_node = make_int4_weights(weights, scales, biases, weights_per_block);
     }
+
+    OPENVINO_ASSERT(weight_node.get_shape().size() == 2, "Weight should be 2D");
+    // weight_node = std::make_shared<ov::op::v0::Unsqueeze>(
+    //     weight_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {0}));
+
     weight_node.get_node_shared_ptr()->set_friendly_name(tensor->name);
+    // GGML_LOG_DEBUG("Created weight node: %s   %s %s%s\n",
+    //                tensor->name,
+    //                ggml_type_name(tensor->type),
+    //                weight_node.get_element_type().get_type_name().c_str(),
+    //                weight_node.get_partial_shape().to_string().c_str());
     return weight_node.get_node_shared_ptr();
 }
 
diff --git a/ggml/src/ggml-openvino/ggml-quant.cpp b/ggml/src/ggml-openvino/ggml-quant.cpp
index 4311ab138e..14ef58a3f7 100644
--- a/ggml/src/ggml-openvino/ggml-quant.cpp
+++ b/ggml/src/ggml-openvino/ggml-quant.cpp
@@ -1,4 +1,7 @@
+#include "ggml-quant.hpp"
+
 #include <cstdint>
+#include <openvino/core/parallel.hpp>
 #include <openvino/core/type/element_type_traits.hpp>
 #include <openvino/op/constant.hpp>
 #include <openvino/op/convert.hpp>
@@ -6,7 +9,6 @@
 #include <openvino/op/reshape.hpp>
 #include <openvino/op/subtract.hpp>
 #include <openvino/runtime/tensor.hpp>
-#include <openvino/core/parallel.hpp>
 
 #include "ggml.h"
 
diff --git a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp
index 36795fd43e..0de77da59f 100644
--- a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp
+++ b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp
@@ -1,4 +1,3 @@
-#include <cstdint>
 #include <openvino/core/node.hpp>
 #include <openvino/core/node_output.hpp>
 #include <openvino/op/constant.hpp>
@@ -7,6 +6,7 @@
 #include <openvino/op/reshape.hpp>
 #include <openvino/op/slice.hpp>
 #include <openvino/op/squeeze.hpp>
+#include <openvino/op/unsqueeze.hpp>
 
 #include "../node_context.hpp"
 #include "../op_table.hpp"
@@ -31,11 +31,18 @@ OutputVector translate_get_rows(const NodeContext& context) {
         indices = process_view_input(context, 1);
     }
 
-    auto axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1});
+    Output<Node> axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1});
     if (indices.get_partial_shape()[1].get_length() == 1) {
         indices =
             std::make_shared<ov::op::v0::Squeeze>(indices, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1}));
+        if (data.get_partial_shape().rank() == 2) {
+            axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {0});
+        }
         res = std::make_shared<ov::op::v8::Gather>(data, indices, axis);
+        if (data.get_partial_shape().rank() == 2) {
+            res =
+                std::make_shared<ov::op::v0::Unsqueeze>(res, ov::op::v0::Constant::create(ov::element::i64, {1}, {0}));
+        }
     } else {
         indices =
             std::make_shared<ov::op::v0::Squeeze>(indices, ov::op::v0::Constant::create(ov::element::i64, {1}, {0}));
diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp
index 3e27a689d5..6280467041 100644
--- a/ggml/src/ggml-openvino/openvino/translate_session.cpp
+++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp
@@ -212,7 +212,6 @@ std::shared_ptr<Model> TranslateSession::apply_transformations(std::shared_ptr<M
         ov::pass::Manager manager;
         manager.set_per_pass_validation(true);
         manager.register_pass<ov::pass::MarkCompressedFloatConstants>();
-        manager.register_pass<ov::pass::ConstantFolding>();
 
         if (!ggml_model_decoder->is_static()) {
             const auto kv_param_res_names = ggml_model_decoder->get_kv_param_res_names();
diff --git a/ggml/src/ggml-openvino/openvino/utils.cpp b/ggml/src/ggml-openvino/openvino/utils.cpp
index c4197ccc3a..ef5f51ebbc 100644
--- a/ggml/src/ggml-openvino/openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/openvino/utils.cpp
@@ -17,6 +17,8 @@
 #include <openvino/op/transpose.hpp>
 #include <string>
 
+#include "ggml-impl.h"
+
 namespace ov {
 namespace frontend {
 namespace ggml {