Quant models run with accuracy issue

2025-08-06 15:54:40 +08:00 · 2025-08-06 15:54:40 +08:00 · 663a0b8cce
parent d4ca760da8
commit 663a0b8cce
5 changed files with 33 additions and 5 deletions
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@ -14,6 +14,7 @@
 #include <map>
 #include <memory>
 #include <openvino/core/dimension.hpp>
+#include <openvino/core/except.hpp>
 #include <openvino/core/node.hpp>
 #include <openvino/core/partial_shape.hpp>
 #include <openvino/core/type/bfloat16.hpp>
@ -22,6 +23,7 @@
 #include <openvino/op/constant.hpp>
 #include <openvino/op/convert.hpp>
 #include <openvino/op/parameter.hpp>
+#include <openvino/op/unsqueeze.hpp>
 #include <openvino/runtime/tensor.hpp>
 #include <ostream>
 #include <set>
@ -415,6 +417,9 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor)
    auto node_shape = get_shape(tensor);
    auto ne_total = ggml_nelements(tensor);

+    OPENVINO_ASSERT(node_shape[0] == 1, "Got 3D weights, expect all weights to be 2D: ", tensor->name);
+
+    // F16 and F32 case
    if (node_type != ov::element::dynamic) {
        ov::Tensor weights(node_type, node_shape);
        memcpy(weights.data(), tensor->data, ne_total * node_type.size());
@ -426,6 +431,9 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor)
        return weight_node;
    }

+    // Quantized case
+    node_shape.erase(node_shape.begin());
+
    uint64_t weights_per_byte;
    if (tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_1 || tensor->type == GGML_TYPE_Q4_K) {
        weights_per_byte = 2;
@ -459,7 +467,7 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor)
    ov::Output<ov::Node> weight_node;
    if (tensor->type == GGML_TYPE_Q4_0) {
        extract_q4_0_data(tensor, weights, scales, biases);
-        weight_node = make_int8_weights(weights, scales, biases, weights_per_block);
+        weight_node = make_int4_weights(weights, scales, biases, weights_per_block);
    } else if (tensor->type == GGML_TYPE_Q4_1) {
        extract_q4_1_data(tensor, weights, scales, biases);
        weight_node = make_int4_weights(weights, scales, biases, weights_per_block);
@ -474,7 +482,17 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor)
        extract_q4_k_data(tensor, weights, scales, biases);
        weight_node = make_int4_weights(weights, scales, biases, weights_per_block);
    }
+
+    OPENVINO_ASSERT(weight_node.get_shape().size() == 2, "Weight should be 2D");
+    // weight_node = std::make_shared<ov::op::v0::Unsqueeze>(
+    //     weight_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {0}));
+
    weight_node.get_node_shared_ptr()->set_friendly_name(tensor->name);
+    // GGML_LOG_DEBUG("Created weight node: %s   %s %s%s\n",
+    //                tensor->name,
+    //                ggml_type_name(tensor->type),
+    //                weight_node.get_element_type().get_type_name().c_str(),
+    //                weight_node.get_partial_shape().to_string().c_str());
    return weight_node.get_node_shared_ptr();
 }

--- a/ggml/src/ggml-openvino/ggml-quant.cpp
+++ b/ggml/src/ggml-openvino/ggml-quant.cpp
@ -1,4 +1,7 @@
+#include "ggml-quant.hpp"
+
 #include <cstdint>
+#include <openvino/core/parallel.hpp>
 #include <openvino/core/type/element_type_traits.hpp>
 #include <openvino/op/constant.hpp>
 #include <openvino/op/convert.hpp>
@ -6,7 +9,6 @@
 #include <openvino/op/reshape.hpp>
 #include <openvino/op/subtract.hpp>
 #include <openvino/runtime/tensor.hpp>
-#include <openvino/core/parallel.hpp>

 #include "ggml.h"

--- a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp
+++ b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp
@ -1,4 +1,3 @@
-#include <cstdint>
 #include <openvino/core/node.hpp>
 #include <openvino/core/node_output.hpp>
 #include <openvino/op/constant.hpp>
@ -7,6 +6,7 @@
 #include <openvino/op/reshape.hpp>
 #include <openvino/op/slice.hpp>
 #include <openvino/op/squeeze.hpp>
+#include <openvino/op/unsqueeze.hpp>

 #include "../node_context.hpp"
 #include "../op_table.hpp"
@ -31,11 +31,18 @@ OutputVector translate_get_rows(const NodeContext& context) {
        indices = process_view_input(context, 1);
    }

-    auto axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1});
+    Output<Node> axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1});
    if (indices.get_partial_shape()[1].get_length() == 1) {
        indices =
            std::make_shared<ov::op::v0::Squeeze>(indices, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1}));
+        if (data.get_partial_shape().rank() == 2) {
+            axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {0});
+        }
        res = std::make_shared<ov::op::v8::Gather>(data, indices, axis);
+        if (data.get_partial_shape().rank() == 2) {
+            res =
+                std::make_shared<ov::op::v0::Unsqueeze>(res, ov::op::v0::Constant::create(ov::element::i64, {1}, {0}));
+        }
    } else {
        indices =
            std::make_shared<ov::op::v0::Squeeze>(indices, ov::op::v0::Constant::create(ov::element::i64, {1}, {0}));
--- a/ggml/src/ggml-openvino/openvino/translate_session.cpp
+++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp
@ -212,7 +212,6 @@ std::shared_ptr<Model> TranslateSession::apply_transformations(std::shared_ptr<M
        ov::pass::Manager manager;
        manager.set_per_pass_validation(true);
        manager.register_pass<ov::pass::MarkCompressedFloatConstants>();
-        manager.register_pass<ov::pass::ConstantFolding>();

        if (!ggml_model_decoder->is_static()) {
            const auto kv_param_res_names = ggml_model_decoder->get_kv_param_res_names();
--- a/ggml/src/ggml-openvino/openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/openvino/utils.cpp
@ -17,6 +17,8 @@
 #include <openvino/op/transpose.hpp>
 #include <string>

+#include "ggml-impl.h"
+
 namespace ov {
 namespace frontend {
 namespace ggml {