Quant models run with accuracy issue

This commit is contained in:
Yu, Zijun 2025-08-06 15:54:40 +08:00 committed by Mustafa Cavus
parent d4ca760da8
commit 663a0b8cce
5 changed files with 33 additions and 5 deletions

View File

@ -14,6 +14,7 @@
#include <map>
#include <memory>
#include <openvino/core/dimension.hpp>
#include <openvino/core/except.hpp>
#include <openvino/core/node.hpp>
#include <openvino/core/partial_shape.hpp>
#include <openvino/core/type/bfloat16.hpp>
@ -22,6 +23,7 @@
#include <openvino/op/constant.hpp>
#include <openvino/op/convert.hpp>
#include <openvino/op/parameter.hpp>
#include <openvino/op/unsqueeze.hpp>
#include <openvino/runtime/tensor.hpp>
#include <ostream>
#include <set>
@ -415,6 +417,9 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor)
auto node_shape = get_shape(tensor);
auto ne_total = ggml_nelements(tensor);
OPENVINO_ASSERT(node_shape[0] == 1, "Got 3D weights, expect all weights to be 2D: ", tensor->name);
// F16 and F32 case
if (node_type != ov::element::dynamic) {
ov::Tensor weights(node_type, node_shape);
memcpy(weights.data(), tensor->data, ne_total * node_type.size());
@ -426,6 +431,9 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor)
return weight_node;
}
// Quantized case
node_shape.erase(node_shape.begin());
uint64_t weights_per_byte;
if (tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_1 || tensor->type == GGML_TYPE_Q4_K) {
weights_per_byte = 2;
@ -459,7 +467,7 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor)
ov::Output<ov::Node> weight_node;
if (tensor->type == GGML_TYPE_Q4_0) {
extract_q4_0_data(tensor, weights, scales, biases);
weight_node = make_int8_weights(weights, scales, biases, weights_per_block);
weight_node = make_int4_weights(weights, scales, biases, weights_per_block);
} else if (tensor->type == GGML_TYPE_Q4_1) {
extract_q4_1_data(tensor, weights, scales, biases);
weight_node = make_int4_weights(weights, scales, biases, weights_per_block);
@ -474,7 +482,17 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor)
extract_q4_k_data(tensor, weights, scales, biases);
weight_node = make_int4_weights(weights, scales, biases, weights_per_block);
}
OPENVINO_ASSERT(weight_node.get_shape().size() == 2, "Weight should be 2D");
// weight_node = std::make_shared<ov::op::v0::Unsqueeze>(
// weight_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {0}));
weight_node.get_node_shared_ptr()->set_friendly_name(tensor->name);
// GGML_LOG_DEBUG("Created weight node: %s %s %s%s\n",
// tensor->name,
// ggml_type_name(tensor->type),
// weight_node.get_element_type().get_type_name().c_str(),
// weight_node.get_partial_shape().to_string().c_str());
return weight_node.get_node_shared_ptr();
}

View File

@ -1,4 +1,7 @@
#include "ggml-quant.hpp"
#include <cstdint>
#include <openvino/core/parallel.hpp>
#include <openvino/core/type/element_type_traits.hpp>
#include <openvino/op/constant.hpp>
#include <openvino/op/convert.hpp>
@ -6,7 +9,6 @@
#include <openvino/op/reshape.hpp>
#include <openvino/op/subtract.hpp>
#include <openvino/runtime/tensor.hpp>
#include <openvino/core/parallel.hpp>
#include "ggml.h"

View File

@ -1,4 +1,3 @@
#include <cstdint>
#include <openvino/core/node.hpp>
#include <openvino/core/node_output.hpp>
#include <openvino/op/constant.hpp>
@ -7,6 +6,7 @@
#include <openvino/op/reshape.hpp>
#include <openvino/op/slice.hpp>
#include <openvino/op/squeeze.hpp>
#include <openvino/op/unsqueeze.hpp>
#include "../node_context.hpp"
#include "../op_table.hpp"
@ -31,11 +31,18 @@ OutputVector translate_get_rows(const NodeContext& context) {
indices = process_view_input(context, 1);
}
auto axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1});
Output<Node> axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1});
if (indices.get_partial_shape()[1].get_length() == 1) {
indices =
std::make_shared<ov::op::v0::Squeeze>(indices, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1}));
if (data.get_partial_shape().rank() == 2) {
axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {0});
}
res = std::make_shared<ov::op::v8::Gather>(data, indices, axis);
if (data.get_partial_shape().rank() == 2) {
res =
std::make_shared<ov::op::v0::Unsqueeze>(res, ov::op::v0::Constant::create(ov::element::i64, {1}, {0}));
}
} else {
indices =
std::make_shared<ov::op::v0::Squeeze>(indices, ov::op::v0::Constant::create(ov::element::i64, {1}, {0}));

View File

@ -212,7 +212,6 @@ std::shared_ptr<Model> TranslateSession::apply_transformations(std::shared_ptr<M
ov::pass::Manager manager;
manager.set_per_pass_validation(true);
manager.register_pass<ov::pass::MarkCompressedFloatConstants>();
manager.register_pass<ov::pass::ConstantFolding>();
if (!ggml_model_decoder->is_static()) {
const auto kv_param_res_names = ggml_model_decoder->get_kv_param_res_names();

View File

@ -17,6 +17,8 @@
#include <openvino/op/transpose.hpp>
#include <string>
#include "ggml-impl.h"
namespace ov {
namespace frontend {
namespace ggml {