Quant models run with accuracy issue
This commit is contained in:
parent
d4ca760da8
commit
663a0b8cce
|
|
@ -14,6 +14,7 @@
|
|||
#include <map>
|
||||
#include <memory>
|
||||
#include <openvino/core/dimension.hpp>
|
||||
#include <openvino/core/except.hpp>
|
||||
#include <openvino/core/node.hpp>
|
||||
#include <openvino/core/partial_shape.hpp>
|
||||
#include <openvino/core/type/bfloat16.hpp>
|
||||
|
|
@ -22,6 +23,7 @@
|
|||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/convert.hpp>
|
||||
#include <openvino/op/parameter.hpp>
|
||||
#include <openvino/op/unsqueeze.hpp>
|
||||
#include <openvino/runtime/tensor.hpp>
|
||||
#include <ostream>
|
||||
#include <set>
|
||||
|
|
@ -415,6 +417,9 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor)
|
|||
auto node_shape = get_shape(tensor);
|
||||
auto ne_total = ggml_nelements(tensor);
|
||||
|
||||
OPENVINO_ASSERT(node_shape[0] == 1, "Got 3D weights, expect all weights to be 2D: ", tensor->name);
|
||||
|
||||
// F16 and F32 case
|
||||
if (node_type != ov::element::dynamic) {
|
||||
ov::Tensor weights(node_type, node_shape);
|
||||
memcpy(weights.data(), tensor->data, ne_total * node_type.size());
|
||||
|
|
@ -426,6 +431,9 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor)
|
|||
return weight_node;
|
||||
}
|
||||
|
||||
// Quantized case
|
||||
node_shape.erase(node_shape.begin());
|
||||
|
||||
uint64_t weights_per_byte;
|
||||
if (tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_1 || tensor->type == GGML_TYPE_Q4_K) {
|
||||
weights_per_byte = 2;
|
||||
|
|
@ -459,7 +467,7 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor)
|
|||
ov::Output<ov::Node> weight_node;
|
||||
if (tensor->type == GGML_TYPE_Q4_0) {
|
||||
extract_q4_0_data(tensor, weights, scales, biases);
|
||||
weight_node = make_int8_weights(weights, scales, biases, weights_per_block);
|
||||
weight_node = make_int4_weights(weights, scales, biases, weights_per_block);
|
||||
} else if (tensor->type == GGML_TYPE_Q4_1) {
|
||||
extract_q4_1_data(tensor, weights, scales, biases);
|
||||
weight_node = make_int4_weights(weights, scales, biases, weights_per_block);
|
||||
|
|
@ -474,7 +482,17 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor)
|
|||
extract_q4_k_data(tensor, weights, scales, biases);
|
||||
weight_node = make_int4_weights(weights, scales, biases, weights_per_block);
|
||||
}
|
||||
|
||||
OPENVINO_ASSERT(weight_node.get_shape().size() == 2, "Weight should be 2D");
|
||||
// weight_node = std::make_shared<ov::op::v0::Unsqueeze>(
|
||||
// weight_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {0}));
|
||||
|
||||
weight_node.get_node_shared_ptr()->set_friendly_name(tensor->name);
|
||||
// GGML_LOG_DEBUG("Created weight node: %s %s %s%s\n",
|
||||
// tensor->name,
|
||||
// ggml_type_name(tensor->type),
|
||||
// weight_node.get_element_type().get_type_name().c_str(),
|
||||
// weight_node.get_partial_shape().to_string().c_str());
|
||||
return weight_node.get_node_shared_ptr();
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,7 @@
|
|||
#include "ggml-quant.hpp"
|
||||
|
||||
#include <cstdint>
|
||||
#include <openvino/core/parallel.hpp>
|
||||
#include <openvino/core/type/element_type_traits.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/convert.hpp>
|
||||
|
|
@ -6,7 +9,6 @@
|
|||
#include <openvino/op/reshape.hpp>
|
||||
#include <openvino/op/subtract.hpp>
|
||||
#include <openvino/runtime/tensor.hpp>
|
||||
#include <openvino/core/parallel.hpp>
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,3 @@
|
|||
#include <cstdint>
|
||||
#include <openvino/core/node.hpp>
|
||||
#include <openvino/core/node_output.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
|
|
@ -7,6 +6,7 @@
|
|||
#include <openvino/op/reshape.hpp>
|
||||
#include <openvino/op/slice.hpp>
|
||||
#include <openvino/op/squeeze.hpp>
|
||||
#include <openvino/op/unsqueeze.hpp>
|
||||
|
||||
#include "../node_context.hpp"
|
||||
#include "../op_table.hpp"
|
||||
|
|
@ -31,11 +31,18 @@ OutputVector translate_get_rows(const NodeContext& context) {
|
|||
indices = process_view_input(context, 1);
|
||||
}
|
||||
|
||||
auto axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1});
|
||||
Output<Node> axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {1});
|
||||
if (indices.get_partial_shape()[1].get_length() == 1) {
|
||||
indices =
|
||||
std::make_shared<ov::op::v0::Squeeze>(indices, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1}));
|
||||
if (data.get_partial_shape().rank() == 2) {
|
||||
axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {0});
|
||||
}
|
||||
res = std::make_shared<ov::op::v8::Gather>(data, indices, axis);
|
||||
if (data.get_partial_shape().rank() == 2) {
|
||||
res =
|
||||
std::make_shared<ov::op::v0::Unsqueeze>(res, ov::op::v0::Constant::create(ov::element::i64, {1}, {0}));
|
||||
}
|
||||
} else {
|
||||
indices =
|
||||
std::make_shared<ov::op::v0::Squeeze>(indices, ov::op::v0::Constant::create(ov::element::i64, {1}, {0}));
|
||||
|
|
|
|||
|
|
@ -212,7 +212,6 @@ std::shared_ptr<Model> TranslateSession::apply_transformations(std::shared_ptr<M
|
|||
ov::pass::Manager manager;
|
||||
manager.set_per_pass_validation(true);
|
||||
manager.register_pass<ov::pass::MarkCompressedFloatConstants>();
|
||||
manager.register_pass<ov::pass::ConstantFolding>();
|
||||
|
||||
if (!ggml_model_decoder->is_static()) {
|
||||
const auto kv_param_res_names = ggml_model_decoder->get_kv_param_res_names();
|
||||
|
|
|
|||
|
|
@ -17,6 +17,8 @@
|
|||
#include <openvino/op/transpose.hpp>
|
||||
#include <string>
|
||||
|
||||
#include "ggml-impl.h"
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
namespace ggml {
|
||||
|
|
|
|||
Loading…
Reference in New Issue