Use bias instead of zp in test-backend-ops

This commit is contained in:
Yu, Zijun 2026-02-13 17:33:07 +08:00
parent 2a6a95eb77
commit 5525bac078
9 changed files with 205 additions and 112 deletions

View File

@ -50,6 +50,7 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph,
m_is_static(is_static), m_is_static(is_static),
m_is_stateful(is_stateful), m_is_stateful(is_stateful),
m_is_prefill(is_prefill), m_is_prefill(is_prefill),
m_naive(false),
m_prefill_chunk_size(prefill_chunk_size), m_prefill_chunk_size(prefill_chunk_size),
m_cgraph(cgraph), m_cgraph(cgraph),
m_model_weights(model_weights), m_model_weights(model_weights),
@ -93,9 +94,10 @@ void GgmlOvDecoder::update_io(ggml_cgraph * cgraph) {
GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map<std::string, std::shared_ptr<ov::Node>> & model_weights) { GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map<std::string, std::shared_ptr<ov::Node>> & model_weights) {
m_cgraph = cgraph; m_cgraph = cgraph;
m_model_weights = model_weights; m_model_weights = model_weights;
m_naive = true;
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
auto * cur_node = cgraph->nodes[node_n]; auto * cur_node = cgraph->nodes[node_n];
set_input_output(cur_node, true); set_input_output(cur_node);
} }
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
m_node_info_list[node_n].node_op_case = compute_op_case(m_node_info_list[node_n].node); m_node_info_list[node_n].node_op_case = compute_op_case(m_node_info_list[node_n].node);
@ -134,7 +136,7 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map<std::string, std::sh
} }
} }
void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { void GgmlOvDecoder::set_input_output(ggml_tensor * node) {
NodeInfo current_node_info; NodeInfo current_node_info;
auto node_name = std::string(node->name); auto node_name = std::string(node->name);
auto node_output_name = node_name; auto node_output_name = node_name;
@ -169,7 +171,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) {
current_node_info.node_inputs_names.push_back(src_name); current_node_info.node_inputs_names.push_back(src_name);
// Add model inputs // Add model inputs
if (!naive && !src->view_src) { if (!m_naive && !src->view_src) {
ggml_backend_buffer * buffer = src->buffer; ggml_backend_buffer * buffer = src->buffer;
if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) { if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) {
@ -206,7 +208,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) {
} }
// Add model outputs // Add model outputs
if (!naive) { if (!m_naive) {
// Model outputs are tensors with GGML_TENSOR_FLAG_OUTPUT flag and kv_caches // Model outputs are tensors with GGML_TENSOR_FLAG_OUTPUT flag and kv_caches
static std::set<std::string> debug_output_names = {}; static std::set<std::string> debug_output_names = {};
// Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph // Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph
@ -509,12 +511,14 @@ std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const
return kv_param_res_names; return kv_param_res_names;
} }
std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph) { std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph, bool naive) {
std::map<std::string, std::shared_ptr<ov::Node>> model_weights; std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
// static std::mutex weights_mutex; // static std::mutex weights_mutex;
auto * nodes = cgraph->nodes; auto * nodes = cgraph->nodes;
auto n_nodes = cgraph->n_nodes; auto n_nodes = cgraph->n_nodes;
std::for_each(std::execution::seq, nodes, nodes + n_nodes, [&](ggml_tensor * node) { // std::for_each(std::execution::par, nodes, nodes + n_nodes, [&](ggml_tensor * node) {
for (int node_i = 0; node_i < n_nodes; node_i++) {
auto * node = nodes[node_i];
for (int i = 0; i < GGML_MAX_SRC; i++) { for (int i = 0; i < GGML_MAX_SRC; i++) {
auto * src = node->src[i]; auto * src = node->src[i];
if (src == nullptr) { if (src == nullptr) {
@ -542,18 +546,19 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
// } // }
// } // }
if (model_weights.find(src_name) == model_weights.end()) { if (model_weights.find(src_name) == model_weights.end()) {
auto weight_node = create_weight_node(src); auto weight_node = create_weight_node(src, naive);
weight_node->set_friendly_name(src_name); weight_node->set_friendly_name(src_name);
model_weights[src_name] = weight_node; model_weights[src_name] = weight_node;
} }
} }
} }
} }
}); }
// });
return model_weights; return model_weights;
} }
std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor) { std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor, bool naive) {
const bool is_ov_buffer = ggml_backend_buffer_is_openvino(tensor->buffer); const bool is_ov_buffer = ggml_backend_buffer_is_openvino(tensor->buffer);
// Check if we have a pre-built constant from the OpenVINO backend buffer // Check if we have a pre-built constant from the OpenVINO backend buffer
@ -581,6 +586,11 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor
} }
} }
// There are three cases where we need to create a new weight node:
// 1. weights are in openvino_host_buffer. Weight loading to host buffer will not trigger backend_buffer_set_tensor
// 2. weights are in cpu/cpu_mapped buffer. On token_embd.weight goes to case 1 or 2, depending on whether mmap or direct_io is used
// 3. test-backend-ops. buffers in test-backend-ops does not set USAGE_WEIGHT so backend_buffer_set_tensor will not create weight node
// GGML_LOG_DEBUG("%s: creating new weight node for %s\n", __func__, tensor->name); // GGML_LOG_DEBUG("%s: creating new weight node for %s\n", __func__, tensor->name);
static const std::set<ggml_type> weight_types = {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, static const std::set<ggml_type> weight_types = {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16,
GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
@ -592,6 +602,7 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor
OvWeight ov_weight; OvWeight ov_weight;
if (ggml_is_quantized(tensor->type)) { if (ggml_is_quantized(tensor->type)) {
auto use_bias = naive;
if (is_ov_buffer) { if (is_ov_buffer) {
// For quantized weights, copy raw data to a temp buffer first because // For quantized weights, copy raw data to a temp buffer first because
// process_weight_tensor reads from data and writes extracted results // process_weight_tensor reads from data and writes extracted results
@ -600,9 +611,9 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor
size_t raw_size = ggml_nbytes(tensor); size_t raw_size = ggml_nbytes(tensor);
std::vector<uint8_t> tmp(raw_size); std::vector<uint8_t> tmp(raw_size);
memcpy(tmp.data(), tensor->data, raw_size); memcpy(tmp.data(), tensor->data, raw_size);
ov_weight = process_weight_tensor(tensor, tmp.data(), tensor->data); ov_weight = process_weight_tensor(tensor, tmp.data(), tensor->data, use_bias);
} else { } else {
ov_weight = process_weight_tensor(tensor, tensor->data, nullptr); ov_weight = process_weight_tensor(tensor, tensor->data, nullptr, use_bias);
} }
} else { } else {
// For non-quantized weights (F16/F32/BF16), data is already in tensor->data. // For non-quantized weights (F16/F32/BF16), data is already in tensor->data.

View File

@ -104,7 +104,7 @@ public:
virtual ov::PartialShape get_output_shape(int node_idx) const override; virtual ov::PartialShape get_output_shape(int node_idx) const override;
virtual ov::element::Type get_output_type(const int node_idx) const override; virtual ov::element::Type get_output_type(int node_idx) const override;
virtual int32_t * get_input_op_params(int node_idx, const std::string & name) const override; virtual int32_t * get_input_op_params(int node_idx, const std::string & name) const override;
@ -184,9 +184,10 @@ public:
static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename); static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename);
static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor * tensor); static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor * tensor, bool naive = false);
static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(ggml_cgraph * cgraph); static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(ggml_cgraph * cgraph,
bool naive = false);
const ggml_tensor * get_tensor_used_op(const ggml_tensor * tensor) const; const ggml_tensor * get_tensor_used_op(const ggml_tensor * tensor) const;
@ -207,6 +208,7 @@ public:
bool m_is_static = false; bool m_is_static = false;
bool m_is_stateful = false; bool m_is_stateful = false;
bool m_is_prefill = false; bool m_is_prefill = false;
bool m_naive = false;
int m_prefill_chunk_size = 0; int m_prefill_chunk_size = 0;
static ov::Shape get_shape(const ggml_tensor * tensor); static ov::Shape get_shape(const ggml_tensor * tensor);
@ -265,7 +267,7 @@ public:
} }
private: private:
void set_input_output(ggml_tensor * node, bool naive = false); void set_input_output(ggml_tensor * node);
int compute_op_case(const ggml_tensor * node) const; int compute_op_case(const ggml_tensor * node) const;
void validate_cgraph() const; void validate_cgraph() const;

View File

@ -6,6 +6,7 @@
#include <cstring> #include <cstring>
#include <openvino/runtime/intel_gpu/ocl/ocl.hpp> #include <openvino/runtime/intel_gpu/ocl/ocl.hpp>
#include <openvino/runtime/intel_npu/level_zero/level_zero.hpp> #include <openvino/runtime/intel_npu/level_zero/level_zero.hpp>
#include <optional>
ov::Core & ov_singleton_core() { ov::Core & ov_singleton_core() {
static ov::Core core; static ov::Core core;
@ -164,7 +165,7 @@ clEnqueueMemcpyINTEL_fn ggml_openvino_get_clEnqueueMemcpyINTEL() {
} }
// Get requantization type for a tensor type (returns nullopt if no requant needed) // Get requantization type for a tensor type (returns nullopt if no requant needed)
std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor * tensor) { std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor * tensor, bool no_requant) {
if (strncmp(tensor->name, "token_embd.weight", 17) == 0) { if (strncmp(tensor->name, "token_embd.weight", 17) == 0) {
return ((ggml_openvino_is_npu() && tensor->type == GGML_TYPE_Q6_K) ? ExtraQuantType::F16 : ExtraQuantType::Q8_0_C); return ((ggml_openvino_is_npu() && tensor->type == GGML_TYPE_Q6_K) ? ExtraQuantType::F16 : ExtraQuantType::Q8_0_C);
} }
@ -174,6 +175,9 @@ std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor *
if (ggml_openvino_is_npu()) { if (ggml_openvino_is_npu()) {
return ExtraQuantType::Q4_0_128; return ExtraQuantType::Q4_0_128;
} }
if (no_requant) {
return std::nullopt;
}
switch (tensor->type) { switch (tensor->type) {
case GGML_TYPE_Q6_K: case GGML_TYPE_Q6_K:
case GGML_TYPE_Q5_K: case GGML_TYPE_Q5_K:
@ -187,7 +191,7 @@ std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor *
// Extracted Layout Calculation // Extracted Layout Calculation
// ===================================================== // =====================================================
ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor) { ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor, bool use_bias) {
ggml_openvino_extracted_layout layout = {}; ggml_openvino_extracted_layout layout = {};
layout.is_symmetric = false; layout.is_symmetric = false;
@ -204,7 +208,7 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
const size_t alignment = 64; // Good for SIMD const size_t alignment = 64; // Good for SIMD
// Check if requantization is needed (NPU-specific) // Check if requantization is needed (NPU-specific)
auto requant_type = ggml_openvino_get_requant_type(tensor); auto requant_type = ggml_openvino_get_requant_type(tensor, use_bias);
if (requant_type.has_value()) { if (requant_type.has_value()) {
layout.is_requant = true; layout.is_requant = true;
layout.requant_type = requant_type; layout.requant_type = requant_type;

View File

@ -83,7 +83,7 @@ const std::string & ggml_openvino_get_device_name();
bool ggml_openvino_is_npu(); bool ggml_openvino_is_npu();
// Get requantization type for a tensor type (returns nullopt if no requant needed) // Get requantization type for a tensor type (returns nullopt if no requant needed)
std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor * tensor); std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor * tensor, bool no_requant = false);
// ===================================================== // =====================================================
// OpenVINO Tensor Extra Types // OpenVINO Tensor Extra Types
@ -160,7 +160,7 @@ struct ggml_openvino_extracted_layout {
}; };
// Calculate the buffer layout for extracted quantized data // Calculate the buffer layout for extracted quantized data
ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor); ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor, bool use_bias = false);
ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor * tensor, bool is_remote); ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor * tensor, bool is_remote);

View File

@ -922,6 +922,13 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
default: default:
break; break;
} }
if (op->op == GGML_OP_GET_ROWS) {
if (op->ne[0] == 256 && (op->src[0]->type == GGML_TYPE_Q4_K || op->src[0]->type == GGML_TYPE_Q5_K)) {
// ERR = 0.000000306 > 0.000000100 GET_ROWS(type=q4_K,n=256,m=5,r=4,be1=1,be2=1,v=0)
// ERR = 0.000000197 > 0.000000100 GET_ROWS(type=q5_K,n=256,m=5,r=4,be1=1,be2=1,v=0)
return true;
}
}
return false; return false;
} }

View File

@ -11,6 +11,7 @@
#include <cstdint> #include <cstdint>
#include <limits> #include <limits>
#include <memory> #include <memory>
#include <openvino/core/except.hpp>
#include <openvino/core/node.hpp> #include <openvino/core/node.hpp>
#include <openvino/core/node_output.hpp> #include <openvino/core/node_output.hpp>
#include <openvino/core/parallel.hpp> #include <openvino/core/parallel.hpp>
@ -18,6 +19,7 @@
#include <openvino/core/type/element_type.hpp> #include <openvino/core/type/element_type.hpp>
#include <openvino/core/type/element_type_traits.hpp> #include <openvino/core/type/element_type_traits.hpp>
#include <openvino/core/type/float16.hpp> #include <openvino/core/type/float16.hpp>
#include <openvino/op/add.hpp>
#include <openvino/op/constant.hpp> #include <openvino/op/constant.hpp>
#include <openvino/op/convert.hpp> #include <openvino/op/convert.hpp>
#include <openvino/op/multiply.hpp> #include <openvino/op/multiply.hpp>
@ -82,28 +84,41 @@ void extract_q4_0_data(const ggml_tensor * tensor,
void extract_q4_1_data(const ggml_tensor * tensor, void extract_q4_1_data(const ggml_tensor * tensor,
ov::Tensor & weights_arr, ov::Tensor & weights_arr,
ov::Tensor & scales_arr, ov::Tensor & scales_arr,
ov::Tensor & zp_arr) { ov::Tensor & zp_arr,
bool use_bias) {
const uint64_t bytes_per_block = 20; // 2 bytes scale, 2 bytes min, 32x0.5 byte weights const uint64_t bytes_per_block = 20; // 2 bytes scale, 2 bytes min, 32x0.5 byte weights
auto * data = static_cast<uint8_t *>(tensor->data); auto * data = static_cast<uint8_t *>(tensor->data);
auto * weights = static_cast<uint8_t *>(weights_arr.data()); auto * weights = static_cast<uint8_t *>(weights_arr.data());
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>(); auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto * zp = static_cast<uint8_t *>(zp_arr.data());
ov::parallel_for(scales_arr.get_size(), [&](size_t i) { if (use_bias) {
float scale = static_cast<float>(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block)))); // Store bias (min) directly as f16 instead of computing u4 zero points
float min = static_cast<float>(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block + 2)))); auto * bias = zp_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
scales[i] = ov::float16(scale); ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
// zp = -min / scale (bias = min, so zp = -bias/scale) float scale = static_cast<float>(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block))));
uint8_t zp_val = (scale != 0.0f) ? (uint8_t) std::round(-min / scale) : 0; float min = static_cast<float>(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block + 2))));
// Pack two 4-bit zero points per byte scales[i] = ov::float16(scale);
if (i % 2 == 0) { bias[i] = ov::float16(min); // bias = min, dequant: w*s + bias
zp[i / 2] = zp_val & 0x0F; // Lower nibble unpack_32_4(data + i * bytes_per_block + 4, weights + i * 16);
} else { });
zp[i / 2] |= (zp_val << 4); // Upper nibble } else {
} auto * zp = static_cast<uint8_t *>(zp_arr.data());
unpack_32_4(data + i * bytes_per_block + 4, weights + i * 16); ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
}); float scale = static_cast<float>(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block))));
float min = static_cast<float>(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block + 2))));
scales[i] = ov::float16(scale);
// zp = -min / scale (bias = min, so zp = -bias/scale)
uint8_t zp_val = (scale != 0.0f) ? (uint8_t) std::round(-min / scale) : 0;
// Pack two 4-bit zero points per byte
if (i % 2 == 0) {
zp[i / 2] = zp_val & 0x0F; // Lower nibble
} else {
zp[i / 2] |= (zp_val << 4); // Upper nibble
}
unpack_32_4(data + i * bytes_per_block + 4, weights + i * 16);
});
}
} }
// Extracts (weight, scales, zp) from Q8_0 tensors. // Extracts (weight, scales, zp) from Q8_0 tensors.
@ -164,14 +179,18 @@ void unpack_256_4(const uint8_t * data, uint8_t * dst) {
void extract_q4_k_data(const ggml_tensor * tensor, void extract_q4_k_data(const ggml_tensor * tensor,
ov::Tensor & weights_arr, ov::Tensor & weights_arr,
ov::Tensor & scales_arr, ov::Tensor & scales_arr,
ov::Tensor & zp_arr) { ov::Tensor & zp_arr,
bool use_bias) {
const uint64_t bytes_per_block = 2 + 2 + 12 + 128; const uint64_t bytes_per_block = 2 + 2 + 12 + 128;
const uint64_t n_super_block = tensor->nb[3] / bytes_per_block; const uint64_t n_super_block = tensor->nb[3] / bytes_per_block;
auto * data = static_cast<uint8_t *>(tensor->data); auto * data = static_cast<uint8_t *>(tensor->data);
auto * weights = static_cast<uint8_t *>(weights_arr.data()); auto * weights = static_cast<uint8_t *>(weights_arr.data());
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>(); auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto * zp = static_cast<uint8_t *>(zp_arr.data());
// For bias path, zp_arr holds f16 bias values; for zp path, it holds packed u4 zero points
auto * zp_u4 = use_bias ? nullptr : static_cast<uint8_t *>(zp_arr.data());
auto * bias_f16 = use_bias ? zp_arr.data<ov::element_type_traits<ov::element::f16>::value_type>() : nullptr;
ov::parallel_for(n_super_block, [&](size_t i) { ov::parallel_for(n_super_block, [&](size_t i) {
uint8_t * block_data = data + i * bytes_per_block; uint8_t * block_data = data + i * bytes_per_block;
@ -205,17 +224,22 @@ void extract_q4_k_data(const ggml_tensor * tensor,
min_vals[6] = scale_mins * static_cast<float>((*(qs1 + 10) >> 4) | ((*(qs1 + 6) >> 6) << 4)); min_vals[6] = scale_mins * static_cast<float>((*(qs1 + 10) >> 4) | ((*(qs1 + 6) >> 6) << 4));
min_vals[7] = scale_mins * static_cast<float>((*(qs1 + 11) >> 4) | ((*(qs1 + 7) >> 6) << 4)); min_vals[7] = scale_mins * static_cast<float>((*(qs1 + 11) >> 4) | ((*(qs1 + 7) >> 6) << 4));
// Store scales and compute zero points // Store scales and compute zero points or bias
for (int j = 0; j < 8; j++) { for (int j = 0; j < 8; j++) {
scales[i * 8 + j] = ov::float16(scale_vals[j]); scales[i * 8 + j] = ov::float16(scale_vals[j]);
// zp = min / scale (since bias = -min and zp = -bias/scale) if (use_bias) {
uint8_t zp_val = (scale_vals[j] != 0.0f) ? (uint8_t) std::round(min_vals[j] / scale_vals[j]) : 0; // Store bias = -min directly as f16, dequant: w*s + bias
// Pack two 4-bit zero points per byte bias_f16[i * 8 + j] = ov::float16(-min_vals[j]);
size_t idx = i * 8 + j;
if (idx % 2 == 0) {
zp[idx / 2] = zp_val & 0x0F;
} else { } else {
zp[idx / 2] |= (zp_val << 4); // zp = min / scale (since bias = -min and zp = -bias/scale)
uint8_t zp_val = (scale_vals[j] != 0.0f) ? (uint8_t) std::round(min_vals[j] / scale_vals[j]) : 0;
// Pack two 4-bit zero points per byte
size_t idx = i * 8 + j;
if (idx % 2 == 0) {
zp_u4[idx / 2] = zp_val & 0x0F;
} else {
zp_u4[idx / 2] |= (zp_val << 4);
}
} }
} }
unpack_256_4(block_data + 16, weights + i * 128); unpack_256_4(block_data + 16, weights + i * 128);
@ -285,14 +309,18 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8
void extract_q5_k_data(const ggml_tensor * tensor, void extract_q5_k_data(const ggml_tensor * tensor,
ov::Tensor & weights_arr, ov::Tensor & weights_arr,
ov::Tensor & scales_arr, ov::Tensor & scales_arr,
ov::Tensor & zp_arr) { ov::Tensor & zp_arr,
bool use_bias) {
const uint64_t bytes_per_block = 4 + 12 + 32 + 128; const uint64_t bytes_per_block = 4 + 12 + 32 + 128;
const uint64_t n_super_block = tensor->nb[3] / bytes_per_block; const uint64_t n_super_block = tensor->nb[3] / bytes_per_block;
auto * data = static_cast<uint8_t *>(tensor->data); auto * data = static_cast<uint8_t *>(tensor->data);
auto * weights = static_cast<uint8_t *>(weights_arr.data()); auto * weights = static_cast<uint8_t *>(weights_arr.data());
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>(); auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto * zp = static_cast<uint8_t *>(zp_arr.data());
// For bias path, zp_arr holds f16 bias values; for zp path, it holds u8 zero points
auto * zp_u8 = use_bias ? nullptr : static_cast<uint8_t *>(zp_arr.data());
auto * bias_f16 = use_bias ? zp_arr.data<ov::element_type_traits<ov::element::f16>::value_type>() : nullptr;
ov::parallel_for(n_super_block, [&](size_t i) { ov::parallel_for(n_super_block, [&](size_t i) {
uint8_t * block_data = data + i * bytes_per_block; uint8_t * block_data = data + i * bytes_per_block;
@ -325,9 +353,15 @@ void extract_q5_k_data(const ggml_tensor * tensor,
scales[i * 8 + is] = ov::float16(d1); scales[i * 8 + is] = ov::float16(d1);
scales[i * 8 + is + 1] = ov::float16(d2); scales[i * 8 + is + 1] = ov::float16(d2);
// zp = min / scale (since bias = -min and zp = -bias/scale) if (use_bias) {
zp[i * 8 + is] = (d1 != 0.0f) ? (uint8_t) std::round(m1 / d1) : 0; // Store bias = -min directly as f16, dequant: w*s + bias
zp[i * 8 + is + 1] = (d2 != 0.0f) ? (uint8_t) std::round(m2 / d2) : 0; bias_f16[i * 8 + is] = ov::float16(-m1);
bias_f16[i * 8 + is + 1] = ov::float16(-m2);
} else {
// zp = min / scale (since bias = -min and zp = -bias/scale)
zp_u8[i * 8 + is] = (d1 != 0.0f) ? (uint8_t) std::round(m1 / d1) : 0;
zp_u8[i * 8 + is + 1] = (d2 != 0.0f) ? (uint8_t) std::round(m2 / d2) : 0;
}
// Extract weights for first 32 elements (matching deq formula exactly) // Extract weights for first 32 elements (matching deq formula exactly)
for (int l = 0; l < 32; ++l) { for (int l = 0; l < 32; ++l) {
@ -349,10 +383,14 @@ void extract_q5_k_data(const ggml_tensor * tensor,
// TODO Reorder for make_intX_weights // TODO Reorder for make_intX_weights
ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight, ov::Tensor & scales, ov::Tensor & zp, size_t group_size) { ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
ov::Tensor & scales,
ov::Tensor & zp,
size_t group_size,
bool use_bias) {
ov::Shape orig_shape = weight.get_shape(); ov::Shape orig_shape = weight.get_shape();
// Expand dimensions for scales and zp // Expand dimensions for scales and zp/bias
auto scale_shape = scales.get_shape(); auto scale_shape = scales.get_shape();
auto zp_shape = zp.get_shape(); auto zp_shape = zp.get_shape();
bool is_scalar_zp = zp_shape.empty(); // Symmetric quantization bool is_scalar_zp = zp_shape.empty(); // Symmetric quantization
@ -377,36 +415,45 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight, ov::Tensor & scales,
static_cast<uint8_t *>(weight.data()), nullptr); static_cast<uint8_t *>(weight.data()), nullptr);
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight; weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales); auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
// Zero point is already in U8 format from extraction
auto zero_point = std::make_shared<ov::op::v0::Constant>(zp);
float zp_value;
if (ov::op::util::get_single_value(zero_point, zp_value)) {
zero_point = ov::op::v0::Constant::create(zero_point->get_element_type(), {}, {zp_value});
}
// Quantization operations
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16); auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
auto zero_point_f16 = std::make_shared<ov::op::v0::Convert>(zero_point, ov::element::f16);
auto w_zp = std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY); ov::Output<ov::Node> result;
ov::Output<ov::Node> w_zp_s = if (use_bias && !is_scalar_zp) {
std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY); // Bias path: w * s + b (zp tensor holds f16 bias values)
auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp);
auto w_s = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY);
} else {
// Zero point path: (w - zp) * s
auto zero_point = std::make_shared<ov::op::v0::Constant>(zp);
float zp_value;
if (ov::op::util::get_single_value(zero_point, zp_value)) {
zero_point = ov::op::v0::Constant::create(zero_point->get_element_type(), {}, {zp_value});
}
auto zero_point_f16 = std::make_shared<ov::op::v0::Convert>(zero_point, ov::element::f16);
auto w_zp =
std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY);
result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
}
if (packed_shape.size() != 2) { if (packed_shape.size() != 2) {
// If not requantized channel-wise case, reshape back to original shape // If not requantized channel-wise case, reshape back to original shape
auto final_shape = auto final_shape =
std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{orig_shape.size()}, orig_shape); std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{orig_shape.size()}, orig_shape);
w_zp_s = std::make_shared<ov::op::v1::Reshape>(w_zp_s, final_shape, false); result = std::make_shared<ov::op::v1::Reshape>(result, final_shape, false);
} }
return std::make_shared<ov::op::v0::Convert>(w_zp_s, ov::element::f32); return std::make_shared<ov::op::v0::Convert>(result, ov::element::f32);
} }
ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight, ov::Tensor & scales, ov::Tensor & zp, size_t group_size) { ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
ov::Tensor & scales,
ov::Tensor & zp,
size_t group_size,
bool use_bias) {
ov::Shape orig_weight_shape = weight.get_shape(); ov::Shape orig_weight_shape = weight.get_shape();
// Expand dimensions for scales and zp // Expand dimensions for scales and zp/bias
ov::Shape scale_shape = scales.get_shape(); ov::Shape scale_shape = scales.get_shape();
auto zp_shape = zp.get_shape(); auto zp_shape = zp.get_shape();
bool is_scalar_zp = zp_shape.empty(); // Symmetric quantization bool is_scalar_zp = zp_shape.empty(); // Symmetric quantization
@ -431,32 +478,35 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight, ov::Tensor & scales,
static_cast<uint8_t *>(weight.data()), nullptr); static_cast<uint8_t *>(weight.data()), nullptr);
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight; weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16); auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
// Zero point is already in U4 format from extraction
auto zero_points_node = std::make_shared<ov::op::v0::Constant>(zp);
float zp_value;
if (ov::op::util::get_single_value(zero_points_node, zp_value)) {
zero_points_node = ov::op::v0::Constant::create(zero_points_node->get_element_type(), {}, {zp_value});
}
auto zero_points_f16 = std::make_shared<ov::op::v0::Convert>(zero_points_node, ov::element::f16);
auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales); auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
// Perform dequantization ov::Output<ov::Node> result;
auto w_zp = std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY); if (use_bias && !is_scalar_zp) {
// Bias path: w * s + b (zp tensor holds f16 bias values)
ov::Output<ov::Node> w_zp_s = auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp);
std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY); auto w_s = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY);
} else {
// Zero point path: (w - zp) * s
auto zero_points_node = std::make_shared<ov::op::v0::Constant>(zp);
float zp_value;
if (ov::op::util::get_single_value(zero_points_node, zp_value)) {
zero_points_node = ov::op::v0::Constant::create(zero_points_node->get_element_type(), {}, {zp_value});
}
auto zero_points_f16 = std::make_shared<ov::op::v0::Convert>(zero_points_node, ov::element::f16);
auto w_zp =
std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY);
result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
}
if (packed_shape.size() != 2) { if (packed_shape.size() != 2) {
// If not requantized channel-wise case, reshape back to original shape // If not requantized channel-wise case, reshape back to original shape
auto final_shape = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{orig_weight_shape.size()}, auto final_shape = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{orig_weight_shape.size()},
orig_weight_shape); orig_weight_shape);
result = std::make_shared<ov::op::v1::Reshape>(result, final_shape, false);
w_zp_s = std::make_shared<ov::op::v1::Reshape>(w_zp_s, final_shape, false);
} }
return std::make_shared<ov::op::v0::Convert>(w_zp_s, ov::element::f32); return std::make_shared<ov::op::v0::Convert>(result, ov::element::f32);
} }
// Extract quantized weights from tensor and create weight subgraph // Extract quantized weights from tensor and create weight subgraph
@ -464,7 +514,8 @@ std::shared_ptr<ov::Node> extract_quantized_weights(const ggml_tensor * tensor,
const void * data, const void * data,
ov::Tensor & weights, ov::Tensor & weights,
ov::Tensor & scales, ov::Tensor & scales,
ov::Tensor & zp) { ov::Tensor & zp,
bool use_bias) {
// Create a temporary tensor for extraction functions that read from tensor->data // Create a temporary tensor for extraction functions that read from tensor->data
ggml_tensor temp_tensor = *tensor; ggml_tensor temp_tensor = *tensor;
temp_tensor.data = const_cast<void *>(data); temp_tensor.data = const_cast<void *>(data);
@ -499,10 +550,10 @@ std::shared_ptr<ov::Node> extract_quantized_weights(const ggml_tensor * tensor,
extract_q4_0_data(&temp_tensor, weights, scales, zp); extract_q4_0_data(&temp_tensor, weights, scales, zp);
break; break;
case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_1:
extract_q4_1_data(&temp_tensor, weights, scales, zp); extract_q4_1_data(&temp_tensor, weights, scales, zp, use_bias);
break; break;
case GGML_TYPE_Q4_K: case GGML_TYPE_Q4_K:
extract_q4_k_data(&temp_tensor, weights, scales, zp); extract_q4_k_data(&temp_tensor, weights, scales, zp, use_bias);
break; break;
case GGML_TYPE_Q8_0: case GGML_TYPE_Q8_0:
extract_q8_0_data(&temp_tensor, weights, scales, zp); extract_q8_0_data(&temp_tensor, weights, scales, zp);
@ -511,7 +562,7 @@ std::shared_ptr<ov::Node> extract_quantized_weights(const ggml_tensor * tensor,
extract_q6_k_data(&temp_tensor, weights, scales, zp); extract_q6_k_data(&temp_tensor, weights, scales, zp);
break; break;
case GGML_TYPE_Q5_K: case GGML_TYPE_Q5_K:
extract_q5_k_data(&temp_tensor, weights, scales, zp); extract_q5_k_data(&temp_tensor, weights, scales, zp, use_bias);
break; break;
default: default:
throw std::runtime_error("Unsupported quantized type: " + std::string(ggml_type_name(tensor->type))); throw std::runtime_error("Unsupported quantized type: " + std::string(ggml_type_name(tensor->type)));
@ -520,9 +571,9 @@ std::shared_ptr<ov::Node> extract_quantized_weights(const ggml_tensor * tensor,
// Create the OpenVINO weight subgraph // Create the OpenVINO weight subgraph
ov::Output<ov::Node> weight_node; ov::Output<ov::Node> weight_node;
if (is_u4) { if (is_u4) {
weight_node = make_int4_weights(weights, scales, zp, weights_per_block); weight_node = make_int4_weights(weights, scales, zp, weights_per_block, use_bias);
} else { } else {
weight_node = make_int8_weights(weights, scales, zp, weights_per_block); weight_node = make_int8_weights(weights, scales, zp, weights_per_block, use_bias);
} }
auto result = weight_node.get_node_shared_ptr(); auto result = weight_node.get_node_shared_ptr();
@ -576,7 +627,7 @@ std::shared_ptr<ov::Node> requantize_to_buffers(const ggml_tensor * tensor,
return result; return result;
} }
OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, void * output_base_ptr) { OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, void * output_base_ptr, bool use_bias) {
GGML_ASSERT(tensor != nullptr); GGML_ASSERT(tensor != nullptr);
GGML_ASSERT(data != nullptr); GGML_ASSERT(data != nullptr);
@ -619,12 +670,19 @@ OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, vo
OPENVINO_THROW("Unsupported weight tensor type: ", ggml_type_name(tensor->type)); OPENVINO_THROW("Unsupported weight tensor type: ", ggml_type_name(tensor->type));
} }
result.layout = ggml_openvino_get_extracted_layout(tensor); result.layout = ggml_openvino_get_extracted_layout(tensor, use_bias);
const auto & layout = result.layout; const auto & layout = result.layout;
if (layout.total_size == 0) { if (layout.total_size == 0) {
OPENVINO_THROW("Unsupported quantized type: ", ggml_type_name(tensor->type)); OPENVINO_THROW("Unsupported quantized type: ", ggml_type_name(tensor->type));
} }
if (use_bias) {
OPENVINO_ASSERT(!layout.is_requant,
"use_bias is only used for test-backend-ops, which should not have requantization");
// bias node will be created on the fly and not use backend buffer
output_base_ptr = nullptr;
}
// F16 requant path - no separate scales/zp needed in result // F16 requant path - no separate scales/zp needed in result
if (layout.is_requant && layout.requant_type.has_value() && layout.requant_type.value() == ExtraQuantType::F16) { if (layout.is_requant && layout.requant_type.has_value() && layout.requant_type.value() == ExtraQuantType::F16) {
if (output_base_ptr) { if (output_base_ptr) {
@ -653,14 +711,20 @@ OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, vo
} else { } else {
result.weights = ov::Tensor(weight_type, node_shape); result.weights = ov::Tensor(weight_type, node_shape);
result.scales = ov::Tensor(ov::element::f16, scale_shape); result.scales = ov::Tensor(ov::element::f16, scale_shape);
result.zp = ov::Tensor(weight_type, zp_shape); if (use_bias && !layout.is_symmetric) {
// bias only has effect for asymmetric quant
result.zp = ov::Tensor(ov::element::f16, zp_shape);
} else {
result.zp = ov::Tensor(weight_type, zp_shape);
}
} }
if (layout.is_requant && layout.requant_type.has_value()) { if (layout.is_requant && layout.requant_type.has_value()) {
result.weight_node = requantize_to_buffers(tensor, data, layout.requant_type.value(), layout.weights_per_block, result.weight_node = requantize_to_buffers(tensor, data, layout.requant_type.value(), layout.weights_per_block,
result.weights, result.scales, result.zp); result.weights, result.scales, result.zp);
} else { } else {
result.weight_node = extract_quantized_weights(tensor, data, result.weights, result.scales, result.zp); result.weight_node =
extract_quantized_weights(tensor, data, result.weights, result.scales, result.zp, use_bias);
} }
return result; return result;

View File

@ -16,7 +16,8 @@ void extract_q4_0_data(const ggml_tensor * tensor,
void extract_q4_1_data(const ggml_tensor * tensor, void extract_q4_1_data(const ggml_tensor * tensor,
ov::Tensor & weights_arr, ov::Tensor & weights_arr,
ov::Tensor & scales_arr, ov::Tensor & scales_arr,
ov::Tensor & zp_arr); ov::Tensor & zp_arr,
bool use_bias = false);
void extract_q8_0_data(const ggml_tensor * tensor, void extract_q8_0_data(const ggml_tensor * tensor,
ov::Tensor & weights_arr, ov::Tensor & weights_arr,
@ -28,12 +29,14 @@ void unpack_256_4(const uint8_t* data, uint8_t* dst);
void extract_q4_k_data(const ggml_tensor * tensor, void extract_q4_k_data(const ggml_tensor * tensor,
ov::Tensor & weights_arr, ov::Tensor & weights_arr,
ov::Tensor & scales_arr, ov::Tensor & scales_arr,
ov::Tensor & zp_arr); ov::Tensor & zp_arr,
bool use_bias = false);
void extract_q5_k_data(const ggml_tensor * tensor, void extract_q5_k_data(const ggml_tensor * tensor,
ov::Tensor & weights_arr, ov::Tensor & weights_arr,
ov::Tensor & scales_arr, ov::Tensor & scales_arr,
ov::Tensor & zp_arr); ov::Tensor & zp_arr,
bool use_bias = false);
void extract_q6_k_data(const ggml_tensor * tensor, void extract_q6_k_data(const ggml_tensor * tensor,
ov::Tensor & weights_arr, ov::Tensor & weights_arr,
@ -45,12 +48,14 @@ static constexpr size_t GGML_QUANTIZATION_GROUP_SIZE = 32;
ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight, ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
ov::Tensor & scales, ov::Tensor & scales,
ov::Tensor & zp, ov::Tensor & zp,
size_t group_size = GGML_QUANTIZATION_GROUP_SIZE); size_t group_size = GGML_QUANTIZATION_GROUP_SIZE,
bool use_bias = false);
ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight, ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
ov::Tensor & scales, ov::Tensor & scales,
ov::Tensor & zp, ov::Tensor & zp,
size_t group_size = GGML_QUANTIZATION_GROUP_SIZE); size_t group_size = GGML_QUANTIZATION_GROUP_SIZE,
bool use_bias = false);
// Extract quantized weights from tensor and create weight subgraph // Extract quantized weights from tensor and create weight subgraph
// If weights/scales/zp are provided (non-empty), uses them as output buffers // If weights/scales/zp are provided (non-empty), uses them as output buffers
@ -61,7 +66,8 @@ std::shared_ptr<ov::Node> extract_quantized_weights(
const void * data, // Source data pointer (may differ from tensor->data) const void * data, // Source data pointer (may differ from tensor->data)
ov::Tensor & weights, ov::Tensor & weights,
ov::Tensor & scales, ov::Tensor & scales,
ov::Tensor & zp); ov::Tensor & zp,
bool use_bias = false); // Use fp bias instead of quantized zero_point (for test-backend-ops)
// Requantize weights from tensor to target format, writing to provided buffers // Requantize weights from tensor to target format, writing to provided buffers
// For F16 target, only weights buffer is used (scales/zp ignored) // For F16 target, only weights buffer is used (scales/zp ignored)
@ -112,8 +118,9 @@ struct OvWeight {
// Returns OvWeight with the weight node and optional quantized tensors // Returns OvWeight with the weight node and optional quantized tensors
OvWeight process_weight_tensor( OvWeight process_weight_tensor(
const ggml_tensor * tensor, const ggml_tensor * tensor,
const void * data, // Source data pointer (may differ from tensor->data) const void * data, // Source data pointer (may differ from tensor->data)
void * output_base_ptr = nullptr); // Base pointer for output buffers (or nullptr for internal allocation) void * output_base_ptr = nullptr, // Base pointer for output buffers (or nullptr for internal allocation)
bool use_bias = false); // Use fp bias instead of quantized zero_point, only used in test-backend-ops
void quantize_q4_0(const float * x, void quantize_q4_0(const float * x,
ov::Tensor & weights_arr, ov::Tensor & weights_arr,

View File

@ -127,7 +127,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
if (pos_data[0] == 0) { if (pos_data[0] == 0) {
infer_request->reset_state(); infer_request->reset_state();
stateful_kv_size = pos_shape[3]; stateful_kv_size = pos_shape[3];
} else if (stateful_kv_size == pos_data[0]) { } else if (stateful_kv_size == static_cast<size_t>(pos_data[0])) {
stateful_kv_size += pos_shape[3]; stateful_kv_size += pos_shape[3];
} else { } else {
auto states = infer_request->query_state(); auto states = infer_request->query_state();
@ -139,7 +139,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
state.set_state(new_state_tensor); state.set_state(new_state_tensor);
} }
stateful_kv_size = pos_data[0] + 1; stateful_kv_size = pos_data[0] + 1;
} }
} }
decoder_end_time = ggml_time_us(); decoder_end_time = ggml_time_us();
@ -467,10 +467,10 @@ enum ggml_status naive_compute(ggml_cgraph * cgraph,
return GGML_STATUS_SUCCESS; return GGML_STATUS_SUCCESS;
} }
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); bool naive = true;
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, naive);
auto decoder = std::make_shared<GgmlOvDecoder>(cgraph, model_weights); auto decoder = std::make_shared<GgmlOvDecoder>(cgraph, model_weights);
auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(decoder); auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(decoder);
auto naive = true;
auto model = ov::frontend::ggml::FrontEnd::convert(input_model, naive); auto model = ov::frontend::ggml::FrontEnd::convert(input_model, naive);
if (getenv("GGML_OPENVINO_DUMP_IR")) { if (getenv("GGML_OPENVINO_DUMP_IR")) {
ov::serialize(model, "IR_naive.xml"); ov::serialize(model, "IR_naive.xml");

View File

@ -233,9 +233,7 @@ if (NOT LLAMA_SANITIZE_ADDRESS AND NOT GGML_SCHED_NO_REALLOC)
llama_build_and_test(test-opt.cpp) llama_build_and_test(test-opt.cpp)
endif() endif()
llama_build_and_test(test-gguf.cpp) llama_build_and_test(test-gguf.cpp)
if (NOT GGML_OPENVINO) llama_build_and_test(test-backend-ops.cpp)
llama_build_and_test(test-backend-ops.cpp)
endif()
llama_build_and_test(test-model-load-cancel.cpp LABEL "model") llama_build_and_test(test-model-load-cancel.cpp LABEL "model")
llama_build_and_test(test-autorelease.cpp LABEL "model") llama_build_and_test(test-autorelease.cpp LABEL "model")