Use bias instead of zp in test-backend-ops
This commit is contained in:
parent
2a6a95eb77
commit
5525bac078
|
|
@ -50,6 +50,7 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph,
|
||||||
m_is_static(is_static),
|
m_is_static(is_static),
|
||||||
m_is_stateful(is_stateful),
|
m_is_stateful(is_stateful),
|
||||||
m_is_prefill(is_prefill),
|
m_is_prefill(is_prefill),
|
||||||
|
m_naive(false),
|
||||||
m_prefill_chunk_size(prefill_chunk_size),
|
m_prefill_chunk_size(prefill_chunk_size),
|
||||||
m_cgraph(cgraph),
|
m_cgraph(cgraph),
|
||||||
m_model_weights(model_weights),
|
m_model_weights(model_weights),
|
||||||
|
|
@ -93,9 +94,10 @@ void GgmlOvDecoder::update_io(ggml_cgraph * cgraph) {
|
||||||
GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map<std::string, std::shared_ptr<ov::Node>> & model_weights) {
|
GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map<std::string, std::shared_ptr<ov::Node>> & model_weights) {
|
||||||
m_cgraph = cgraph;
|
m_cgraph = cgraph;
|
||||||
m_model_weights = model_weights;
|
m_model_weights = model_weights;
|
||||||
|
m_naive = true;
|
||||||
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
|
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
|
||||||
auto * cur_node = cgraph->nodes[node_n];
|
auto * cur_node = cgraph->nodes[node_n];
|
||||||
set_input_output(cur_node, true);
|
set_input_output(cur_node);
|
||||||
}
|
}
|
||||||
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
|
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
|
||||||
m_node_info_list[node_n].node_op_case = compute_op_case(m_node_info_list[node_n].node);
|
m_node_info_list[node_n].node_op_case = compute_op_case(m_node_info_list[node_n].node);
|
||||||
|
|
@ -134,7 +136,7 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map<std::string, std::sh
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) {
|
void GgmlOvDecoder::set_input_output(ggml_tensor * node) {
|
||||||
NodeInfo current_node_info;
|
NodeInfo current_node_info;
|
||||||
auto node_name = std::string(node->name);
|
auto node_name = std::string(node->name);
|
||||||
auto node_output_name = node_name;
|
auto node_output_name = node_name;
|
||||||
|
|
@ -169,7 +171,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) {
|
||||||
current_node_info.node_inputs_names.push_back(src_name);
|
current_node_info.node_inputs_names.push_back(src_name);
|
||||||
|
|
||||||
// Add model inputs
|
// Add model inputs
|
||||||
if (!naive && !src->view_src) {
|
if (!m_naive && !src->view_src) {
|
||||||
ggml_backend_buffer * buffer = src->buffer;
|
ggml_backend_buffer * buffer = src->buffer;
|
||||||
|
|
||||||
if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) {
|
if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) {
|
||||||
|
|
@ -206,7 +208,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add model outputs
|
// Add model outputs
|
||||||
if (!naive) {
|
if (!m_naive) {
|
||||||
// Model outputs are tensors with GGML_TENSOR_FLAG_OUTPUT flag and kv_caches
|
// Model outputs are tensors with GGML_TENSOR_FLAG_OUTPUT flag and kv_caches
|
||||||
static std::set<std::string> debug_output_names = {};
|
static std::set<std::string> debug_output_names = {};
|
||||||
// Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph
|
// Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph
|
||||||
|
|
@ -509,12 +511,14 @@ std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const
|
||||||
return kv_param_res_names;
|
return kv_param_res_names;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph) {
|
std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph, bool naive) {
|
||||||
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
|
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
|
||||||
// static std::mutex weights_mutex;
|
// static std::mutex weights_mutex;
|
||||||
auto * nodes = cgraph->nodes;
|
auto * nodes = cgraph->nodes;
|
||||||
auto n_nodes = cgraph->n_nodes;
|
auto n_nodes = cgraph->n_nodes;
|
||||||
std::for_each(std::execution::seq, nodes, nodes + n_nodes, [&](ggml_tensor * node) {
|
// std::for_each(std::execution::par, nodes, nodes + n_nodes, [&](ggml_tensor * node) {
|
||||||
|
for (int node_i = 0; node_i < n_nodes; node_i++) {
|
||||||
|
auto * node = nodes[node_i];
|
||||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||||
auto * src = node->src[i];
|
auto * src = node->src[i];
|
||||||
if (src == nullptr) {
|
if (src == nullptr) {
|
||||||
|
|
@ -542,18 +546,19 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
|
||||||
// }
|
// }
|
||||||
// }
|
// }
|
||||||
if (model_weights.find(src_name) == model_weights.end()) {
|
if (model_weights.find(src_name) == model_weights.end()) {
|
||||||
auto weight_node = create_weight_node(src);
|
auto weight_node = create_weight_node(src, naive);
|
||||||
weight_node->set_friendly_name(src_name);
|
weight_node->set_friendly_name(src_name);
|
||||||
model_weights[src_name] = weight_node;
|
model_weights[src_name] = weight_node;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
}
|
||||||
|
// });
|
||||||
return model_weights;
|
return model_weights;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor) {
|
std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor, bool naive) {
|
||||||
const bool is_ov_buffer = ggml_backend_buffer_is_openvino(tensor->buffer);
|
const bool is_ov_buffer = ggml_backend_buffer_is_openvino(tensor->buffer);
|
||||||
|
|
||||||
// Check if we have a pre-built constant from the OpenVINO backend buffer
|
// Check if we have a pre-built constant from the OpenVINO backend buffer
|
||||||
|
|
@ -581,6 +586,11 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// There are three cases where we need to create a new weight node:
|
||||||
|
// 1. weights are in openvino_host_buffer. Weight loading to host buffer will not trigger backend_buffer_set_tensor
|
||||||
|
// 2. weights are in cpu/cpu_mapped buffer. On token_embd.weight goes to case 1 or 2, depending on whether mmap or direct_io is used
|
||||||
|
// 3. test-backend-ops. buffers in test-backend-ops does not set USAGE_WEIGHT so backend_buffer_set_tensor will not create weight node
|
||||||
|
|
||||||
// GGML_LOG_DEBUG("%s: creating new weight node for %s\n", __func__, tensor->name);
|
// GGML_LOG_DEBUG("%s: creating new weight node for %s\n", __func__, tensor->name);
|
||||||
static const std::set<ggml_type> weight_types = {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16,
|
static const std::set<ggml_type> weight_types = {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16,
|
||||||
GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
|
GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
|
||||||
|
|
@ -592,6 +602,7 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor
|
||||||
|
|
||||||
OvWeight ov_weight;
|
OvWeight ov_weight;
|
||||||
if (ggml_is_quantized(tensor->type)) {
|
if (ggml_is_quantized(tensor->type)) {
|
||||||
|
auto use_bias = naive;
|
||||||
if (is_ov_buffer) {
|
if (is_ov_buffer) {
|
||||||
// For quantized weights, copy raw data to a temp buffer first because
|
// For quantized weights, copy raw data to a temp buffer first because
|
||||||
// process_weight_tensor reads from data and writes extracted results
|
// process_weight_tensor reads from data and writes extracted results
|
||||||
|
|
@ -600,9 +611,9 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor
|
||||||
size_t raw_size = ggml_nbytes(tensor);
|
size_t raw_size = ggml_nbytes(tensor);
|
||||||
std::vector<uint8_t> tmp(raw_size);
|
std::vector<uint8_t> tmp(raw_size);
|
||||||
memcpy(tmp.data(), tensor->data, raw_size);
|
memcpy(tmp.data(), tensor->data, raw_size);
|
||||||
ov_weight = process_weight_tensor(tensor, tmp.data(), tensor->data);
|
ov_weight = process_weight_tensor(tensor, tmp.data(), tensor->data, use_bias);
|
||||||
} else {
|
} else {
|
||||||
ov_weight = process_weight_tensor(tensor, tensor->data, nullptr);
|
ov_weight = process_weight_tensor(tensor, tensor->data, nullptr, use_bias);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// For non-quantized weights (F16/F32/BF16), data is already in tensor->data.
|
// For non-quantized weights (F16/F32/BF16), data is already in tensor->data.
|
||||||
|
|
|
||||||
|
|
@ -104,7 +104,7 @@ public:
|
||||||
|
|
||||||
virtual ov::PartialShape get_output_shape(int node_idx) const override;
|
virtual ov::PartialShape get_output_shape(int node_idx) const override;
|
||||||
|
|
||||||
virtual ov::element::Type get_output_type(const int node_idx) const override;
|
virtual ov::element::Type get_output_type(int node_idx) const override;
|
||||||
|
|
||||||
virtual int32_t * get_input_op_params(int node_idx, const std::string & name) const override;
|
virtual int32_t * get_input_op_params(int node_idx, const std::string & name) const override;
|
||||||
|
|
||||||
|
|
@ -184,9 +184,10 @@ public:
|
||||||
|
|
||||||
static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename);
|
static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename);
|
||||||
|
|
||||||
static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor * tensor);
|
static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor * tensor, bool naive = false);
|
||||||
|
|
||||||
static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(ggml_cgraph * cgraph);
|
static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(ggml_cgraph * cgraph,
|
||||||
|
bool naive = false);
|
||||||
|
|
||||||
const ggml_tensor * get_tensor_used_op(const ggml_tensor * tensor) const;
|
const ggml_tensor * get_tensor_used_op(const ggml_tensor * tensor) const;
|
||||||
|
|
||||||
|
|
@ -207,6 +208,7 @@ public:
|
||||||
bool m_is_static = false;
|
bool m_is_static = false;
|
||||||
bool m_is_stateful = false;
|
bool m_is_stateful = false;
|
||||||
bool m_is_prefill = false;
|
bool m_is_prefill = false;
|
||||||
|
bool m_naive = false;
|
||||||
int m_prefill_chunk_size = 0;
|
int m_prefill_chunk_size = 0;
|
||||||
|
|
||||||
static ov::Shape get_shape(const ggml_tensor * tensor);
|
static ov::Shape get_shape(const ggml_tensor * tensor);
|
||||||
|
|
@ -265,7 +267,7 @@ public:
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void set_input_output(ggml_tensor * node, bool naive = false);
|
void set_input_output(ggml_tensor * node);
|
||||||
int compute_op_case(const ggml_tensor * node) const;
|
int compute_op_case(const ggml_tensor * node) const;
|
||||||
|
|
||||||
void validate_cgraph() const;
|
void validate_cgraph() const;
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,7 @@
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <openvino/runtime/intel_gpu/ocl/ocl.hpp>
|
#include <openvino/runtime/intel_gpu/ocl/ocl.hpp>
|
||||||
#include <openvino/runtime/intel_npu/level_zero/level_zero.hpp>
|
#include <openvino/runtime/intel_npu/level_zero/level_zero.hpp>
|
||||||
|
#include <optional>
|
||||||
|
|
||||||
ov::Core & ov_singleton_core() {
|
ov::Core & ov_singleton_core() {
|
||||||
static ov::Core core;
|
static ov::Core core;
|
||||||
|
|
@ -164,7 +165,7 @@ clEnqueueMemcpyINTEL_fn ggml_openvino_get_clEnqueueMemcpyINTEL() {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get requantization type for a tensor type (returns nullopt if no requant needed)
|
// Get requantization type for a tensor type (returns nullopt if no requant needed)
|
||||||
std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor * tensor) {
|
std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor * tensor, bool no_requant) {
|
||||||
if (strncmp(tensor->name, "token_embd.weight", 17) == 0) {
|
if (strncmp(tensor->name, "token_embd.weight", 17) == 0) {
|
||||||
return ((ggml_openvino_is_npu() && tensor->type == GGML_TYPE_Q6_K) ? ExtraQuantType::F16 : ExtraQuantType::Q8_0_C);
|
return ((ggml_openvino_is_npu() && tensor->type == GGML_TYPE_Q6_K) ? ExtraQuantType::F16 : ExtraQuantType::Q8_0_C);
|
||||||
}
|
}
|
||||||
|
|
@ -174,6 +175,9 @@ std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor *
|
||||||
if (ggml_openvino_is_npu()) {
|
if (ggml_openvino_is_npu()) {
|
||||||
return ExtraQuantType::Q4_0_128;
|
return ExtraQuantType::Q4_0_128;
|
||||||
}
|
}
|
||||||
|
if (no_requant) {
|
||||||
|
return std::nullopt;
|
||||||
|
}
|
||||||
switch (tensor->type) {
|
switch (tensor->type) {
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
|
|
@ -187,7 +191,7 @@ std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor *
|
||||||
// Extracted Layout Calculation
|
// Extracted Layout Calculation
|
||||||
// =====================================================
|
// =====================================================
|
||||||
|
|
||||||
ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor) {
|
ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor, bool use_bias) {
|
||||||
ggml_openvino_extracted_layout layout = {};
|
ggml_openvino_extracted_layout layout = {};
|
||||||
layout.is_symmetric = false;
|
layout.is_symmetric = false;
|
||||||
|
|
||||||
|
|
@ -204,7 +208,7 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
|
||||||
const size_t alignment = 64; // Good for SIMD
|
const size_t alignment = 64; // Good for SIMD
|
||||||
|
|
||||||
// Check if requantization is needed (NPU-specific)
|
// Check if requantization is needed (NPU-specific)
|
||||||
auto requant_type = ggml_openvino_get_requant_type(tensor);
|
auto requant_type = ggml_openvino_get_requant_type(tensor, use_bias);
|
||||||
if (requant_type.has_value()) {
|
if (requant_type.has_value()) {
|
||||||
layout.is_requant = true;
|
layout.is_requant = true;
|
||||||
layout.requant_type = requant_type;
|
layout.requant_type = requant_type;
|
||||||
|
|
|
||||||
|
|
@ -83,7 +83,7 @@ const std::string & ggml_openvino_get_device_name();
|
||||||
bool ggml_openvino_is_npu();
|
bool ggml_openvino_is_npu();
|
||||||
|
|
||||||
// Get requantization type for a tensor type (returns nullopt if no requant needed)
|
// Get requantization type for a tensor type (returns nullopt if no requant needed)
|
||||||
std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor * tensor);
|
std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor * tensor, bool no_requant = false);
|
||||||
|
|
||||||
// =====================================================
|
// =====================================================
|
||||||
// OpenVINO Tensor Extra Types
|
// OpenVINO Tensor Extra Types
|
||||||
|
|
@ -160,7 +160,7 @@ struct ggml_openvino_extracted_layout {
|
||||||
};
|
};
|
||||||
|
|
||||||
// Calculate the buffer layout for extracted quantized data
|
// Calculate the buffer layout for extracted quantized data
|
||||||
ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor);
|
ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor, bool use_bias = false);
|
||||||
|
|
||||||
ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor * tensor, bool is_remote);
|
ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor * tensor, bool is_remote);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -922,6 +922,13 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
if (op->op == GGML_OP_GET_ROWS) {
|
||||||
|
if (op->ne[0] == 256 && (op->src[0]->type == GGML_TYPE_Q4_K || op->src[0]->type == GGML_TYPE_Q5_K)) {
|
||||||
|
// ERR = 0.000000306 > 0.000000100 GET_ROWS(type=q4_K,n=256,m=5,r=4,be1=1,be2=1,v=0)
|
||||||
|
// ERR = 0.000000197 > 0.000000100 GET_ROWS(type=q5_K,n=256,m=5,r=4,be1=1,be2=1,v=0)
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -11,6 +11,7 @@
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <limits>
|
#include <limits>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
#include <openvino/core/except.hpp>
|
||||||
#include <openvino/core/node.hpp>
|
#include <openvino/core/node.hpp>
|
||||||
#include <openvino/core/node_output.hpp>
|
#include <openvino/core/node_output.hpp>
|
||||||
#include <openvino/core/parallel.hpp>
|
#include <openvino/core/parallel.hpp>
|
||||||
|
|
@ -18,6 +19,7 @@
|
||||||
#include <openvino/core/type/element_type.hpp>
|
#include <openvino/core/type/element_type.hpp>
|
||||||
#include <openvino/core/type/element_type_traits.hpp>
|
#include <openvino/core/type/element_type_traits.hpp>
|
||||||
#include <openvino/core/type/float16.hpp>
|
#include <openvino/core/type/float16.hpp>
|
||||||
|
#include <openvino/op/add.hpp>
|
||||||
#include <openvino/op/constant.hpp>
|
#include <openvino/op/constant.hpp>
|
||||||
#include <openvino/op/convert.hpp>
|
#include <openvino/op/convert.hpp>
|
||||||
#include <openvino/op/multiply.hpp>
|
#include <openvino/op/multiply.hpp>
|
||||||
|
|
@ -82,28 +84,41 @@ void extract_q4_0_data(const ggml_tensor * tensor,
|
||||||
void extract_q4_1_data(const ggml_tensor * tensor,
|
void extract_q4_1_data(const ggml_tensor * tensor,
|
||||||
ov::Tensor & weights_arr,
|
ov::Tensor & weights_arr,
|
||||||
ov::Tensor & scales_arr,
|
ov::Tensor & scales_arr,
|
||||||
ov::Tensor & zp_arr) {
|
ov::Tensor & zp_arr,
|
||||||
|
bool use_bias) {
|
||||||
const uint64_t bytes_per_block = 20; // 2 bytes scale, 2 bytes min, 32x0.5 byte weights
|
const uint64_t bytes_per_block = 20; // 2 bytes scale, 2 bytes min, 32x0.5 byte weights
|
||||||
|
|
||||||
auto * data = static_cast<uint8_t *>(tensor->data);
|
auto * data = static_cast<uint8_t *>(tensor->data);
|
||||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
|
||||||
|
|
||||||
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
|
if (use_bias) {
|
||||||
float scale = static_cast<float>(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block))));
|
// Store bias (min) directly as f16 instead of computing u4 zero points
|
||||||
float min = static_cast<float>(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block + 2))));
|
auto * bias = zp_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||||
scales[i] = ov::float16(scale);
|
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
|
||||||
// zp = -min / scale (bias = min, so zp = -bias/scale)
|
float scale = static_cast<float>(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block))));
|
||||||
uint8_t zp_val = (scale != 0.0f) ? (uint8_t) std::round(-min / scale) : 0;
|
float min = static_cast<float>(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block + 2))));
|
||||||
// Pack two 4-bit zero points per byte
|
scales[i] = ov::float16(scale);
|
||||||
if (i % 2 == 0) {
|
bias[i] = ov::float16(min); // bias = min, dequant: w*s + bias
|
||||||
zp[i / 2] = zp_val & 0x0F; // Lower nibble
|
unpack_32_4(data + i * bytes_per_block + 4, weights + i * 16);
|
||||||
} else {
|
});
|
||||||
zp[i / 2] |= (zp_val << 4); // Upper nibble
|
} else {
|
||||||
}
|
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
||||||
unpack_32_4(data + i * bytes_per_block + 4, weights + i * 16);
|
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
|
||||||
});
|
float scale = static_cast<float>(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block))));
|
||||||
|
float min = static_cast<float>(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block + 2))));
|
||||||
|
scales[i] = ov::float16(scale);
|
||||||
|
// zp = -min / scale (bias = min, so zp = -bias/scale)
|
||||||
|
uint8_t zp_val = (scale != 0.0f) ? (uint8_t) std::round(-min / scale) : 0;
|
||||||
|
// Pack two 4-bit zero points per byte
|
||||||
|
if (i % 2 == 0) {
|
||||||
|
zp[i / 2] = zp_val & 0x0F; // Lower nibble
|
||||||
|
} else {
|
||||||
|
zp[i / 2] |= (zp_val << 4); // Upper nibble
|
||||||
|
}
|
||||||
|
unpack_32_4(data + i * bytes_per_block + 4, weights + i * 16);
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Extracts (weight, scales, zp) from Q8_0 tensors.
|
// Extracts (weight, scales, zp) from Q8_0 tensors.
|
||||||
|
|
@ -164,14 +179,18 @@ void unpack_256_4(const uint8_t * data, uint8_t * dst) {
|
||||||
void extract_q4_k_data(const ggml_tensor * tensor,
|
void extract_q4_k_data(const ggml_tensor * tensor,
|
||||||
ov::Tensor & weights_arr,
|
ov::Tensor & weights_arr,
|
||||||
ov::Tensor & scales_arr,
|
ov::Tensor & scales_arr,
|
||||||
ov::Tensor & zp_arr) {
|
ov::Tensor & zp_arr,
|
||||||
|
bool use_bias) {
|
||||||
const uint64_t bytes_per_block = 2 + 2 + 12 + 128;
|
const uint64_t bytes_per_block = 2 + 2 + 12 + 128;
|
||||||
const uint64_t n_super_block = tensor->nb[3] / bytes_per_block;
|
const uint64_t n_super_block = tensor->nb[3] / bytes_per_block;
|
||||||
|
|
||||||
auto * data = static_cast<uint8_t *>(tensor->data);
|
auto * data = static_cast<uint8_t *>(tensor->data);
|
||||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
|
||||||
|
// For bias path, zp_arr holds f16 bias values; for zp path, it holds packed u4 zero points
|
||||||
|
auto * zp_u4 = use_bias ? nullptr : static_cast<uint8_t *>(zp_arr.data());
|
||||||
|
auto * bias_f16 = use_bias ? zp_arr.data<ov::element_type_traits<ov::element::f16>::value_type>() : nullptr;
|
||||||
|
|
||||||
ov::parallel_for(n_super_block, [&](size_t i) {
|
ov::parallel_for(n_super_block, [&](size_t i) {
|
||||||
uint8_t * block_data = data + i * bytes_per_block;
|
uint8_t * block_data = data + i * bytes_per_block;
|
||||||
|
|
@ -205,17 +224,22 @@ void extract_q4_k_data(const ggml_tensor * tensor,
|
||||||
min_vals[6] = scale_mins * static_cast<float>((*(qs1 + 10) >> 4) | ((*(qs1 + 6) >> 6) << 4));
|
min_vals[6] = scale_mins * static_cast<float>((*(qs1 + 10) >> 4) | ((*(qs1 + 6) >> 6) << 4));
|
||||||
min_vals[7] = scale_mins * static_cast<float>((*(qs1 + 11) >> 4) | ((*(qs1 + 7) >> 6) << 4));
|
min_vals[7] = scale_mins * static_cast<float>((*(qs1 + 11) >> 4) | ((*(qs1 + 7) >> 6) << 4));
|
||||||
|
|
||||||
// Store scales and compute zero points
|
// Store scales and compute zero points or bias
|
||||||
for (int j = 0; j < 8; j++) {
|
for (int j = 0; j < 8; j++) {
|
||||||
scales[i * 8 + j] = ov::float16(scale_vals[j]);
|
scales[i * 8 + j] = ov::float16(scale_vals[j]);
|
||||||
// zp = min / scale (since bias = -min and zp = -bias/scale)
|
if (use_bias) {
|
||||||
uint8_t zp_val = (scale_vals[j] != 0.0f) ? (uint8_t) std::round(min_vals[j] / scale_vals[j]) : 0;
|
// Store bias = -min directly as f16, dequant: w*s + bias
|
||||||
// Pack two 4-bit zero points per byte
|
bias_f16[i * 8 + j] = ov::float16(-min_vals[j]);
|
||||||
size_t idx = i * 8 + j;
|
|
||||||
if (idx % 2 == 0) {
|
|
||||||
zp[idx / 2] = zp_val & 0x0F;
|
|
||||||
} else {
|
} else {
|
||||||
zp[idx / 2] |= (zp_val << 4);
|
// zp = min / scale (since bias = -min and zp = -bias/scale)
|
||||||
|
uint8_t zp_val = (scale_vals[j] != 0.0f) ? (uint8_t) std::round(min_vals[j] / scale_vals[j]) : 0;
|
||||||
|
// Pack two 4-bit zero points per byte
|
||||||
|
size_t idx = i * 8 + j;
|
||||||
|
if (idx % 2 == 0) {
|
||||||
|
zp_u4[idx / 2] = zp_val & 0x0F;
|
||||||
|
} else {
|
||||||
|
zp_u4[idx / 2] |= (zp_val << 4);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
unpack_256_4(block_data + 16, weights + i * 128);
|
unpack_256_4(block_data + 16, weights + i * 128);
|
||||||
|
|
@ -285,14 +309,18 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8
|
||||||
void extract_q5_k_data(const ggml_tensor * tensor,
|
void extract_q5_k_data(const ggml_tensor * tensor,
|
||||||
ov::Tensor & weights_arr,
|
ov::Tensor & weights_arr,
|
||||||
ov::Tensor & scales_arr,
|
ov::Tensor & scales_arr,
|
||||||
ov::Tensor & zp_arr) {
|
ov::Tensor & zp_arr,
|
||||||
|
bool use_bias) {
|
||||||
const uint64_t bytes_per_block = 4 + 12 + 32 + 128;
|
const uint64_t bytes_per_block = 4 + 12 + 32 + 128;
|
||||||
const uint64_t n_super_block = tensor->nb[3] / bytes_per_block;
|
const uint64_t n_super_block = tensor->nb[3] / bytes_per_block;
|
||||||
|
|
||||||
auto * data = static_cast<uint8_t *>(tensor->data);
|
auto * data = static_cast<uint8_t *>(tensor->data);
|
||||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
|
||||||
|
// For bias path, zp_arr holds f16 bias values; for zp path, it holds u8 zero points
|
||||||
|
auto * zp_u8 = use_bias ? nullptr : static_cast<uint8_t *>(zp_arr.data());
|
||||||
|
auto * bias_f16 = use_bias ? zp_arr.data<ov::element_type_traits<ov::element::f16>::value_type>() : nullptr;
|
||||||
|
|
||||||
ov::parallel_for(n_super_block, [&](size_t i) {
|
ov::parallel_for(n_super_block, [&](size_t i) {
|
||||||
uint8_t * block_data = data + i * bytes_per_block;
|
uint8_t * block_data = data + i * bytes_per_block;
|
||||||
|
|
@ -325,9 +353,15 @@ void extract_q5_k_data(const ggml_tensor * tensor,
|
||||||
|
|
||||||
scales[i * 8 + is] = ov::float16(d1);
|
scales[i * 8 + is] = ov::float16(d1);
|
||||||
scales[i * 8 + is + 1] = ov::float16(d2);
|
scales[i * 8 + is + 1] = ov::float16(d2);
|
||||||
// zp = min / scale (since bias = -min and zp = -bias/scale)
|
if (use_bias) {
|
||||||
zp[i * 8 + is] = (d1 != 0.0f) ? (uint8_t) std::round(m1 / d1) : 0;
|
// Store bias = -min directly as f16, dequant: w*s + bias
|
||||||
zp[i * 8 + is + 1] = (d2 != 0.0f) ? (uint8_t) std::round(m2 / d2) : 0;
|
bias_f16[i * 8 + is] = ov::float16(-m1);
|
||||||
|
bias_f16[i * 8 + is + 1] = ov::float16(-m2);
|
||||||
|
} else {
|
||||||
|
// zp = min / scale (since bias = -min and zp = -bias/scale)
|
||||||
|
zp_u8[i * 8 + is] = (d1 != 0.0f) ? (uint8_t) std::round(m1 / d1) : 0;
|
||||||
|
zp_u8[i * 8 + is + 1] = (d2 != 0.0f) ? (uint8_t) std::round(m2 / d2) : 0;
|
||||||
|
}
|
||||||
|
|
||||||
// Extract weights for first 32 elements (matching deq formula exactly)
|
// Extract weights for first 32 elements (matching deq formula exactly)
|
||||||
for (int l = 0; l < 32; ++l) {
|
for (int l = 0; l < 32; ++l) {
|
||||||
|
|
@ -349,10 +383,14 @@ void extract_q5_k_data(const ggml_tensor * tensor,
|
||||||
|
|
||||||
// TODO Reorder for make_intX_weights
|
// TODO Reorder for make_intX_weights
|
||||||
|
|
||||||
ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight, ov::Tensor & scales, ov::Tensor & zp, size_t group_size) {
|
ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
|
||||||
|
ov::Tensor & scales,
|
||||||
|
ov::Tensor & zp,
|
||||||
|
size_t group_size,
|
||||||
|
bool use_bias) {
|
||||||
ov::Shape orig_shape = weight.get_shape();
|
ov::Shape orig_shape = weight.get_shape();
|
||||||
|
|
||||||
// Expand dimensions for scales and zp
|
// Expand dimensions for scales and zp/bias
|
||||||
auto scale_shape = scales.get_shape();
|
auto scale_shape = scales.get_shape();
|
||||||
auto zp_shape = zp.get_shape();
|
auto zp_shape = zp.get_shape();
|
||||||
bool is_scalar_zp = zp_shape.empty(); // Symmetric quantization
|
bool is_scalar_zp = zp_shape.empty(); // Symmetric quantization
|
||||||
|
|
@ -377,36 +415,45 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight, ov::Tensor & scales,
|
||||||
static_cast<uint8_t *>(weight.data()), nullptr);
|
static_cast<uint8_t *>(weight.data()), nullptr);
|
||||||
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
|
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
|
||||||
auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
|
auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
|
||||||
|
|
||||||
// Zero point is already in U8 format from extraction
|
|
||||||
auto zero_point = std::make_shared<ov::op::v0::Constant>(zp);
|
|
||||||
float zp_value;
|
|
||||||
if (ov::op::util::get_single_value(zero_point, zp_value)) {
|
|
||||||
zero_point = ov::op::v0::Constant::create(zero_point->get_element_type(), {}, {zp_value});
|
|
||||||
}
|
|
||||||
|
|
||||||
// Quantization operations
|
|
||||||
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
|
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
|
||||||
auto zero_point_f16 = std::make_shared<ov::op::v0::Convert>(zero_point, ov::element::f16);
|
|
||||||
|
|
||||||
auto w_zp = std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY);
|
ov::Output<ov::Node> result;
|
||||||
ov::Output<ov::Node> w_zp_s =
|
if (use_bias && !is_scalar_zp) {
|
||||||
std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
|
// Bias path: w * s + b (zp tensor holds f16 bias values)
|
||||||
|
auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp);
|
||||||
|
auto w_s = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||||
|
result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||||
|
} else {
|
||||||
|
// Zero point path: (w - zp) * s
|
||||||
|
auto zero_point = std::make_shared<ov::op::v0::Constant>(zp);
|
||||||
|
float zp_value;
|
||||||
|
if (ov::op::util::get_single_value(zero_point, zp_value)) {
|
||||||
|
zero_point = ov::op::v0::Constant::create(zero_point->get_element_type(), {}, {zp_value});
|
||||||
|
}
|
||||||
|
auto zero_point_f16 = std::make_shared<ov::op::v0::Convert>(zero_point, ov::element::f16);
|
||||||
|
auto w_zp =
|
||||||
|
std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||||
|
result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||||
|
}
|
||||||
|
|
||||||
if (packed_shape.size() != 2) {
|
if (packed_shape.size() != 2) {
|
||||||
// If not requantized channel-wise case, reshape back to original shape
|
// If not requantized channel-wise case, reshape back to original shape
|
||||||
auto final_shape =
|
auto final_shape =
|
||||||
std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{orig_shape.size()}, orig_shape);
|
std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{orig_shape.size()}, orig_shape);
|
||||||
w_zp_s = std::make_shared<ov::op::v1::Reshape>(w_zp_s, final_shape, false);
|
result = std::make_shared<ov::op::v1::Reshape>(result, final_shape, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
return std::make_shared<ov::op::v0::Convert>(w_zp_s, ov::element::f32);
|
return std::make_shared<ov::op::v0::Convert>(result, ov::element::f32);
|
||||||
}
|
}
|
||||||
|
|
||||||
ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight, ov::Tensor & scales, ov::Tensor & zp, size_t group_size) {
|
ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
|
||||||
|
ov::Tensor & scales,
|
||||||
|
ov::Tensor & zp,
|
||||||
|
size_t group_size,
|
||||||
|
bool use_bias) {
|
||||||
ov::Shape orig_weight_shape = weight.get_shape();
|
ov::Shape orig_weight_shape = weight.get_shape();
|
||||||
|
|
||||||
// Expand dimensions for scales and zp
|
// Expand dimensions for scales and zp/bias
|
||||||
ov::Shape scale_shape = scales.get_shape();
|
ov::Shape scale_shape = scales.get_shape();
|
||||||
auto zp_shape = zp.get_shape();
|
auto zp_shape = zp.get_shape();
|
||||||
bool is_scalar_zp = zp_shape.empty(); // Symmetric quantization
|
bool is_scalar_zp = zp_shape.empty(); // Symmetric quantization
|
||||||
|
|
@ -431,32 +478,35 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight, ov::Tensor & scales,
|
||||||
static_cast<uint8_t *>(weight.data()), nullptr);
|
static_cast<uint8_t *>(weight.data()), nullptr);
|
||||||
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
|
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
|
||||||
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
|
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
|
||||||
|
|
||||||
// Zero point is already in U4 format from extraction
|
|
||||||
auto zero_points_node = std::make_shared<ov::op::v0::Constant>(zp);
|
|
||||||
float zp_value;
|
|
||||||
if (ov::op::util::get_single_value(zero_points_node, zp_value)) {
|
|
||||||
zero_points_node = ov::op::v0::Constant::create(zero_points_node->get_element_type(), {}, {zp_value});
|
|
||||||
}
|
|
||||||
auto zero_points_f16 = std::make_shared<ov::op::v0::Convert>(zero_points_node, ov::element::f16);
|
|
||||||
|
|
||||||
auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
|
auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
|
||||||
|
|
||||||
// Perform dequantization
|
ov::Output<ov::Node> result;
|
||||||
auto w_zp = std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY);
|
if (use_bias && !is_scalar_zp) {
|
||||||
|
// Bias path: w * s + b (zp tensor holds f16 bias values)
|
||||||
ov::Output<ov::Node> w_zp_s =
|
auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp);
|
||||||
std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
|
auto w_s = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||||
|
result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||||
|
} else {
|
||||||
|
// Zero point path: (w - zp) * s
|
||||||
|
auto zero_points_node = std::make_shared<ov::op::v0::Constant>(zp);
|
||||||
|
float zp_value;
|
||||||
|
if (ov::op::util::get_single_value(zero_points_node, zp_value)) {
|
||||||
|
zero_points_node = ov::op::v0::Constant::create(zero_points_node->get_element_type(), {}, {zp_value});
|
||||||
|
}
|
||||||
|
auto zero_points_f16 = std::make_shared<ov::op::v0::Convert>(zero_points_node, ov::element::f16);
|
||||||
|
auto w_zp =
|
||||||
|
std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||||
|
result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
|
||||||
|
}
|
||||||
|
|
||||||
if (packed_shape.size() != 2) {
|
if (packed_shape.size() != 2) {
|
||||||
// If not requantized channel-wise case, reshape back to original shape
|
// If not requantized channel-wise case, reshape back to original shape
|
||||||
auto final_shape = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{orig_weight_shape.size()},
|
auto final_shape = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{orig_weight_shape.size()},
|
||||||
orig_weight_shape);
|
orig_weight_shape);
|
||||||
|
result = std::make_shared<ov::op::v1::Reshape>(result, final_shape, false);
|
||||||
w_zp_s = std::make_shared<ov::op::v1::Reshape>(w_zp_s, final_shape, false);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return std::make_shared<ov::op::v0::Convert>(w_zp_s, ov::element::f32);
|
return std::make_shared<ov::op::v0::Convert>(result, ov::element::f32);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Extract quantized weights from tensor and create weight subgraph
|
// Extract quantized weights from tensor and create weight subgraph
|
||||||
|
|
@ -464,7 +514,8 @@ std::shared_ptr<ov::Node> extract_quantized_weights(const ggml_tensor * tensor,
|
||||||
const void * data,
|
const void * data,
|
||||||
ov::Tensor & weights,
|
ov::Tensor & weights,
|
||||||
ov::Tensor & scales,
|
ov::Tensor & scales,
|
||||||
ov::Tensor & zp) {
|
ov::Tensor & zp,
|
||||||
|
bool use_bias) {
|
||||||
// Create a temporary tensor for extraction functions that read from tensor->data
|
// Create a temporary tensor for extraction functions that read from tensor->data
|
||||||
ggml_tensor temp_tensor = *tensor;
|
ggml_tensor temp_tensor = *tensor;
|
||||||
temp_tensor.data = const_cast<void *>(data);
|
temp_tensor.data = const_cast<void *>(data);
|
||||||
|
|
@ -499,10 +550,10 @@ std::shared_ptr<ov::Node> extract_quantized_weights(const ggml_tensor * tensor,
|
||||||
extract_q4_0_data(&temp_tensor, weights, scales, zp);
|
extract_q4_0_data(&temp_tensor, weights, scales, zp);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q4_1:
|
case GGML_TYPE_Q4_1:
|
||||||
extract_q4_1_data(&temp_tensor, weights, scales, zp);
|
extract_q4_1_data(&temp_tensor, weights, scales, zp, use_bias);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
extract_q4_k_data(&temp_tensor, weights, scales, zp);
|
extract_q4_k_data(&temp_tensor, weights, scales, zp, use_bias);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
extract_q8_0_data(&temp_tensor, weights, scales, zp);
|
extract_q8_0_data(&temp_tensor, weights, scales, zp);
|
||||||
|
|
@ -511,7 +562,7 @@ std::shared_ptr<ov::Node> extract_quantized_weights(const ggml_tensor * tensor,
|
||||||
extract_q6_k_data(&temp_tensor, weights, scales, zp);
|
extract_q6_k_data(&temp_tensor, weights, scales, zp);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
extract_q5_k_data(&temp_tensor, weights, scales, zp);
|
extract_q5_k_data(&temp_tensor, weights, scales, zp, use_bias);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
throw std::runtime_error("Unsupported quantized type: " + std::string(ggml_type_name(tensor->type)));
|
throw std::runtime_error("Unsupported quantized type: " + std::string(ggml_type_name(tensor->type)));
|
||||||
|
|
@ -520,9 +571,9 @@ std::shared_ptr<ov::Node> extract_quantized_weights(const ggml_tensor * tensor,
|
||||||
// Create the OpenVINO weight subgraph
|
// Create the OpenVINO weight subgraph
|
||||||
ov::Output<ov::Node> weight_node;
|
ov::Output<ov::Node> weight_node;
|
||||||
if (is_u4) {
|
if (is_u4) {
|
||||||
weight_node = make_int4_weights(weights, scales, zp, weights_per_block);
|
weight_node = make_int4_weights(weights, scales, zp, weights_per_block, use_bias);
|
||||||
} else {
|
} else {
|
||||||
weight_node = make_int8_weights(weights, scales, zp, weights_per_block);
|
weight_node = make_int8_weights(weights, scales, zp, weights_per_block, use_bias);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto result = weight_node.get_node_shared_ptr();
|
auto result = weight_node.get_node_shared_ptr();
|
||||||
|
|
@ -576,7 +627,7 @@ std::shared_ptr<ov::Node> requantize_to_buffers(const ggml_tensor * tensor,
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, void * output_base_ptr) {
|
OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, void * output_base_ptr, bool use_bias) {
|
||||||
GGML_ASSERT(tensor != nullptr);
|
GGML_ASSERT(tensor != nullptr);
|
||||||
GGML_ASSERT(data != nullptr);
|
GGML_ASSERT(data != nullptr);
|
||||||
|
|
||||||
|
|
@ -619,12 +670,19 @@ OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, vo
|
||||||
OPENVINO_THROW("Unsupported weight tensor type: ", ggml_type_name(tensor->type));
|
OPENVINO_THROW("Unsupported weight tensor type: ", ggml_type_name(tensor->type));
|
||||||
}
|
}
|
||||||
|
|
||||||
result.layout = ggml_openvino_get_extracted_layout(tensor);
|
result.layout = ggml_openvino_get_extracted_layout(tensor, use_bias);
|
||||||
const auto & layout = result.layout;
|
const auto & layout = result.layout;
|
||||||
if (layout.total_size == 0) {
|
if (layout.total_size == 0) {
|
||||||
OPENVINO_THROW("Unsupported quantized type: ", ggml_type_name(tensor->type));
|
OPENVINO_THROW("Unsupported quantized type: ", ggml_type_name(tensor->type));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (use_bias) {
|
||||||
|
OPENVINO_ASSERT(!layout.is_requant,
|
||||||
|
"use_bias is only used for test-backend-ops, which should not have requantization");
|
||||||
|
// bias node will be created on the fly and not use backend buffer
|
||||||
|
output_base_ptr = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
// F16 requant path - no separate scales/zp needed in result
|
// F16 requant path - no separate scales/zp needed in result
|
||||||
if (layout.is_requant && layout.requant_type.has_value() && layout.requant_type.value() == ExtraQuantType::F16) {
|
if (layout.is_requant && layout.requant_type.has_value() && layout.requant_type.value() == ExtraQuantType::F16) {
|
||||||
if (output_base_ptr) {
|
if (output_base_ptr) {
|
||||||
|
|
@ -653,14 +711,20 @@ OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, vo
|
||||||
} else {
|
} else {
|
||||||
result.weights = ov::Tensor(weight_type, node_shape);
|
result.weights = ov::Tensor(weight_type, node_shape);
|
||||||
result.scales = ov::Tensor(ov::element::f16, scale_shape);
|
result.scales = ov::Tensor(ov::element::f16, scale_shape);
|
||||||
result.zp = ov::Tensor(weight_type, zp_shape);
|
if (use_bias && !layout.is_symmetric) {
|
||||||
|
// bias only has effect for asymmetric quant
|
||||||
|
result.zp = ov::Tensor(ov::element::f16, zp_shape);
|
||||||
|
} else {
|
||||||
|
result.zp = ov::Tensor(weight_type, zp_shape);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (layout.is_requant && layout.requant_type.has_value()) {
|
if (layout.is_requant && layout.requant_type.has_value()) {
|
||||||
result.weight_node = requantize_to_buffers(tensor, data, layout.requant_type.value(), layout.weights_per_block,
|
result.weight_node = requantize_to_buffers(tensor, data, layout.requant_type.value(), layout.weights_per_block,
|
||||||
result.weights, result.scales, result.zp);
|
result.weights, result.scales, result.zp);
|
||||||
} else {
|
} else {
|
||||||
result.weight_node = extract_quantized_weights(tensor, data, result.weights, result.scales, result.zp);
|
result.weight_node =
|
||||||
|
extract_quantized_weights(tensor, data, result.weights, result.scales, result.zp, use_bias);
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
|
|
||||||
|
|
@ -16,7 +16,8 @@ void extract_q4_0_data(const ggml_tensor * tensor,
|
||||||
void extract_q4_1_data(const ggml_tensor * tensor,
|
void extract_q4_1_data(const ggml_tensor * tensor,
|
||||||
ov::Tensor & weights_arr,
|
ov::Tensor & weights_arr,
|
||||||
ov::Tensor & scales_arr,
|
ov::Tensor & scales_arr,
|
||||||
ov::Tensor & zp_arr);
|
ov::Tensor & zp_arr,
|
||||||
|
bool use_bias = false);
|
||||||
|
|
||||||
void extract_q8_0_data(const ggml_tensor * tensor,
|
void extract_q8_0_data(const ggml_tensor * tensor,
|
||||||
ov::Tensor & weights_arr,
|
ov::Tensor & weights_arr,
|
||||||
|
|
@ -28,12 +29,14 @@ void unpack_256_4(const uint8_t* data, uint8_t* dst);
|
||||||
void extract_q4_k_data(const ggml_tensor * tensor,
|
void extract_q4_k_data(const ggml_tensor * tensor,
|
||||||
ov::Tensor & weights_arr,
|
ov::Tensor & weights_arr,
|
||||||
ov::Tensor & scales_arr,
|
ov::Tensor & scales_arr,
|
||||||
ov::Tensor & zp_arr);
|
ov::Tensor & zp_arr,
|
||||||
|
bool use_bias = false);
|
||||||
|
|
||||||
void extract_q5_k_data(const ggml_tensor * tensor,
|
void extract_q5_k_data(const ggml_tensor * tensor,
|
||||||
ov::Tensor & weights_arr,
|
ov::Tensor & weights_arr,
|
||||||
ov::Tensor & scales_arr,
|
ov::Tensor & scales_arr,
|
||||||
ov::Tensor & zp_arr);
|
ov::Tensor & zp_arr,
|
||||||
|
bool use_bias = false);
|
||||||
|
|
||||||
void extract_q6_k_data(const ggml_tensor * tensor,
|
void extract_q6_k_data(const ggml_tensor * tensor,
|
||||||
ov::Tensor & weights_arr,
|
ov::Tensor & weights_arr,
|
||||||
|
|
@ -45,12 +48,14 @@ static constexpr size_t GGML_QUANTIZATION_GROUP_SIZE = 32;
|
||||||
ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
|
ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
|
||||||
ov::Tensor & scales,
|
ov::Tensor & scales,
|
||||||
ov::Tensor & zp,
|
ov::Tensor & zp,
|
||||||
size_t group_size = GGML_QUANTIZATION_GROUP_SIZE);
|
size_t group_size = GGML_QUANTIZATION_GROUP_SIZE,
|
||||||
|
bool use_bias = false);
|
||||||
|
|
||||||
ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
|
ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
|
||||||
ov::Tensor & scales,
|
ov::Tensor & scales,
|
||||||
ov::Tensor & zp,
|
ov::Tensor & zp,
|
||||||
size_t group_size = GGML_QUANTIZATION_GROUP_SIZE);
|
size_t group_size = GGML_QUANTIZATION_GROUP_SIZE,
|
||||||
|
bool use_bias = false);
|
||||||
|
|
||||||
// Extract quantized weights from tensor and create weight subgraph
|
// Extract quantized weights from tensor and create weight subgraph
|
||||||
// If weights/scales/zp are provided (non-empty), uses them as output buffers
|
// If weights/scales/zp are provided (non-empty), uses them as output buffers
|
||||||
|
|
@ -61,7 +66,8 @@ std::shared_ptr<ov::Node> extract_quantized_weights(
|
||||||
const void * data, // Source data pointer (may differ from tensor->data)
|
const void * data, // Source data pointer (may differ from tensor->data)
|
||||||
ov::Tensor & weights,
|
ov::Tensor & weights,
|
||||||
ov::Tensor & scales,
|
ov::Tensor & scales,
|
||||||
ov::Tensor & zp);
|
ov::Tensor & zp,
|
||||||
|
bool use_bias = false); // Use fp bias instead of quantized zero_point (for test-backend-ops)
|
||||||
|
|
||||||
// Requantize weights from tensor to target format, writing to provided buffers
|
// Requantize weights from tensor to target format, writing to provided buffers
|
||||||
// For F16 target, only weights buffer is used (scales/zp ignored)
|
// For F16 target, only weights buffer is used (scales/zp ignored)
|
||||||
|
|
@ -112,8 +118,9 @@ struct OvWeight {
|
||||||
// Returns OvWeight with the weight node and optional quantized tensors
|
// Returns OvWeight with the weight node and optional quantized tensors
|
||||||
OvWeight process_weight_tensor(
|
OvWeight process_weight_tensor(
|
||||||
const ggml_tensor * tensor,
|
const ggml_tensor * tensor,
|
||||||
const void * data, // Source data pointer (may differ from tensor->data)
|
const void * data, // Source data pointer (may differ from tensor->data)
|
||||||
void * output_base_ptr = nullptr); // Base pointer for output buffers (or nullptr for internal allocation)
|
void * output_base_ptr = nullptr, // Base pointer for output buffers (or nullptr for internal allocation)
|
||||||
|
bool use_bias = false); // Use fp bias instead of quantized zero_point, only used in test-backend-ops
|
||||||
|
|
||||||
void quantize_q4_0(const float * x,
|
void quantize_q4_0(const float * x,
|
||||||
ov::Tensor & weights_arr,
|
ov::Tensor & weights_arr,
|
||||||
|
|
|
||||||
|
|
@ -127,7 +127,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
|
||||||
if (pos_data[0] == 0) {
|
if (pos_data[0] == 0) {
|
||||||
infer_request->reset_state();
|
infer_request->reset_state();
|
||||||
stateful_kv_size = pos_shape[3];
|
stateful_kv_size = pos_shape[3];
|
||||||
} else if (stateful_kv_size == pos_data[0]) {
|
} else if (stateful_kv_size == static_cast<size_t>(pos_data[0])) {
|
||||||
stateful_kv_size += pos_shape[3];
|
stateful_kv_size += pos_shape[3];
|
||||||
} else {
|
} else {
|
||||||
auto states = infer_request->query_state();
|
auto states = infer_request->query_state();
|
||||||
|
|
@ -139,7 +139,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
|
||||||
state.set_state(new_state_tensor);
|
state.set_state(new_state_tensor);
|
||||||
}
|
}
|
||||||
stateful_kv_size = pos_data[0] + 1;
|
stateful_kv_size = pos_data[0] + 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
decoder_end_time = ggml_time_us();
|
decoder_end_time = ggml_time_us();
|
||||||
|
|
@ -467,10 +467,10 @@ enum ggml_status naive_compute(ggml_cgraph * cgraph,
|
||||||
return GGML_STATUS_SUCCESS;
|
return GGML_STATUS_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
|
bool naive = true;
|
||||||
|
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, naive);
|
||||||
auto decoder = std::make_shared<GgmlOvDecoder>(cgraph, model_weights);
|
auto decoder = std::make_shared<GgmlOvDecoder>(cgraph, model_weights);
|
||||||
auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(decoder);
|
auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(decoder);
|
||||||
auto naive = true;
|
|
||||||
auto model = ov::frontend::ggml::FrontEnd::convert(input_model, naive);
|
auto model = ov::frontend::ggml::FrontEnd::convert(input_model, naive);
|
||||||
if (getenv("GGML_OPENVINO_DUMP_IR")) {
|
if (getenv("GGML_OPENVINO_DUMP_IR")) {
|
||||||
ov::serialize(model, "IR_naive.xml");
|
ov::serialize(model, "IR_naive.xml");
|
||||||
|
|
|
||||||
|
|
@ -233,9 +233,7 @@ if (NOT LLAMA_SANITIZE_ADDRESS AND NOT GGML_SCHED_NO_REALLOC)
|
||||||
llama_build_and_test(test-opt.cpp)
|
llama_build_and_test(test-opt.cpp)
|
||||||
endif()
|
endif()
|
||||||
llama_build_and_test(test-gguf.cpp)
|
llama_build_and_test(test-gguf.cpp)
|
||||||
if (NOT GGML_OPENVINO)
|
llama_build_and_test(test-backend-ops.cpp)
|
||||||
llama_build_and_test(test-backend-ops.cpp)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
llama_build_and_test(test-model-load-cancel.cpp LABEL "model")
|
llama_build_and_test(test-model-load-cancel.cpp LABEL "model")
|
||||||
llama_build_and_test(test-autorelease.cpp LABEL "model")
|
llama_build_and_test(test-autorelease.cpp LABEL "model")
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue