diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 99776e1bec..107b510f3b 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -50,6 +50,7 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, m_is_static(is_static), m_is_stateful(is_stateful), m_is_prefill(is_prefill), + m_naive(false), m_prefill_chunk_size(prefill_chunk_size), m_cgraph(cgraph), m_model_weights(model_weights), @@ -93,9 +94,10 @@ void GgmlOvDecoder::update_io(ggml_cgraph * cgraph) { GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map> & model_weights) { m_cgraph = cgraph; m_model_weights = model_weights; + m_naive = true; for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { auto * cur_node = cgraph->nodes[node_n]; - set_input_output(cur_node, true); + set_input_output(cur_node); } for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { m_node_info_list[node_n].node_op_case = compute_op_case(m_node_info_list[node_n].node); @@ -134,7 +136,7 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::mapname); auto node_output_name = node_name; @@ -169,7 +171,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { current_node_info.node_inputs_names.push_back(src_name); // Add model inputs - if (!naive && !src->view_src) { + if (!m_naive && !src->view_src) { ggml_backend_buffer * buffer = src->buffer; if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) { @@ -206,7 +208,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) { } // Add model outputs - if (!naive) { + if (!m_naive) { // Model outputs are tensors with GGML_TENSOR_FLAG_OUTPUT flag and kv_caches static std::set debug_output_names = {}; // Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph @@ -509,12 +511,14 @@ std::map GgmlOvDecoder::get_kv_param_res_names() const return kv_param_res_names; } -std::map> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph) { +std::map> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph, bool naive) { std::map> model_weights; // static std::mutex weights_mutex; auto * nodes = cgraph->nodes; auto n_nodes = cgraph->n_nodes; - std::for_each(std::execution::seq, nodes, nodes + n_nodes, [&](ggml_tensor * node) { + // std::for_each(std::execution::par, nodes, nodes + n_nodes, [&](ggml_tensor * node) { + for (int node_i = 0; node_i < n_nodes; node_i++) { + auto * node = nodes[node_i]; for (int i = 0; i < GGML_MAX_SRC; i++) { auto * src = node->src[i]; if (src == nullptr) { @@ -542,18 +546,19 @@ std::map> GgmlOvDecoder::create_weight_no // } // } if (model_weights.find(src_name) == model_weights.end()) { - auto weight_node = create_weight_node(src); + auto weight_node = create_weight_node(src, naive); weight_node->set_friendly_name(src_name); model_weights[src_name] = weight_node; } } } } - }); + } + // }); return model_weights; } -std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor) { +std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor, bool naive) { const bool is_ov_buffer = ggml_backend_buffer_is_openvino(tensor->buffer); // Check if we have a pre-built constant from the OpenVINO backend buffer @@ -581,6 +586,11 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor } } + // There are three cases where we need to create a new weight node: + // 1. weights are in openvino_host_buffer. Weight loading to host buffer will not trigger backend_buffer_set_tensor + // 2. weights are in cpu/cpu_mapped buffer. On token_embd.weight goes to case 1 or 2, depending on whether mmap or direct_io is used + // 3. test-backend-ops. buffers in test-backend-ops does not set USAGE_WEIGHT so backend_buffer_set_tensor will not create weight node + // GGML_LOG_DEBUG("%s: creating new weight node for %s\n", __func__, tensor->name); static const std::set weight_types = {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, @@ -592,6 +602,7 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor OvWeight ov_weight; if (ggml_is_quantized(tensor->type)) { + auto use_bias = naive; if (is_ov_buffer) { // For quantized weights, copy raw data to a temp buffer first because // process_weight_tensor reads from data and writes extracted results @@ -600,9 +611,9 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor size_t raw_size = ggml_nbytes(tensor); std::vector tmp(raw_size); memcpy(tmp.data(), tensor->data, raw_size); - ov_weight = process_weight_tensor(tensor, tmp.data(), tensor->data); + ov_weight = process_weight_tensor(tensor, tmp.data(), tensor->data, use_bias); } else { - ov_weight = process_weight_tensor(tensor, tensor->data, nullptr); + ov_weight = process_weight_tensor(tensor, tensor->data, nullptr, use_bias); } } else { // For non-quantized weights (F16/F32/BF16), data is already in tensor->data. diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 59311a6121..ec6062a166 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -104,7 +104,7 @@ public: virtual ov::PartialShape get_output_shape(int node_idx) const override; - virtual ov::element::Type get_output_type(const int node_idx) const override; + virtual ov::element::Type get_output_type(int node_idx) const override; virtual int32_t * get_input_op_params(int node_idx, const std::string & name) const override; @@ -184,9 +184,10 @@ public: static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename); - static std::shared_ptr create_weight_node(ggml_tensor * tensor); + static std::shared_ptr create_weight_node(ggml_tensor * tensor, bool naive = false); - static std::map> create_weight_nodes(ggml_cgraph * cgraph); + static std::map> create_weight_nodes(ggml_cgraph * cgraph, + bool naive = false); const ggml_tensor * get_tensor_used_op(const ggml_tensor * tensor) const; @@ -207,6 +208,7 @@ public: bool m_is_static = false; bool m_is_stateful = false; bool m_is_prefill = false; + bool m_naive = false; int m_prefill_chunk_size = 0; static ov::Shape get_shape(const ggml_tensor * tensor); @@ -265,7 +267,7 @@ public: } private: - void set_input_output(ggml_tensor * node, bool naive = false); + void set_input_output(ggml_tensor * node); int compute_op_case(const ggml_tensor * node) const; void validate_cgraph() const; diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp index 7a48ed1b65..0b8b2d3743 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp @@ -6,6 +6,7 @@ #include #include #include +#include ov::Core & ov_singleton_core() { static ov::Core core; @@ -164,7 +165,7 @@ clEnqueueMemcpyINTEL_fn ggml_openvino_get_clEnqueueMemcpyINTEL() { } // Get requantization type for a tensor type (returns nullopt if no requant needed) -std::optional ggml_openvino_get_requant_type(const ggml_tensor * tensor) { +std::optional ggml_openvino_get_requant_type(const ggml_tensor * tensor, bool no_requant) { if (strncmp(tensor->name, "token_embd.weight", 17) == 0) { return ((ggml_openvino_is_npu() && tensor->type == GGML_TYPE_Q6_K) ? ExtraQuantType::F16 : ExtraQuantType::Q8_0_C); } @@ -174,6 +175,9 @@ std::optional ggml_openvino_get_requant_type(const ggml_tensor * if (ggml_openvino_is_npu()) { return ExtraQuantType::Q4_0_128; } + if (no_requant) { + return std::nullopt; + } switch (tensor->type) { case GGML_TYPE_Q6_K: case GGML_TYPE_Q5_K: @@ -187,7 +191,7 @@ std::optional ggml_openvino_get_requant_type(const ggml_tensor * // Extracted Layout Calculation // ===================================================== -ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor) { +ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor, bool use_bias) { ggml_openvino_extracted_layout layout = {}; layout.is_symmetric = false; @@ -204,7 +208,7 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten const size_t alignment = 64; // Good for SIMD // Check if requantization is needed (NPU-specific) - auto requant_type = ggml_openvino_get_requant_type(tensor); + auto requant_type = ggml_openvino_get_requant_type(tensor, use_bias); if (requant_type.has_value()) { layout.is_requant = true; layout.requant_type = requant_type; diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.h b/ggml/src/ggml-openvino/ggml-openvino-extra.h index 9ce4667154..441a62e9d3 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.h +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.h @@ -83,7 +83,7 @@ const std::string & ggml_openvino_get_device_name(); bool ggml_openvino_is_npu(); // Get requantization type for a tensor type (returns nullopt if no requant needed) -std::optional ggml_openvino_get_requant_type(const ggml_tensor * tensor); +std::optional ggml_openvino_get_requant_type(const ggml_tensor * tensor, bool no_requant = false); // ===================================================== // OpenVINO Tensor Extra Types @@ -160,7 +160,7 @@ struct ggml_openvino_extracted_layout { }; // Calculate the buffer layout for extracted quantized data -ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor); +ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor, bool use_bias = false); ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor * tensor, bool is_remote); diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 780d17b750..948ff2cc78 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -922,6 +922,13 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { default: break; } + if (op->op == GGML_OP_GET_ROWS) { + if (op->ne[0] == 256 && (op->src[0]->type == GGML_TYPE_Q4_K || op->src[0]->type == GGML_TYPE_Q5_K)) { + // ERR = 0.000000306 > 0.000000100 GET_ROWS(type=q4_K,n=256,m=5,r=4,be1=1,be2=1,v=0) + // ERR = 0.000000197 > 0.000000100 GET_ROWS(type=q5_K,n=256,m=5,r=4,be1=1,be2=1,v=0) + return true; + } + } return false; } diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp index 10909cbc1e..3628f7a959 100644 --- a/ggml/src/ggml-openvino/ggml-quants.cpp +++ b/ggml/src/ggml-openvino/ggml-quants.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -18,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -82,28 +84,41 @@ void extract_q4_0_data(const ggml_tensor * tensor, void extract_q4_1_data(const ggml_tensor * tensor, ov::Tensor & weights_arr, ov::Tensor & scales_arr, - ov::Tensor & zp_arr) { + ov::Tensor & zp_arr, + bool use_bias) { const uint64_t bytes_per_block = 20; // 2 bytes scale, 2 bytes min, 32x0.5 byte weights auto * data = static_cast(tensor->data); auto * weights = static_cast(weights_arr.data()); auto * scales = scales_arr.data::value_type>(); - auto * zp = static_cast(zp_arr.data()); - ov::parallel_for(scales_arr.get_size(), [&](size_t i) { - float scale = static_cast(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block)))); - float min = static_cast(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block + 2)))); - scales[i] = ov::float16(scale); - // zp = -min / scale (bias = min, so zp = -bias/scale) - uint8_t zp_val = (scale != 0.0f) ? (uint8_t) std::round(-min / scale) : 0; - // Pack two 4-bit zero points per byte - if (i % 2 == 0) { - zp[i / 2] = zp_val & 0x0F; // Lower nibble - } else { - zp[i / 2] |= (zp_val << 4); // Upper nibble - } - unpack_32_4(data + i * bytes_per_block + 4, weights + i * 16); - }); + if (use_bias) { + // Store bias (min) directly as f16 instead of computing u4 zero points + auto * bias = zp_arr.data::value_type>(); + ov::parallel_for(scales_arr.get_size(), [&](size_t i) { + float scale = static_cast(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block)))); + float min = static_cast(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block + 2)))); + scales[i] = ov::float16(scale); + bias[i] = ov::float16(min); // bias = min, dequant: w*s + bias + unpack_32_4(data + i * bytes_per_block + 4, weights + i * 16); + }); + } else { + auto * zp = static_cast(zp_arr.data()); + ov::parallel_for(scales_arr.get_size(), [&](size_t i) { + float scale = static_cast(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block)))); + float min = static_cast(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block + 2)))); + scales[i] = ov::float16(scale); + // zp = -min / scale (bias = min, so zp = -bias/scale) + uint8_t zp_val = (scale != 0.0f) ? (uint8_t) std::round(-min / scale) : 0; + // Pack two 4-bit zero points per byte + if (i % 2 == 0) { + zp[i / 2] = zp_val & 0x0F; // Lower nibble + } else { + zp[i / 2] |= (zp_val << 4); // Upper nibble + } + unpack_32_4(data + i * bytes_per_block + 4, weights + i * 16); + }); + } } // Extracts (weight, scales, zp) from Q8_0 tensors. @@ -164,14 +179,18 @@ void unpack_256_4(const uint8_t * data, uint8_t * dst) { void extract_q4_k_data(const ggml_tensor * tensor, ov::Tensor & weights_arr, ov::Tensor & scales_arr, - ov::Tensor & zp_arr) { + ov::Tensor & zp_arr, + bool use_bias) { const uint64_t bytes_per_block = 2 + 2 + 12 + 128; const uint64_t n_super_block = tensor->nb[3] / bytes_per_block; auto * data = static_cast(tensor->data); auto * weights = static_cast(weights_arr.data()); auto * scales = scales_arr.data::value_type>(); - auto * zp = static_cast(zp_arr.data()); + + // For bias path, zp_arr holds f16 bias values; for zp path, it holds packed u4 zero points + auto * zp_u4 = use_bias ? nullptr : static_cast(zp_arr.data()); + auto * bias_f16 = use_bias ? zp_arr.data::value_type>() : nullptr; ov::parallel_for(n_super_block, [&](size_t i) { uint8_t * block_data = data + i * bytes_per_block; @@ -205,17 +224,22 @@ void extract_q4_k_data(const ggml_tensor * tensor, min_vals[6] = scale_mins * static_cast((*(qs1 + 10) >> 4) | ((*(qs1 + 6) >> 6) << 4)); min_vals[7] = scale_mins * static_cast((*(qs1 + 11) >> 4) | ((*(qs1 + 7) >> 6) << 4)); - // Store scales and compute zero points + // Store scales and compute zero points or bias for (int j = 0; j < 8; j++) { scales[i * 8 + j] = ov::float16(scale_vals[j]); - // zp = min / scale (since bias = -min and zp = -bias/scale) - uint8_t zp_val = (scale_vals[j] != 0.0f) ? (uint8_t) std::round(min_vals[j] / scale_vals[j]) : 0; - // Pack two 4-bit zero points per byte - size_t idx = i * 8 + j; - if (idx % 2 == 0) { - zp[idx / 2] = zp_val & 0x0F; + if (use_bias) { + // Store bias = -min directly as f16, dequant: w*s + bias + bias_f16[i * 8 + j] = ov::float16(-min_vals[j]); } else { - zp[idx / 2] |= (zp_val << 4); + // zp = min / scale (since bias = -min and zp = -bias/scale) + uint8_t zp_val = (scale_vals[j] != 0.0f) ? (uint8_t) std::round(min_vals[j] / scale_vals[j]) : 0; + // Pack two 4-bit zero points per byte + size_t idx = i * 8 + j; + if (idx % 2 == 0) { + zp_u4[idx / 2] = zp_val & 0x0F; + } else { + zp_u4[idx / 2] |= (zp_val << 4); + } } } unpack_256_4(block_data + 16, weights + i * 128); @@ -285,14 +309,18 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8 void extract_q5_k_data(const ggml_tensor * tensor, ov::Tensor & weights_arr, ov::Tensor & scales_arr, - ov::Tensor & zp_arr) { + ov::Tensor & zp_arr, + bool use_bias) { const uint64_t bytes_per_block = 4 + 12 + 32 + 128; const uint64_t n_super_block = tensor->nb[3] / bytes_per_block; auto * data = static_cast(tensor->data); auto * weights = static_cast(weights_arr.data()); auto * scales = scales_arr.data::value_type>(); - auto * zp = static_cast(zp_arr.data()); + + // For bias path, zp_arr holds f16 bias values; for zp path, it holds u8 zero points + auto * zp_u8 = use_bias ? nullptr : static_cast(zp_arr.data()); + auto * bias_f16 = use_bias ? zp_arr.data::value_type>() : nullptr; ov::parallel_for(n_super_block, [&](size_t i) { uint8_t * block_data = data + i * bytes_per_block; @@ -325,9 +353,15 @@ void extract_q5_k_data(const ggml_tensor * tensor, scales[i * 8 + is] = ov::float16(d1); scales[i * 8 + is + 1] = ov::float16(d2); - // zp = min / scale (since bias = -min and zp = -bias/scale) - zp[i * 8 + is] = (d1 != 0.0f) ? (uint8_t) std::round(m1 / d1) : 0; - zp[i * 8 + is + 1] = (d2 != 0.0f) ? (uint8_t) std::round(m2 / d2) : 0; + if (use_bias) { + // Store bias = -min directly as f16, dequant: w*s + bias + bias_f16[i * 8 + is] = ov::float16(-m1); + bias_f16[i * 8 + is + 1] = ov::float16(-m2); + } else { + // zp = min / scale (since bias = -min and zp = -bias/scale) + zp_u8[i * 8 + is] = (d1 != 0.0f) ? (uint8_t) std::round(m1 / d1) : 0; + zp_u8[i * 8 + is + 1] = (d2 != 0.0f) ? (uint8_t) std::round(m2 / d2) : 0; + } // Extract weights for first 32 elements (matching deq formula exactly) for (int l = 0; l < 32; ++l) { @@ -349,10 +383,14 @@ void extract_q5_k_data(const ggml_tensor * tensor, // TODO Reorder for make_intX_weights -ov::Output make_int8_weights(ov::Tensor & weight, ov::Tensor & scales, ov::Tensor & zp, size_t group_size) { +ov::Output make_int8_weights(ov::Tensor & weight, + ov::Tensor & scales, + ov::Tensor & zp, + size_t group_size, + bool use_bias) { ov::Shape orig_shape = weight.get_shape(); - // Expand dimensions for scales and zp + // Expand dimensions for scales and zp/bias auto scale_shape = scales.get_shape(); auto zp_shape = zp.get_shape(); bool is_scalar_zp = zp_shape.empty(); // Symmetric quantization @@ -377,36 +415,45 @@ ov::Output make_int8_weights(ov::Tensor & weight, ov::Tensor & scales, static_cast(weight.data()), nullptr); weights_node->get_rt_info()["__gguf_tensor_holder"] = weight; auto scales_f16 = std::make_shared(scales); - - // Zero point is already in U8 format from extraction - auto zero_point = std::make_shared(zp); - float zp_value; - if (ov::op::util::get_single_value(zero_point, zp_value)) { - zero_point = ov::op::v0::Constant::create(zero_point->get_element_type(), {}, {zp_value}); - } - - // Quantization operations auto weights_f16 = std::make_shared(weights_node, ov::element::f16); - auto zero_point_f16 = std::make_shared(zero_point, ov::element::f16); - auto w_zp = std::make_shared(weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY); - ov::Output w_zp_s = - std::make_shared(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY); + ov::Output result; + if (use_bias && !is_scalar_zp) { + // Bias path: w * s + b (zp tensor holds f16 bias values) + auto bias_f16 = std::make_shared(zp); + auto w_s = std::make_shared(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY); + result = std::make_shared(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY); + } else { + // Zero point path: (w - zp) * s + auto zero_point = std::make_shared(zp); + float zp_value; + if (ov::op::util::get_single_value(zero_point, zp_value)) { + zero_point = ov::op::v0::Constant::create(zero_point->get_element_type(), {}, {zp_value}); + } + auto zero_point_f16 = std::make_shared(zero_point, ov::element::f16); + auto w_zp = + std::make_shared(weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY); + result = std::make_shared(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY); + } if (packed_shape.size() != 2) { // If not requantized channel-wise case, reshape back to original shape auto final_shape = std::make_shared(ov::element::i64, ov::Shape{orig_shape.size()}, orig_shape); - w_zp_s = std::make_shared(w_zp_s, final_shape, false); + result = std::make_shared(result, final_shape, false); } - return std::make_shared(w_zp_s, ov::element::f32); + return std::make_shared(result, ov::element::f32); } -ov::Output make_int4_weights(ov::Tensor & weight, ov::Tensor & scales, ov::Tensor & zp, size_t group_size) { +ov::Output make_int4_weights(ov::Tensor & weight, + ov::Tensor & scales, + ov::Tensor & zp, + size_t group_size, + bool use_bias) { ov::Shape orig_weight_shape = weight.get_shape(); - // Expand dimensions for scales and zp + // Expand dimensions for scales and zp/bias ov::Shape scale_shape = scales.get_shape(); auto zp_shape = zp.get_shape(); bool is_scalar_zp = zp_shape.empty(); // Symmetric quantization @@ -431,32 +478,35 @@ ov::Output make_int4_weights(ov::Tensor & weight, ov::Tensor & scales, static_cast(weight.data()), nullptr); weights_node->get_rt_info()["__gguf_tensor_holder"] = weight; auto weights_f16 = std::make_shared(weights_node, ov::element::f16); - - // Zero point is already in U4 format from extraction - auto zero_points_node = std::make_shared(zp); - float zp_value; - if (ov::op::util::get_single_value(zero_points_node, zp_value)) { - zero_points_node = ov::op::v0::Constant::create(zero_points_node->get_element_type(), {}, {zp_value}); - } - auto zero_points_f16 = std::make_shared(zero_points_node, ov::element::f16); - auto scales_f16 = std::make_shared(scales); - // Perform dequantization - auto w_zp = std::make_shared(weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY); - - ov::Output w_zp_s = - std::make_shared(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY); + ov::Output result; + if (use_bias && !is_scalar_zp) { + // Bias path: w * s + b (zp tensor holds f16 bias values) + auto bias_f16 = std::make_shared(zp); + auto w_s = std::make_shared(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY); + result = std::make_shared(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY); + } else { + // Zero point path: (w - zp) * s + auto zero_points_node = std::make_shared(zp); + float zp_value; + if (ov::op::util::get_single_value(zero_points_node, zp_value)) { + zero_points_node = ov::op::v0::Constant::create(zero_points_node->get_element_type(), {}, {zp_value}); + } + auto zero_points_f16 = std::make_shared(zero_points_node, ov::element::f16); + auto w_zp = + std::make_shared(weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY); + result = std::make_shared(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY); + } if (packed_shape.size() != 2) { // If not requantized channel-wise case, reshape back to original shape auto final_shape = std::make_shared(ov::element::i64, ov::Shape{orig_weight_shape.size()}, orig_weight_shape); - - w_zp_s = std::make_shared(w_zp_s, final_shape, false); + result = std::make_shared(result, final_shape, false); } - return std::make_shared(w_zp_s, ov::element::f32); + return std::make_shared(result, ov::element::f32); } // Extract quantized weights from tensor and create weight subgraph @@ -464,7 +514,8 @@ std::shared_ptr extract_quantized_weights(const ggml_tensor * tensor, const void * data, ov::Tensor & weights, ov::Tensor & scales, - ov::Tensor & zp) { + ov::Tensor & zp, + bool use_bias) { // Create a temporary tensor for extraction functions that read from tensor->data ggml_tensor temp_tensor = *tensor; temp_tensor.data = const_cast(data); @@ -499,10 +550,10 @@ std::shared_ptr extract_quantized_weights(const ggml_tensor * tensor, extract_q4_0_data(&temp_tensor, weights, scales, zp); break; case GGML_TYPE_Q4_1: - extract_q4_1_data(&temp_tensor, weights, scales, zp); + extract_q4_1_data(&temp_tensor, weights, scales, zp, use_bias); break; case GGML_TYPE_Q4_K: - extract_q4_k_data(&temp_tensor, weights, scales, zp); + extract_q4_k_data(&temp_tensor, weights, scales, zp, use_bias); break; case GGML_TYPE_Q8_0: extract_q8_0_data(&temp_tensor, weights, scales, zp); @@ -511,7 +562,7 @@ std::shared_ptr extract_quantized_weights(const ggml_tensor * tensor, extract_q6_k_data(&temp_tensor, weights, scales, zp); break; case GGML_TYPE_Q5_K: - extract_q5_k_data(&temp_tensor, weights, scales, zp); + extract_q5_k_data(&temp_tensor, weights, scales, zp, use_bias); break; default: throw std::runtime_error("Unsupported quantized type: " + std::string(ggml_type_name(tensor->type))); @@ -520,9 +571,9 @@ std::shared_ptr extract_quantized_weights(const ggml_tensor * tensor, // Create the OpenVINO weight subgraph ov::Output weight_node; if (is_u4) { - weight_node = make_int4_weights(weights, scales, zp, weights_per_block); + weight_node = make_int4_weights(weights, scales, zp, weights_per_block, use_bias); } else { - weight_node = make_int8_weights(weights, scales, zp, weights_per_block); + weight_node = make_int8_weights(weights, scales, zp, weights_per_block, use_bias); } auto result = weight_node.get_node_shared_ptr(); @@ -576,7 +627,7 @@ std::shared_ptr requantize_to_buffers(const ggml_tensor * tensor, return result; } -OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, void * output_base_ptr) { +OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, void * output_base_ptr, bool use_bias) { GGML_ASSERT(tensor != nullptr); GGML_ASSERT(data != nullptr); @@ -619,12 +670,19 @@ OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, vo OPENVINO_THROW("Unsupported weight tensor type: ", ggml_type_name(tensor->type)); } - result.layout = ggml_openvino_get_extracted_layout(tensor); + result.layout = ggml_openvino_get_extracted_layout(tensor, use_bias); const auto & layout = result.layout; if (layout.total_size == 0) { OPENVINO_THROW("Unsupported quantized type: ", ggml_type_name(tensor->type)); } + if (use_bias) { + OPENVINO_ASSERT(!layout.is_requant, + "use_bias is only used for test-backend-ops, which should not have requantization"); + // bias node will be created on the fly and not use backend buffer + output_base_ptr = nullptr; + } + // F16 requant path - no separate scales/zp needed in result if (layout.is_requant && layout.requant_type.has_value() && layout.requant_type.value() == ExtraQuantType::F16) { if (output_base_ptr) { @@ -653,14 +711,20 @@ OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, vo } else { result.weights = ov::Tensor(weight_type, node_shape); result.scales = ov::Tensor(ov::element::f16, scale_shape); - result.zp = ov::Tensor(weight_type, zp_shape); + if (use_bias && !layout.is_symmetric) { + // bias only has effect for asymmetric quant + result.zp = ov::Tensor(ov::element::f16, zp_shape); + } else { + result.zp = ov::Tensor(weight_type, zp_shape); + } } if (layout.is_requant && layout.requant_type.has_value()) { result.weight_node = requantize_to_buffers(tensor, data, layout.requant_type.value(), layout.weights_per_block, result.weights, result.scales, result.zp); } else { - result.weight_node = extract_quantized_weights(tensor, data, result.weights, result.scales, result.zp); + result.weight_node = + extract_quantized_weights(tensor, data, result.weights, result.scales, result.zp, use_bias); } return result; diff --git a/ggml/src/ggml-openvino/ggml-quants.hpp b/ggml/src/ggml-openvino/ggml-quants.hpp index 600b9c9f29..e4a02297ca 100644 --- a/ggml/src/ggml-openvino/ggml-quants.hpp +++ b/ggml/src/ggml-openvino/ggml-quants.hpp @@ -16,7 +16,8 @@ void extract_q4_0_data(const ggml_tensor * tensor, void extract_q4_1_data(const ggml_tensor * tensor, ov::Tensor & weights_arr, ov::Tensor & scales_arr, - ov::Tensor & zp_arr); + ov::Tensor & zp_arr, + bool use_bias = false); void extract_q8_0_data(const ggml_tensor * tensor, ov::Tensor & weights_arr, @@ -28,12 +29,14 @@ void unpack_256_4(const uint8_t* data, uint8_t* dst); void extract_q4_k_data(const ggml_tensor * tensor, ov::Tensor & weights_arr, ov::Tensor & scales_arr, - ov::Tensor & zp_arr); + ov::Tensor & zp_arr, + bool use_bias = false); void extract_q5_k_data(const ggml_tensor * tensor, ov::Tensor & weights_arr, ov::Tensor & scales_arr, - ov::Tensor & zp_arr); + ov::Tensor & zp_arr, + bool use_bias = false); void extract_q6_k_data(const ggml_tensor * tensor, ov::Tensor & weights_arr, @@ -45,12 +48,14 @@ static constexpr size_t GGML_QUANTIZATION_GROUP_SIZE = 32; ov::Output make_int8_weights(ov::Tensor & weight, ov::Tensor & scales, ov::Tensor & zp, - size_t group_size = GGML_QUANTIZATION_GROUP_SIZE); + size_t group_size = GGML_QUANTIZATION_GROUP_SIZE, + bool use_bias = false); ov::Output make_int4_weights(ov::Tensor & weight, ov::Tensor & scales, ov::Tensor & zp, - size_t group_size = GGML_QUANTIZATION_GROUP_SIZE); + size_t group_size = GGML_QUANTIZATION_GROUP_SIZE, + bool use_bias = false); // Extract quantized weights from tensor and create weight subgraph // If weights/scales/zp are provided (non-empty), uses them as output buffers @@ -61,7 +66,8 @@ std::shared_ptr extract_quantized_weights( const void * data, // Source data pointer (may differ from tensor->data) ov::Tensor & weights, ov::Tensor & scales, - ov::Tensor & zp); + ov::Tensor & zp, + bool use_bias = false); // Use fp bias instead of quantized zero_point (for test-backend-ops) // Requantize weights from tensor to target format, writing to provided buffers // For F16 target, only weights buffer is used (scales/zp ignored) @@ -112,8 +118,9 @@ struct OvWeight { // Returns OvWeight with the weight node and optional quantized tensors OvWeight process_weight_tensor( const ggml_tensor * tensor, - const void * data, // Source data pointer (may differ from tensor->data) - void * output_base_ptr = nullptr); // Base pointer for output buffers (or nullptr for internal allocation) + const void * data, // Source data pointer (may differ from tensor->data) + void * output_base_ptr = nullptr, // Base pointer for output buffers (or nullptr for internal allocation) + bool use_bias = false); // Use fp bias instead of quantized zero_point, only used in test-backend-ops void quantize_q4_0(const float * x, ov::Tensor & weights_arr, diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 69fcb0eda4..41fbf27383 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -127,7 +127,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin if (pos_data[0] == 0) { infer_request->reset_state(); stateful_kv_size = pos_shape[3]; - } else if (stateful_kv_size == pos_data[0]) { + } else if (stateful_kv_size == static_cast(pos_data[0])) { stateful_kv_size += pos_shape[3]; } else { auto states = infer_request->query_state(); @@ -139,7 +139,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin state.set_state(new_state_tensor); } stateful_kv_size = pos_data[0] + 1; - } + } } decoder_end_time = ggml_time_us(); @@ -467,10 +467,10 @@ enum ggml_status naive_compute(ggml_cgraph * cgraph, return GGML_STATUS_SUCCESS; } - auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); + bool naive = true; + auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, naive); auto decoder = std::make_shared(cgraph, model_weights); auto input_model = std::make_shared(decoder); - auto naive = true; auto model = ov::frontend::ggml::FrontEnd::convert(input_model, naive); if (getenv("GGML_OPENVINO_DUMP_IR")) { ov::serialize(model, "IR_naive.xml"); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 6cdf6ae818..350bffc315 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -233,9 +233,7 @@ if (NOT LLAMA_SANITIZE_ADDRESS AND NOT GGML_SCHED_NO_REALLOC) llama_build_and_test(test-opt.cpp) endif() llama_build_and_test(test-gguf.cpp) -if (NOT GGML_OPENVINO) - llama_build_and_test(test-backend-ops.cpp) -endif() +llama_build_and_test(test-backend-ops.cpp) llama_build_and_test(test-model-load-cancel.cpp LABEL "model") llama_build_and_test(test-autorelease.cpp LABEL "model")