Use bias instead of zp in test-backend-ops

2026-02-13 17:33:07 +08:00 · 2026-02-13 17:33:07 +08:00 · 5525bac078
parent 2a6a95eb77
commit 5525bac078
9 changed files with 205 additions and 112 deletions
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@ -50,6 +50,7 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph,
    m_is_static(is_static),
    m_is_stateful(is_stateful),
    m_is_prefill(is_prefill),
    m_naive(false),
    m_prefill_chunk_size(prefill_chunk_size),
    m_cgraph(cgraph),
    m_model_weights(model_weights),
@ -93,9 +94,10 @@ void GgmlOvDecoder::update_io(ggml_cgraph * cgraph) {
 GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map<std::string, std::shared_ptr<ov::Node>> & model_weights) {
    m_cgraph = cgraph;
    m_model_weights = model_weights;
    m_naive = true;
    for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
        auto * cur_node = cgraph->nodes[node_n];
-        set_input_output(cur_node, true);
+        set_input_output(cur_node);
    }
    for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
        m_node_info_list[node_n].node_op_case = compute_op_case(m_node_info_list[node_n].node);
@ -134,7 +136,7 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map<std::string, std::sh
    }
 }
-void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) {
+void GgmlOvDecoder::set_input_output(ggml_tensor * node) {
    NodeInfo current_node_info;
    auto node_name = std::string(node->name);
    auto node_output_name = node_name;
@ -169,7 +171,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) {
        current_node_info.node_inputs_names.push_back(src_name);
        // Add model inputs
-        if (!naive && !src->view_src) {
+        if (!m_naive && !src->view_src) {
            ggml_backend_buffer * buffer = src->buffer;
            if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) {
@ -206,7 +208,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) {
    }
    // Add model outputs
-    if (!naive) {
+    if (!m_naive) {
        // Model outputs are tensors with GGML_TENSOR_FLAG_OUTPUT flag and kv_caches
        static std::set<std::string> debug_output_names = {};
        // Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph
@ -509,12 +511,14 @@ std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const
    return kv_param_res_names;
 }
-std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph) {
+std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph, bool naive) {
    std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
    // static std::mutex weights_mutex;
    auto * nodes = cgraph->nodes;
    auto n_nodes = cgraph->n_nodes;
-    std::for_each(std::execution::seq, nodes, nodes + n_nodes, [&](ggml_tensor * node) {
+    // std::for_each(std::execution::par, nodes, nodes + n_nodes, [&](ggml_tensor * node) {
    for (int node_i = 0; node_i < n_nodes; node_i++) {
        auto * node = nodes[node_i];
        for (int i = 0; i < GGML_MAX_SRC; i++) {
            auto * src = node->src[i];
            if (src == nullptr) {
@ -542,18 +546,19 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
                    //     }
                    // }
                    if (model_weights.find(src_name) == model_weights.end()) {
-                        auto weight_node = create_weight_node(src);
+                        auto weight_node = create_weight_node(src, naive);
                        weight_node->set_friendly_name(src_name);
                        model_weights[src_name] = weight_node;
                    }
                }
            }
        }
-    });
+    }
    // });
    return model_weights;
 }
-std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor) {
+std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor, bool naive) {
    const bool is_ov_buffer = ggml_backend_buffer_is_openvino(tensor->buffer);
    // Check if we have a pre-built constant from the OpenVINO backend buffer
@ -581,6 +586,11 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor
        }
    }
    // There are three cases where we need to create a new weight node:
    // 1. weights are in openvino_host_buffer. Weight loading to host buffer will not trigger backend_buffer_set_tensor
    // 2. weights are in cpu/cpu_mapped buffer. On token_embd.weight goes to case 1 or 2, depending on whether mmap or direct_io is used
    // 3. test-backend-ops. buffers in test-backend-ops does not set USAGE_WEIGHT so backend_buffer_set_tensor will not create weight node
    // GGML_LOG_DEBUG("%s: creating new weight node for %s\n", __func__, tensor->name);
    static const std::set<ggml_type> weight_types = {GGML_TYPE_F32,  GGML_TYPE_F16,  GGML_TYPE_BF16,
                                                     GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
@ -592,6 +602,7 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor
    OvWeight ov_weight;
    if (ggml_is_quantized(tensor->type)) {
        auto use_bias = naive;
        if (is_ov_buffer) {
            // For quantized weights, copy raw data to a temp buffer first because
            // process_weight_tensor reads from data and writes extracted results
@ -600,9 +611,9 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor
            size_t raw_size = ggml_nbytes(tensor);
            std::vector<uint8_t> tmp(raw_size);
            memcpy(tmp.data(), tensor->data, raw_size);
-            ov_weight = process_weight_tensor(tensor, tmp.data(), tensor->data);
+            ov_weight = process_weight_tensor(tensor, tmp.data(), tensor->data, use_bias);
        } else {
-            ov_weight = process_weight_tensor(tensor, tensor->data, nullptr);
+            ov_weight = process_weight_tensor(tensor, tensor->data, nullptr, use_bias);
        }
    } else {
        // For non-quantized weights (F16/F32/BF16), data is already in tensor->data.
--- a/ggml/src/ggml-openvino/ggml-decoder.h
+++ b/ggml/src/ggml-openvino/ggml-decoder.h
@ -104,7 +104,7 @@ public:
    virtual ov::PartialShape get_output_shape(int node_idx) const override;
-    virtual ov::element::Type get_output_type(const int node_idx) const override;
+    virtual ov::element::Type get_output_type(int node_idx) const override;
    virtual int32_t * get_input_op_params(int node_idx, const std::string & name) const override;
@ -184,9 +184,10 @@ public:
    static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename);
-    static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor * tensor);
+    static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor * tensor, bool naive = false);
-    static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(ggml_cgraph * cgraph);
+    static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(ggml_cgraph * cgraph,
                                                                                bool naive = false);
    const ggml_tensor * get_tensor_used_op(const ggml_tensor * tensor) const;
@ -207,6 +208,7 @@ public:
    bool m_is_static = false;
    bool m_is_stateful = false;
    bool m_is_prefill = false;
    bool m_naive = false;
    int m_prefill_chunk_size = 0;
    static ov::Shape get_shape(const ggml_tensor * tensor);
@ -265,7 +267,7 @@ public:
    }
 private:
-    void set_input_output(ggml_tensor * node, bool naive = false);
+    void set_input_output(ggml_tensor * node);
    int compute_op_case(const ggml_tensor * node) const;
    void validate_cgraph() const;
--- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp
@ -6,6 +6,7 @@
 #include <cstring>
 #include <openvino/runtime/intel_gpu/ocl/ocl.hpp>
 #include <openvino/runtime/intel_npu/level_zero/level_zero.hpp>
 #include <optional>
 ov::Core & ov_singleton_core() {
    static ov::Core core;
@ -164,7 +165,7 @@ clEnqueueMemcpyINTEL_fn ggml_openvino_get_clEnqueueMemcpyINTEL() {
 }
 // Get requantization type for a tensor type (returns nullopt if no requant needed)
-std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor * tensor) {
+std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor * tensor, bool no_requant) {
    if (strncmp(tensor->name, "token_embd.weight", 17) == 0) {
        return ((ggml_openvino_is_npu() && tensor->type == GGML_TYPE_Q6_K) ? ExtraQuantType::F16 : ExtraQuantType::Q8_0_C);
    }
@ -174,6 +175,9 @@ std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor *
    if (ggml_openvino_is_npu()) {
        return ExtraQuantType::Q4_0_128;
    }
    if (no_requant) {
        return std::nullopt;
    }
    switch (tensor->type) {
    case GGML_TYPE_Q6_K:
    case GGML_TYPE_Q5_K:
@ -187,7 +191,7 @@ std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor *
 // Extracted Layout Calculation
 // =====================================================
-ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor) {
+ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor, bool use_bias) {
    ggml_openvino_extracted_layout layout = {};
    layout.is_symmetric = false;
@ -204,7 +208,7 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
    const size_t alignment = 64;  // Good for SIMD
    // Check if requantization is needed (NPU-specific)
-    auto requant_type = ggml_openvino_get_requant_type(tensor);
+    auto requant_type = ggml_openvino_get_requant_type(tensor, use_bias);
    if (requant_type.has_value()) {
        layout.is_requant = true;
        layout.requant_type = requant_type;
--- a/ggml/src/ggml-openvino/ggml-openvino-extra.h
+++ b/ggml/src/ggml-openvino/ggml-openvino-extra.h
@ -83,7 +83,7 @@ const std::string & ggml_openvino_get_device_name();
 bool ggml_openvino_is_npu();
 // Get requantization type for a tensor type (returns nullopt if no requant needed)
-std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor * tensor);
+std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor * tensor, bool no_requant = false);
 // =====================================================
 // OpenVINO Tensor Extra Types
@ -160,7 +160,7 @@ struct ggml_openvino_extracted_layout {
 };
 // Calculate the buffer layout for extracted quantized data
-ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor);
+ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor, bool use_bias = false);
 ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor * tensor, bool is_remote);
--- a/ggml/src/ggml-openvino/ggml-openvino.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino.cpp
@ -922,6 +922,13 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
    default:
        break;
    }
    if (op->op == GGML_OP_GET_ROWS) {
        if (op->ne[0] == 256 && (op->src[0]->type == GGML_TYPE_Q4_K || op->src[0]->type == GGML_TYPE_Q5_K)) {
            // ERR = 0.000000306 > 0.000000100   GET_ROWS(type=q4_K,n=256,m=5,r=4,be1=1,be2=1,v=0)
            // ERR = 0.000000197 > 0.000000100   GET_ROWS(type=q5_K,n=256,m=5,r=4,be1=1,be2=1,v=0)
            return true;
        }
    }
    return false;
 }
--- a/ggml/src/ggml-openvino/ggml-quants.cpp
+++ b/ggml/src/ggml-openvino/ggml-quants.cpp
@ -11,6 +11,7 @@
 #include <cstdint>
 #include <limits>
 #include <memory>
 #include <openvino/core/except.hpp>
 #include <openvino/core/node.hpp>
 #include <openvino/core/node_output.hpp>
 #include <openvino/core/parallel.hpp>
@ -18,6 +19,7 @@
 #include <openvino/core/type/element_type.hpp>
 #include <openvino/core/type/element_type_traits.hpp>
 #include <openvino/core/type/float16.hpp>
 #include <openvino/op/add.hpp>
 #include <openvino/op/constant.hpp>
 #include <openvino/op/convert.hpp>
 #include <openvino/op/multiply.hpp>
@ -82,28 +84,41 @@ void extract_q4_0_data(const ggml_tensor * tensor,
 void extract_q4_1_data(const ggml_tensor * tensor,
                       ov::Tensor & weights_arr,
                       ov::Tensor & scales_arr,
-                       ov::Tensor & zp_arr) {
+                       ov::Tensor & zp_arr,
                       bool use_bias) {
    const uint64_t bytes_per_block = 20;  // 2 bytes scale, 2 bytes min, 32x0.5 byte weights
    auto * data = static_cast<uint8_t *>(tensor->data);
    auto * weights = static_cast<uint8_t *>(weights_arr.data());
    auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
    auto * zp = static_cast<uint8_t *>(zp_arr.data());
-    ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
+    if (use_bias) {
-        float scale = static_cast<float>(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block))));
+        // Store bias (min) directly as f16 instead of computing u4 zero points
-        float min = static_cast<float>(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block + 2))));
+        auto * bias = zp_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
-        scales[i] = ov::float16(scale);
+        ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
-        // zp = -min / scale (bias = min, so zp = -bias/scale)
+            float scale = static_cast<float>(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block))));
-        uint8_t zp_val = (scale != 0.0f) ? (uint8_t) std::round(-min / scale) : 0;
+            float min = static_cast<float>(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block + 2))));
-        // Pack two 4-bit zero points per byte
+            scales[i] = ov::float16(scale);
-        if (i % 2 == 0) {
+            bias[i] = ov::float16(min);  // bias = min, dequant: w*s + bias
-            zp[i / 2] = zp_val & 0x0F;   // Lower nibble
+            unpack_32_4(data + i * bytes_per_block + 4, weights + i * 16);
-        } else {
+        });
-            zp[i / 2] |= (zp_val << 4);  // Upper nibble
+    } else {
-        }
+        auto * zp = static_cast<uint8_t *>(zp_arr.data());
-        unpack_32_4(data + i * bytes_per_block + 4, weights + i * 16);
+        ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
-    });
+            float scale = static_cast<float>(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block))));
            float min = static_cast<float>(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block + 2))));
            scales[i] = ov::float16(scale);
            // zp = -min / scale (bias = min, so zp = -bias/scale)
            uint8_t zp_val = (scale != 0.0f) ? (uint8_t) std::round(-min / scale) : 0;
            // Pack two 4-bit zero points per byte
            if (i % 2 == 0) {
                zp[i / 2] = zp_val & 0x0F;   // Lower nibble
            } else {
                zp[i / 2] |= (zp_val << 4);  // Upper nibble
            }
            unpack_32_4(data + i * bytes_per_block + 4, weights + i * 16);
        });
    }
 }
 // Extracts (weight, scales, zp) from Q8_0 tensors.
@ -164,14 +179,18 @@ void unpack_256_4(const uint8_t * data, uint8_t * dst) {
 void extract_q4_k_data(const ggml_tensor * tensor,
                       ov::Tensor & weights_arr,
                       ov::Tensor & scales_arr,
-                       ov::Tensor & zp_arr) {
+                       ov::Tensor & zp_arr,
                       bool use_bias) {
    const uint64_t bytes_per_block = 2 + 2 + 12 + 128;
    const uint64_t n_super_block = tensor->nb[3] / bytes_per_block;
    auto * data = static_cast<uint8_t *>(tensor->data);
    auto * weights = static_cast<uint8_t *>(weights_arr.data());
    auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
-    auto * zp = static_cast<uint8_t *>(zp_arr.data());
+
    // For bias path, zp_arr holds f16 bias values; for zp path, it holds packed u4 zero points
    auto * zp_u4 = use_bias ? nullptr : static_cast<uint8_t *>(zp_arr.data());
    auto * bias_f16 = use_bias ? zp_arr.data<ov::element_type_traits<ov::element::f16>::value_type>() : nullptr;
    ov::parallel_for(n_super_block, [&](size_t i) {
        uint8_t * block_data = data + i * bytes_per_block;
@ -205,17 +224,22 @@ void extract_q4_k_data(const ggml_tensor * tensor,
        min_vals[6] = scale_mins * static_cast<float>((*(qs1 + 10) >> 4) | ((*(qs1 + 6) >> 6) << 4));
        min_vals[7] = scale_mins * static_cast<float>((*(qs1 + 11) >> 4) | ((*(qs1 + 7) >> 6) << 4));
-        // Store scales and compute zero points
+        // Store scales and compute zero points or bias
        for (int j = 0; j < 8; j++) {
            scales[i * 8 + j] = ov::float16(scale_vals[j]);
-            // zp = min / scale (since bias = -min and zp = -bias/scale)
+            if (use_bias) {
-            uint8_t zp_val = (scale_vals[j] != 0.0f) ? (uint8_t) std::round(min_vals[j] / scale_vals[j]) : 0;
+                // Store bias = -min directly as f16, dequant: w*s + bias
-            // Pack two 4-bit zero points per byte
+                bias_f16[i * 8 + j] = ov::float16(-min_vals[j]);
            size_t idx = i * 8 + j;
            if (idx % 2 == 0) {
                zp[idx / 2] = zp_val & 0x0F;
            } else {
-                zp[idx / 2] |= (zp_val << 4);
+                // zp = min / scale (since bias = -min and zp = -bias/scale)
                uint8_t zp_val = (scale_vals[j] != 0.0f) ? (uint8_t) std::round(min_vals[j] / scale_vals[j]) : 0;
                // Pack two 4-bit zero points per byte
                size_t idx = i * 8 + j;
                if (idx % 2 == 0) {
                    zp_u4[idx / 2] = zp_val & 0x0F;
                } else {
                    zp_u4[idx / 2] |= (zp_val << 4);
                }
            }
        }
        unpack_256_4(block_data + 16, weights + i * 128);
@ -285,14 +309,18 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8
 void extract_q5_k_data(const ggml_tensor * tensor,
                       ov::Tensor & weights_arr,
                       ov::Tensor & scales_arr,
-                       ov::Tensor & zp_arr) {
+                       ov::Tensor & zp_arr,
                       bool use_bias) {
    const uint64_t bytes_per_block = 4 + 12 + 32 + 128;
    const uint64_t n_super_block = tensor->nb[3] / bytes_per_block;
    auto * data = static_cast<uint8_t *>(tensor->data);
    auto * weights = static_cast<uint8_t *>(weights_arr.data());
    auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
-    auto * zp = static_cast<uint8_t *>(zp_arr.data());
+
    // For bias path, zp_arr holds f16 bias values; for zp path, it holds u8 zero points
    auto * zp_u8 = use_bias ? nullptr : static_cast<uint8_t *>(zp_arr.data());
    auto * bias_f16 = use_bias ? zp_arr.data<ov::element_type_traits<ov::element::f16>::value_type>() : nullptr;
    ov::parallel_for(n_super_block, [&](size_t i) {
        uint8_t * block_data = data + i * bytes_per_block;
@ -325,9 +353,15 @@ void extract_q5_k_data(const ggml_tensor * tensor,
            scales[i * 8 + is] = ov::float16(d1);
            scales[i * 8 + is + 1] = ov::float16(d2);
-            // zp = min / scale (since bias = -min and zp = -bias/scale)
+            if (use_bias) {
-            zp[i * 8 + is] = (d1 != 0.0f) ? (uint8_t) std::round(m1 / d1) : 0;
+                // Store bias = -min directly as f16, dequant: w*s + bias
-            zp[i * 8 + is + 1] = (d2 != 0.0f) ? (uint8_t) std::round(m2 / d2) : 0;
+                bias_f16[i * 8 + is] = ov::float16(-m1);
                bias_f16[i * 8 + is + 1] = ov::float16(-m2);
            } else {
                // zp = min / scale (since bias = -min and zp = -bias/scale)
                zp_u8[i * 8 + is] = (d1 != 0.0f) ? (uint8_t) std::round(m1 / d1) : 0;
                zp_u8[i * 8 + is + 1] = (d2 != 0.0f) ? (uint8_t) std::round(m2 / d2) : 0;
            }
            // Extract weights for first 32 elements (matching deq formula exactly)
            for (int l = 0; l < 32; ++l) {
@ -349,10 +383,14 @@ void extract_q5_k_data(const ggml_tensor * tensor,
 // TODO Reorder for make_intX_weights
-ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight, ov::Tensor & scales, ov::Tensor & zp, size_t group_size) {
+ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
                                       ov::Tensor & scales,
                                       ov::Tensor & zp,
                                       size_t group_size,
                                       bool use_bias) {
    ov::Shape orig_shape = weight.get_shape();
-    // Expand dimensions for scales and zp
+    // Expand dimensions for scales and zp/bias
    auto scale_shape = scales.get_shape();
    auto zp_shape = zp.get_shape();
    bool is_scalar_zp = zp_shape.empty();  // Symmetric quantization
@ -377,36 +415,45 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight, ov::Tensor & scales,
                                                               static_cast<uint8_t *>(weight.data()), nullptr);
    weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
    auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
    // Zero point is already in U8 format from extraction
    auto zero_point = std::make_shared<ov::op::v0::Constant>(zp);
    float zp_value;
    if (ov::op::util::get_single_value(zero_point, zp_value)) {
        zero_point = ov::op::v0::Constant::create(zero_point->get_element_type(), {}, {zp_value});
    }
    // Quantization operations
    auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
    auto zero_point_f16 = std::make_shared<ov::op::v0::Convert>(zero_point, ov::element::f16);
-    auto w_zp = std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY);
+    ov::Output<ov::Node> result;
-    ov::Output<ov::Node> w_zp_s =
+    if (use_bias && !is_scalar_zp) {
-        std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
+        // Bias path: w * s + b (zp tensor holds f16 bias values)
        auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp);
        auto w_s = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
        result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY);
    } else {
        // Zero point path: (w - zp) * s
        auto zero_point = std::make_shared<ov::op::v0::Constant>(zp);
        float zp_value;
        if (ov::op::util::get_single_value(zero_point, zp_value)) {
            zero_point = ov::op::v0::Constant::create(zero_point->get_element_type(), {}, {zp_value});
        }
        auto zero_point_f16 = std::make_shared<ov::op::v0::Convert>(zero_point, ov::element::f16);
        auto w_zp =
            std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY);
        result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
    }
    if (packed_shape.size() != 2) {
        // If not requantized channel-wise case, reshape back to original shape
        auto final_shape =
            std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{orig_shape.size()}, orig_shape);
-        w_zp_s = std::make_shared<ov::op::v1::Reshape>(w_zp_s, final_shape, false);
+        result = std::make_shared<ov::op::v1::Reshape>(result, final_shape, false);
    }
-    return std::make_shared<ov::op::v0::Convert>(w_zp_s, ov::element::f32);
+    return std::make_shared<ov::op::v0::Convert>(result, ov::element::f32);
 }
-ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight, ov::Tensor & scales, ov::Tensor & zp, size_t group_size) {
+ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
                                       ov::Tensor & scales,
                                       ov::Tensor & zp,
                                       size_t group_size,
                                       bool use_bias) {
    ov::Shape orig_weight_shape = weight.get_shape();
-    // Expand dimensions for scales and zp
+    // Expand dimensions for scales and zp/bias
    ov::Shape scale_shape = scales.get_shape();
    auto zp_shape = zp.get_shape();
    bool is_scalar_zp = zp_shape.empty();  // Symmetric quantization
@ -431,32 +478,35 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight, ov::Tensor & scales,
                                                               static_cast<uint8_t *>(weight.data()), nullptr);
    weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
    auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
    // Zero point is already in U4 format from extraction
    auto zero_points_node = std::make_shared<ov::op::v0::Constant>(zp);
    float zp_value;
    if (ov::op::util::get_single_value(zero_points_node, zp_value)) {
        zero_points_node = ov::op::v0::Constant::create(zero_points_node->get_element_type(), {}, {zp_value});
    }
    auto zero_points_f16 = std::make_shared<ov::op::v0::Convert>(zero_points_node, ov::element::f16);
    auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
-    // Perform dequantization
+    ov::Output<ov::Node> result;
-    auto w_zp = std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY);
+    if (use_bias && !is_scalar_zp) {
-
+        // Bias path: w * s + b (zp tensor holds f16 bias values)
-    ov::Output<ov::Node> w_zp_s =
+        auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp);
-        std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
+        auto w_s = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
        result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY);
    } else {
        // Zero point path: (w - zp) * s
        auto zero_points_node = std::make_shared<ov::op::v0::Constant>(zp);
        float zp_value;
        if (ov::op::util::get_single_value(zero_points_node, zp_value)) {
            zero_points_node = ov::op::v0::Constant::create(zero_points_node->get_element_type(), {}, {zp_value});
        }
        auto zero_points_f16 = std::make_shared<ov::op::v0::Convert>(zero_points_node, ov::element::f16);
        auto w_zp =
            std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY);
        result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
    }
    if (packed_shape.size() != 2) {
        // If not requantized channel-wise case, reshape back to original shape
        auto final_shape = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{orig_weight_shape.size()},
                                                                  orig_weight_shape);
-
+        result = std::make_shared<ov::op::v1::Reshape>(result, final_shape, false);
        w_zp_s = std::make_shared<ov::op::v1::Reshape>(w_zp_s, final_shape, false);
    }
-    return std::make_shared<ov::op::v0::Convert>(w_zp_s, ov::element::f32);
+    return std::make_shared<ov::op::v0::Convert>(result, ov::element::f32);
 }
 // Extract quantized weights from tensor and create weight subgraph
@ -464,7 +514,8 @@ std::shared_ptr<ov::Node> extract_quantized_weights(const ggml_tensor * tensor,
                                                    const void * data,
                                                    ov::Tensor & weights,
                                                    ov::Tensor & scales,
-                                                    ov::Tensor & zp) {
+                                                    ov::Tensor & zp,
                                                    bool use_bias) {
    // Create a temporary tensor for extraction functions that read from tensor->data
    ggml_tensor temp_tensor = *tensor;
    temp_tensor.data = const_cast<void *>(data);
@ -499,10 +550,10 @@ std::shared_ptr<ov::Node> extract_quantized_weights(const ggml_tensor * tensor,
        extract_q4_0_data(&temp_tensor, weights, scales, zp);
        break;
    case GGML_TYPE_Q4_1:
-        extract_q4_1_data(&temp_tensor, weights, scales, zp);
+        extract_q4_1_data(&temp_tensor, weights, scales, zp, use_bias);
        break;
    case GGML_TYPE_Q4_K:
-        extract_q4_k_data(&temp_tensor, weights, scales, zp);
+        extract_q4_k_data(&temp_tensor, weights, scales, zp, use_bias);
        break;
    case GGML_TYPE_Q8_0:
        extract_q8_0_data(&temp_tensor, weights, scales, zp);
@ -511,7 +562,7 @@ std::shared_ptr<ov::Node> extract_quantized_weights(const ggml_tensor * tensor,
        extract_q6_k_data(&temp_tensor, weights, scales, zp);
        break;
    case GGML_TYPE_Q5_K:
-        extract_q5_k_data(&temp_tensor, weights, scales, zp);
+        extract_q5_k_data(&temp_tensor, weights, scales, zp, use_bias);
        break;
    default:
        throw std::runtime_error("Unsupported quantized type: " + std::string(ggml_type_name(tensor->type)));
@ -520,9 +571,9 @@ std::shared_ptr<ov::Node> extract_quantized_weights(const ggml_tensor * tensor,
    // Create the OpenVINO weight subgraph
    ov::Output<ov::Node> weight_node;
    if (is_u4) {
-        weight_node = make_int4_weights(weights, scales, zp, weights_per_block);
+        weight_node = make_int4_weights(weights, scales, zp, weights_per_block, use_bias);
    } else {
-        weight_node = make_int8_weights(weights, scales, zp, weights_per_block);
+        weight_node = make_int8_weights(weights, scales, zp, weights_per_block, use_bias);
    }
    auto result = weight_node.get_node_shared_ptr();
@ -576,7 +627,7 @@ std::shared_ptr<ov::Node> requantize_to_buffers(const ggml_tensor * tensor,
    return result;
 }
-OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, void * output_base_ptr) {
+OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, void * output_base_ptr, bool use_bias) {
    GGML_ASSERT(tensor != nullptr);
    GGML_ASSERT(data != nullptr);
@ -619,12 +670,19 @@ OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, vo
        OPENVINO_THROW("Unsupported weight tensor type: ", ggml_type_name(tensor->type));
    }
-    result.layout = ggml_openvino_get_extracted_layout(tensor);
+    result.layout = ggml_openvino_get_extracted_layout(tensor, use_bias);
    const auto & layout = result.layout;
    if (layout.total_size == 0) {
        OPENVINO_THROW("Unsupported quantized type: ", ggml_type_name(tensor->type));
    }
    if (use_bias) {
        OPENVINO_ASSERT(!layout.is_requant,
                        "use_bias is only used for test-backend-ops, which should not have requantization");
        // bias node will be created on the fly and not use backend buffer
        output_base_ptr = nullptr;
    }
    // F16 requant path - no separate scales/zp needed in result
    if (layout.is_requant && layout.requant_type.has_value() && layout.requant_type.value() == ExtraQuantType::F16) {
        if (output_base_ptr) {
@ -653,14 +711,20 @@ OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, vo
    } else {
        result.weights = ov::Tensor(weight_type, node_shape);
        result.scales = ov::Tensor(ov::element::f16, scale_shape);
-        result.zp = ov::Tensor(weight_type, zp_shape);
+        if (use_bias && !layout.is_symmetric) {
            // bias only has effect for asymmetric quant
            result.zp = ov::Tensor(ov::element::f16, zp_shape);
        } else {
            result.zp = ov::Tensor(weight_type, zp_shape);
        }
    }
    if (layout.is_requant && layout.requant_type.has_value()) {
        result.weight_node = requantize_to_buffers(tensor, data, layout.requant_type.value(), layout.weights_per_block,
                                                   result.weights, result.scales, result.zp);
    } else {
-        result.weight_node = extract_quantized_weights(tensor, data, result.weights, result.scales, result.zp);
+        result.weight_node =
            extract_quantized_weights(tensor, data, result.weights, result.scales, result.zp, use_bias);
    }
    return result;
--- a/ggml/src/ggml-openvino/ggml-quants.hpp
+++ b/ggml/src/ggml-openvino/ggml-quants.hpp
@ -16,7 +16,8 @@ void extract_q4_0_data(const ggml_tensor * tensor,
 void extract_q4_1_data(const ggml_tensor * tensor,
                       ov::Tensor & weights_arr,
                       ov::Tensor & scales_arr,
-                       ov::Tensor & zp_arr);
+                       ov::Tensor & zp_arr,
                       bool use_bias = false);
 void extract_q8_0_data(const ggml_tensor * tensor,
                       ov::Tensor & weights_arr,
@ -28,12 +29,14 @@ void unpack_256_4(const uint8_t* data, uint8_t* dst);
 void extract_q4_k_data(const ggml_tensor * tensor,
                       ov::Tensor & weights_arr,
                       ov::Tensor & scales_arr,
-                       ov::Tensor & zp_arr);
+                       ov::Tensor & zp_arr,
                       bool use_bias = false);
 void extract_q5_k_data(const ggml_tensor * tensor,
                       ov::Tensor & weights_arr,
                       ov::Tensor & scales_arr,
-                       ov::Tensor & zp_arr);
+                       ov::Tensor & zp_arr,
                       bool use_bias = false);
 void extract_q6_k_data(const ggml_tensor * tensor,
                       ov::Tensor & weights_arr,
@ -45,12 +48,14 @@ static constexpr size_t GGML_QUANTIZATION_GROUP_SIZE = 32;
 ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
                                       ov::Tensor & scales,
                                       ov::Tensor & zp,
-                                       size_t group_size = GGML_QUANTIZATION_GROUP_SIZE);
+                                       size_t group_size = GGML_QUANTIZATION_GROUP_SIZE,
                                       bool use_bias = false);
 ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
                                       ov::Tensor & scales,
                                       ov::Tensor & zp,
-                                       size_t group_size = GGML_QUANTIZATION_GROUP_SIZE);
+                                       size_t group_size = GGML_QUANTIZATION_GROUP_SIZE,
                                       bool use_bias = false);
 // Extract quantized weights from tensor and create weight subgraph
 // If weights/scales/zp are provided (non-empty), uses them as output buffers
@ -61,7 +66,8 @@ std::shared_ptr<ov::Node> extract_quantized_weights(
    const void * data,  // Source data pointer (may differ from tensor->data)
    ov::Tensor & weights,
    ov::Tensor & scales,
-    ov::Tensor & zp);
+    ov::Tensor & zp,
    bool use_bias = false);  // Use fp bias instead of quantized zero_point (for test-backend-ops)
 // Requantize weights from tensor to target format, writing to provided buffers
 // For F16 target, only weights buffer is used (scales/zp ignored)
@ -112,8 +118,9 @@ struct OvWeight {
 // Returns OvWeight with the weight node and optional quantized tensors
 OvWeight process_weight_tensor(
    const ggml_tensor * tensor,
-    const void * data,                  // Source data pointer (may differ from tensor->data)
+    const void * data,                 // Source data pointer (may differ from tensor->data)
-    void * output_base_ptr = nullptr);  // Base pointer for output buffers (or nullptr for internal allocation)
+    void * output_base_ptr = nullptr,  // Base pointer for output buffers (or nullptr for internal allocation)
    bool use_bias = false);            // Use fp bias instead of quantized zero_point, only used in test-backend-ops
 void quantize_q4_0(const float * x,
                   ov::Tensor & weights_arr,
--- a/ggml/src/ggml-openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/utils.cpp
@ -127,7 +127,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
                if (pos_data[0] == 0) {
                    infer_request->reset_state();
                    stateful_kv_size = pos_shape[3];
-                } else if (stateful_kv_size == pos_data[0]) {
+                } else if (stateful_kv_size == static_cast<size_t>(pos_data[0])) {
                    stateful_kv_size += pos_shape[3];
                } else {
                    auto states = infer_request->query_state();
@ -139,7 +139,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
                        state.set_state(new_state_tensor);
                    }
                    stateful_kv_size = pos_data[0] + 1;
-                 }
+                }
            }
            decoder_end_time = ggml_time_us();
@ -467,10 +467,10 @@ enum ggml_status naive_compute(ggml_cgraph * cgraph,
        return GGML_STATUS_SUCCESS;
    }
-    auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
+    bool naive = true;
    auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, naive);
    auto decoder = std::make_shared<GgmlOvDecoder>(cgraph, model_weights);
    auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(decoder);
    auto naive = true;
    auto model = ov::frontend::ggml::FrontEnd::convert(input_model, naive);
    if (getenv("GGML_OPENVINO_DUMP_IR")) {
        ov::serialize(model, "IR_naive.xml");
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -233,9 +233,7 @@ if (NOT LLAMA_SANITIZE_ADDRESS AND NOT GGML_SCHED_NO_REALLOC)
  llama_build_and_test(test-opt.cpp)
 endif()
 llama_build_and_test(test-gguf.cpp)
-if (NOT GGML_OPENVINO)
+llama_build_and_test(test-backend-ops.cpp)
    llama_build_and_test(test-backend-ops.cpp)
 endif()
 llama_build_and_test(test-model-load-cancel.cpp LABEL "model")
 llama_build_and_test(test-autorelease.cpp       LABEL "model")