diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
index 99776e1bec..107b510f3b 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -50,6 +50,7 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph,
     m_is_static(is_static),
     m_is_stateful(is_stateful),
     m_is_prefill(is_prefill),
+    m_naive(false),
     m_prefill_chunk_size(prefill_chunk_size),
     m_cgraph(cgraph),
     m_model_weights(model_weights),
@@ -93,9 +94,10 @@ void GgmlOvDecoder::update_io(ggml_cgraph * cgraph) {
 GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map<std::string, std::shared_ptr<ov::Node>> & model_weights) {
     m_cgraph = cgraph;
     m_model_weights = model_weights;
+    m_naive = true;
     for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
         auto * cur_node = cgraph->nodes[node_n];
-        set_input_output(cur_node, true);
+        set_input_output(cur_node);
     }
     for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
         m_node_info_list[node_n].node_op_case = compute_op_case(m_node_info_list[node_n].node);
@@ -134,7 +136,7 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map<std::string, std::sh
     }
 }
 
-void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) {
+void GgmlOvDecoder::set_input_output(ggml_tensor * node) {
     NodeInfo current_node_info;
     auto node_name = std::string(node->name);
     auto node_output_name = node_name;
@@ -169,7 +171,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) {
         current_node_info.node_inputs_names.push_back(src_name);
 
         // Add model inputs
-        if (!naive && !src->view_src) {
+        if (!m_naive && !src->view_src) {
             ggml_backend_buffer * buffer = src->buffer;
 
             if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) {
@@ -206,7 +208,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor * node, bool naive) {
     }
 
     // Add model outputs
-    if (!naive) {
+    if (!m_naive) {
         // Model outputs are tensors with GGML_TENSOR_FLAG_OUTPUT flag and kv_caches
         static std::set<std::string> debug_output_names = {};
         // Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph
@@ -509,12 +511,14 @@ std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const
     return kv_param_res_names;
 }
 
-std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph) {
+std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph, bool naive) {
     std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
     // static std::mutex weights_mutex;
     auto * nodes = cgraph->nodes;
     auto n_nodes = cgraph->n_nodes;
-    std::for_each(std::execution::seq, nodes, nodes + n_nodes, [&](ggml_tensor * node) {
+    // std::for_each(std::execution::par, nodes, nodes + n_nodes, [&](ggml_tensor * node) {
+    for (int node_i = 0; node_i < n_nodes; node_i++) {
+        auto * node = nodes[node_i];
         for (int i = 0; i < GGML_MAX_SRC; i++) {
             auto * src = node->src[i];
             if (src == nullptr) {
@@ -542,18 +546,19 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
                     //     }
                     // }
                     if (model_weights.find(src_name) == model_weights.end()) {
-                        auto weight_node = create_weight_node(src);
+                        auto weight_node = create_weight_node(src, naive);
                         weight_node->set_friendly_name(src_name);
                         model_weights[src_name] = weight_node;
                     }
                 }
             }
         }
-    });
+    }
+    // });
     return model_weights;
 }
 
-std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor) {
+std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor, bool naive) {
     const bool is_ov_buffer = ggml_backend_buffer_is_openvino(tensor->buffer);
 
     // Check if we have a pre-built constant from the OpenVINO backend buffer
@@ -581,6 +586,11 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor
         }
     }
 
+    // There are three cases where we need to create a new weight node:
+    // 1. weights are in openvino_host_buffer. Weight loading to host buffer will not trigger backend_buffer_set_tensor
+    // 2. weights are in cpu/cpu_mapped buffer. On token_embd.weight goes to case 1 or 2, depending on whether mmap or direct_io is used
+    // 3. test-backend-ops. buffers in test-backend-ops does not set USAGE_WEIGHT so backend_buffer_set_tensor will not create weight node
+
     // GGML_LOG_DEBUG("%s: creating new weight node for %s\n", __func__, tensor->name);
     static const std::set<ggml_type> weight_types = {GGML_TYPE_F32,  GGML_TYPE_F16,  GGML_TYPE_BF16,
                                                      GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
@@ -592,6 +602,7 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor
 
     OvWeight ov_weight;
     if (ggml_is_quantized(tensor->type)) {
+        auto use_bias = naive;
         if (is_ov_buffer) {
             // For quantized weights, copy raw data to a temp buffer first because
             // process_weight_tensor reads from data and writes extracted results
@@ -600,9 +611,9 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor
             size_t raw_size = ggml_nbytes(tensor);
             std::vector<uint8_t> tmp(raw_size);
             memcpy(tmp.data(), tensor->data, raw_size);
-            ov_weight = process_weight_tensor(tensor, tmp.data(), tensor->data);
+            ov_weight = process_weight_tensor(tensor, tmp.data(), tensor->data, use_bias);
         } else {
-            ov_weight = process_weight_tensor(tensor, tensor->data, nullptr);
+            ov_weight = process_weight_tensor(tensor, tensor->data, nullptr, use_bias);
         }
     } else {
         // For non-quantized weights (F16/F32/BF16), data is already in tensor->data.
diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h
index 59311a6121..ec6062a166 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.h
+++ b/ggml/src/ggml-openvino/ggml-decoder.h
@@ -104,7 +104,7 @@ public:
 
     virtual ov::PartialShape get_output_shape(int node_idx) const override;
 
-    virtual ov::element::Type get_output_type(const int node_idx) const override;
+    virtual ov::element::Type get_output_type(int node_idx) const override;
 
     virtual int32_t * get_input_op_params(int node_idx, const std::string & name) const override;
 
@@ -184,9 +184,10 @@ public:
 
     static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename);
 
-    static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor * tensor);
+    static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor * tensor, bool naive = false);
 
-    static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(ggml_cgraph * cgraph);
+    static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(ggml_cgraph * cgraph,
+                                                                                bool naive = false);
 
     const ggml_tensor * get_tensor_used_op(const ggml_tensor * tensor) const;
 
@@ -207,6 +208,7 @@ public:
     bool m_is_static = false;
     bool m_is_stateful = false;
     bool m_is_prefill = false;
+    bool m_naive = false;
     int m_prefill_chunk_size = 0;
 
     static ov::Shape get_shape(const ggml_tensor * tensor);
@@ -265,7 +267,7 @@ public:
     }
 
 private:
-    void set_input_output(ggml_tensor * node, bool naive = false);
+    void set_input_output(ggml_tensor * node);
     int compute_op_case(const ggml_tensor * node) const;
 
     void validate_cgraph() const;
diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp
index 7a48ed1b65..0b8b2d3743 100644
--- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp
@@ -6,6 +6,7 @@
 #include <cstring>
 #include <openvino/runtime/intel_gpu/ocl/ocl.hpp>
 #include <openvino/runtime/intel_npu/level_zero/level_zero.hpp>
+#include <optional>
 
 ov::Core & ov_singleton_core() {
     static ov::Core core;
@@ -164,7 +165,7 @@ clEnqueueMemcpyINTEL_fn ggml_openvino_get_clEnqueueMemcpyINTEL() {
 }
 
 // Get requantization type for a tensor type (returns nullopt if no requant needed)
-std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor * tensor) {
+std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor * tensor, bool no_requant) {
     if (strncmp(tensor->name, "token_embd.weight", 17) == 0) {
         return ((ggml_openvino_is_npu() && tensor->type == GGML_TYPE_Q6_K) ? ExtraQuantType::F16 : ExtraQuantType::Q8_0_C);
     }
@@ -174,6 +175,9 @@ std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor *
     if (ggml_openvino_is_npu()) {
         return ExtraQuantType::Q4_0_128;
     }
+    if (no_requant) {
+        return std::nullopt;
+    }
     switch (tensor->type) {
     case GGML_TYPE_Q6_K:
     case GGML_TYPE_Q5_K:
@@ -187,7 +191,7 @@ std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor *
 // Extracted Layout Calculation
 // =====================================================
 
-ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor) {
+ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor, bool use_bias) {
     ggml_openvino_extracted_layout layout = {};
     layout.is_symmetric = false;
 
@@ -204,7 +208,7 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
     const size_t alignment = 64;  // Good for SIMD
 
     // Check if requantization is needed (NPU-specific)
-    auto requant_type = ggml_openvino_get_requant_type(tensor);
+    auto requant_type = ggml_openvino_get_requant_type(tensor, use_bias);
     if (requant_type.has_value()) {
         layout.is_requant = true;
         layout.requant_type = requant_type;
diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.h b/ggml/src/ggml-openvino/ggml-openvino-extra.h
index 9ce4667154..441a62e9d3 100644
--- a/ggml/src/ggml-openvino/ggml-openvino-extra.h
+++ b/ggml/src/ggml-openvino/ggml-openvino-extra.h
@@ -83,7 +83,7 @@ const std::string & ggml_openvino_get_device_name();
 bool ggml_openvino_is_npu();
 
 // Get requantization type for a tensor type (returns nullopt if no requant needed)
-std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor * tensor);
+std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor * tensor, bool no_requant = false);
 
 // =====================================================
 // OpenVINO Tensor Extra Types
@@ -160,7 +160,7 @@ struct ggml_openvino_extracted_layout {
 };
 
 // Calculate the buffer layout for extracted quantized data
-ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor);
+ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor, bool use_bias = false);
 
 ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor * tensor, bool is_remote);
 
diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp
index 780d17b750..948ff2cc78 100644
--- a/ggml/src/ggml-openvino/ggml-openvino.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino.cpp
@@ -922,6 +922,13 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
     default:
         break;
     }
+    if (op->op == GGML_OP_GET_ROWS) {
+        if (op->ne[0] == 256 && (op->src[0]->type == GGML_TYPE_Q4_K || op->src[0]->type == GGML_TYPE_Q5_K)) {
+            // ERR = 0.000000306 > 0.000000100   GET_ROWS(type=q4_K,n=256,m=5,r=4,be1=1,be2=1,v=0)
+            // ERR = 0.000000197 > 0.000000100   GET_ROWS(type=q5_K,n=256,m=5,r=4,be1=1,be2=1,v=0)
+            return true;
+        }
+    }
     return false;
 }
 
diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp
index 10909cbc1e..3628f7a959 100644
--- a/ggml/src/ggml-openvino/ggml-quants.cpp
+++ b/ggml/src/ggml-openvino/ggml-quants.cpp
@@ -11,6 +11,7 @@
 #include <cstdint>
 #include <limits>
 #include <memory>
+#include <openvino/core/except.hpp>
 #include <openvino/core/node.hpp>
 #include <openvino/core/node_output.hpp>
 #include <openvino/core/parallel.hpp>
@@ -18,6 +19,7 @@
 #include <openvino/core/type/element_type.hpp>
 #include <openvino/core/type/element_type_traits.hpp>
 #include <openvino/core/type/float16.hpp>
+#include <openvino/op/add.hpp>
 #include <openvino/op/constant.hpp>
 #include <openvino/op/convert.hpp>
 #include <openvino/op/multiply.hpp>
@@ -82,28 +84,41 @@ void extract_q4_0_data(const ggml_tensor * tensor,
 void extract_q4_1_data(const ggml_tensor * tensor,
                        ov::Tensor & weights_arr,
                        ov::Tensor & scales_arr,
-                       ov::Tensor & zp_arr) {
+                       ov::Tensor & zp_arr,
+                       bool use_bias) {
     const uint64_t bytes_per_block = 20;  // 2 bytes scale, 2 bytes min, 32x0.5 byte weights
 
     auto * data = static_cast<uint8_t *>(tensor->data);
     auto * weights = static_cast<uint8_t *>(weights_arr.data());
     auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
-    auto * zp = static_cast<uint8_t *>(zp_arr.data());
 
-    ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
-        float scale = static_cast<float>(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block))));
-        float min = static_cast<float>(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block + 2))));
-        scales[i] = ov::float16(scale);
-        // zp = -min / scale (bias = min, so zp = -bias/scale)
-        uint8_t zp_val = (scale != 0.0f) ? (uint8_t) std::round(-min / scale) : 0;
-        // Pack two 4-bit zero points per byte
-        if (i % 2 == 0) {
-            zp[i / 2] = zp_val & 0x0F;   // Lower nibble
-        } else {
-            zp[i / 2] |= (zp_val << 4);  // Upper nibble
-        }
-        unpack_32_4(data + i * bytes_per_block + 4, weights + i * 16);
-    });
+    if (use_bias) {
+        // Store bias (min) directly as f16 instead of computing u4 zero points
+        auto * bias = zp_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
+        ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
+            float scale = static_cast<float>(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block))));
+            float min = static_cast<float>(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block + 2))));
+            scales[i] = ov::float16(scale);
+            bias[i] = ov::float16(min);  // bias = min, dequant: w*s + bias
+            unpack_32_4(data + i * bytes_per_block + 4, weights + i * 16);
+        });
+    } else {
+        auto * zp = static_cast<uint8_t *>(zp_arr.data());
+        ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
+            float scale = static_cast<float>(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block))));
+            float min = static_cast<float>(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block + 2))));
+            scales[i] = ov::float16(scale);
+            // zp = -min / scale (bias = min, so zp = -bias/scale)
+            uint8_t zp_val = (scale != 0.0f) ? (uint8_t) std::round(-min / scale) : 0;
+            // Pack two 4-bit zero points per byte
+            if (i % 2 == 0) {
+                zp[i / 2] = zp_val & 0x0F;   // Lower nibble
+            } else {
+                zp[i / 2] |= (zp_val << 4);  // Upper nibble
+            }
+            unpack_32_4(data + i * bytes_per_block + 4, weights + i * 16);
+        });
+    }
 }
 
 // Extracts (weight, scales, zp) from Q8_0 tensors.
@@ -164,14 +179,18 @@ void unpack_256_4(const uint8_t * data, uint8_t * dst) {
 void extract_q4_k_data(const ggml_tensor * tensor,
                        ov::Tensor & weights_arr,
                        ov::Tensor & scales_arr,
-                       ov::Tensor & zp_arr) {
+                       ov::Tensor & zp_arr,
+                       bool use_bias) {
     const uint64_t bytes_per_block = 2 + 2 + 12 + 128;
     const uint64_t n_super_block = tensor->nb[3] / bytes_per_block;
 
     auto * data = static_cast<uint8_t *>(tensor->data);
     auto * weights = static_cast<uint8_t *>(weights_arr.data());
     auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
-    auto * zp = static_cast<uint8_t *>(zp_arr.data());
+
+    // For bias path, zp_arr holds f16 bias values; for zp path, it holds packed u4 zero points
+    auto * zp_u4 = use_bias ? nullptr : static_cast<uint8_t *>(zp_arr.data());
+    auto * bias_f16 = use_bias ? zp_arr.data<ov::element_type_traits<ov::element::f16>::value_type>() : nullptr;
 
     ov::parallel_for(n_super_block, [&](size_t i) {
         uint8_t * block_data = data + i * bytes_per_block;
@@ -205,17 +224,22 @@ void extract_q4_k_data(const ggml_tensor * tensor,
         min_vals[6] = scale_mins * static_cast<float>((*(qs1 + 10) >> 4) | ((*(qs1 + 6) >> 6) << 4));
         min_vals[7] = scale_mins * static_cast<float>((*(qs1 + 11) >> 4) | ((*(qs1 + 7) >> 6) << 4));
 
-        // Store scales and compute zero points
+        // Store scales and compute zero points or bias
         for (int j = 0; j < 8; j++) {
             scales[i * 8 + j] = ov::float16(scale_vals[j]);
-            // zp = min / scale (since bias = -min and zp = -bias/scale)
-            uint8_t zp_val = (scale_vals[j] != 0.0f) ? (uint8_t) std::round(min_vals[j] / scale_vals[j]) : 0;
-            // Pack two 4-bit zero points per byte
-            size_t idx = i * 8 + j;
-            if (idx % 2 == 0) {
-                zp[idx / 2] = zp_val & 0x0F;
+            if (use_bias) {
+                // Store bias = -min directly as f16, dequant: w*s + bias
+                bias_f16[i * 8 + j] = ov::float16(-min_vals[j]);
             } else {
-                zp[idx / 2] |= (zp_val << 4);
+                // zp = min / scale (since bias = -min and zp = -bias/scale)
+                uint8_t zp_val = (scale_vals[j] != 0.0f) ? (uint8_t) std::round(min_vals[j] / scale_vals[j]) : 0;
+                // Pack two 4-bit zero points per byte
+                size_t idx = i * 8 + j;
+                if (idx % 2 == 0) {
+                    zp_u4[idx / 2] = zp_val & 0x0F;
+                } else {
+                    zp_u4[idx / 2] |= (zp_val << 4);
+                }
             }
         }
         unpack_256_4(block_data + 16, weights + i * 128);
@@ -285,14 +309,18 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8
 void extract_q5_k_data(const ggml_tensor * tensor,
                        ov::Tensor & weights_arr,
                        ov::Tensor & scales_arr,
-                       ov::Tensor & zp_arr) {
+                       ov::Tensor & zp_arr,
+                       bool use_bias) {
     const uint64_t bytes_per_block = 4 + 12 + 32 + 128;
     const uint64_t n_super_block = tensor->nb[3] / bytes_per_block;
 
     auto * data = static_cast<uint8_t *>(tensor->data);
     auto * weights = static_cast<uint8_t *>(weights_arr.data());
     auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
-    auto * zp = static_cast<uint8_t *>(zp_arr.data());
+
+    // For bias path, zp_arr holds f16 bias values; for zp path, it holds u8 zero points
+    auto * zp_u8 = use_bias ? nullptr : static_cast<uint8_t *>(zp_arr.data());
+    auto * bias_f16 = use_bias ? zp_arr.data<ov::element_type_traits<ov::element::f16>::value_type>() : nullptr;
 
     ov::parallel_for(n_super_block, [&](size_t i) {
         uint8_t * block_data = data + i * bytes_per_block;
@@ -325,9 +353,15 @@ void extract_q5_k_data(const ggml_tensor * tensor,
 
             scales[i * 8 + is] = ov::float16(d1);
             scales[i * 8 + is + 1] = ov::float16(d2);
-            // zp = min / scale (since bias = -min and zp = -bias/scale)
-            zp[i * 8 + is] = (d1 != 0.0f) ? (uint8_t) std::round(m1 / d1) : 0;
-            zp[i * 8 + is + 1] = (d2 != 0.0f) ? (uint8_t) std::round(m2 / d2) : 0;
+            if (use_bias) {
+                // Store bias = -min directly as f16, dequant: w*s + bias
+                bias_f16[i * 8 + is] = ov::float16(-m1);
+                bias_f16[i * 8 + is + 1] = ov::float16(-m2);
+            } else {
+                // zp = min / scale (since bias = -min and zp = -bias/scale)
+                zp_u8[i * 8 + is] = (d1 != 0.0f) ? (uint8_t) std::round(m1 / d1) : 0;
+                zp_u8[i * 8 + is + 1] = (d2 != 0.0f) ? (uint8_t) std::round(m2 / d2) : 0;
+            }
 
             // Extract weights for first 32 elements (matching deq formula exactly)
             for (int l = 0; l < 32; ++l) {
@@ -349,10 +383,14 @@ void extract_q5_k_data(const ggml_tensor * tensor,
 
 // TODO Reorder for make_intX_weights
 
-ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight, ov::Tensor & scales, ov::Tensor & zp, size_t group_size) {
+ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
+                                       ov::Tensor & scales,
+                                       ov::Tensor & zp,
+                                       size_t group_size,
+                                       bool use_bias) {
     ov::Shape orig_shape = weight.get_shape();
 
-    // Expand dimensions for scales and zp
+    // Expand dimensions for scales and zp/bias
     auto scale_shape = scales.get_shape();
     auto zp_shape = zp.get_shape();
     bool is_scalar_zp = zp_shape.empty();  // Symmetric quantization
@@ -377,36 +415,45 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight, ov::Tensor & scales,
                                                                static_cast<uint8_t *>(weight.data()), nullptr);
     weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
     auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
-
-    // Zero point is already in U8 format from extraction
-    auto zero_point = std::make_shared<ov::op::v0::Constant>(zp);
-    float zp_value;
-    if (ov::op::util::get_single_value(zero_point, zp_value)) {
-        zero_point = ov::op::v0::Constant::create(zero_point->get_element_type(), {}, {zp_value});
-    }
-
-    // Quantization operations
     auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
-    auto zero_point_f16 = std::make_shared<ov::op::v0::Convert>(zero_point, ov::element::f16);
 
-    auto w_zp = std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY);
-    ov::Output<ov::Node> w_zp_s =
-        std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
+    ov::Output<ov::Node> result;
+    if (use_bias && !is_scalar_zp) {
+        // Bias path: w * s + b (zp tensor holds f16 bias values)
+        auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp);
+        auto w_s = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
+        result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY);
+    } else {
+        // Zero point path: (w - zp) * s
+        auto zero_point = std::make_shared<ov::op::v0::Constant>(zp);
+        float zp_value;
+        if (ov::op::util::get_single_value(zero_point, zp_value)) {
+            zero_point = ov::op::v0::Constant::create(zero_point->get_element_type(), {}, {zp_value});
+        }
+        auto zero_point_f16 = std::make_shared<ov::op::v0::Convert>(zero_point, ov::element::f16);
+        auto w_zp =
+            std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY);
+        result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
+    }
 
     if (packed_shape.size() != 2) {
         // If not requantized channel-wise case, reshape back to original shape
         auto final_shape =
             std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{orig_shape.size()}, orig_shape);
-        w_zp_s = std::make_shared<ov::op::v1::Reshape>(w_zp_s, final_shape, false);
+        result = std::make_shared<ov::op::v1::Reshape>(result, final_shape, false);
     }
 
-    return std::make_shared<ov::op::v0::Convert>(w_zp_s, ov::element::f32);
+    return std::make_shared<ov::op::v0::Convert>(result, ov::element::f32);
 }
 
-ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight, ov::Tensor & scales, ov::Tensor & zp, size_t group_size) {
+ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
+                                       ov::Tensor & scales,
+                                       ov::Tensor & zp,
+                                       size_t group_size,
+                                       bool use_bias) {
     ov::Shape orig_weight_shape = weight.get_shape();
 
-    // Expand dimensions for scales and zp
+    // Expand dimensions for scales and zp/bias
     ov::Shape scale_shape = scales.get_shape();
     auto zp_shape = zp.get_shape();
     bool is_scalar_zp = zp_shape.empty();  // Symmetric quantization
@@ -431,32 +478,35 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight, ov::Tensor & scales,
                                                                static_cast<uint8_t *>(weight.data()), nullptr);
     weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
     auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
-
-    // Zero point is already in U4 format from extraction
-    auto zero_points_node = std::make_shared<ov::op::v0::Constant>(zp);
-    float zp_value;
-    if (ov::op::util::get_single_value(zero_points_node, zp_value)) {
-        zero_points_node = ov::op::v0::Constant::create(zero_points_node->get_element_type(), {}, {zp_value});
-    }
-    auto zero_points_f16 = std::make_shared<ov::op::v0::Convert>(zero_points_node, ov::element::f16);
-
     auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
 
-    // Perform dequantization
-    auto w_zp = std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY);
-
-    ov::Output<ov::Node> w_zp_s =
-        std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
+    ov::Output<ov::Node> result;
+    if (use_bias && !is_scalar_zp) {
+        // Bias path: w * s + b (zp tensor holds f16 bias values)
+        auto bias_f16 = std::make_shared<ov::op::v0::Constant>(zp);
+        auto w_s = std::make_shared<ov::op::v1::Multiply>(weights_f16, scales_f16, ov::op::AutoBroadcastType::NUMPY);
+        result = std::make_shared<ov::op::v1::Add>(w_s, bias_f16, ov::op::AutoBroadcastType::NUMPY);
+    } else {
+        // Zero point path: (w - zp) * s
+        auto zero_points_node = std::make_shared<ov::op::v0::Constant>(zp);
+        float zp_value;
+        if (ov::op::util::get_single_value(zero_points_node, zp_value)) {
+            zero_points_node = ov::op::v0::Constant::create(zero_points_node->get_element_type(), {}, {zp_value});
+        }
+        auto zero_points_f16 = std::make_shared<ov::op::v0::Convert>(zero_points_node, ov::element::f16);
+        auto w_zp =
+            std::make_shared<ov::op::v1::Subtract>(weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY);
+        result = std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
+    }
 
     if (packed_shape.size() != 2) {
         // If not requantized channel-wise case, reshape back to original shape
         auto final_shape = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{orig_weight_shape.size()},
                                                                   orig_weight_shape);
-
-        w_zp_s = std::make_shared<ov::op::v1::Reshape>(w_zp_s, final_shape, false);
+        result = std::make_shared<ov::op::v1::Reshape>(result, final_shape, false);
     }
 
-    return std::make_shared<ov::op::v0::Convert>(w_zp_s, ov::element::f32);
+    return std::make_shared<ov::op::v0::Convert>(result, ov::element::f32);
 }
 
 // Extract quantized weights from tensor and create weight subgraph
@@ -464,7 +514,8 @@ std::shared_ptr<ov::Node> extract_quantized_weights(const ggml_tensor * tensor,
                                                     const void * data,
                                                     ov::Tensor & weights,
                                                     ov::Tensor & scales,
-                                                    ov::Tensor & zp) {
+                                                    ov::Tensor & zp,
+                                                    bool use_bias) {
     // Create a temporary tensor for extraction functions that read from tensor->data
     ggml_tensor temp_tensor = *tensor;
     temp_tensor.data = const_cast<void *>(data);
@@ -499,10 +550,10 @@ std::shared_ptr<ov::Node> extract_quantized_weights(const ggml_tensor * tensor,
         extract_q4_0_data(&temp_tensor, weights, scales, zp);
         break;
     case GGML_TYPE_Q4_1:
-        extract_q4_1_data(&temp_tensor, weights, scales, zp);
+        extract_q4_1_data(&temp_tensor, weights, scales, zp, use_bias);
         break;
     case GGML_TYPE_Q4_K:
-        extract_q4_k_data(&temp_tensor, weights, scales, zp);
+        extract_q4_k_data(&temp_tensor, weights, scales, zp, use_bias);
         break;
     case GGML_TYPE_Q8_0:
         extract_q8_0_data(&temp_tensor, weights, scales, zp);
@@ -511,7 +562,7 @@ std::shared_ptr<ov::Node> extract_quantized_weights(const ggml_tensor * tensor,
         extract_q6_k_data(&temp_tensor, weights, scales, zp);
         break;
     case GGML_TYPE_Q5_K:
-        extract_q5_k_data(&temp_tensor, weights, scales, zp);
+        extract_q5_k_data(&temp_tensor, weights, scales, zp, use_bias);
         break;
     default:
         throw std::runtime_error("Unsupported quantized type: " + std::string(ggml_type_name(tensor->type)));
@@ -520,9 +571,9 @@ std::shared_ptr<ov::Node> extract_quantized_weights(const ggml_tensor * tensor,
     // Create the OpenVINO weight subgraph
     ov::Output<ov::Node> weight_node;
     if (is_u4) {
-        weight_node = make_int4_weights(weights, scales, zp, weights_per_block);
+        weight_node = make_int4_weights(weights, scales, zp, weights_per_block, use_bias);
     } else {
-        weight_node = make_int8_weights(weights, scales, zp, weights_per_block);
+        weight_node = make_int8_weights(weights, scales, zp, weights_per_block, use_bias);
     }
 
     auto result = weight_node.get_node_shared_ptr();
@@ -576,7 +627,7 @@ std::shared_ptr<ov::Node> requantize_to_buffers(const ggml_tensor * tensor,
     return result;
 }
 
-OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, void * output_base_ptr) {
+OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, void * output_base_ptr, bool use_bias) {
     GGML_ASSERT(tensor != nullptr);
     GGML_ASSERT(data != nullptr);
 
@@ -619,12 +670,19 @@ OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, vo
         OPENVINO_THROW("Unsupported weight tensor type: ", ggml_type_name(tensor->type));
     }
 
-    result.layout = ggml_openvino_get_extracted_layout(tensor);
+    result.layout = ggml_openvino_get_extracted_layout(tensor, use_bias);
     const auto & layout = result.layout;
     if (layout.total_size == 0) {
         OPENVINO_THROW("Unsupported quantized type: ", ggml_type_name(tensor->type));
     }
 
+    if (use_bias) {
+        OPENVINO_ASSERT(!layout.is_requant,
+                        "use_bias is only used for test-backend-ops, which should not have requantization");
+        // bias node will be created on the fly and not use backend buffer
+        output_base_ptr = nullptr;
+    }
+
     // F16 requant path - no separate scales/zp needed in result
     if (layout.is_requant && layout.requant_type.has_value() && layout.requant_type.value() == ExtraQuantType::F16) {
         if (output_base_ptr) {
@@ -653,14 +711,20 @@ OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, vo
     } else {
         result.weights = ov::Tensor(weight_type, node_shape);
         result.scales = ov::Tensor(ov::element::f16, scale_shape);
-        result.zp = ov::Tensor(weight_type, zp_shape);
+        if (use_bias && !layout.is_symmetric) {
+            // bias only has effect for asymmetric quant
+            result.zp = ov::Tensor(ov::element::f16, zp_shape);
+        } else {
+            result.zp = ov::Tensor(weight_type, zp_shape);
+        }
     }
 
     if (layout.is_requant && layout.requant_type.has_value()) {
         result.weight_node = requantize_to_buffers(tensor, data, layout.requant_type.value(), layout.weights_per_block,
                                                    result.weights, result.scales, result.zp);
     } else {
-        result.weight_node = extract_quantized_weights(tensor, data, result.weights, result.scales, result.zp);
+        result.weight_node =
+            extract_quantized_weights(tensor, data, result.weights, result.scales, result.zp, use_bias);
     }
 
     return result;
diff --git a/ggml/src/ggml-openvino/ggml-quants.hpp b/ggml/src/ggml-openvino/ggml-quants.hpp
index 600b9c9f29..e4a02297ca 100644
--- a/ggml/src/ggml-openvino/ggml-quants.hpp
+++ b/ggml/src/ggml-openvino/ggml-quants.hpp
@@ -16,7 +16,8 @@ void extract_q4_0_data(const ggml_tensor * tensor,
 void extract_q4_1_data(const ggml_tensor * tensor,
                        ov::Tensor & weights_arr,
                        ov::Tensor & scales_arr,
-                       ov::Tensor & zp_arr);
+                       ov::Tensor & zp_arr,
+                       bool use_bias = false);
 
 void extract_q8_0_data(const ggml_tensor * tensor,
                        ov::Tensor & weights_arr,
@@ -28,12 +29,14 @@ void unpack_256_4(const uint8_t* data, uint8_t* dst);
 void extract_q4_k_data(const ggml_tensor * tensor,
                        ov::Tensor & weights_arr,
                        ov::Tensor & scales_arr,
-                       ov::Tensor & zp_arr);
+                       ov::Tensor & zp_arr,
+                       bool use_bias = false);
 
 void extract_q5_k_data(const ggml_tensor * tensor,
                        ov::Tensor & weights_arr,
                        ov::Tensor & scales_arr,
-                       ov::Tensor & zp_arr);
+                       ov::Tensor & zp_arr,
+                       bool use_bias = false);
 
 void extract_q6_k_data(const ggml_tensor * tensor,
                        ov::Tensor & weights_arr,
@@ -45,12 +48,14 @@ static constexpr size_t GGML_QUANTIZATION_GROUP_SIZE = 32;
 ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
                                        ov::Tensor & scales,
                                        ov::Tensor & zp,
-                                       size_t group_size = GGML_QUANTIZATION_GROUP_SIZE);
+                                       size_t group_size = GGML_QUANTIZATION_GROUP_SIZE,
+                                       bool use_bias = false);
 
 ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
                                        ov::Tensor & scales,
                                        ov::Tensor & zp,
-                                       size_t group_size = GGML_QUANTIZATION_GROUP_SIZE);
+                                       size_t group_size = GGML_QUANTIZATION_GROUP_SIZE,
+                                       bool use_bias = false);
 
 // Extract quantized weights from tensor and create weight subgraph
 // If weights/scales/zp are provided (non-empty), uses them as output buffers
@@ -61,7 +66,8 @@ std::shared_ptr<ov::Node> extract_quantized_weights(
     const void * data,  // Source data pointer (may differ from tensor->data)
     ov::Tensor & weights,
     ov::Tensor & scales,
-    ov::Tensor & zp);
+    ov::Tensor & zp,
+    bool use_bias = false);  // Use fp bias instead of quantized zero_point (for test-backend-ops)
 
 // Requantize weights from tensor to target format, writing to provided buffers
 // For F16 target, only weights buffer is used (scales/zp ignored)
@@ -112,8 +118,9 @@ struct OvWeight {
 // Returns OvWeight with the weight node and optional quantized tensors
 OvWeight process_weight_tensor(
     const ggml_tensor * tensor,
-    const void * data,                  // Source data pointer (may differ from tensor->data)
-    void * output_base_ptr = nullptr);  // Base pointer for output buffers (or nullptr for internal allocation)
+    const void * data,                 // Source data pointer (may differ from tensor->data)
+    void * output_base_ptr = nullptr,  // Base pointer for output buffers (or nullptr for internal allocation)
+    bool use_bias = false);            // Use fp bias instead of quantized zero_point, only used in test-backend-ops
 
 void quantize_q4_0(const float * x,
                    ov::Tensor & weights_arr,
diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp
index 69fcb0eda4..41fbf27383 100644
--- a/ggml/src/ggml-openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/utils.cpp
@@ -127,7 +127,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
                 if (pos_data[0] == 0) {
                     infer_request->reset_state();
                     stateful_kv_size = pos_shape[3];
-                } else if (stateful_kv_size == pos_data[0]) {
+                } else if (stateful_kv_size == static_cast<size_t>(pos_data[0])) {
                     stateful_kv_size += pos_shape[3];
                 } else {
                     auto states = infer_request->query_state();
@@ -139,7 +139,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
                         state.set_state(new_state_tensor);
                     }
                     stateful_kv_size = pos_data[0] + 1;
-                 }
+                }
             }
 
             decoder_end_time = ggml_time_us();
@@ -467,10 +467,10 @@ enum ggml_status naive_compute(ggml_cgraph * cgraph,
         return GGML_STATUS_SUCCESS;
     }
 
-    auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
+    bool naive = true;
+    auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, naive);
     auto decoder = std::make_shared<GgmlOvDecoder>(cgraph, model_weights);
     auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(decoder);
-    auto naive = true;
     auto model = ov::frontend::ggml::FrontEnd::convert(input_model, naive);
     if (getenv("GGML_OPENVINO_DUMP_IR")) {
         ov::serialize(model, "IR_naive.xml");
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 6cdf6ae818..350bffc315 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -233,9 +233,7 @@ if (NOT LLAMA_SANITIZE_ADDRESS AND NOT GGML_SCHED_NO_REALLOC)
   llama_build_and_test(test-opt.cpp)
 endif()
 llama_build_and_test(test-gguf.cpp)
-if (NOT GGML_OPENVINO)
-    llama_build_and_test(test-backend-ops.cpp)
-endif()
+llama_build_and_test(test-backend-ops.cpp)
 
 llama_build_and_test(test-model-load-cancel.cpp LABEL "model")
 llama_build_and_test(test-autorelease.cpp       LABEL "model")