From 67c9720e496290411975d05244552fc8c6f11631 Mon Sep 17 00:00:00 2001
From: "Yu, Zijun" <zijun.yu@intel.com>
Date: Mon, 29 Dec 2025 15:25:59 +0800
Subject: [PATCH] Optimize symmetric quant weight extraction: use single zp

---
 .../src/ggml-openvino/ggml-openvino-extra.cpp |  32 ++++-
 ggml/src/ggml-openvino/ggml-openvino-extra.h  |   1 +
 ggml/src/ggml-openvino/ggml-quants.cpp        | 133 ++++++++++++++----
 3 files changed, 140 insertions(+), 26 deletions(-)
diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp
index 26cc386dff..2f24d7a1db 100644
--- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp
@@ -192,6 +192,7 @@ std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor *
 
 ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor) {
     ggml_openvino_extracted_layout layout = {};
+    layout.is_symmetric = false;
 
     if (!ggml_is_quantized(tensor->type)) {
         return layout;
@@ -225,10 +226,26 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
         case ExtraQuantType::Q4_0_128:
             layout.is_u4 = true;
             layout.weights_per_block = 128;
+            layout.is_symmetric = true;
+            break;
+        case ExtraQuantType::Q4_0_C:
+            layout.is_u4 = true;
+            layout.weights_per_block = tensor->ne[0];
+            layout.is_symmetric = true;
             break;
         case ExtraQuantType::Q8_0_32:
             layout.is_u4 = false;
             layout.weights_per_block = 32;
+            layout.is_symmetric = true;
+            break;
+        case ExtraQuantType::Q8_0_C:
+            layout.is_u4 = false;
+            layout.weights_per_block = tensor->ne[0];
+            layout.is_symmetric = true;
+            break;
+        case ExtraQuantType::Q8_1_C:
+            layout.is_u4 = false;
+            layout.weights_per_block = tensor->ne[0];
             break;
         default:
             layout.weights_per_block = -1;
@@ -241,7 +258,8 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
             layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;
             int64_t n_blocks = n_elements / layout.weights_per_block;
             layout.scales_size = n_blocks * sizeof(uint16_t);
-            layout.biases_size = n_blocks * sizeof(uint16_t);
+            // For symmetric quantization, we only need one bias value (not one per block)
+            layout.biases_size = layout.is_symmetric ? sizeof(uint16_t) : n_blocks * sizeof(uint16_t);
 
             layout.weights_offset = 0;
             layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
@@ -256,7 +274,14 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
     // Normal extraction (no requant) - determine format based on tensor type
     switch (tensor->type) {
     case GGML_TYPE_Q4_0:
+        layout.is_u4 = true;
+        layout.weights_per_block = 32;
+        layout.is_symmetric = true;
+        break;
     case GGML_TYPE_Q4_1:
+        layout.is_u4 = true;
+        layout.weights_per_block = 32;
+        break;
     case GGML_TYPE_Q4_K:
         layout.is_u4 = true;
         layout.weights_per_block = 32;
@@ -264,10 +289,12 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
     case GGML_TYPE_Q8_0:
         layout.is_u4 = false;
         layout.weights_per_block = 32;
+        layout.is_symmetric = true;
         break;
     case GGML_TYPE_Q6_K:
         layout.is_u4 = false;
         layout.weights_per_block = 16;
+        layout.is_symmetric = true;
         break;
     case GGML_TYPE_Q5_K:
         layout.is_u4 = false;
@@ -285,7 +312,8 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
     // Scales and biases: F16 per block
     int64_t n_blocks = n_elements / layout.weights_per_block;
     layout.scales_size = n_blocks * sizeof(uint16_t);  // F16 = 2 bytes
-    layout.biases_size = n_blocks * sizeof(uint16_t);  // F16 = 2 bytes
+    // For symmetric quantization, we only need one bias value (not one per block)
+    layout.biases_size = layout.is_symmetric ? sizeof(uint16_t) : n_blocks * sizeof(uint16_t);
 
     // Layout in buffer: [weights | scales | biases] with alignment
     layout.weights_offset = 0;
diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.h b/ggml/src/ggml-openvino/ggml-openvino-extra.h
index fbfe459edf..e2c5a8ceea 100644
--- a/ggml/src/ggml-openvino/ggml-openvino-extra.h
+++ b/ggml/src/ggml-openvino/ggml-openvino-extra.h
@@ -146,6 +146,7 @@ struct ggml_openvino_extracted_layout {
     size_t biases_size;       // Size of biases in bytes
     bool is_u4;               // true for U4 weights, false for U8
     int64_t weights_per_block;// weights per scale/bias block
+    bool is_symmetric;        // true for symmetric quantization
 
     // Requantization info
     bool is_requant;                              // true if this tensor needs requantization
diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp
index 1a5679cd8d..8946b73a56 100644
--- a/ggml/src/ggml-openvino/ggml-quants.cpp
+++ b/ggml/src/ggml-openvino/ggml-quants.cpp
@@ -55,9 +55,18 @@ void extract_q4_0_data(const ggml_tensor * tensor,
     auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
     auto * biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
 
+    bool is_scalar_bias = (biases_arr.get_size() == 1);  // Symmetric quantization
+
     ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
         scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block)));
-        biases[i] = ov::float16(-8.f * static_cast<float>(scales[i]));
+        // For symmetric quantization, only write the first bias (all blocks share the same bias relationship)
+        if (is_scalar_bias) {
+            if (i == 0) {
+                biases[0] = ov::float16(-8.f * static_cast<float>(scales[0]));
+            }
+        } else {
+            biases[i] = ov::float16(-8.f * static_cast<float>(scales[i]));
+        }
         unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16);
     });
 }
@@ -95,10 +104,19 @@ void extract_q8_0_data(const ggml_tensor * tensor,
     auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
     auto * biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
 
+    bool is_scalar_bias = (biases_arr.get_size() == 1);  // Symmetric quantization
+
     ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
         uint8_t * block_data = data + i * bytes_per_block;
         scales[i] = ov::float16::from_bits(*(uint16_t *) block_data);
-        biases[i] = ov::float16(-128.f * static_cast<float>(scales[i]));
+        // For symmetric quantization, only write the first bias (all blocks share the same bias relationship)
+        if (is_scalar_bias) {
+            if (i == 0) {
+                biases[0] = ov::float16(-128.f * static_cast<float>(scales[0]));
+            }
+        } else {
+            biases[i] = ov::float16(-128.f * static_cast<float>(scales[i]));
+        }
         for (size_t j = 0; j < weights_per_block; ++j) {
             uint8_t x = block_data[j + 2];  // j+2 to skip the scale bytes.
             // Original data is in int8_t, so we add a bias of -128 and invert the first bit.
@@ -190,6 +208,8 @@ void extract_q6_k_data(const ggml_tensor * tensor,
     auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
     auto * biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
 
+    bool is_scalar_bias = (biases_arr.get_size() == 1);  // Symmetric quantization
+
     ov::parallel_for(n_super_block, [&](size_t i) {
         uint8_t * block_data = data + i * bytes_per_block;
 
@@ -199,7 +219,14 @@ void extract_q6_k_data(const ggml_tensor * tensor,
         for (size_t j = 0; j < 16; j++) {
             scales[j + i * 16] =
                 ov::float16(scale_factor * static_cast<float>(*((int8_t *) (block_data + 128 + 64 + j))));
-            biases[j + i * 16] = ov::float16(-32.f * static_cast<float>(scales[j + i * 16]));
+            // For symmetric quantization, only write the first bias (all blocks share the same bias relationship)
+            if (is_scalar_bias) {
+                if (i == 0 && j == 0) {
+                    biases[0] = ov::float16(-32.f * static_cast<float>(scales[0]));
+                }
+            } else {
+                biases[j + i * 16] = ov::float16(-32.f * static_cast<float>(scales[j + i * 16]));
+            }
         }
 
         uint8_t * ql = block_data;
@@ -302,15 +329,22 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
 
     // Expand dimensions for scales and biases
     auto scale_shape = scales.get_shape();
+    auto bias_shape = biases.get_shape();
+    bool is_scalar_bias = bias_shape.empty();  // Symmetric quantization
 
     ov::Shape packed_shape = {orig_shape[0], orig_shape[1] / group_size, group_size};
 
     if (packed_shape[1] == 1) {
+        // Requantized channel-wise case
         packed_shape.erase(packed_shape.begin() + 1);
     } else {
         scale_shape.push_back(1);
         scales.set_shape(scale_shape);
-        biases.set_shape(scale_shape);
+        // For symmetric quantization, biases remain scalar (don't resize)
+        if (!is_scalar_bias) {
+            bias_shape = scale_shape;
+            biases.set_shape(bias_shape);
+        }
     }
 
     // Create graph nodes
@@ -318,15 +352,23 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
                                                                static_cast<uint8_t *>(weight.data()), nullptr);
     weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
     auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
-    ov::Tensor biases_u8(ov::element::u8, scale_shape);
+    ov::Tensor biases_u8(ov::element::u8, is_scalar_bias ? ov::Shape{} : scale_shape);
 
     // Calculate zero point
     const ov::float16 * bias_data = biases.data<ov::element_type_traits<ov::element::f16>::value_type>();
     const ov::float16 * scale_data = scales.data<ov::element_type_traits<ov::element::f16>::value_type>();
     uint8_t * bias_u8_data = biases_u8.data<uint8_t>();
-    for (size_t i = 0; i < biases_u8.get_size(); ++i) {
-        bias_u8_data[i] =
-            (uint8_t) std::round(-1.f * static_cast<float>(bias_data[i]) / static_cast<float>(scale_data[i]));
+
+    if (is_scalar_bias) {
+        // Symmetric quantization: single bias value for all blocks
+        // For Q8_0, bias = -128 * scale, so zero_point = 128
+        bias_u8_data[0] = (uint8_t) std::round(-1.f * static_cast<float>(bias_data[0]) / static_cast<float>(scale_data[0]));
+    } else {
+        // Asymmetric quantization: per-block biases
+        for (size_t i = 0; i < biases_u8.get_size(); ++i) {
+            bias_u8_data[i] =
+                (uint8_t) std::round(-1.f * static_cast<float>(bias_data[i]) / static_cast<float>(scale_data[i]));
+        }
     }
 
     auto zero_point = std::make_shared<ov::op::v0::Constant>(biases_u8);
@@ -361,17 +403,23 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
 
     // Expand dimensions for scales and biases
     ov::Shape scale_bias_shape = scales.get_shape();
+    auto bias_shape = biases.get_shape();
+    bool is_scalar_bias = bias_shape.empty();  // Symmetric quantization
 
     // Create INT4 weight tensor
     ov::Shape packed_shape = {orig_weight_shape[0], orig_weight_shape[1] / group_size, group_size};
 
-    // Requantized channel-wise case
     if (packed_shape[1] == 1) {
+        // Requantized channel-wise case
         packed_shape.erase(packed_shape.begin() + 1);
     } else {
         scale_bias_shape.push_back(1);
         scales.set_shape(scale_bias_shape);
-        biases.set_shape(scale_bias_shape);
+        // For symmetric quantization, biases remain scalar (don't resize)
+        if (!is_scalar_bias) {
+            bias_shape = scale_bias_shape;
+            biases.set_shape(bias_shape);
+        }
     }
 
     auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u4, packed_shape,
@@ -382,14 +430,23 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
     // Pack zero points: two subsequent values into one
     const ov::float16 * bias_data = biases.data<ov::element_type_traits<ov::element::f16>::value_type>();
     const ov::float16 * scale_data = scales.data<ov::element_type_traits<ov::element::f16>::value_type>();
-    ov::Tensor zero_point_tensor(ov::element::u4, scale_bias_shape);
+    ov::Tensor zero_point_tensor(ov::element::u4, is_scalar_bias ? ov::Shape{} : scale_bias_shape);
     uint8_t * zero_point_data = static_cast<uint8_t *>(zero_point_tensor.data());
-    for (size_t i = 0; i < zero_point_tensor.get_byte_size(); ++i) {
-        uint8_t bias1 =
-            (uint8_t) std::round(-1.f * static_cast<float>(bias_data[i * 2]) / static_cast<float>(scale_data[i * 2]));
-        uint8_t bias2 = (uint8_t) std::round(-1.f * static_cast<float>(bias_data[i * 2 + 1]) /
-                                             static_cast<float>(scale_data[i * 2 + 1]));
-        zero_point_data[i] = (bias2 << 4) | (bias1 & 0x0F);
+
+    if (is_scalar_bias) {
+        // Symmetric quantization: single bias value for all blocks
+        // For Q4_0, bias = -8 * scale, so zero_point = 8
+        uint8_t zp = (uint8_t) std::round(-1.f * static_cast<float>(bias_data[0]) / static_cast<float>(scale_data[0]));
+        zero_point_data[0] = (zp << 4) | (zp & 0x0F);
+    } else {
+        // Asymmetric quantization: per-block biases
+        for (size_t i = 0; i < zero_point_tensor.get_byte_size(); ++i) {
+            uint8_t bias1 =
+                (uint8_t) std::round(-1.f * static_cast<float>(bias_data[i * 2]) / static_cast<float>(scale_data[i * 2]));
+            uint8_t bias2 = (uint8_t) std::round(-1.f * static_cast<float>(bias_data[i * 2 + 1]) /
+                                                 static_cast<float>(scale_data[i * 2 + 1]));
+            zero_point_data[i] = (bias2 << 4) | (bias1 & 0x0F);
+        }
     }
 
     auto zero_points_node = std::make_shared<ov::op::v0::Constant>(zero_point_tensor);
@@ -602,17 +659,19 @@ std::shared_ptr<ov::Node> process_weight_tensor(const ggml_tensor * tensor, cons
             // Requant to quantized format (Q4_0_128, Q8_0_32, etc.)
             ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
             ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block};
+            // For symmetric quantization, biases are a single value instead of per-block
+            ov::Shape bias_shape = layout.is_symmetric ? ov::Shape{} : scale_shape;
 
             ov::Tensor weights, scales, biases;
             if (output_base_ptr) {
                 uint8_t * buf_base = static_cast<uint8_t *>(output_base_ptr);
                 weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset);
                 scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
-                biases = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.biases_offset);
+                biases = ov::Tensor(ov::element::f16, bias_shape, buf_base + layout.biases_offset);
             } else {
                 weights = ov::Tensor(weight_type, node_shape);
                 scales = ov::Tensor(ov::element::f16, scale_shape);
-                biases = ov::Tensor(ov::element::f16, scale_shape);
+                biases = ov::Tensor(ov::element::f16, bias_shape);
             }
 
             result = requantize_to_buffers(tensor, data, layout.requant_type.value(), layout.weights_per_block, weights,
@@ -622,17 +681,19 @@ std::shared_ptr<ov::Node> process_weight_tensor(const ggml_tensor * tensor, cons
         // Normal extraction path (no requant)
         ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
         ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block};
+        // For symmetric quantization, biases are a single value instead of per-block
+        ov::Shape bias_shape = layout.is_symmetric ? ov::Shape{} : scale_shape;
 
         ov::Tensor weights, scales, biases;
         if (output_base_ptr) {
             uint8_t * buf_base = static_cast<uint8_t *>(output_base_ptr);
             weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset);
             scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
-            biases = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.biases_offset);
+            biases = ov::Tensor(ov::element::f16, bias_shape, buf_base + layout.biases_offset);
         } else {
             weights = ov::Tensor(weight_type, node_shape);
             scales = ov::Tensor(ov::element::f16, scale_shape);
-            biases = ov::Tensor(ov::element::f16, scale_shape);
+            biases = ov::Tensor(ov::element::f16, bias_shape);
         }
 
         result = extract_quantized_weights(tensor, data, weights, scales, biases);
@@ -653,6 +714,8 @@ void quantize_q4_0(const float * x,
     auto * weights = static_cast<uint8_t *>(weights_arr.data());
     auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
     auto * biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
+    bool is_scalar_bias = (biases_arr.get_size() == 1);  // Symmetric quantization
+
     for (int i = 0; i < nb; i++) {
         float amax = 0.0f;  // absolute max
         float max = 0.0f;
@@ -669,7 +732,13 @@ void quantize_q4_0(const float * x,
 
         if (d == 0) {
             scales[i] = ov::float16(1.0f);
-            biases[i] = ov::float16(-8.0f);
+            if (is_scalar_bias) {
+                if (i == 0) {
+                    biases[0] = ov::float16(-8.0f);
+                }
+            } else {
+                biases[i] = ov::float16(-8.0f);
+            }
             uint8_t zp = 8;
             memset(weights + i * qk / 2, zp | (zp << 4), qk / 2);
             continue;
@@ -677,7 +746,14 @@ void quantize_q4_0(const float * x,
 
         const float id = 1.0f / d;
         scales[i] = ov::float16(d);
-        biases[i] = ov::float16(-8.f * d);
+        // For symmetric quantization, only write the first bias (all blocks share the same bias relationship)
+        if (is_scalar_bias) {
+            if (i == 0) {
+                biases[0] = ov::float16(-8.f * d);
+            }
+        } else {
+            biases[i] = ov::float16(-8.f * d);
+        }
 
         for (int j = 0; j < qk / 2; ++j) {
             const float x0 = x[i * qk + 2 * j] * id;
@@ -701,6 +777,8 @@ void quantize_q8_0(const float * x,
     auto * weights = static_cast<uint8_t *>(weights_arr.data());
     auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
     auto * biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
+    bool is_scalar_bias = (biases_arr.get_size() == 1);  // Symmetric quantization
+
     for (int i = 0; i < nb; i++) {
         float amax = 0.0f;  // absolute max
 
@@ -714,7 +792,14 @@ void quantize_q8_0(const float * x,
         const float d = amax / 127.0f;
         const float id = d ? 1.0f / d : 0.0f;
         scales[i] = ov::float16(d);
-        biases[i] = ov::float16(-128.0f * d);
+        // For symmetric quantization, only write the first bias (all blocks share the same bias relationship)
+        if (is_scalar_bias) {
+            if (i == 0) {
+                biases[0] = ov::float16(-128.0f * d);
+            }
+        } else {
+            biases[i] = ov::float16(-128.0f * d);
+        }
 
         for (int j = 0; j < qk; ++j) {
             const float x0 = x[i * qk + j] * id;