Optimize symmetric quant weight extraction: use single zp

2025-12-29 15:25:59 +08:00 · 2025-12-29 15:25:59 +08:00 · 67c9720e49
parent c1142ddb7c
commit 67c9720e49
3 changed files with 140 additions and 26 deletions
--- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp
@ -192,6 +192,7 @@ std::optional<ExtraQuantType> ggml_openvino_get_requant_type(const ggml_tensor *

 ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor) {
    ggml_openvino_extracted_layout layout = {};
+    layout.is_symmetric = false;

    if (!ggml_is_quantized(tensor->type)) {
        return layout;
@ -225,10 +226,26 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
        case ExtraQuantType::Q4_0_128:
            layout.is_u4 = true;
            layout.weights_per_block = 128;
+            layout.is_symmetric = true;
+            break;
+        case ExtraQuantType::Q4_0_C:
+            layout.is_u4 = true;
+            layout.weights_per_block = tensor->ne[0];
+            layout.is_symmetric = true;
            break;
        case ExtraQuantType::Q8_0_32:
            layout.is_u4 = false;
            layout.weights_per_block = 32;
+            layout.is_symmetric = true;
+            break;
+        case ExtraQuantType::Q8_0_C:
+            layout.is_u4 = false;
+            layout.weights_per_block = tensor->ne[0];
+            layout.is_symmetric = true;
+            break;
+        case ExtraQuantType::Q8_1_C:
+            layout.is_u4 = false;
+            layout.weights_per_block = tensor->ne[0];
            break;
        default:
            layout.weights_per_block = -1;
@ -241,7 +258,8 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
            layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;
            int64_t n_blocks = n_elements / layout.weights_per_block;
            layout.scales_size = n_blocks * sizeof(uint16_t);
-            layout.biases_size = n_blocks * sizeof(uint16_t);
+            // For symmetric quantization, we only need one bias value (not one per block)
+            layout.biases_size = layout.is_symmetric ? sizeof(uint16_t) : n_blocks * sizeof(uint16_t);

            layout.weights_offset = 0;
            layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
@ -256,7 +274,14 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
    // Normal extraction (no requant) - determine format based on tensor type
    switch (tensor->type) {
    case GGML_TYPE_Q4_0:
+        layout.is_u4 = true;
+        layout.weights_per_block = 32;
+        layout.is_symmetric = true;
+        break;
    case GGML_TYPE_Q4_1:
+        layout.is_u4 = true;
+        layout.weights_per_block = 32;
+        break;
    case GGML_TYPE_Q4_K:
        layout.is_u4 = true;
        layout.weights_per_block = 32;
@ -264,10 +289,12 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
    case GGML_TYPE_Q8_0:
        layout.is_u4 = false;
        layout.weights_per_block = 32;
+        layout.is_symmetric = true;
        break;
    case GGML_TYPE_Q6_K:
        layout.is_u4 = false;
        layout.weights_per_block = 16;
+        layout.is_symmetric = true;
        break;
    case GGML_TYPE_Q5_K:
        layout.is_u4 = false;
@ -285,7 +312,8 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
    // Scales and biases: F16 per block
    int64_t n_blocks = n_elements / layout.weights_per_block;
    layout.scales_size = n_blocks * sizeof(uint16_t);  // F16 = 2 bytes
-    layout.biases_size = n_blocks * sizeof(uint16_t);  // F16 = 2 bytes
+    // For symmetric quantization, we only need one bias value (not one per block)
+    layout.biases_size = layout.is_symmetric ? sizeof(uint16_t) : n_blocks * sizeof(uint16_t);

    // Layout in buffer: [weights | scales | biases] with alignment
    layout.weights_offset = 0;
--- a/ggml/src/ggml-openvino/ggml-openvino-extra.h
+++ b/ggml/src/ggml-openvino/ggml-openvino-extra.h
@ -146,6 +146,7 @@ struct ggml_openvino_extracted_layout {
    size_t biases_size;       // Size of biases in bytes
    bool is_u4;               // true for U4 weights, false for U8
    int64_t weights_per_block;// weights per scale/bias block
+    bool is_symmetric;        // true for symmetric quantization

    // Requantization info
    bool is_requant;                              // true if this tensor needs requantization
--- a/ggml/src/ggml-openvino/ggml-quants.cpp
+++ b/ggml/src/ggml-openvino/ggml-quants.cpp
@ -55,9 +55,18 @@ void extract_q4_0_data(const ggml_tensor * tensor,
    auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
    auto * biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();

+    bool is_scalar_bias = (biases_arr.get_size() == 1);  // Symmetric quantization
+
    ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
        scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block)));
-        biases[i] = ov::float16(-8.f * static_cast<float>(scales[i]));
+        // For symmetric quantization, only write the first bias (all blocks share the same bias relationship)
+        if (is_scalar_bias) {
+            if (i == 0) {
+                biases[0] = ov::float16(-8.f * static_cast<float>(scales[0]));
+            }
+        } else {
+            biases[i] = ov::float16(-8.f * static_cast<float>(scales[i]));
+        }
        unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16);
    });
 }
@ -95,10 +104,19 @@ void extract_q8_0_data(const ggml_tensor * tensor,
    auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
    auto * biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();

+    bool is_scalar_bias = (biases_arr.get_size() == 1);  // Symmetric quantization
+
    ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
        uint8_t * block_data = data + i * bytes_per_block;
        scales[i] = ov::float16::from_bits(*(uint16_t *) block_data);
-        biases[i] = ov::float16(-128.f * static_cast<float>(scales[i]));
+        // For symmetric quantization, only write the first bias (all blocks share the same bias relationship)
+        if (is_scalar_bias) {
+            if (i == 0) {
+                biases[0] = ov::float16(-128.f * static_cast<float>(scales[0]));
+            }
+        } else {
+            biases[i] = ov::float16(-128.f * static_cast<float>(scales[i]));
+        }
        for (size_t j = 0; j < weights_per_block; ++j) {
            uint8_t x = block_data[j + 2];  // j+2 to skip the scale bytes.
            // Original data is in int8_t, so we add a bias of -128 and invert the first bit.
@ -190,6 +208,8 @@ void extract_q6_k_data(const ggml_tensor * tensor,
    auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
    auto * biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();

+    bool is_scalar_bias = (biases_arr.get_size() == 1);  // Symmetric quantization
+
    ov::parallel_for(n_super_block, [&](size_t i) {
        uint8_t * block_data = data + i * bytes_per_block;

@ -199,7 +219,14 @@ void extract_q6_k_data(const ggml_tensor * tensor,
        for (size_t j = 0; j < 16; j++) {
            scales[j + i * 16] =
                ov::float16(scale_factor * static_cast<float>(*((int8_t *) (block_data + 128 + 64 + j))));
-            biases[j + i * 16] = ov::float16(-32.f * static_cast<float>(scales[j + i * 16]));
+            // For symmetric quantization, only write the first bias (all blocks share the same bias relationship)
+            if (is_scalar_bias) {
+                if (i == 0 && j == 0) {
+                    biases[0] = ov::float16(-32.f * static_cast<float>(scales[0]));
+                }
+            } else {
+                biases[j + i * 16] = ov::float16(-32.f * static_cast<float>(scales[j + i * 16]));
+            }
        }

        uint8_t * ql = block_data;
@ -302,15 +329,22 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,

    // Expand dimensions for scales and biases
    auto scale_shape = scales.get_shape();
+    auto bias_shape = biases.get_shape();
+    bool is_scalar_bias = bias_shape.empty();  // Symmetric quantization

    ov::Shape packed_shape = {orig_shape[0], orig_shape[1] / group_size, group_size};

    if (packed_shape[1] == 1) {
+        // Requantized channel-wise case
        packed_shape.erase(packed_shape.begin() + 1);
    } else {
        scale_shape.push_back(1);
        scales.set_shape(scale_shape);
-        biases.set_shape(scale_shape);
+        // For symmetric quantization, biases remain scalar (don't resize)
+        if (!is_scalar_bias) {
+            bias_shape = scale_shape;
+            biases.set_shape(bias_shape);
+        }
    }

    // Create graph nodes
@ -318,15 +352,23 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
                                                               static_cast<uint8_t *>(weight.data()), nullptr);
    weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
    auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
-    ov::Tensor biases_u8(ov::element::u8, scale_shape);
+    ov::Tensor biases_u8(ov::element::u8, is_scalar_bias ? ov::Shape{} : scale_shape);

    // Calculate zero point
    const ov::float16 * bias_data = biases.data<ov::element_type_traits<ov::element::f16>::value_type>();
    const ov::float16 * scale_data = scales.data<ov::element_type_traits<ov::element::f16>::value_type>();
    uint8_t * bias_u8_data = biases_u8.data<uint8_t>();
-    for (size_t i = 0; i < biases_u8.get_size(); ++i) {
-        bias_u8_data[i] =
-            (uint8_t) std::round(-1.f * static_cast<float>(bias_data[i]) / static_cast<float>(scale_data[i]));
+
+    if (is_scalar_bias) {
+        // Symmetric quantization: single bias value for all blocks
+        // For Q8_0, bias = -128 * scale, so zero_point = 128
+        bias_u8_data[0] = (uint8_t) std::round(-1.f * static_cast<float>(bias_data[0]) / static_cast<float>(scale_data[0]));
+    } else {
+        // Asymmetric quantization: per-block biases
+        for (size_t i = 0; i < biases_u8.get_size(); ++i) {
+            bias_u8_data[i] =
+                (uint8_t) std::round(-1.f * static_cast<float>(bias_data[i]) / static_cast<float>(scale_data[i]));
+        }
    }

    auto zero_point = std::make_shared<ov::op::v0::Constant>(biases_u8);
@ -361,17 +403,23 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,

    // Expand dimensions for scales and biases
    ov::Shape scale_bias_shape = scales.get_shape();
+    auto bias_shape = biases.get_shape();
+    bool is_scalar_bias = bias_shape.empty();  // Symmetric quantization

    // Create INT4 weight tensor
    ov::Shape packed_shape = {orig_weight_shape[0], orig_weight_shape[1] / group_size, group_size};

-    // Requantized channel-wise case
    if (packed_shape[1] == 1) {
+        // Requantized channel-wise case
        packed_shape.erase(packed_shape.begin() + 1);
    } else {
        scale_bias_shape.push_back(1);
        scales.set_shape(scale_bias_shape);
-        biases.set_shape(scale_bias_shape);
+        // For symmetric quantization, biases remain scalar (don't resize)
+        if (!is_scalar_bias) {
+            bias_shape = scale_bias_shape;
+            biases.set_shape(bias_shape);
+        }
    }

    auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u4, packed_shape,
@ -382,14 +430,23 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
    // Pack zero points: two subsequent values into one
    const ov::float16 * bias_data = biases.data<ov::element_type_traits<ov::element::f16>::value_type>();
    const ov::float16 * scale_data = scales.data<ov::element_type_traits<ov::element::f16>::value_type>();
-    ov::Tensor zero_point_tensor(ov::element::u4, scale_bias_shape);
+    ov::Tensor zero_point_tensor(ov::element::u4, is_scalar_bias ? ov::Shape{} : scale_bias_shape);
    uint8_t * zero_point_data = static_cast<uint8_t *>(zero_point_tensor.data());
-    for (size_t i = 0; i < zero_point_tensor.get_byte_size(); ++i) {
-        uint8_t bias1 =
-            (uint8_t) std::round(-1.f * static_cast<float>(bias_data[i * 2]) / static_cast<float>(scale_data[i * 2]));
-        uint8_t bias2 = (uint8_t) std::round(-1.f * static_cast<float>(bias_data[i * 2 + 1]) /
-                                             static_cast<float>(scale_data[i * 2 + 1]));
-        zero_point_data[i] = (bias2 << 4) | (bias1 & 0x0F);
+
+    if (is_scalar_bias) {
+        // Symmetric quantization: single bias value for all blocks
+        // For Q4_0, bias = -8 * scale, so zero_point = 8
+        uint8_t zp = (uint8_t) std::round(-1.f * static_cast<float>(bias_data[0]) / static_cast<float>(scale_data[0]));
+        zero_point_data[0] = (zp << 4) | (zp & 0x0F);
+    } else {
+        // Asymmetric quantization: per-block biases
+        for (size_t i = 0; i < zero_point_tensor.get_byte_size(); ++i) {
+            uint8_t bias1 =
+                (uint8_t) std::round(-1.f * static_cast<float>(bias_data[i * 2]) / static_cast<float>(scale_data[i * 2]));
+            uint8_t bias2 = (uint8_t) std::round(-1.f * static_cast<float>(bias_data[i * 2 + 1]) /
+                                                 static_cast<float>(scale_data[i * 2 + 1]));
+            zero_point_data[i] = (bias2 << 4) | (bias1 & 0x0F);
+        }
    }

    auto zero_points_node = std::make_shared<ov::op::v0::Constant>(zero_point_tensor);
@ -602,17 +659,19 @@ std::shared_ptr<ov::Node> process_weight_tensor(const ggml_tensor * tensor, cons
            // Requant to quantized format (Q4_0_128, Q8_0_32, etc.)
            ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
            ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block};
+            // For symmetric quantization, biases are a single value instead of per-block
+            ov::Shape bias_shape = layout.is_symmetric ? ov::Shape{} : scale_shape;

            ov::Tensor weights, scales, biases;
            if (output_base_ptr) {
                uint8_t * buf_base = static_cast<uint8_t *>(output_base_ptr);
                weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset);
                scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
-                biases = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.biases_offset);
+                biases = ov::Tensor(ov::element::f16, bias_shape, buf_base + layout.biases_offset);
            } else {
                weights = ov::Tensor(weight_type, node_shape);
                scales = ov::Tensor(ov::element::f16, scale_shape);
-                biases = ov::Tensor(ov::element::f16, scale_shape);
+                biases = ov::Tensor(ov::element::f16, bias_shape);
            }

            result = requantize_to_buffers(tensor, data, layout.requant_type.value(), layout.weights_per_block, weights,
@ -622,17 +681,19 @@ std::shared_ptr<ov::Node> process_weight_tensor(const ggml_tensor * tensor, cons
        // Normal extraction path (no requant)
        ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
        ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block};
+        // For symmetric quantization, biases are a single value instead of per-block
+        ov::Shape bias_shape = layout.is_symmetric ? ov::Shape{} : scale_shape;

        ov::Tensor weights, scales, biases;
        if (output_base_ptr) {
            uint8_t * buf_base = static_cast<uint8_t *>(output_base_ptr);
            weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset);
            scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
-            biases = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.biases_offset);
+            biases = ov::Tensor(ov::element::f16, bias_shape, buf_base + layout.biases_offset);
        } else {
            weights = ov::Tensor(weight_type, node_shape);
            scales = ov::Tensor(ov::element::f16, scale_shape);
-            biases = ov::Tensor(ov::element::f16, scale_shape);
+            biases = ov::Tensor(ov::element::f16, bias_shape);
        }

        result = extract_quantized_weights(tensor, data, weights, scales, biases);
@ -653,6 +714,8 @@ void quantize_q4_0(const float * x,
    auto * weights = static_cast<uint8_t *>(weights_arr.data());
    auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
    auto * biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
+    bool is_scalar_bias = (biases_arr.get_size() == 1);  // Symmetric quantization
+
    for (int i = 0; i < nb; i++) {
        float amax = 0.0f;  // absolute max
        float max = 0.0f;
@ -669,7 +732,13 @@ void quantize_q4_0(const float * x,

        if (d == 0) {
            scales[i] = ov::float16(1.0f);
-            biases[i] = ov::float16(-8.0f);
+            if (is_scalar_bias) {
+                if (i == 0) {
+                    biases[0] = ov::float16(-8.0f);
+                }
+            } else {
+                biases[i] = ov::float16(-8.0f);
+            }
            uint8_t zp = 8;
            memset(weights + i * qk / 2, zp | (zp << 4), qk / 2);
            continue;
@ -677,7 +746,14 @@ void quantize_q4_0(const float * x,

        const float id = 1.0f / d;
        scales[i] = ov::float16(d);
-        biases[i] = ov::float16(-8.f * d);
+        // For symmetric quantization, only write the first bias (all blocks share the same bias relationship)
+        if (is_scalar_bias) {
+            if (i == 0) {
+                biases[0] = ov::float16(-8.f * d);
+            }
+        } else {
+            biases[i] = ov::float16(-8.f * d);
+        }

        for (int j = 0; j < qk / 2; ++j) {
            const float x0 = x[i * qk + 2 * j] * id;
@ -701,6 +777,8 @@ void quantize_q8_0(const float * x,
    auto * weights = static_cast<uint8_t *>(weights_arr.data());
    auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
    auto * biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
+    bool is_scalar_bias = (biases_arr.get_size() == 1);  // Symmetric quantization
+
    for (int i = 0; i < nb; i++) {
        float amax = 0.0f;  // absolute max

@ -714,7 +792,14 @@ void quantize_q8_0(const float * x,
        const float d = amax / 127.0f;
        const float id = d ? 1.0f / d : 0.0f;
        scales[i] = ov::float16(d);
-        biases[i] = ov::float16(-128.0f * d);
+        // For symmetric quantization, only write the first bias (all blocks share the same bias relationship)
+        if (is_scalar_bias) {
+            if (i == 0) {
+                biases[0] = ov::float16(-128.0f * d);
+            }
+        } else {
+            biases[i] = ov::float16(-128.0f * d);
+        }

        for (int j = 0; j < qk; ++j) {
            const float x0 = x[i * qk + j] * id;