From 67c9720e496290411975d05244552fc8c6f11631 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 29 Dec 2025 15:25:59 +0800 Subject: [PATCH] Optimize symmetric quant weight extraction: use single zp --- .../src/ggml-openvino/ggml-openvino-extra.cpp | 32 ++++- ggml/src/ggml-openvino/ggml-openvino-extra.h | 1 + ggml/src/ggml-openvino/ggml-quants.cpp | 133 ++++++++++++++---- 3 files changed, 140 insertions(+), 26 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp index 26cc386dff..2f24d7a1db 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp @@ -192,6 +192,7 @@ std::optional ggml_openvino_get_requant_type(const ggml_tensor * ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor) { ggml_openvino_extracted_layout layout = {}; + layout.is_symmetric = false; if (!ggml_is_quantized(tensor->type)) { return layout; @@ -225,10 +226,26 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten case ExtraQuantType::Q4_0_128: layout.is_u4 = true; layout.weights_per_block = 128; + layout.is_symmetric = true; + break; + case ExtraQuantType::Q4_0_C: + layout.is_u4 = true; + layout.weights_per_block = tensor->ne[0]; + layout.is_symmetric = true; break; case ExtraQuantType::Q8_0_32: layout.is_u4 = false; layout.weights_per_block = 32; + layout.is_symmetric = true; + break; + case ExtraQuantType::Q8_0_C: + layout.is_u4 = false; + layout.weights_per_block = tensor->ne[0]; + layout.is_symmetric = true; + break; + case ExtraQuantType::Q8_1_C: + layout.is_u4 = false; + layout.weights_per_block = tensor->ne[0]; break; default: layout.weights_per_block = -1; @@ -241,7 +258,8 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements; int64_t n_blocks = n_elements / layout.weights_per_block; layout.scales_size = n_blocks * sizeof(uint16_t); - layout.biases_size = n_blocks * sizeof(uint16_t); + // For symmetric quantization, we only need one bias value (not one per block) + layout.biases_size = layout.is_symmetric ? sizeof(uint16_t) : n_blocks * sizeof(uint16_t); layout.weights_offset = 0; layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment; @@ -256,7 +274,14 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten // Normal extraction (no requant) - determine format based on tensor type switch (tensor->type) { case GGML_TYPE_Q4_0: + layout.is_u4 = true; + layout.weights_per_block = 32; + layout.is_symmetric = true; + break; case GGML_TYPE_Q4_1: + layout.is_u4 = true; + layout.weights_per_block = 32; + break; case GGML_TYPE_Q4_K: layout.is_u4 = true; layout.weights_per_block = 32; @@ -264,10 +289,12 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten case GGML_TYPE_Q8_0: layout.is_u4 = false; layout.weights_per_block = 32; + layout.is_symmetric = true; break; case GGML_TYPE_Q6_K: layout.is_u4 = false; layout.weights_per_block = 16; + layout.is_symmetric = true; break; case GGML_TYPE_Q5_K: layout.is_u4 = false; @@ -285,7 +312,8 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten // Scales and biases: F16 per block int64_t n_blocks = n_elements / layout.weights_per_block; layout.scales_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes - layout.biases_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes + // For symmetric quantization, we only need one bias value (not one per block) + layout.biases_size = layout.is_symmetric ? sizeof(uint16_t) : n_blocks * sizeof(uint16_t); // Layout in buffer: [weights | scales | biases] with alignment layout.weights_offset = 0; diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.h b/ggml/src/ggml-openvino/ggml-openvino-extra.h index fbfe459edf..e2c5a8ceea 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.h +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.h @@ -146,6 +146,7 @@ struct ggml_openvino_extracted_layout { size_t biases_size; // Size of biases in bytes bool is_u4; // true for U4 weights, false for U8 int64_t weights_per_block;// weights per scale/bias block + bool is_symmetric; // true for symmetric quantization // Requantization info bool is_requant; // true if this tensor needs requantization diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp index 1a5679cd8d..8946b73a56 100644 --- a/ggml/src/ggml-openvino/ggml-quants.cpp +++ b/ggml/src/ggml-openvino/ggml-quants.cpp @@ -55,9 +55,18 @@ void extract_q4_0_data(const ggml_tensor * tensor, auto * scales = scales_arr.data::value_type>(); auto * biases = biases_arr.data::value_type>(); + bool is_scalar_bias = (biases_arr.get_size() == 1); // Symmetric quantization + ov::parallel_for(scales_arr.get_size(), [&](size_t i) { scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block))); - biases[i] = ov::float16(-8.f * static_cast(scales[i])); + // For symmetric quantization, only write the first bias (all blocks share the same bias relationship) + if (is_scalar_bias) { + if (i == 0) { + biases[0] = ov::float16(-8.f * static_cast(scales[0])); + } + } else { + biases[i] = ov::float16(-8.f * static_cast(scales[i])); + } unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16); }); } @@ -95,10 +104,19 @@ void extract_q8_0_data(const ggml_tensor * tensor, auto * scales = scales_arr.data::value_type>(); auto * biases = biases_arr.data::value_type>(); + bool is_scalar_bias = (biases_arr.get_size() == 1); // Symmetric quantization + ov::parallel_for(scales_arr.get_size(), [&](size_t i) { uint8_t * block_data = data + i * bytes_per_block; scales[i] = ov::float16::from_bits(*(uint16_t *) block_data); - biases[i] = ov::float16(-128.f * static_cast(scales[i])); + // For symmetric quantization, only write the first bias (all blocks share the same bias relationship) + if (is_scalar_bias) { + if (i == 0) { + biases[0] = ov::float16(-128.f * static_cast(scales[0])); + } + } else { + biases[i] = ov::float16(-128.f * static_cast(scales[i])); + } for (size_t j = 0; j < weights_per_block; ++j) { uint8_t x = block_data[j + 2]; // j+2 to skip the scale bytes. // Original data is in int8_t, so we add a bias of -128 and invert the first bit. @@ -190,6 +208,8 @@ void extract_q6_k_data(const ggml_tensor * tensor, auto * scales = scales_arr.data::value_type>(); auto * biases = biases_arr.data::value_type>(); + bool is_scalar_bias = (biases_arr.get_size() == 1); // Symmetric quantization + ov::parallel_for(n_super_block, [&](size_t i) { uint8_t * block_data = data + i * bytes_per_block; @@ -199,7 +219,14 @@ void extract_q6_k_data(const ggml_tensor * tensor, for (size_t j = 0; j < 16; j++) { scales[j + i * 16] = ov::float16(scale_factor * static_cast(*((int8_t *) (block_data + 128 + 64 + j)))); - biases[j + i * 16] = ov::float16(-32.f * static_cast(scales[j + i * 16])); + // For symmetric quantization, only write the first bias (all blocks share the same bias relationship) + if (is_scalar_bias) { + if (i == 0 && j == 0) { + biases[0] = ov::float16(-32.f * static_cast(scales[0])); + } + } else { + biases[j + i * 16] = ov::float16(-32.f * static_cast(scales[j + i * 16])); + } } uint8_t * ql = block_data; @@ -302,15 +329,22 @@ ov::Output make_int8_weights(ov::Tensor & weight, // Expand dimensions for scales and biases auto scale_shape = scales.get_shape(); + auto bias_shape = biases.get_shape(); + bool is_scalar_bias = bias_shape.empty(); // Symmetric quantization ov::Shape packed_shape = {orig_shape[0], orig_shape[1] / group_size, group_size}; if (packed_shape[1] == 1) { + // Requantized channel-wise case packed_shape.erase(packed_shape.begin() + 1); } else { scale_shape.push_back(1); scales.set_shape(scale_shape); - biases.set_shape(scale_shape); + // For symmetric quantization, biases remain scalar (don't resize) + if (!is_scalar_bias) { + bias_shape = scale_shape; + biases.set_shape(bias_shape); + } } // Create graph nodes @@ -318,15 +352,23 @@ ov::Output make_int8_weights(ov::Tensor & weight, static_cast(weight.data()), nullptr); weights_node->get_rt_info()["__gguf_tensor_holder"] = weight; auto scales_f16 = std::make_shared(scales); - ov::Tensor biases_u8(ov::element::u8, scale_shape); + ov::Tensor biases_u8(ov::element::u8, is_scalar_bias ? ov::Shape{} : scale_shape); // Calculate zero point const ov::float16 * bias_data = biases.data::value_type>(); const ov::float16 * scale_data = scales.data::value_type>(); uint8_t * bias_u8_data = biases_u8.data(); - for (size_t i = 0; i < biases_u8.get_size(); ++i) { - bias_u8_data[i] = - (uint8_t) std::round(-1.f * static_cast(bias_data[i]) / static_cast(scale_data[i])); + + if (is_scalar_bias) { + // Symmetric quantization: single bias value for all blocks + // For Q8_0, bias = -128 * scale, so zero_point = 128 + bias_u8_data[0] = (uint8_t) std::round(-1.f * static_cast(bias_data[0]) / static_cast(scale_data[0])); + } else { + // Asymmetric quantization: per-block biases + for (size_t i = 0; i < biases_u8.get_size(); ++i) { + bias_u8_data[i] = + (uint8_t) std::round(-1.f * static_cast(bias_data[i]) / static_cast(scale_data[i])); + } } auto zero_point = std::make_shared(biases_u8); @@ -361,17 +403,23 @@ ov::Output make_int4_weights(ov::Tensor & weight, // Expand dimensions for scales and biases ov::Shape scale_bias_shape = scales.get_shape(); + auto bias_shape = biases.get_shape(); + bool is_scalar_bias = bias_shape.empty(); // Symmetric quantization // Create INT4 weight tensor ov::Shape packed_shape = {orig_weight_shape[0], orig_weight_shape[1] / group_size, group_size}; - // Requantized channel-wise case if (packed_shape[1] == 1) { + // Requantized channel-wise case packed_shape.erase(packed_shape.begin() + 1); } else { scale_bias_shape.push_back(1); scales.set_shape(scale_bias_shape); - biases.set_shape(scale_bias_shape); + // For symmetric quantization, biases remain scalar (don't resize) + if (!is_scalar_bias) { + bias_shape = scale_bias_shape; + biases.set_shape(bias_shape); + } } auto weights_node = std::make_shared(ov::element::u4, packed_shape, @@ -382,14 +430,23 @@ ov::Output make_int4_weights(ov::Tensor & weight, // Pack zero points: two subsequent values into one const ov::float16 * bias_data = biases.data::value_type>(); const ov::float16 * scale_data = scales.data::value_type>(); - ov::Tensor zero_point_tensor(ov::element::u4, scale_bias_shape); + ov::Tensor zero_point_tensor(ov::element::u4, is_scalar_bias ? ov::Shape{} : scale_bias_shape); uint8_t * zero_point_data = static_cast(zero_point_tensor.data()); - for (size_t i = 0; i < zero_point_tensor.get_byte_size(); ++i) { - uint8_t bias1 = - (uint8_t) std::round(-1.f * static_cast(bias_data[i * 2]) / static_cast(scale_data[i * 2])); - uint8_t bias2 = (uint8_t) std::round(-1.f * static_cast(bias_data[i * 2 + 1]) / - static_cast(scale_data[i * 2 + 1])); - zero_point_data[i] = (bias2 << 4) | (bias1 & 0x0F); + + if (is_scalar_bias) { + // Symmetric quantization: single bias value for all blocks + // For Q4_0, bias = -8 * scale, so zero_point = 8 + uint8_t zp = (uint8_t) std::round(-1.f * static_cast(bias_data[0]) / static_cast(scale_data[0])); + zero_point_data[0] = (zp << 4) | (zp & 0x0F); + } else { + // Asymmetric quantization: per-block biases + for (size_t i = 0; i < zero_point_tensor.get_byte_size(); ++i) { + uint8_t bias1 = + (uint8_t) std::round(-1.f * static_cast(bias_data[i * 2]) / static_cast(scale_data[i * 2])); + uint8_t bias2 = (uint8_t) std::round(-1.f * static_cast(bias_data[i * 2 + 1]) / + static_cast(scale_data[i * 2 + 1])); + zero_point_data[i] = (bias2 << 4) | (bias1 & 0x0F); + } } auto zero_points_node = std::make_shared(zero_point_tensor); @@ -602,17 +659,19 @@ std::shared_ptr process_weight_tensor(const ggml_tensor * tensor, cons // Requant to quantized format (Q4_0_128, Q8_0_32, etc.) ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block}; + // For symmetric quantization, biases are a single value instead of per-block + ov::Shape bias_shape = layout.is_symmetric ? ov::Shape{} : scale_shape; ov::Tensor weights, scales, biases; if (output_base_ptr) { uint8_t * buf_base = static_cast(output_base_ptr); weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset); scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset); - biases = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.biases_offset); + biases = ov::Tensor(ov::element::f16, bias_shape, buf_base + layout.biases_offset); } else { weights = ov::Tensor(weight_type, node_shape); scales = ov::Tensor(ov::element::f16, scale_shape); - biases = ov::Tensor(ov::element::f16, scale_shape); + biases = ov::Tensor(ov::element::f16, bias_shape); } result = requantize_to_buffers(tensor, data, layout.requant_type.value(), layout.weights_per_block, weights, @@ -622,17 +681,19 @@ std::shared_ptr process_weight_tensor(const ggml_tensor * tensor, cons // Normal extraction path (no requant) ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block}; + // For symmetric quantization, biases are a single value instead of per-block + ov::Shape bias_shape = layout.is_symmetric ? ov::Shape{} : scale_shape; ov::Tensor weights, scales, biases; if (output_base_ptr) { uint8_t * buf_base = static_cast(output_base_ptr); weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset); scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset); - biases = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.biases_offset); + biases = ov::Tensor(ov::element::f16, bias_shape, buf_base + layout.biases_offset); } else { weights = ov::Tensor(weight_type, node_shape); scales = ov::Tensor(ov::element::f16, scale_shape); - biases = ov::Tensor(ov::element::f16, scale_shape); + biases = ov::Tensor(ov::element::f16, bias_shape); } result = extract_quantized_weights(tensor, data, weights, scales, biases); @@ -653,6 +714,8 @@ void quantize_q4_0(const float * x, auto * weights = static_cast(weights_arr.data()); auto * scales = scales_arr.data::value_type>(); auto * biases = biases_arr.data::value_type>(); + bool is_scalar_bias = (biases_arr.get_size() == 1); // Symmetric quantization + for (int i = 0; i < nb; i++) { float amax = 0.0f; // absolute max float max = 0.0f; @@ -669,7 +732,13 @@ void quantize_q4_0(const float * x, if (d == 0) { scales[i] = ov::float16(1.0f); - biases[i] = ov::float16(-8.0f); + if (is_scalar_bias) { + if (i == 0) { + biases[0] = ov::float16(-8.0f); + } + } else { + biases[i] = ov::float16(-8.0f); + } uint8_t zp = 8; memset(weights + i * qk / 2, zp | (zp << 4), qk / 2); continue; @@ -677,7 +746,14 @@ void quantize_q4_0(const float * x, const float id = 1.0f / d; scales[i] = ov::float16(d); - biases[i] = ov::float16(-8.f * d); + // For symmetric quantization, only write the first bias (all blocks share the same bias relationship) + if (is_scalar_bias) { + if (i == 0) { + biases[0] = ov::float16(-8.f * d); + } + } else { + biases[i] = ov::float16(-8.f * d); + } for (int j = 0; j < qk / 2; ++j) { const float x0 = x[i * qk + 2 * j] * id; @@ -701,6 +777,8 @@ void quantize_q8_0(const float * x, auto * weights = static_cast(weights_arr.data()); auto * scales = scales_arr.data::value_type>(); auto * biases = biases_arr.data::value_type>(); + bool is_scalar_bias = (biases_arr.get_size() == 1); // Symmetric quantization + for (int i = 0; i < nb; i++) { float amax = 0.0f; // absolute max @@ -714,7 +792,14 @@ void quantize_q8_0(const float * x, const float d = amax / 127.0f; const float id = d ? 1.0f / d : 0.0f; scales[i] = ov::float16(d); - biases[i] = ov::float16(-128.0f * d); + // For symmetric quantization, only write the first bias (all blocks share the same bias relationship) + if (is_scalar_bias) { + if (i == 0) { + biases[0] = ov::float16(-128.0f * d); + } + } else { + biases[i] = ov::float16(-128.0f * d); + } for (int j = 0; j < qk; ++j) { const float x0 = x[i * qk + j] * id;