From f3afa7b91466fc288488744278f81dd642525716 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Fri, 26 Sep 2025 15:50:32 +0800 Subject: [PATCH] Requantize Q6_K (gs16) to gs32 on GPU --- ggml/src/ggml-openvino/ggml-quants.cpp | 43 +++++++++++++++++++++++--- ggml/src/ggml-openvino/ggml-quants.hpp | 4 ++- ggml/src/ggml-openvino/utils.cpp | 4 +-- 3 files changed, 43 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp index 9b8bfff072..1538a8207c 100644 --- a/ggml/src/ggml-openvino/ggml-quants.cpp +++ b/ggml/src/ggml-openvino/ggml-quants.cpp @@ -425,6 +425,8 @@ std::shared_ptr requantize(const ggml_tensor* tensor, ExtraQuantType r int64_t block_size = node_shape[1]; if (requant_type == ExtraQuantType::Q4_0_128) { block_size = 128; + } else if (requant_type == ExtraQuantType::Q8_0_32) { + block_size = 32; } auto scales_shape = ov::Shape{node_shape[0], node_shape[1] / block_size}; @@ -432,7 +434,7 @@ std::shared_ptr requantize(const ggml_tensor* tensor, ExtraQuantType r ov::Tensor scales(ov::element::f16, scales_shape); ov::Tensor bias(ov::element::f16, scales_shape); - if (requant_type == ExtraQuantType::Q4_0_C) { + if (requant_type == ExtraQuantType::Q4_0_C || requant_type == ExtraQuantType::Q4_0_128) { weights = ov::Tensor(ov::element::u4, node_shape); quantize_q4_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size); weight_node = make_int4_weights(weights, scales, bias, block_size).get_node_shared_ptr(); @@ -440,10 +442,10 @@ std::shared_ptr requantize(const ggml_tensor* tensor, ExtraQuantType r weights = ov::Tensor(ov::element::u8, node_shape); quantize_q8_1(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size); weight_node = make_int8_weights(weights, scales, bias, block_size).get_node_shared_ptr(); - } else if (requant_type == ExtraQuantType::Q4_0_128) { - weights = ov::Tensor(ov::element::u4, node_shape); - quantize_q4_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size); - weight_node = make_int4_weights(weights, scales, bias, block_size).get_node_shared_ptr(); + } else if (requant_type == ExtraQuantType::Q8_0_C || requant_type == ExtraQuantType::Q8_0_32) { + weights = ov::Tensor(ov::element::u8, node_shape); + quantize_q8_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size); + weight_node = make_int8_weights(weights, scales, bias, block_size).get_node_shared_ptr(); } weight_node->set_friendly_name(tensor->name); @@ -485,6 +487,37 @@ void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_a } } +void quantize_q8_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, + int64_t qk) { + assert(k % qk == 0); + const int nb = k / qk; + + auto* weights = static_cast(weights_arr.data()); + auto* scales = scales_arr.data::value_type>(); + auto* biases = biases_arr.data::value_type>(); + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + + for (int j = 0; j < qk; j++) { + const float v = x[i * qk + j]; + if (amax < fabsf(v)) { + amax = fabsf(v); + } + } + + const float d = amax / 127.0f; + const float id = d ? 1.0f / d : 0.0f; + scales[i] = ov::float16(d); + biases[i] = ov::float16(-128.0f * d); + + for (int j = 0; j < qk; ++j) { + const float x0 = x[i * qk + j] * id; + const int8_t xi0 = roundf(x0); + weights[i * qk + j] = (uint8_t) (xi0 + 128); + } + } +} + void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, int64_t qk) { assert(k % qk == 0); diff --git a/ggml/src/ggml-openvino/ggml-quants.hpp b/ggml/src/ggml-openvino/ggml-quants.hpp index 5496785eb1..71ae317a39 100644 --- a/ggml/src/ggml-openvino/ggml-quants.hpp +++ b/ggml/src/ggml-openvino/ggml-quants.hpp @@ -51,7 +51,7 @@ ov::Output make_int4_weights(ov::Tensor& weight, ov::Tensor& biases, size_t group_size = GGML_QUANTIZATION_GROUP_SIZE); -enum class ExtraQuantType { F16, Q4_0_C, Q8_1_C, Q4_0_128 }; +enum class ExtraQuantType { F16, Q4_0_C, Q8_1_C, Q4_0_128, Q8_0_C, Q8_0_32 }; std::shared_ptr requantize(const ggml_tensor* tensor, ExtraQuantType requant_type); @@ -59,6 +59,8 @@ void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_a int64_t qk); void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, int64_t qk); +void quantize_q8_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, + int64_t qk); namespace ov { namespace op { diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index e9084cf387..0ec815f07f 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -288,8 +288,8 @@ std::map get_types_to_requant(const std::string& devi } if (device == "GPU") { return { - // CVS-166739 - {GGML_TYPE_Q6_K, ExtraQuantType::Q8_1_C}, + // gs16 is WIP + {GGML_TYPE_Q6_K, ExtraQuantType::Q8_0_32}, }; } return {};