From f3afa7b91466fc288488744278f81dd642525716 Mon Sep 17 00:00:00 2001
From: "Yu, Zijun" <zijun.yu@intel.com>
Date: Fri, 26 Sep 2025 15:50:32 +0800
Subject: [PATCH] Requantize Q6_K (gs16) to gs32 on GPU

---
 ggml/src/ggml-openvino/ggml-quants.cpp | 43 +++++++++++++++++++++++---
 ggml/src/ggml-openvino/ggml-quants.hpp |  4 ++-
 ggml/src/ggml-openvino/utils.cpp       |  4 +--
 3 files changed, 43 insertions(+), 8 deletions(-)

diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp
index 9b8bfff072..1538a8207c 100644
--- a/ggml/src/ggml-openvino/ggml-quants.cpp
+++ b/ggml/src/ggml-openvino/ggml-quants.cpp
@@ -425,6 +425,8 @@ std::shared_ptr<ov::Node> requantize(const ggml_tensor* tensor, ExtraQuantType r
     int64_t block_size = node_shape[1];
     if (requant_type == ExtraQuantType::Q4_0_128) {
         block_size = 128;
+    } else if (requant_type == ExtraQuantType::Q8_0_32) {
+        block_size = 32;
     }
     auto scales_shape = ov::Shape{node_shape[0], node_shape[1] / block_size};
 
@@ -432,7 +434,7 @@ std::shared_ptr<ov::Node> requantize(const ggml_tensor* tensor, ExtraQuantType r
     ov::Tensor scales(ov::element::f16, scales_shape);
     ov::Tensor bias(ov::element::f16, scales_shape);
 
-    if (requant_type == ExtraQuantType::Q4_0_C) {
+    if (requant_type == ExtraQuantType::Q4_0_C || requant_type == ExtraQuantType::Q4_0_128) {
         weights = ov::Tensor(ov::element::u4, node_shape);
         quantize_q4_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size);
         weight_node = make_int4_weights(weights, scales, bias, block_size).get_node_shared_ptr();
@@ -440,10 +442,10 @@ std::shared_ptr<ov::Node> requantize(const ggml_tensor* tensor, ExtraQuantType r
         weights = ov::Tensor(ov::element::u8, node_shape);
         quantize_q8_1(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size);
         weight_node = make_int8_weights(weights, scales, bias, block_size).get_node_shared_ptr();
-    } else if (requant_type == ExtraQuantType::Q4_0_128) {
-        weights = ov::Tensor(ov::element::u4, node_shape);
-        quantize_q4_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size);
-        weight_node = make_int4_weights(weights, scales, bias, block_size).get_node_shared_ptr();
+    } else if (requant_type == ExtraQuantType::Q8_0_C || requant_type == ExtraQuantType::Q8_0_32) {
+        weights = ov::Tensor(ov::element::u8, node_shape);
+        quantize_q8_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size);
+        weight_node = make_int8_weights(weights, scales, bias, block_size).get_node_shared_ptr();
     }
 
     weight_node->set_friendly_name(tensor->name);
@@ -485,6 +487,37 @@ void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_a
     }
 }
 
+void quantize_q8_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
+                   int64_t qk) {
+    assert(k % qk == 0);
+    const int nb = k / qk;
+
+    auto* weights = static_cast<uint8_t*>(weights_arr.data());
+    auto* scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
+    auto* biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f;  // absolute max
+
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i * qk + j];
+            if (amax < fabsf(v)) {
+                amax = fabsf(v);
+            }
+        }
+
+        const float d = amax / 127.0f;
+        const float id = d ? 1.0f / d : 0.0f;
+        scales[i] = ov::float16(d);
+        biases[i] = ov::float16(-128.0f * d);
+
+        for (int j = 0; j < qk; ++j) {
+            const float x0 = x[i * qk + j] * id;
+            const int8_t xi0 = roundf(x0);
+            weights[i * qk + j] = (uint8_t) (xi0 + 128);
+        }
+    }
+}
+
 void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
                    int64_t qk) {
     assert(k % qk == 0);
diff --git a/ggml/src/ggml-openvino/ggml-quants.hpp b/ggml/src/ggml-openvino/ggml-quants.hpp
index 5496785eb1..71ae317a39 100644
--- a/ggml/src/ggml-openvino/ggml-quants.hpp
+++ b/ggml/src/ggml-openvino/ggml-quants.hpp
@@ -51,7 +51,7 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight,
                                        ov::Tensor& biases,
                                        size_t group_size = GGML_QUANTIZATION_GROUP_SIZE);
 
-enum class ExtraQuantType { F16, Q4_0_C, Q8_1_C, Q4_0_128 };
+enum class ExtraQuantType { F16, Q4_0_C, Q8_1_C, Q4_0_128, Q8_0_C, Q8_0_32 };
 
 std::shared_ptr<ov::Node> requantize(const ggml_tensor* tensor, ExtraQuantType requant_type);
 
@@ -59,6 +59,8 @@ void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_a
                    int64_t qk);
 void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
                    int64_t qk);
+void quantize_q8_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
+                   int64_t qk);
 
 namespace ov {
 namespace op {
diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp
index e9084cf387..0ec815f07f 100644
--- a/ggml/src/ggml-openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/utils.cpp
@@ -288,8 +288,8 @@ std::map<ggml_type, ExtraQuantType> get_types_to_requant(const std::string& devi
     }
     if (device == "GPU") {
         return {
-            // CVS-166739
-            {GGML_TYPE_Q6_K, ExtraQuantType::Q8_1_C},
+            // gs16 is WIP
+            {GGML_TYPE_Q6_K, ExtraQuantType::Q8_0_32},
         };
     }
     return {};