Add custom quant type: q8_1_c, q4_0_128

2025-09-02 13:52:45 +08:00 · 2025-09-02 13:52:45 +08:00 · 6926655f5b
parent b593428eb3
commit 6926655f5b
5 changed files with 202 additions and 67 deletions
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@ -25,6 +25,7 @@
 #include <openvino/op/parameter.hpp>
 #include <openvino/op/unsqueeze.hpp>
 #include <openvino/runtime/tensor.hpp>
+#include <optional>
 #include <ostream>
 #include <set>
 #include <stdexcept>
@ -371,7 +372,7 @@ std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const
 }

 std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(
-    struct ggml_cgraph* cgraph, std::set<ggml_type> types_to_dequantize) {
+    struct ggml_cgraph* cgraph, std::map<ggml_type, ExtraQuantType> types_to_requantize) {
    std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
    static std::mutex weights_mutex;
    auto* nodes = cgraph->nodes;
@ -396,7 +397,10 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
                        }
                    }
                    if (should_create) {
-                        auto weight_node = create_weight_node(src, types_to_dequantize.count(src->type) > 0);
+                        auto requant_type = types_to_requantize.count(src->type) ?
+                                                std::optional<ExtraQuantType>(types_to_requantize.at(src->type)) :
+                                                std::nullopt;
+                        auto weight_node = create_weight_node(src, requant_type);
                        weight_node->set_friendly_name(src_name);
                        {
                            std::lock_guard<std::mutex> lock(weights_mutex);
@ -410,7 +414,8 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
    return model_weights;
 }

-std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, bool to_dequantize) {
+std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor,
+                                                            std::optional<ExtraQuantType> requant_type) {
    std::set<ggml_type> weight_types = {
        GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K};
    if (weight_types.find(tensor->type) == weight_types.end()) {
@ -443,21 +448,15 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor,
        tensor->extra == nullptr,
        "Unsupported weight tensor: " + std::string(tensor->name) + " Possibly this is a repacked quantized weights");

-    if (to_dequantize) {
-        std::vector<float> weights_f32(ne_total);
-        ggml_get_type_traits(tensor->type)->to_float(tensor->data, weights_f32.data(), ggml_nelements(tensor));
-        ov::Tensor weights(ov::element::f16, node_shape);
-        ggml_get_type_traits(GGML_TYPE_F16)->from_float_ref(weights_f32.data(), weights.data(), ggml_nelements(tensor));
-        std::shared_ptr<ov::Node> weight_node = std::make_shared<ov::op::v0::Constant>(weights);
-        weight_node->set_friendly_name(tensor->name);
-        return weight_node;
+    if (requant_type.has_value()) {
+        return requantize(tensor, requant_type.value());
    }

-    uint64_t weights_per_byte;
+    ov::element::Type weight_type;
    if (tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_1 || tensor->type == GGML_TYPE_Q4_K) {
-        weights_per_byte = 2;
+        weight_type = ov::element::u4;
    } else {  // tensor.type == GGUF_TYPE_Q8_0 || tensor.type == GGUF_TYPE_Q6_K
-        weights_per_byte = 1;
+        weight_type = ov::element::u8;
    }

    uint64_t weights_per_block;
@ -474,15 +473,12 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor,
                    " has incompatible last dim shape: ",
                    node_shape.back());

-    auto weights_shape = node_shape;
-    weights_shape.back() /= (weights_per_byte * 4);  // means u32 type can store 8 q4 or 4 q8
-
-    ov::Tensor weights(ov::element::u32, weights_shape);
-    // For scales and bias
+    ov::Tensor weights(weight_type, node_shape);
+    // For scales and biases
    node_shape[node_shape.size() - 1] = node_shape[node_shape.size() - 1] / weights_per_block;
-
    ov::Tensor scales(ov::element::f16, node_shape);
    ov::Tensor biases(ov::element::f16, node_shape);
+
    ov::Output<ov::Node> weight_node;
    if (tensor->type == GGML_TYPE_Q4_0) {
        extract_q4_0_data(tensor, weights, scales, biases);
@ -494,7 +490,6 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor,
        extract_q8_0_data(tensor, weights, scales, biases);
        weight_node = make_int8_weights(weights, scales, biases, weights_per_block);
    } else if (tensor->type == GGML_TYPE_Q6_K) {
-        // due to WA #2135, this case will not be used, extract_q6_k_data temporarily disabled.
        extract_q6_k_data(tensor, weights, scales, biases);
        weight_node = make_int8_weights(weights, scales, biases, weights_per_block);
    } else if (tensor->type == GGML_TYPE_Q4_K) {
@ -503,15 +498,8 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor,
    }

    OPENVINO_ASSERT(weight_node.get_shape().size() == 2, "Weight should be 2D");
-    // weight_node = std::make_shared<ov::op::v0::Unsqueeze>(
-    //     weight_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {0}));

    weight_node.get_node_shared_ptr()->set_friendly_name(tensor->name);
-    // GGML_LOG_DEBUG("Created weight node: %s   %s %s%s\n",
-    //                tensor->name,
-    //                ggml_type_name(tensor->type),
-    //                weight_node.get_element_type().get_type_name().c_str(),
-    //                weight_node.get_partial_shape().to_string().c_str());
    return weight_node.get_node_shared_ptr();
 }

--- a/ggml/src/ggml-openvino/ggml-decoder.h
+++ b/ggml/src/ggml-openvino/ggml-decoder.h
@ -4,8 +4,10 @@
 #include <map>
 #include <memory>
 #include <openvino/core/partial_shape.hpp>
+#include <optional>
 #include <vector>

+#include "ggml-quants.hpp"
 #include "ggml.h"
 #include "openvino/decoder.hpp"

@ -117,9 +119,10 @@ public:

    static void dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename);

-    static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor, bool to_dequantize);
+    static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor,
+                                                        std::optional<ExtraQuantType> requant_type = std::nullopt);
    static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(
-        struct ggml_cgraph* cgraph, std::set<ggml_type> types_to_dequantize = {});
+        struct ggml_cgraph* cgraph, std::map<ggml_type, ExtraQuantType> types_to_requantize = {});

    const ggml_tensor* get_tensor_used_op(const ggml_tensor* tensor) const;
    const ggml_tensor* get_tensor_from_name(const std::string& name) const;
--- a/ggml/src/ggml-openvino/ggml-quants.cpp
+++ b/ggml/src/ggml-openvino/ggml-quants.cpp
@ -1,15 +1,20 @@
 #include "ggml-quants.hpp"

 #include <cstdint>
+#include <limits>
+#include <memory>
 #include <openvino/core/parallel.hpp>
 #include <openvino/core/type/element_type_traits.hpp>
+#include <openvino/core/type/float16.hpp>
 #include <openvino/op/constant.hpp>
 #include <openvino/op/convert.hpp>
 #include <openvino/op/multiply.hpp>
 #include <openvino/op/reshape.hpp>
 #include <openvino/op/subtract.hpp>
 #include <openvino/runtime/tensor.hpp>
+#include <string>

+#include "ggml-impl.h"
 #include "ggml.h"

 void unpack_32_4(const uint8_t* data, uint8_t* dst) {
@ -203,20 +208,24 @@ void extract_q6_k_data(const ggml_tensor* tensor,
 // TODO Reorder for make_intX_weights

 ov::Output<ov::Node> make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size) {
-
-    // Reshape weight to (num_heads, -1, group_size)
    ov::Shape orig_shape = weight.get_shape();
-    orig_shape[1] *= sizeof(uint32_t) / sizeof(uint8_t);
-    size_t num_groups = orig_shape[1] / group_size;

    // Expand dimensions for scales and biases
    auto scale_shape = scales.get_shape();
-    scale_shape.push_back(1);
-    scales.set_shape(scale_shape);
-    biases.set_shape(scale_shape);
+
+    ov::Shape packed_shape = {orig_shape[0], orig_shape[1] / group_size, group_size};
+
+    if (packed_shape[1] == 1) {
+        packed_shape.erase(packed_shape.begin() + 1);
+    } else {
+        scale_shape.push_back(1);
+        scales.set_shape(scale_shape);
+        biases.set_shape(scale_shape);
+    }

    // Create graph nodes
-    auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u8, ov::Shape{orig_shape[0], num_groups, group_size}, static_cast<uint8_t*>(weight.data()), nullptr);
+    auto weights_node = std::make_shared<ov::op::v0::Constant>(
+        ov::element::u8, packed_shape, static_cast<uint8_t*>(weight.data()), nullptr);
    weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
    auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
    ov::Tensor biases_u8(ov::element::u8, scale_shape);
@ -242,32 +251,24 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, o
    auto w_zp = std::make_shared<ov::op::v1::Subtract>(
        weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY
    );
-    auto w_zp_s = std::make_shared<ov::op::v1::Multiply>(
-        w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY
-    );
+    ov::Output<ov::Node> w_zp_s =
+        std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);

-    // Reshape back to original dimensions
-    auto final_shape = std::make_shared<ov::op::v0::Constant>(
-        ov::element::i64, ov::Shape{orig_shape.size()}, orig_shape
-    );
-    auto w_zp_s_r = std::make_shared<ov::op::v1::Reshape>(
-        w_zp_s, final_shape, false
-    );
+    if (packed_shape.size() != 2) {
+        // If not requantized channel-wise case, reshape back to original shape
+        auto final_shape =
+            std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{orig_shape.size()}, orig_shape);
+        w_zp_s = std::make_shared<ov::op::v1::Reshape>(w_zp_s, final_shape, false);
+    }

-    return std::make_shared<ov::op::v0::Convert>(w_zp_s_r, ov::element::f32);
+    return std::make_shared<ov::op::v0::Convert>(w_zp_s, ov::element::f32);
 }

 ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size) {
-
-    // Convert weight to uint8 view and adjust shape
    ov::Shape orig_weight_shape = weight.get_shape();
-    orig_weight_shape[1] *= sizeof(uint32_t) / sizeof(uint8_t) * 2; // Double number of columns for 4-bit representation

    // Expand dimensions for scales and biases
    ov::Shape scale_bias_shape = scales.get_shape();
-    scale_bias_shape.push_back(1); // Add new axis at the end
-    scales.set_shape(scale_bias_shape);
-    biases.set_shape(scale_bias_shape);

    // Create INT4 weight tensor
    ov::Shape packed_shape = {
@ -276,8 +277,17 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, o
        group_size
    };

+    // Requantized channel-wise case
+    if (packed_shape[1] == 1) {
+        packed_shape.erase(packed_shape.begin() + 1);
+    } else {
+        scale_bias_shape.push_back(1);
+        scales.set_shape(scale_bias_shape);
+        biases.set_shape(scale_bias_shape);
+    }
+
    auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u4, packed_shape, static_cast<uint8_t*>(weight.data()), nullptr);
-    weights_node->get_rt_info()["__gguf_tensor_holde"] = weight;
+    weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
    auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);

    // Pack zero points: two subsequent values into one
@ -304,15 +314,129 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, o
    auto w_zp = std::make_shared<ov::op::v1::Subtract>(
        weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY);

-    auto w_zp_s = std::make_shared<ov::op::v1::Multiply>(
-        w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
+    ov::Output<ov::Node> w_zp_s =
+        std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);

-    // Reshape back to original shape
-    auto final_shape = std::make_shared<ov::op::v0::Constant>(
-        ov::element::i64, ov::Shape{orig_weight_shape.size()}, orig_weight_shape);
+    if (packed_shape.size() != 2) {
+        // If not requantized channel-wise case, reshape back to original shape
+        auto final_shape = std::make_shared<ov::op::v0::Constant>(
+            ov::element::i64, ov::Shape{orig_weight_shape.size()}, orig_weight_shape);

-    auto w_zp_s_r = std::make_shared<ov::op::v1::Reshape>(
-        w_zp_s, final_shape, false);
+        w_zp_s = std::make_shared<ov::op::v1::Reshape>(w_zp_s, final_shape, false);
+    }

-    return std::make_shared<ov::op::v0::Convert>(w_zp_s_r, ov::element::f32);
+    return std::make_shared<ov::op::v0::Convert>(w_zp_s, ov::element::f32);
+}
+
+std::shared_ptr<ov::Node> requantize(const ggml_tensor* tensor, ExtraQuantType requant_type) {
+    std::vector<float> weights_f32(tensor->ne[0] * tensor->ne[1]);
+    ggml_get_type_traits(tensor->type)->to_float(tensor->data, weights_f32.data(), ggml_nelements(tensor));
+
+    std::shared_ptr<ov::Node> weight_node;
+    ov::Shape node_shape = {(uint64_t) (tensor->ne[1]), (uint64_t) (tensor->ne[0])};
+
+    if (requant_type == ExtraQuantType::F16) {
+        ov::Tensor weights(ov::element::f16, node_shape);
+        ggml_get_type_traits(GGML_TYPE_F16)->from_float_ref(weights_f32.data(), weights.data(), ggml_nelements(tensor));
+        std::shared_ptr<ov::Node> weight_node = std::make_shared<ov::op::v0::Constant>(weights);
+        weight_node->set_friendly_name(tensor->name);
+        return weight_node;
+    }
+
+    int64_t block_size = node_shape[1];
+    if (requant_type == ExtraQuantType::Q4_0_128) {
+        block_size = 128;
+    }
+    auto scales_shape = ov::Shape{node_shape[0], node_shape[1] / block_size};
+
+    ov::Tensor weights;
+    ov::Tensor scales(ov::element::f16, scales_shape);
+    ov::Tensor bias(ov::element::f16, scales_shape);
+
+    if (requant_type == ExtraQuantType::Q4_0_C) {
+        weights = ov::Tensor(ov::element::u4, node_shape);
+        quantize_q4_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size);
+        weight_node = make_int4_weights(weights, scales, bias, block_size).get_node_shared_ptr();
+    } else if (requant_type == ExtraQuantType::Q8_1_C) {
+        weights = ov::Tensor(ov::element::u8, node_shape);
+        quantize_q8_1(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size);
+        weight_node = make_int8_weights(weights, scales, bias, block_size).get_node_shared_ptr();
+    } else if (requant_type == ExtraQuantType::Q4_0_128) {
+        weights = ov::Tensor(ov::element::u4, node_shape);
+        quantize_q4_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size);
+        weight_node = make_int4_weights(weights, scales, bias, block_size).get_node_shared_ptr();
+    }
+
+    weight_node->set_friendly_name(tensor->name);
+    return weight_node;
+}
+
+void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
+                   int64_t qk) {
+    assert(k % qk == 0);
+    const int nb = k / qk;
+
+    auto* weights = static_cast<uint8_t*>(weights_arr.data());
+    auto* scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
+    auto* biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f;  // absolute max
+        float max = 0.0f;
+
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i * qk + j];
+            if (amax < fabsf(v)) {
+                amax = fabsf(v);
+                max = v;
+            }
+        }
+
+        const float d = max / -8;
+        const float id = d ? 1.0f / d : 0.0f;
+        scales[i] = ov::float16(d);
+        biases[i] = ov::float16(-8.f * d);
+
+        for (int j = 0; j < qk / 2; ++j) {
+            const float x0 = x[i * qk + 2 * j] * id;
+            const float x1 = x[i * qk + 2 * j + 1] * id;
+            const uint8_t xi0 = MIN(15, (int8_t) (x0 + 8.5f));
+            const uint8_t xi1 = MIN(15, (int8_t) (x1 + 8.5f));
+            weights[i * qk / 2 + j] = xi0 | (xi1 << 4);
+        }
+    }
+}
+
+void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
+                   int64_t qk) {
+    assert(k % qk == 0);
+    const int nb = k / qk;
+
+    auto* weights = static_cast<uint8_t*>(weights_arr.data());
+    auto* scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
+    auto* biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
+    for (int i = 0; i < nb; i++) {
+        float min = std::numeric_limits<float>::max();
+        float max = std::numeric_limits<float>::lowest();
+
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i * qk + j];
+            if (v < min) {
+                min = v;
+            }
+            if (v > max) {
+                max = v;
+            }
+        }
+
+        const float d = (max - min) / ((1 << 8) - 1);
+        const float id = d ? 1.0f / d : 0.0f;
+        scales[i] = ov::float16(d);
+        biases[i] = ov::float16(min);
+
+        for (int j = 0; j < qk; ++j) {
+            const float x0 = (x[i * qk + j] - min) * id;
+            const uint8_t xi0 = roundf(x0);
+            weights[i * qk + j] = xi0;
+        }
+    }
 }
--- a/ggml/src/ggml-openvino/ggml-quants.hpp
+++ b/ggml/src/ggml-openvino/ggml-quants.hpp
@ -1,3 +1,4 @@
+#pragma once
 #include <cstdint>
 #include <openvino/op/constant.hpp>
 #include <openvino/runtime/tensor.hpp>
@ -45,6 +46,15 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight,
                                       ov::Tensor& biases,
                                       size_t group_size = GGML_QUANTIZATION_GROUP_SIZE);

+enum class ExtraQuantType { F16, Q4_0_C, Q8_1_C, Q4_0_128 };
+
+std::shared_ptr<ov::Node> requantize(const ggml_tensor* tensor, ExtraQuantType requant_type);
+
+void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
+                   int64_t qk);
+void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
+                   int64_t qk);
+
 namespace ov {
 namespace op {
 namespace util {
--- a/ggml/src/ggml-openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/utils.cpp
@ -130,11 +130,21 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
            compile_end_time = conversion_end_time;
        } else {
            std::shared_ptr<ov::Model> model;
-            std::set<ggml_type> types_to_dequantize;
+            std::map<ggml_type, ExtraQuantType> types_to_requantize;
            if (is_static) {
-                types_to_dequantize = {GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K};
+                types_to_requantize = {
+                    {GGML_TYPE_Q4_0, ExtraQuantType::Q4_0_128},
+                    {GGML_TYPE_Q4_1, ExtraQuantType::Q4_0_128},
+                    {GGML_TYPE_Q4_K, ExtraQuantType::Q4_0_128},
+                    {GGML_TYPE_Q6_K, ExtraQuantType::Q8_1_C  },
+                };
+            } else if (device == "GPU") {
+                types_to_requantize = {
+                    // CVS-166739
+                    {GGML_TYPE_Q6_K, ExtraQuantType::Q8_1_C},
+                };
            }
-            auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, types_to_dequantize);
+            auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, types_to_requantize);

            if (is_static) {
                ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, model_weights, is_static, true);