Add custom quant type: q8_1_c, q4_0_128

This commit is contained in:
Yu, Zijun 2025-09-02 13:52:45 +08:00 committed by Mustafa Cavus
parent b593428eb3
commit 6926655f5b
5 changed files with 202 additions and 67 deletions

View File

@ -25,6 +25,7 @@
#include <openvino/op/parameter.hpp>
#include <openvino/op/unsqueeze.hpp>
#include <openvino/runtime/tensor.hpp>
#include <optional>
#include <ostream>
#include <set>
#include <stdexcept>
@ -371,7 +372,7 @@ std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const
}
std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(
struct ggml_cgraph* cgraph, std::set<ggml_type> types_to_dequantize) {
struct ggml_cgraph* cgraph, std::map<ggml_type, ExtraQuantType> types_to_requantize) {
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
static std::mutex weights_mutex;
auto* nodes = cgraph->nodes;
@ -396,7 +397,10 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
}
}
if (should_create) {
auto weight_node = create_weight_node(src, types_to_dequantize.count(src->type) > 0);
auto requant_type = types_to_requantize.count(src->type) ?
std::optional<ExtraQuantType>(types_to_requantize.at(src->type)) :
std::nullopt;
auto weight_node = create_weight_node(src, requant_type);
weight_node->set_friendly_name(src_name);
{
std::lock_guard<std::mutex> lock(weights_mutex);
@ -410,7 +414,8 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
return model_weights;
}
std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, bool to_dequantize) {
std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor,
std::optional<ExtraQuantType> requant_type) {
std::set<ggml_type> weight_types = {
GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K};
if (weight_types.find(tensor->type) == weight_types.end()) {
@ -443,21 +448,15 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor,
tensor->extra == nullptr,
"Unsupported weight tensor: " + std::string(tensor->name) + " Possibly this is a repacked quantized weights");
if (to_dequantize) {
std::vector<float> weights_f32(ne_total);
ggml_get_type_traits(tensor->type)->to_float(tensor->data, weights_f32.data(), ggml_nelements(tensor));
ov::Tensor weights(ov::element::f16, node_shape);
ggml_get_type_traits(GGML_TYPE_F16)->from_float_ref(weights_f32.data(), weights.data(), ggml_nelements(tensor));
std::shared_ptr<ov::Node> weight_node = std::make_shared<ov::op::v0::Constant>(weights);
weight_node->set_friendly_name(tensor->name);
return weight_node;
if (requant_type.has_value()) {
return requantize(tensor, requant_type.value());
}
uint64_t weights_per_byte;
ov::element::Type weight_type;
if (tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_1 || tensor->type == GGML_TYPE_Q4_K) {
weights_per_byte = 2;
weight_type = ov::element::u4;
} else { // tensor.type == GGUF_TYPE_Q8_0 || tensor.type == GGUF_TYPE_Q6_K
weights_per_byte = 1;
weight_type = ov::element::u8;
}
uint64_t weights_per_block;
@ -474,15 +473,12 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor,
" has incompatible last dim shape: ",
node_shape.back());
auto weights_shape = node_shape;
weights_shape.back() /= (weights_per_byte * 4); // means u32 type can store 8 q4 or 4 q8
ov::Tensor weights(ov::element::u32, weights_shape);
// For scales and bias
ov::Tensor weights(weight_type, node_shape);
// For scales and biases
node_shape[node_shape.size() - 1] = node_shape[node_shape.size() - 1] / weights_per_block;
ov::Tensor scales(ov::element::f16, node_shape);
ov::Tensor biases(ov::element::f16, node_shape);
ov::Output<ov::Node> weight_node;
if (tensor->type == GGML_TYPE_Q4_0) {
extract_q4_0_data(tensor, weights, scales, biases);
@ -494,7 +490,6 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor,
extract_q8_0_data(tensor, weights, scales, biases);
weight_node = make_int8_weights(weights, scales, biases, weights_per_block);
} else if (tensor->type == GGML_TYPE_Q6_K) {
// due to WA #2135, this case will not be used, extract_q6_k_data temporarily disabled.
extract_q6_k_data(tensor, weights, scales, biases);
weight_node = make_int8_weights(weights, scales, biases, weights_per_block);
} else if (tensor->type == GGML_TYPE_Q4_K) {
@ -503,15 +498,8 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor,
}
OPENVINO_ASSERT(weight_node.get_shape().size() == 2, "Weight should be 2D");
// weight_node = std::make_shared<ov::op::v0::Unsqueeze>(
// weight_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {0}));
weight_node.get_node_shared_ptr()->set_friendly_name(tensor->name);
// GGML_LOG_DEBUG("Created weight node: %s %s %s%s\n",
// tensor->name,
// ggml_type_name(tensor->type),
// weight_node.get_element_type().get_type_name().c_str(),
// weight_node.get_partial_shape().to_string().c_str());
return weight_node.get_node_shared_ptr();
}

View File

@ -4,8 +4,10 @@
#include <map>
#include <memory>
#include <openvino/core/partial_shape.hpp>
#include <optional>
#include <vector>
#include "ggml-quants.hpp"
#include "ggml.h"
#include "openvino/decoder.hpp"
@ -117,9 +119,10 @@ public:
static void dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename);
static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor, bool to_dequantize);
static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor,
std::optional<ExtraQuantType> requant_type = std::nullopt);
static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(
struct ggml_cgraph* cgraph, std::set<ggml_type> types_to_dequantize = {});
struct ggml_cgraph* cgraph, std::map<ggml_type, ExtraQuantType> types_to_requantize = {});
const ggml_tensor* get_tensor_used_op(const ggml_tensor* tensor) const;
const ggml_tensor* get_tensor_from_name(const std::string& name) const;

View File

@ -1,15 +1,20 @@
#include "ggml-quants.hpp"
#include <cstdint>
#include <limits>
#include <memory>
#include <openvino/core/parallel.hpp>
#include <openvino/core/type/element_type_traits.hpp>
#include <openvino/core/type/float16.hpp>
#include <openvino/op/constant.hpp>
#include <openvino/op/convert.hpp>
#include <openvino/op/multiply.hpp>
#include <openvino/op/reshape.hpp>
#include <openvino/op/subtract.hpp>
#include <openvino/runtime/tensor.hpp>
#include <string>
#include "ggml-impl.h"
#include "ggml.h"
void unpack_32_4(const uint8_t* data, uint8_t* dst) {
@ -203,20 +208,24 @@ void extract_q6_k_data(const ggml_tensor* tensor,
// TODO Reorder for make_intX_weights
ov::Output<ov::Node> make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size) {
// Reshape weight to (num_heads, -1, group_size)
ov::Shape orig_shape = weight.get_shape();
orig_shape[1] *= sizeof(uint32_t) / sizeof(uint8_t);
size_t num_groups = orig_shape[1] / group_size;
// Expand dimensions for scales and biases
auto scale_shape = scales.get_shape();
scale_shape.push_back(1);
scales.set_shape(scale_shape);
biases.set_shape(scale_shape);
ov::Shape packed_shape = {orig_shape[0], orig_shape[1] / group_size, group_size};
if (packed_shape[1] == 1) {
packed_shape.erase(packed_shape.begin() + 1);
} else {
scale_shape.push_back(1);
scales.set_shape(scale_shape);
biases.set_shape(scale_shape);
}
// Create graph nodes
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u8, ov::Shape{orig_shape[0], num_groups, group_size}, static_cast<uint8_t*>(weight.data()), nullptr);
auto weights_node = std::make_shared<ov::op::v0::Constant>(
ov::element::u8, packed_shape, static_cast<uint8_t*>(weight.data()), nullptr);
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
ov::Tensor biases_u8(ov::element::u8, scale_shape);
@ -242,32 +251,24 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, o
auto w_zp = std::make_shared<ov::op::v1::Subtract>(
weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY
);
auto w_zp_s = std::make_shared<ov::op::v1::Multiply>(
w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY
);
ov::Output<ov::Node> w_zp_s =
std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
// Reshape back to original dimensions
auto final_shape = std::make_shared<ov::op::v0::Constant>(
ov::element::i64, ov::Shape{orig_shape.size()}, orig_shape
);
auto w_zp_s_r = std::make_shared<ov::op::v1::Reshape>(
w_zp_s, final_shape, false
);
if (packed_shape.size() != 2) {
// If not requantized channel-wise case, reshape back to original shape
auto final_shape =
std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{orig_shape.size()}, orig_shape);
w_zp_s = std::make_shared<ov::op::v1::Reshape>(w_zp_s, final_shape, false);
}
return std::make_shared<ov::op::v0::Convert>(w_zp_s_r, ov::element::f32);
return std::make_shared<ov::op::v0::Convert>(w_zp_s, ov::element::f32);
}
ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size) {
// Convert weight to uint8 view and adjust shape
ov::Shape orig_weight_shape = weight.get_shape();
orig_weight_shape[1] *= sizeof(uint32_t) / sizeof(uint8_t) * 2; // Double number of columns for 4-bit representation
// Expand dimensions for scales and biases
ov::Shape scale_bias_shape = scales.get_shape();
scale_bias_shape.push_back(1); // Add new axis at the end
scales.set_shape(scale_bias_shape);
biases.set_shape(scale_bias_shape);
// Create INT4 weight tensor
ov::Shape packed_shape = {
@ -276,8 +277,17 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, o
group_size
};
// Requantized channel-wise case
if (packed_shape[1] == 1) {
packed_shape.erase(packed_shape.begin() + 1);
} else {
scale_bias_shape.push_back(1);
scales.set_shape(scale_bias_shape);
biases.set_shape(scale_bias_shape);
}
auto weights_node = std::make_shared<ov::op::v0::Constant>(ov::element::u4, packed_shape, static_cast<uint8_t*>(weight.data()), nullptr);
weights_node->get_rt_info()["__gguf_tensor_holde"] = weight;
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
// Pack zero points: two subsequent values into one
@ -304,15 +314,129 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, o
auto w_zp = std::make_shared<ov::op::v1::Subtract>(
weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY);
auto w_zp_s = std::make_shared<ov::op::v1::Multiply>(
w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
ov::Output<ov::Node> w_zp_s =
std::make_shared<ov::op::v1::Multiply>(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY);
// Reshape back to original shape
auto final_shape = std::make_shared<ov::op::v0::Constant>(
ov::element::i64, ov::Shape{orig_weight_shape.size()}, orig_weight_shape);
if (packed_shape.size() != 2) {
// If not requantized channel-wise case, reshape back to original shape
auto final_shape = std::make_shared<ov::op::v0::Constant>(
ov::element::i64, ov::Shape{orig_weight_shape.size()}, orig_weight_shape);
auto w_zp_s_r = std::make_shared<ov::op::v1::Reshape>(
w_zp_s, final_shape, false);
w_zp_s = std::make_shared<ov::op::v1::Reshape>(w_zp_s, final_shape, false);
}
return std::make_shared<ov::op::v0::Convert>(w_zp_s_r, ov::element::f32);
return std::make_shared<ov::op::v0::Convert>(w_zp_s, ov::element::f32);
}
std::shared_ptr<ov::Node> requantize(const ggml_tensor* tensor, ExtraQuantType requant_type) {
std::vector<float> weights_f32(tensor->ne[0] * tensor->ne[1]);
ggml_get_type_traits(tensor->type)->to_float(tensor->data, weights_f32.data(), ggml_nelements(tensor));
std::shared_ptr<ov::Node> weight_node;
ov::Shape node_shape = {(uint64_t) (tensor->ne[1]), (uint64_t) (tensor->ne[0])};
if (requant_type == ExtraQuantType::F16) {
ov::Tensor weights(ov::element::f16, node_shape);
ggml_get_type_traits(GGML_TYPE_F16)->from_float_ref(weights_f32.data(), weights.data(), ggml_nelements(tensor));
std::shared_ptr<ov::Node> weight_node = std::make_shared<ov::op::v0::Constant>(weights);
weight_node->set_friendly_name(tensor->name);
return weight_node;
}
int64_t block_size = node_shape[1];
if (requant_type == ExtraQuantType::Q4_0_128) {
block_size = 128;
}
auto scales_shape = ov::Shape{node_shape[0], node_shape[1] / block_size};
ov::Tensor weights;
ov::Tensor scales(ov::element::f16, scales_shape);
ov::Tensor bias(ov::element::f16, scales_shape);
if (requant_type == ExtraQuantType::Q4_0_C) {
weights = ov::Tensor(ov::element::u4, node_shape);
quantize_q4_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size);
weight_node = make_int4_weights(weights, scales, bias, block_size).get_node_shared_ptr();
} else if (requant_type == ExtraQuantType::Q8_1_C) {
weights = ov::Tensor(ov::element::u8, node_shape);
quantize_q8_1(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size);
weight_node = make_int8_weights(weights, scales, bias, block_size).get_node_shared_ptr();
} else if (requant_type == ExtraQuantType::Q4_0_128) {
weights = ov::Tensor(ov::element::u4, node_shape);
quantize_q4_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size);
weight_node = make_int4_weights(weights, scales, bias, block_size).get_node_shared_ptr();
}
weight_node->set_friendly_name(tensor->name);
return weight_node;
}
void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
int64_t qk) {
assert(k % qk == 0);
const int nb = k / qk;
auto* weights = static_cast<uint8_t*>(weights_arr.data());
auto* scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto* biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
for (int i = 0; i < nb; i++) {
float amax = 0.0f; // absolute max
float max = 0.0f;
for (int j = 0; j < qk; j++) {
const float v = x[i * qk + j];
if (amax < fabsf(v)) {
amax = fabsf(v);
max = v;
}
}
const float d = max / -8;
const float id = d ? 1.0f / d : 0.0f;
scales[i] = ov::float16(d);
biases[i] = ov::float16(-8.f * d);
for (int j = 0; j < qk / 2; ++j) {
const float x0 = x[i * qk + 2 * j] * id;
const float x1 = x[i * qk + 2 * j + 1] * id;
const uint8_t xi0 = MIN(15, (int8_t) (x0 + 8.5f));
const uint8_t xi1 = MIN(15, (int8_t) (x1 + 8.5f));
weights[i * qk / 2 + j] = xi0 | (xi1 << 4);
}
}
}
void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
int64_t qk) {
assert(k % qk == 0);
const int nb = k / qk;
auto* weights = static_cast<uint8_t*>(weights_arr.data());
auto* scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
auto* biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
for (int i = 0; i < nb; i++) {
float min = std::numeric_limits<float>::max();
float max = std::numeric_limits<float>::lowest();
for (int j = 0; j < qk; j++) {
const float v = x[i * qk + j];
if (v < min) {
min = v;
}
if (v > max) {
max = v;
}
}
const float d = (max - min) / ((1 << 8) - 1);
const float id = d ? 1.0f / d : 0.0f;
scales[i] = ov::float16(d);
biases[i] = ov::float16(min);
for (int j = 0; j < qk; ++j) {
const float x0 = (x[i * qk + j] - min) * id;
const uint8_t xi0 = roundf(x0);
weights[i * qk + j] = xi0;
}
}
}

View File

@ -1,3 +1,4 @@
#pragma once
#include <cstdint>
#include <openvino/op/constant.hpp>
#include <openvino/runtime/tensor.hpp>
@ -45,6 +46,15 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight,
ov::Tensor& biases,
size_t group_size = GGML_QUANTIZATION_GROUP_SIZE);
enum class ExtraQuantType { F16, Q4_0_C, Q8_1_C, Q4_0_128 };
std::shared_ptr<ov::Node> requantize(const ggml_tensor* tensor, ExtraQuantType requant_type);
void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
int64_t qk);
void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
int64_t qk);
namespace ov {
namespace op {
namespace util {

View File

@ -130,11 +130,21 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
compile_end_time = conversion_end_time;
} else {
std::shared_ptr<ov::Model> model;
std::set<ggml_type> types_to_dequantize;
std::map<ggml_type, ExtraQuantType> types_to_requantize;
if (is_static) {
types_to_dequantize = {GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K};
types_to_requantize = {
{GGML_TYPE_Q4_0, ExtraQuantType::Q4_0_128},
{GGML_TYPE_Q4_1, ExtraQuantType::Q4_0_128},
{GGML_TYPE_Q4_K, ExtraQuantType::Q4_0_128},
{GGML_TYPE_Q6_K, ExtraQuantType::Q8_1_C },
};
} else if (device == "GPU") {
types_to_requantize = {
// CVS-166739
{GGML_TYPE_Q6_K, ExtraQuantType::Q8_1_C},
};
}
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, types_to_dequantize);
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, types_to_requantize);
if (is_static) {
ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, model_weights, is_static, true);