diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index fef8648ebd..d00b78e891 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -371,7 +372,7 @@ std::map GgmlOvDecoder::get_kv_param_res_names() const } std::map> GgmlOvDecoder::create_weight_nodes( - struct ggml_cgraph* cgraph, std::set types_to_dequantize) { + struct ggml_cgraph* cgraph, std::map types_to_requantize) { std::map> model_weights; static std::mutex weights_mutex; auto* nodes = cgraph->nodes; @@ -396,7 +397,10 @@ std::map> GgmlOvDecoder::create_weight_no } } if (should_create) { - auto weight_node = create_weight_node(src, types_to_dequantize.count(src->type) > 0); + auto requant_type = types_to_requantize.count(src->type) ? + std::optional(types_to_requantize.at(src->type)) : + std::nullopt; + auto weight_node = create_weight_node(src, requant_type); weight_node->set_friendly_name(src_name); { std::lock_guard lock(weights_mutex); @@ -410,7 +414,8 @@ std::map> GgmlOvDecoder::create_weight_no return model_weights; } -std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, bool to_dequantize) { +std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, + std::optional requant_type) { std::set weight_types = { GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K}; if (weight_types.find(tensor->type) == weight_types.end()) { @@ -443,21 +448,15 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, tensor->extra == nullptr, "Unsupported weight tensor: " + std::string(tensor->name) + " Possibly this is a repacked quantized weights"); - if (to_dequantize) { - std::vector weights_f32(ne_total); - ggml_get_type_traits(tensor->type)->to_float(tensor->data, weights_f32.data(), ggml_nelements(tensor)); - ov::Tensor weights(ov::element::f16, node_shape); - ggml_get_type_traits(GGML_TYPE_F16)->from_float_ref(weights_f32.data(), weights.data(), ggml_nelements(tensor)); - std::shared_ptr weight_node = std::make_shared(weights); - weight_node->set_friendly_name(tensor->name); - return weight_node; + if (requant_type.has_value()) { + return requantize(tensor, requant_type.value()); } - uint64_t weights_per_byte; + ov::element::Type weight_type; if (tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_1 || tensor->type == GGML_TYPE_Q4_K) { - weights_per_byte = 2; + weight_type = ov::element::u4; } else { // tensor.type == GGUF_TYPE_Q8_0 || tensor.type == GGUF_TYPE_Q6_K - weights_per_byte = 1; + weight_type = ov::element::u8; } uint64_t weights_per_block; @@ -474,15 +473,12 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, " has incompatible last dim shape: ", node_shape.back()); - auto weights_shape = node_shape; - weights_shape.back() /= (weights_per_byte * 4); // means u32 type can store 8 q4 or 4 q8 - - ov::Tensor weights(ov::element::u32, weights_shape); - // For scales and bias + ov::Tensor weights(weight_type, node_shape); + // For scales and biases node_shape[node_shape.size() - 1] = node_shape[node_shape.size() - 1] / weights_per_block; - ov::Tensor scales(ov::element::f16, node_shape); ov::Tensor biases(ov::element::f16, node_shape); + ov::Output weight_node; if (tensor->type == GGML_TYPE_Q4_0) { extract_q4_0_data(tensor, weights, scales, biases); @@ -494,7 +490,6 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, extract_q8_0_data(tensor, weights, scales, biases); weight_node = make_int8_weights(weights, scales, biases, weights_per_block); } else if (tensor->type == GGML_TYPE_Q6_K) { - // due to WA #2135, this case will not be used, extract_q6_k_data temporarily disabled. extract_q6_k_data(tensor, weights, scales, biases); weight_node = make_int8_weights(weights, scales, biases, weights_per_block); } else if (tensor->type == GGML_TYPE_Q4_K) { @@ -503,15 +498,8 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor, } OPENVINO_ASSERT(weight_node.get_shape().size() == 2, "Weight should be 2D"); - // weight_node = std::make_shared( - // weight_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {0})); weight_node.get_node_shared_ptr()->set_friendly_name(tensor->name); - // GGML_LOG_DEBUG("Created weight node: %s %s %s%s\n", - // tensor->name, - // ggml_type_name(tensor->type), - // weight_node.get_element_type().get_type_name().c_str(), - // weight_node.get_partial_shape().to_string().c_str()); return weight_node.get_node_shared_ptr(); } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index b446841514..24e1d92dcf 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -4,8 +4,10 @@ #include #include #include +#include #include +#include "ggml-quants.hpp" #include "ggml.h" #include "openvino/decoder.hpp" @@ -117,9 +119,10 @@ public: static void dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename); - static std::shared_ptr create_weight_node(ggml_tensor* tensor, bool to_dequantize); + static std::shared_ptr create_weight_node(ggml_tensor* tensor, + std::optional requant_type = std::nullopt); static std::map> create_weight_nodes( - struct ggml_cgraph* cgraph, std::set types_to_dequantize = {}); + struct ggml_cgraph* cgraph, std::map types_to_requantize = {}); const ggml_tensor* get_tensor_used_op(const ggml_tensor* tensor) const; const ggml_tensor* get_tensor_from_name(const std::string& name) const; diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp index 97aa494ed8..1603e65355 100644 --- a/ggml/src/ggml-openvino/ggml-quants.cpp +++ b/ggml/src/ggml-openvino/ggml-quants.cpp @@ -1,15 +1,20 @@ #include "ggml-quants.hpp" #include +#include +#include #include #include +#include #include #include #include #include #include #include +#include +#include "ggml-impl.h" #include "ggml.h" void unpack_32_4(const uint8_t* data, uint8_t* dst) { @@ -203,20 +208,24 @@ void extract_q6_k_data(const ggml_tensor* tensor, // TODO Reorder for make_intX_weights ov::Output make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size) { - - // Reshape weight to (num_heads, -1, group_size) ov::Shape orig_shape = weight.get_shape(); - orig_shape[1] *= sizeof(uint32_t) / sizeof(uint8_t); - size_t num_groups = orig_shape[1] / group_size; // Expand dimensions for scales and biases auto scale_shape = scales.get_shape(); - scale_shape.push_back(1); - scales.set_shape(scale_shape); - biases.set_shape(scale_shape); + + ov::Shape packed_shape = {orig_shape[0], orig_shape[1] / group_size, group_size}; + + if (packed_shape[1] == 1) { + packed_shape.erase(packed_shape.begin() + 1); + } else { + scale_shape.push_back(1); + scales.set_shape(scale_shape); + biases.set_shape(scale_shape); + } // Create graph nodes - auto weights_node = std::make_shared(ov::element::u8, ov::Shape{orig_shape[0], num_groups, group_size}, static_cast(weight.data()), nullptr); + auto weights_node = std::make_shared( + ov::element::u8, packed_shape, static_cast(weight.data()), nullptr); weights_node->get_rt_info()["__gguf_tensor_holder"] = weight; auto scales_f16 = std::make_shared(scales); ov::Tensor biases_u8(ov::element::u8, scale_shape); @@ -242,32 +251,24 @@ ov::Output make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, o auto w_zp = std::make_shared( weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY ); - auto w_zp_s = std::make_shared( - w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY - ); + ov::Output w_zp_s = + std::make_shared(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY); - // Reshape back to original dimensions - auto final_shape = std::make_shared( - ov::element::i64, ov::Shape{orig_shape.size()}, orig_shape - ); - auto w_zp_s_r = std::make_shared( - w_zp_s, final_shape, false - ); + if (packed_shape.size() != 2) { + // If not requantized channel-wise case, reshape back to original shape + auto final_shape = + std::make_shared(ov::element::i64, ov::Shape{orig_shape.size()}, orig_shape); + w_zp_s = std::make_shared(w_zp_s, final_shape, false); + } - return std::make_shared(w_zp_s_r, ov::element::f32); + return std::make_shared(w_zp_s, ov::element::f32); } ov::Output make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size) { - - // Convert weight to uint8 view and adjust shape ov::Shape orig_weight_shape = weight.get_shape(); - orig_weight_shape[1] *= sizeof(uint32_t) / sizeof(uint8_t) * 2; // Double number of columns for 4-bit representation // Expand dimensions for scales and biases ov::Shape scale_bias_shape = scales.get_shape(); - scale_bias_shape.push_back(1); // Add new axis at the end - scales.set_shape(scale_bias_shape); - biases.set_shape(scale_bias_shape); // Create INT4 weight tensor ov::Shape packed_shape = { @@ -276,8 +277,17 @@ ov::Output make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, o group_size }; + // Requantized channel-wise case + if (packed_shape[1] == 1) { + packed_shape.erase(packed_shape.begin() + 1); + } else { + scale_bias_shape.push_back(1); + scales.set_shape(scale_bias_shape); + biases.set_shape(scale_bias_shape); + } + auto weights_node = std::make_shared(ov::element::u4, packed_shape, static_cast(weight.data()), nullptr); - weights_node->get_rt_info()["__gguf_tensor_holde"] = weight; + weights_node->get_rt_info()["__gguf_tensor_holder"] = weight; auto weights_f16 = std::make_shared(weights_node, ov::element::f16); // Pack zero points: two subsequent values into one @@ -304,15 +314,129 @@ ov::Output make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, o auto w_zp = std::make_shared( weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY); - auto w_zp_s = std::make_shared( - w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY); + ov::Output w_zp_s = + std::make_shared(w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY); - // Reshape back to original shape - auto final_shape = std::make_shared( - ov::element::i64, ov::Shape{orig_weight_shape.size()}, orig_weight_shape); + if (packed_shape.size() != 2) { + // If not requantized channel-wise case, reshape back to original shape + auto final_shape = std::make_shared( + ov::element::i64, ov::Shape{orig_weight_shape.size()}, orig_weight_shape); - auto w_zp_s_r = std::make_shared( - w_zp_s, final_shape, false); + w_zp_s = std::make_shared(w_zp_s, final_shape, false); + } - return std::make_shared(w_zp_s_r, ov::element::f32); + return std::make_shared(w_zp_s, ov::element::f32); +} + +std::shared_ptr requantize(const ggml_tensor* tensor, ExtraQuantType requant_type) { + std::vector weights_f32(tensor->ne[0] * tensor->ne[1]); + ggml_get_type_traits(tensor->type)->to_float(tensor->data, weights_f32.data(), ggml_nelements(tensor)); + + std::shared_ptr weight_node; + ov::Shape node_shape = {(uint64_t) (tensor->ne[1]), (uint64_t) (tensor->ne[0])}; + + if (requant_type == ExtraQuantType::F16) { + ov::Tensor weights(ov::element::f16, node_shape); + ggml_get_type_traits(GGML_TYPE_F16)->from_float_ref(weights_f32.data(), weights.data(), ggml_nelements(tensor)); + std::shared_ptr weight_node = std::make_shared(weights); + weight_node->set_friendly_name(tensor->name); + return weight_node; + } + + int64_t block_size = node_shape[1]; + if (requant_type == ExtraQuantType::Q4_0_128) { + block_size = 128; + } + auto scales_shape = ov::Shape{node_shape[0], node_shape[1] / block_size}; + + ov::Tensor weights; + ov::Tensor scales(ov::element::f16, scales_shape); + ov::Tensor bias(ov::element::f16, scales_shape); + + if (requant_type == ExtraQuantType::Q4_0_C) { + weights = ov::Tensor(ov::element::u4, node_shape); + quantize_q4_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size); + weight_node = make_int4_weights(weights, scales, bias, block_size).get_node_shared_ptr(); + } else if (requant_type == ExtraQuantType::Q8_1_C) { + weights = ov::Tensor(ov::element::u8, node_shape); + quantize_q8_1(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size); + weight_node = make_int8_weights(weights, scales, bias, block_size).get_node_shared_ptr(); + } else if (requant_type == ExtraQuantType::Q4_0_128) { + weights = ov::Tensor(ov::element::u4, node_shape); + quantize_q4_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size); + weight_node = make_int4_weights(weights, scales, bias, block_size).get_node_shared_ptr(); + } + + weight_node->set_friendly_name(tensor->name); + return weight_node; +} + +void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, + int64_t qk) { + assert(k % qk == 0); + const int nb = k / qk; + + auto* weights = static_cast(weights_arr.data()); + auto* scales = scales_arr.data::value_type>(); + auto* biases = biases_arr.data::value_type>(); + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + float max = 0.0f; + + for (int j = 0; j < qk; j++) { + const float v = x[i * qk + j]; + if (amax < fabsf(v)) { + amax = fabsf(v); + max = v; + } + } + + const float d = max / -8; + const float id = d ? 1.0f / d : 0.0f; + scales[i] = ov::float16(d); + biases[i] = ov::float16(-8.f * d); + + for (int j = 0; j < qk / 2; ++j) { + const float x0 = x[i * qk + 2 * j] * id; + const float x1 = x[i * qk + 2 * j + 1] * id; + const uint8_t xi0 = MIN(15, (int8_t) (x0 + 8.5f)); + const uint8_t xi1 = MIN(15, (int8_t) (x1 + 8.5f)); + weights[i * qk / 2 + j] = xi0 | (xi1 << 4); + } + } +} + +void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, + int64_t qk) { + assert(k % qk == 0); + const int nb = k / qk; + + auto* weights = static_cast(weights_arr.data()); + auto* scales = scales_arr.data::value_type>(); + auto* biases = biases_arr.data::value_type>(); + for (int i = 0; i < nb; i++) { + float min = std::numeric_limits::max(); + float max = std::numeric_limits::lowest(); + + for (int j = 0; j < qk; j++) { + const float v = x[i * qk + j]; + if (v < min) { + min = v; + } + if (v > max) { + max = v; + } + } + + const float d = (max - min) / ((1 << 8) - 1); + const float id = d ? 1.0f / d : 0.0f; + scales[i] = ov::float16(d); + biases[i] = ov::float16(min); + + for (int j = 0; j < qk; ++j) { + const float x0 = (x[i * qk + j] - min) * id; + const uint8_t xi0 = roundf(x0); + weights[i * qk + j] = xi0; + } + } } diff --git a/ggml/src/ggml-openvino/ggml-quants.hpp b/ggml/src/ggml-openvino/ggml-quants.hpp index ae37b1618e..fbae2aa1f4 100644 --- a/ggml/src/ggml-openvino/ggml-quants.hpp +++ b/ggml/src/ggml-openvino/ggml-quants.hpp @@ -1,3 +1,4 @@ +#pragma once #include #include #include @@ -45,6 +46,15 @@ ov::Output make_int4_weights(ov::Tensor& weight, ov::Tensor& biases, size_t group_size = GGML_QUANTIZATION_GROUP_SIZE); +enum class ExtraQuantType { F16, Q4_0_C, Q8_1_C, Q4_0_128 }; + +std::shared_ptr requantize(const ggml_tensor* tensor, ExtraQuantType requant_type); + +void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, + int64_t qk); +void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, + int64_t qk); + namespace ov { namespace op { namespace util { diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index e49d941da4..3f728c242d 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -130,11 +130,21 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c compile_end_time = conversion_end_time; } else { std::shared_ptr model; - std::set types_to_dequantize; + std::map types_to_requantize; if (is_static) { - types_to_dequantize = {GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K}; + types_to_requantize = { + {GGML_TYPE_Q4_0, ExtraQuantType::Q4_0_128}, + {GGML_TYPE_Q4_1, ExtraQuantType::Q4_0_128}, + {GGML_TYPE_Q4_K, ExtraQuantType::Q4_0_128}, + {GGML_TYPE_Q6_K, ExtraQuantType::Q8_1_C }, + }; + } else if (device == "GPU") { + types_to_requantize = { + // CVS-166739 + {GGML_TYPE_Q6_K, ExtraQuantType::Q8_1_C}, + }; } - auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, types_to_dequantize); + auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, types_to_requantize); if (is_static) { ggml_decoder = std::make_shared(cgraph, model_weights, is_static, true);