diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 0fd64c685f..c2e164b808 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -29,6 +30,7 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" +#include "ggml-quant.hpp" GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token, int context_size, int num_heads, int num_heads_kv, int head_size) : @@ -402,12 +404,78 @@ std::map> GgmlOvDecoder::create_weight_no } std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) { + std::set weight_types = { + GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q6_K}; + if (weight_types.find(tensor->type) == weight_types.end()) { + throw std::runtime_error("Unexpected weight tensor type: " + std::string(tensor->name) + " with type " + + ggml_type_name(tensor->type)); + } + auto node_type = get_ov_type(tensor); auto node_shape = get_shape(tensor); auto ne_total = ggml_nelements(tensor); - ov::Tensor weights(node_type, node_shape); - memcpy(weights.data(), tensor->data, ne_total * node_type.size()); - return std::make_shared(weights); + + if (node_type != ov::element::dynamic) { + ov::Tensor weights(node_type, node_shape); + memcpy(weights.data(), tensor->data, ne_total * node_type.size()); + std::shared_ptr weight_node = std::make_shared(weights); + if (node_type == ov::element::f16) { + weight_node = std::make_shared(weight_node, ov::element::f32); + } + weight_node->set_friendly_name(tensor->name); + return weight_node; + } + + uint64_t weights_per_byte; + if (tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_1 || tensor->type == GGML_TYPE_Q4_K) { + weights_per_byte = 2; + } else { // tensor.type == GGUF_TYPE_Q8_0 || tensor.type == GGUF_TYPE_Q6_K + weights_per_byte = 1; + } + + uint64_t weights_per_block; + // here we only consider sub block, q6k:16 q4k:32 + if (tensor->type == GGML_TYPE_Q6_K) { + weights_per_block = 16; + } else { + weights_per_block = 32; + } + + OPENVINO_ASSERT(node_shape.back() % weights_per_block == 0, + "[load_gguf] tensor ", + tensor->name, + " has incompatible last dim shape: ", + node_shape.back()); + + auto weights_shape = node_shape; + weights_shape.back() /= (weights_per_byte * 4); // means u32 type can store 8 q4 or 4 q8 + + ov::Tensor weights(ov::element::u32, weights_shape); + // For scales and bias + node_shape[node_shape.size() - 1] = node_shape[node_shape.size() - 1] / weights_per_block; + + ov::Tensor scales(ov::element::f16, node_shape); + ov::Tensor biases(ov::element::f16, node_shape); + ov::Output weight_node; + if (tensor->type == GGML_TYPE_Q4_0) { + extract_q4_0_data(tensor, weights, scales, biases); + weight_node = make_int8_weights(weights, scales, biases, weights_per_block); + } else if (tensor->type == GGML_TYPE_Q4_1) { + extract_q4_1_data(tensor, weights, scales, biases); + weight_node = make_int4_weights(weights, scales, biases, weights_per_block); + } else if (tensor->type == GGML_TYPE_Q8_0) { + extract_q8_0_data(tensor, weights, scales, biases); + weight_node = make_int8_weights(weights, scales, biases, weights_per_block); + } else if (tensor->type == GGML_TYPE_Q6_K) { + // due to WA #2135, this case will not be used, extract_q6_k_data temporarily disabled. + extract_q6_k_data(tensor, weights, scales, biases); + weight_node = make_int8_weights(weights, scales, biases, weights_per_block); + } else if (tensor->type == GGML_TYPE_Q4_K) { + extract_q4_k_data(tensor, weights, scales, biases); + weight_node = make_int4_weights(weights, scales, biases, weights_per_block); + } + weight_node.get_node_shared_ptr()->set_friendly_name(tensor->name); + return weight_node.get_node_shared_ptr(); } void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename) { @@ -537,7 +605,7 @@ ov::element::Type GgmlOvDecoder::get_ov_type(const ggml_tensor* tensor) { case GGML_TYPE_I64: return ov::element::i64; default: - throw std::runtime_error("Unsupported tensor type"); + return ov::element::dynamic; } } diff --git a/ggml/src/ggml-openvino/ggml-quant.cpp b/ggml/src/ggml-openvino/ggml-quant.cpp new file mode 100644 index 0000000000..4311ab138e --- /dev/null +++ b/ggml/src/ggml-openvino/ggml-quant.cpp @@ -0,0 +1,313 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ggml.h" + +void unpack_32_4(const uint8_t* data, uint8_t* dst) { + std::fill_n(dst, 16, 0); + for (int j = 0; j < 16; ++j) { + uint8_t x = (data[j + 2] & 0x0F); // j+2 to skip scale bytes. + uint8_t y = (data[j + 2] >> 4); + if (j % 2 != 0) { + x <<= 4; + y <<= 4; + } + dst[j / 2] |= x; + dst[8 + j / 2] |= y; // Last 16 weights are in the higher bits + } +} + +// Extracts (weight, scales, biases) from Q4_0 tensors. +// Data layout is: |16 bit scale|32 x 4bit weights|. +void extract_q4_0_data(const ggml_tensor* tensor, + ov::Tensor& weights_arr, + ov::Tensor& scales_arr, + ov::Tensor& biases_arr) { + const uint64_t bytes_per_block = 18; // 2 bytes scale, 32x0.5 byte weights + auto data = static_cast(tensor->data); + auto weights = static_cast(weights_arr.data()); + auto scales = scales_arr.data::value_type>(); + auto biases = biases_arr.data::value_type>(); + + ov::parallel_for(scales_arr.get_size(), [&](size_t i) { + scales[i] = ov::float16::from_bits(*((uint16_t*)(data + i * bytes_per_block))); + biases[i] = ov::float16(-8.f * static_cast(scales[i])); + unpack_32_4(data + i * bytes_per_block, weights + i * 16); + }); +} + +// Extracts (weight, scales, biases) from Q4_1 tensors. +// Data layout is: |16 bit scale|16 bit bias|32 x 4bit weights|. +void extract_q4_1_data(const ggml_tensor* tensor, + ov::Tensor& weights_arr, + ov::Tensor& scales_arr, + ov::Tensor& biases_arr) { + const uint64_t bytes_per_block = 20; // 2 bytes scale, 2 bytes bias, 32x0.5 byte weights + auto data = static_cast(tensor->data); + auto weights = static_cast(weights_arr.data()); + auto scales = scales_arr.data::value_type>(); + auto biases = biases_arr.data::value_type>(); + ov::parallel_for(scales_arr.get_size(), [&](size_t i) { + scales[i] = ov::float16::from_bits(*((uint16_t*)(data + i * bytes_per_block))); + biases[i] = ov::float16::from_bits(*((uint16_t*)(data + i * bytes_per_block + 1))); + unpack_32_4(data + i * bytes_per_block, weights + i * 16); + }); +} + +// Extracts (weight, scales, biases) from Q8_0 tensors. +// Data layout is: |16 bit scale|32 x 8bit weights|. +void extract_q8_0_data(const ggml_tensor* tensor, + ov::Tensor& weights_arr, + ov::Tensor& scales_arr, + ov::Tensor& biases_arr) { + const uint64_t weights_per_block = 32; + const uint64_t bytes_per_block = 34; // 2 bytes scale, 32x1 byte weights + auto data = static_cast(tensor->data); + auto weights = static_cast(weights_arr.data()); + auto scales = scales_arr.data::value_type>(); + auto biases = biases_arr.data::value_type>(); + for (int64_t i = 0; i < scales_arr.get_size(); i++) { + uint8_t* block_data = data + i * bytes_per_block; + scales[i] = ov::float16::from_bits(*(uint16_t*)block_data); + biases[i] = ov::float16(-128.f * static_cast(scales[i])); + for (int64_t j = 0; j < weights_per_block; ++j) { + uint8_t x = block_data[j + 2]; // j+2 to skip the scale bytes. + // Original data is in int8_t, so we add a bias of -128 and invert the + // first bit. + x ^= 1 << 7; + weights[i * weights_per_block + j] = x; + } + } +} + +void unpack_256_4(const uint8_t* data, uint8_t* dst) { + // Initialize the output array with zeros + std::fill_n(dst, 128, 0); + + for (size_t i = 0; i < 4; ++i) { + for (int j = 0; j < 32; ++j) { + uint8_t x = (data[i * 32 + j] & 0x0F); + uint8_t y = (data[i * 32 + j] >> 4); + if (j % 2 != 0) { + x <<= 4; + y <<= 4; + } + dst[i * 32 + j / 2] |= x; + dst[i * 32 + 16 + j / 2] |= y; // Last 16 weights are in the higher bits + } + } +} + +void extract_q4_k_data(const ggml_tensor* tensor, + ov::Tensor& weights_arr, + ov::Tensor& scales_arr, + ov::Tensor& biases_arr) { + const uint64_t bytes_per_block = 2 + 2 + 12 + 128; + // TODO tensor->nb[3] + const uint64_t n_super_block = tensor->nb[3] / bytes_per_block; + auto data = static_cast(tensor->data); + auto weights = static_cast(weights_arr.data()); + auto scales = scales_arr.data::value_type>(); + auto biases = biases_arr.data::value_type>(); + + ov::parallel_for(n_super_block, [&](size_t i) { + uint8_t* block_data = data + i * bytes_per_block; + + // Extract scale factors and offsets + float scale_scales = static_cast(ov::float16::from_bits(*((uint16_t*)block_data))); + float scale_biases = static_cast(ov::float16::from_bits(*((uint16_t*)block_data + 1))); + + // Extract qs1 and qs2 + uint8_t* qs1 = block_data + 4; + uint8_t* qs2 = block_data + 16; + + scales[i * 8] = ov::float16(scale_scales * static_cast((*(qs1) & 0b111111))); + scales[i * 8 + 1] = ov::float16(scale_scales * static_cast((*(qs1 + 1) & 0b111111))); + scales[i * 8 + 2] = ov::float16(scale_scales * static_cast((*(qs1 + 2) & 0b111111))); + scales[i * 8 + 3] = ov::float16(scale_scales * static_cast((*(qs1 + 3) & 0b111111))); + scales[i * 8 + 4] = + ov::float16(scale_scales * static_cast((*(qs1 + 8) & 0b00001111) | ((*(qs1) >> 6) << 4))); + scales[i * 8 + 5] = + ov::float16(scale_scales * static_cast((*(qs1 + 9) & 0b00001111) | ((*(qs1 + 1) >> 6) << 4))); + scales[i * 8 + 6] = + ov::float16(scale_scales * static_cast((*(qs1 + 10) & 0b00001111) | ((*(qs1 + 2) >> 6) << 4))); + scales[i * 8 + 7] = + ov::float16(scale_scales * static_cast((*(qs1 + 11) & 0b00001111) | ((*(qs1 + 3) >> 6) << 4))); + + biases[i * 8] = ov::float16(-1.f * scale_biases * static_cast((*(qs1 + 4) & 0b111111))); + biases[i * 8 + 1] = ov::float16(-1.f * scale_biases * static_cast((*(qs1 + 5) & 0b111111))); + biases[i * 8 + 2] = ov::float16(-1.f * scale_biases * static_cast((*(qs1 + 6) & 0b111111))); + biases[i * 8 + 3] = ov::float16(-1.f * scale_biases * static_cast((*(qs1 + 7) & 0b111111))); + biases[i * 8 + 4] = + ov::float16(-1.f * scale_biases * static_cast((*(qs1 + 8) >> 4) | ((*(qs1 + 4) >> 6) << 4))); + biases[i * 8 + 5] = + ov::float16(-1.f * scale_biases * static_cast((*(qs1 + 9) >> 4) | ((*(qs1 + 5) >> 6) << 4))); + biases[i * 8 + 6] = + ov::float16(-1.f * scale_biases * static_cast((*(qs1 + 10) >> 4) | ((*(qs1 + 6) >> 6) << 4))); + biases[i * 8 + 7] = + ov::float16(-1.f * scale_biases * static_cast((*(qs1 + 11) >> 4) | ((*(qs1 + 7) >> 6) << 4))); + unpack_256_4(block_data + 16, weights + i * 128); + }); +} + +void extract_q6_k_data(const ggml_tensor* tensor, + ov::Tensor& weights_arr, + ov::Tensor& scales_arr, + ov::Tensor& biases_arr) { + const uint64_t bytes_per_block = 128 + 64 + 16 + 2; + const uint64_t n_super_block = tensor->nb[3] / bytes_per_block; + auto data = static_cast(tensor->data); + auto weights = static_cast(weights_arr.data()); + auto scales = scales_arr.data::value_type>(); + auto biases = biases_arr.data::value_type>(); + // std::string name(tensor.name, tensor.namelen); + for (int64_t i = 0; i < n_super_block; i++) { + uint8_t* block_data = data + i * bytes_per_block; + + float scale_factor = + static_cast(ov::float16::from_bits(*((uint16_t*)block_data + 104))); // (128+64+16)/2 + + for (size_t j = 0; j < 16; j++) { + scales[j + i * 16] = + ov::float16(scale_factor * static_cast(*((int8_t*)(block_data + 128 + 64 + j)))); + biases[j + i * 16] = ov::float16(-32.f * static_cast(scales[j + i * 16])); + } + + // Extract ql and qh + uint8_t* ql = block_data; + uint8_t* qh = block_data + 128; + + // Extract weights + for (int64_t j = 0; j < 32; ++j) { + weights[i * 256 + j] = (ql[j] & 0xF) | (((qh[j] >> 0) & 3) << 4); + weights[i * 256 + j + 32] = (ql[32 + j] & 0xF) | (((qh[j] >> 2) & 3) << 4); + weights[i * 256 + j + 64] = (ql[j] >> 4) | (((qh[j] >> 4) & 3) << 4); + weights[i * 256 + j + 96] = (ql[32 + j] >> 4) | (((qh[j] >> 6) & 3) << 4); + weights[i * 256 + j + 128] = (ql[64 + j] & 0xF) | (((qh[32 + j] >> 0) & 3) << 4); + weights[i * 256 + j + 160] = (ql[96 + j] & 0xF) | (((qh[32 + j] >> 2) & 3) << 4); + weights[i * 256 + j + 192] = (ql[64 + j] >> 4) | (((qh[32 + j] >> 4) & 3) << 4); + weights[i * 256 + j + 224] = (ql[96 + j] >> 4) | (((qh[32 + j] >> 6) & 3) << 4); + } + } +} + +// TODO Reorder for make_intX_weights + +ov::Output make_int8_weights(ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size) { + + // Reshape weight to (num_heads, -1, group_size) + ov::Shape orig_shape = weight.get_shape(); + orig_shape[1] *= sizeof(uint32_t) / sizeof(uint8_t); + size_t num_groups = orig_shape[1] / group_size; + + // Expand dimensions for scales and biases + auto scale_shape = scales.get_shape(); + scale_shape.push_back(1); + scales.set_shape(scale_shape); + biases.set_shape(scale_shape); + + // Create graph nodes + auto weights_node = std::make_shared(ov::element::u8, ov::Shape{orig_shape[0], num_groups, group_size}, static_cast(weight.data()), nullptr); + weights_node->get_rt_info()["__gguf_tensor_holder"] = weight; + auto scales_f16 = std::make_shared(scales); + ov::Tensor biases_u8(ov::element::u8, scale_shape); + + // Calculate zero point + const ov::float16* bias_data = biases.data::value_type>(); + const ov::float16* scale_data = scales.data::value_type>(); + uint8_t* bias_u8_data = biases_u8.data(); + for (size_t i = 0; i < biases_u8.get_size(); ++i) { + bias_u8_data[i] = (uint8_t)std::round(-1.f * static_cast(bias_data[i]) / static_cast(scale_data[i])); + } + + auto zero_point = std::make_shared(biases_u8); + + // Quantization operations + auto weights_f16 = std::make_shared(weights_node, ov::element::f16); + auto zero_point_f16 = std::make_shared(zero_point, ov::element::f16); + + auto w_zp = std::make_shared( + weights_f16, zero_point_f16, ov::op::AutoBroadcastType::NUMPY + ); + auto w_zp_s = std::make_shared( + w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY + ); + + // Reshape back to original dimensions + auto final_shape = std::make_shared( + ov::element::i64, ov::Shape{orig_shape.size()}, orig_shape + ); + auto w_zp_s_r = std::make_shared( + w_zp_s, final_shape, false + ); + + return std::make_shared(w_zp_s_r, ov::element::f32); +} + +ov::Output make_int4_weights(ov::Tensor& weight, ov::Tensor& scales, ov::Tensor& biases, size_t group_size) { + + // Convert weight to uint8 view and adjust shape + ov::Shape orig_weight_shape = weight.get_shape(); + orig_weight_shape[1] *= sizeof(uint32_t) / sizeof(uint8_t) * 2; // Double number of columns for 4-bit representation + + // Expand dimensions for scales and biases + ov::Shape scale_bias_shape = scales.get_shape(); + scale_bias_shape.push_back(1); // Add new axis at the end + scales.set_shape(scale_bias_shape); + biases.set_shape(scale_bias_shape); + + // Create INT4 weight tensor + ov::Shape packed_shape = { + orig_weight_shape[0], + orig_weight_shape[1] / group_size, + group_size + }; + + auto weights_node = std::make_shared(ov::element::u4, packed_shape, static_cast(weight.data()), nullptr); + weights_node->get_rt_info()["__gguf_tensor_holde"] = weight; + auto weights_f16 = std::make_shared(weights_node, ov::element::f16); + + // Pack zero points: two subsequent values into one + const ov::float16* bias_data = biases.data::value_type>(); + const ov::float16* scale_data = scales.data::value_type>(); + ov::Tensor zero_point_tensor(ov::element::u4, scale_bias_shape); + uint8_t* zero_point_data = static_cast(zero_point_tensor.data()); + for (size_t i = 0; i < zero_point_tensor.get_byte_size(); ++i) { + uint8_t bias1 = (uint8_t)std::round(-1.f * static_cast(bias_data[i * 2]) / static_cast(scale_data[i * 2])); + uint8_t bias2 = (uint8_t)std::round(-1.f * static_cast(bias_data[i * 2 + 1]) / static_cast(scale_data[i * 2 + 1])); + zero_point_data[i] = (bias2 << 4) | (bias1 & 0x0F); + } + + // CVS-166438: GGUF Q4_0 zp array (U4) with all same value (8) will be converted to single U4 scalar via ConvertU4WeightsZeroPointToScalar transformation. + // This corner case can be handled by CPU plugin properly, but will trigger compilation error on GPU plugin. + // Temporal WA by adding one small bias to keep zp array shape for GPU plugin, confirm no accuracy impact for final LLM generation results. + zero_point_data[0] += 1; + + auto zero_points_node = std::make_shared(zero_point_tensor); + auto zero_points_f16 = std::make_shared(zero_points_node, ov::element::f16); + + auto scales_f16 = std::make_shared(scales); + + // Perform dequantization + auto w_zp = std::make_shared( + weights_f16, zero_points_f16, ov::op::AutoBroadcastType::NUMPY); + + auto w_zp_s = std::make_shared( + w_zp, scales_f16, ov::op::AutoBroadcastType::NUMPY); + + // Reshape back to original shape + auto final_shape = std::make_shared( + ov::element::i64, ov::Shape{orig_weight_shape.size()}, orig_weight_shape); + + auto w_zp_s_r = std::make_shared( + w_zp_s, final_shape, false); + + return std::make_shared(w_zp_s_r, ov::element::f32); +} diff --git a/ggml/src/ggml-openvino/ggml-quant.hpp b/ggml/src/ggml-openvino/ggml-quant.hpp new file mode 100644 index 0000000000..9c0dd89a95 --- /dev/null +++ b/ggml/src/ggml-openvino/ggml-quant.hpp @@ -0,0 +1,44 @@ +#include +#include +#include "ggml.h" + +void unpack_32_4(const uint8_t* data, uint8_t* dst); + +void extract_q4_0_data(const ggml_tensor* tensor, + ov::Tensor& weights_arr, + ov::Tensor& scales_arr, + ov::Tensor& biases_arr); + +void extract_q4_1_data(const ggml_tensor* tensor, + ov::Tensor& weights_arr, + ov::Tensor& scales_arr, + ov::Tensor& biases_arr); + +void extract_q8_0_data(const ggml_tensor* tensor, + ov::Tensor& weights_arr, + ov::Tensor& scales_arr, + ov::Tensor& biases_arr); + +void unpack_256_4(const uint8_t* data, uint8_t* dst); + +void extract_q4_k_data(const ggml_tensor* tensor, + ov::Tensor& weights_arr, + ov::Tensor& scales_arr, + ov::Tensor& biases_arr); + +void extract_q6_k_data(const ggml_tensor* tensor, + ov::Tensor& weights_arr, + ov::Tensor& scales_arr, + ov::Tensor& biases_arr); + +static constexpr size_t GGML_QUANTIZATION_GROUP_SIZE = 32; + +ov::Output make_int8_weights(ov::Tensor& weight, + ov::Tensor& scales, + ov::Tensor& biases, + size_t group_size = GGML_QUANTIZATION_GROUP_SIZE); + +ov::Output make_int4_weights(ov::Tensor& weight, + ov::Tensor& scales, + ov::Tensor& biases, + size_t group_size = GGML_QUANTIZATION_GROUP_SIZE);