Extract zp directly instead of bias
This commit is contained in:
parent
b6c0697d10
commit
0ee7e05485
|
|
@ -508,10 +508,10 @@ std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const
|
|||
|
||||
std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph) {
|
||||
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
|
||||
static std::mutex weights_mutex;
|
||||
// static std::mutex weights_mutex;
|
||||
auto * nodes = cgraph->nodes;
|
||||
auto n_nodes = cgraph->n_nodes;
|
||||
std::for_each(std::execution::par, nodes, nodes + n_nodes, [&](ggml_tensor * node) {
|
||||
std::for_each(std::execution::seq, nodes, nodes + n_nodes, [&](ggml_tensor * node) {
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
auto * src = node->src[i];
|
||||
if (src == nullptr) {
|
||||
|
|
@ -522,21 +522,26 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
|
|||
if (!src->view_src) {
|
||||
ggml_backend_buffer * buffer = src->buffer;
|
||||
if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS || ggml_is_quantized(src->type)) {
|
||||
bool should_create = false;
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(weights_mutex);
|
||||
if (model_weights.find(src_name) == model_weights.end()) {
|
||||
model_weights[src_name] = nullptr;
|
||||
should_create = true;
|
||||
}
|
||||
}
|
||||
if (should_create) {
|
||||
// bool should_create = false;
|
||||
// {
|
||||
// std::lock_guard<std::mutex> lock(weights_mutex);
|
||||
// if (model_weights.find(src_name) == model_weights.end()) {
|
||||
// model_weights[src_name] = nullptr;
|
||||
// should_create = true;
|
||||
// }
|
||||
// }
|
||||
// if (should_create) {
|
||||
// auto weight_node = create_weight_node(src);
|
||||
// weight_node->set_friendly_name(src_name);
|
||||
// {
|
||||
// std::lock_guard<std::mutex> lock(weights_mutex);
|
||||
// model_weights[src_name] = weight_node;
|
||||
// }
|
||||
// }
|
||||
if (model_weights.find(src_name) == model_weights.end()) {
|
||||
auto weight_node = create_weight_node(src);
|
||||
weight_node->set_friendly_name(src_name);
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(weights_mutex);
|
||||
model_weights[src_name] = weight_node;
|
||||
}
|
||||
model_weights[src_name] = weight_node;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -209,12 +209,12 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
|
|||
layout.is_requant = true;
|
||||
layout.requant_type = requant_type;
|
||||
|
||||
// Special case: requant to F16 - just store F16 weights, no scales/biases
|
||||
// Special case: requant to F16 - just store F16 weights, no scales/zp
|
||||
if (requant_type.value() == ExtraQuantType::F16) {
|
||||
layout.weights_size = n_elements * sizeof(uint16_t); // F16 = 2 bytes
|
||||
layout.total_size = layout.weights_size;
|
||||
layout.weights_offset = 0;
|
||||
// No scales/biases for F16
|
||||
// No scales/zp for F16
|
||||
return layout;
|
||||
}
|
||||
|
||||
|
|
@ -255,14 +255,15 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
|
|||
layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;
|
||||
int64_t n_blocks = n_elements / layout.weights_per_block;
|
||||
layout.scales_size = n_blocks * sizeof(uint16_t);
|
||||
// For symmetric quantization, we only need one bias value (not one per block)
|
||||
layout.biases_size = layout.is_symmetric ? sizeof(uint16_t) : n_blocks * sizeof(uint16_t);
|
||||
// For symmetric quantization, we only need one zp value (not one per block)
|
||||
// Zero points are stored in U4 or U8 format matching the weight type
|
||||
size_t n_zp_elements = layout.is_symmetric ? 1 : n_blocks;
|
||||
layout.zp_size = layout.is_u4 ? ((n_zp_elements + 1) / 2) : n_zp_elements;
|
||||
|
||||
layout.weights_offset = 0;
|
||||
layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
|
||||
layout.biases_offset =
|
||||
layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;
|
||||
layout.total_size = layout.biases_offset + layout.biases_size;
|
||||
layout.zp_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;
|
||||
layout.total_size = layout.zp_offset + layout.zp_size;
|
||||
layout.total_size = std::max(layout.total_size, ggml_nbytes(tensor));
|
||||
return layout;
|
||||
}
|
||||
|
|
@ -305,17 +306,19 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
|
|||
// Weights: U4 = n_elements/2 bytes, U8 = n_elements bytes
|
||||
layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;
|
||||
|
||||
// Scales and biases: F16 per block
|
||||
// Scales: F16 per block
|
||||
int64_t n_blocks = n_elements / layout.weights_per_block;
|
||||
layout.scales_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes
|
||||
// For symmetric quantization, we only need one bias value (not one per block)
|
||||
layout.biases_size = layout.is_symmetric ? sizeof(uint16_t) : n_blocks * sizeof(uint16_t);
|
||||
// Zero points: U4 or U8 matching weight type
|
||||
// For symmetric quantization, we only need one zp value (not one per block)
|
||||
size_t n_zp_elements = layout.is_symmetric ? 1 : n_blocks;
|
||||
layout.zp_size = layout.is_u4 ? ((n_zp_elements + 1) / 2) : n_zp_elements;
|
||||
|
||||
// Layout in buffer: [weights | scales | biases] with alignment
|
||||
// Layout in buffer: [weights | scales | zp] with alignment
|
||||
layout.weights_offset = 0;
|
||||
layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
|
||||
layout.biases_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;
|
||||
layout.total_size = layout.biases_offset + layout.biases_size;
|
||||
layout.zp_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;
|
||||
layout.total_size = layout.zp_offset + layout.zp_size;
|
||||
|
||||
return layout;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -110,16 +110,19 @@ struct ggml_openvino_weight_extra : public ggml_openvino_extra_base {
|
|||
: ggml_openvino_extra_base(Type::WEIGHT), constant(std::move(c)) {}
|
||||
};
|
||||
|
||||
// Extra data for quantized weight tensors - stores extracted weights/scales/biases and ov::Constant
|
||||
// Extra data for quantized weight tensors - stores extracted weights/scales/zp and ov::Constant
|
||||
struct ggml_openvino_quantized_weight_extra : public ggml_openvino_extra_base {
|
||||
ov::Tensor weights; // U4 or U8 extracted weights
|
||||
ov::Tensor scales; // F16 scales
|
||||
ov::Tensor biases; // F16 biases (zero points)
|
||||
ov::Tensor zp; // U4 or U8 zero points (same type as weights)
|
||||
std::shared_ptr<ov::Node> constant; // Pre-built OpenVINO weight subgraph
|
||||
|
||||
ggml_openvino_quantized_weight_extra(ov::Tensor w, ov::Tensor s, ov::Tensor b, std::shared_ptr<ov::Node> c)
|
||||
: ggml_openvino_extra_base(Type::QUANTIZED_WEIGHT),
|
||||
weights(std::move(w)), scales(std::move(s)), biases(std::move(b)), constant(std::move(c)) {}
|
||||
ggml_openvino_quantized_weight_extra(ov::Tensor w, ov::Tensor s, ov::Tensor z, std::shared_ptr<ov::Node> c) :
|
||||
ggml_openvino_extra_base(Type::QUANTIZED_WEIGHT),
|
||||
weights(std::move(w)),
|
||||
scales(std::move(s)),
|
||||
zp(std::move(z)),
|
||||
constant(std::move(c)) {}
|
||||
};
|
||||
|
||||
// Extra data for KV cache / compute tensors - stores ov::Tensor for infer_request
|
||||
|
|
@ -133,7 +136,7 @@ struct ggml_openvino_tensor_extra : public ggml_openvino_extra_base {
|
|||
// =====================================================
|
||||
// Extracted Size Calculation for Quantized Tensors
|
||||
// =====================================================
|
||||
// For quantized tensors, we need extra space to store extracted weights, scales, and biases.
|
||||
// For quantized tensors, we need extra space to store extracted weights, scales, and zero points.
|
||||
// Returns the total size needed in the buffer for extracted data.
|
||||
|
||||
struct ggml_openvino_extracted_layout {
|
||||
|
|
@ -142,10 +145,10 @@ struct ggml_openvino_extracted_layout {
|
|||
size_t weights_size; // Size of weights in bytes
|
||||
size_t scales_offset; // Offset to scales in buffer
|
||||
size_t scales_size; // Size of scales in bytes
|
||||
size_t biases_offset; // Offset to biases in buffer
|
||||
size_t biases_size; // Size of biases in bytes
|
||||
size_t zp_offset; // Offset to zero points in buffer
|
||||
size_t zp_size; // Size of zero points in bytes (U4 or U8)
|
||||
bool is_u4; // true for U4 weights, false for U8
|
||||
int64_t weights_per_block;// weights per scale/bias block
|
||||
int64_t weights_per_block; // weights per scale/zp block
|
||||
bool is_symmetric; // true for symmetric quantization
|
||||
|
||||
// Requantization info
|
||||
|
|
|
|||
|
|
@ -259,13 +259,15 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer
|
|||
ov::Shape weight_shape = {static_cast<size_t>(tensor->ne[1]), static_cast<size_t>(tensor->ne[0])};
|
||||
ov::Shape scale_shape = {static_cast<size_t>(tensor->ne[1]),
|
||||
static_cast<size_t>(tensor->ne[0] / layout.weights_per_block)};
|
||||
// zp shape: scalar for symmetric, per-block for asymmetric
|
||||
ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape;
|
||||
|
||||
ov::Tensor weights(weight_type, weight_shape, buf_base + layout.weights_offset);
|
||||
ov::Tensor scales(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
|
||||
ov::Tensor biases(ov::element::f16, scale_shape, buf_base + layout.biases_offset);
|
||||
ov::Tensor zp(weight_type, zp_shape, buf_base + layout.zp_offset);
|
||||
|
||||
auto * extra = new ggml_openvino_quantized_weight_extra(std::move(weights), std::move(scales),
|
||||
std::move(biases), constant);
|
||||
std::move(zp), constant);
|
||||
ctx->tensor_extras[tensor] = extra;
|
||||
tensor->extra = extra;
|
||||
|
||||
|
|
@ -487,10 +489,9 @@ static size_t ggml_backend_openvino_buffer_type_get_alloc_size(ggml_backend_buff
|
|||
if (ggml_is_quantized(tensor->type) && tensor->ne[2] == 1 && tensor->ne[3] == 1) {
|
||||
ggml_openvino_extracted_layout layout = ggml_openvino_get_extracted_layout(tensor);
|
||||
if (layout.total_size > 0) {
|
||||
GGML_LOG_DEBUG(
|
||||
"%s: tensor %s needs %zu bytes (original %zu, extracted: weights=%zu scales=%zu biases=%zu)\n",
|
||||
__func__, tensor->name, layout.total_size, ggml_nbytes(tensor), layout.weights_size, layout.scales_size,
|
||||
layout.biases_size);
|
||||
GGML_LOG_DEBUG("%s: tensor %s needs %zu bytes (original %zu, extracted: weights=%zu scales=%zu zp=%zu)\n",
|
||||
__func__, tensor->name, layout.total_size, ggml_nbytes(tensor), layout.weights_size,
|
||||
layout.scales_size, layout.zp_size);
|
||||
return layout.total_size;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -42,80 +42,97 @@ void unpack_32_4(const uint8_t * data, uint8_t * dst) {
|
|||
}
|
||||
}
|
||||
|
||||
// Extracts (weight, scales, biases) from Q4_0 tensors.
|
||||
// Extracts (weight, scales, zp) from Q4_0 tensors.
|
||||
// Data layout is: |16 bit scale|32 x 4bit weights|.
|
||||
void extract_q4_0_data(const ggml_tensor * tensor,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & biases_arr) {
|
||||
ov::Tensor & zp_arr) {
|
||||
const uint64_t bytes_per_block = 18; // 2 bytes scale, 32x0.5 byte weights
|
||||
|
||||
auto * data = static_cast<uint8_t *>(tensor->data);
|
||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
||||
|
||||
bool is_scalar_bias = (biases_arr.get_size() == 1); // Symmetric quantization
|
||||
bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
|
||||
|
||||
// For Q4_0, zero point is always 8
|
||||
if (is_scalar_zp) {
|
||||
zp[0] = 8 | (8 << 4); // Pack two 4-bit values
|
||||
}
|
||||
|
||||
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
|
||||
scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block)));
|
||||
// For symmetric quantization, only write the first bias (all blocks share the same bias relationship)
|
||||
if (is_scalar_bias) {
|
||||
if (i == 0) {
|
||||
biases[0] = ov::float16(-8.f * static_cast<float>(scales[0]));
|
||||
// For asymmetric quantization, compute per-block zero points
|
||||
if (!is_scalar_zp) {
|
||||
// Pack two 4-bit zero points per byte
|
||||
if (i % 2 == 0) {
|
||||
zp[i / 2] = 8; // Lower nibble
|
||||
} else {
|
||||
zp[i / 2] |= (8 << 4); // Upper nibble
|
||||
}
|
||||
} else {
|
||||
biases[i] = ov::float16(-8.f * static_cast<float>(scales[i]));
|
||||
}
|
||||
unpack_32_4(data + i * bytes_per_block + 2, weights + i * 16);
|
||||
});
|
||||
}
|
||||
|
||||
// Extracts (weight, scales, biases) from Q4_1 tensors.
|
||||
// Data layout is: |16 bit scale|16 bit bias|32 x 4bit weights|.
|
||||
// Extracts (weight, scales, zp) from Q4_1 tensors.
|
||||
// Data layout is: |16 bit scale|16 bit min|32 x 4bit weights|.
|
||||
void extract_q4_1_data(const ggml_tensor * tensor,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & biases_arr) {
|
||||
const uint64_t bytes_per_block = 20; // 2 bytes scale, 2 bytes bias, 32x0.5 byte weights
|
||||
ov::Tensor & zp_arr) {
|
||||
const uint64_t bytes_per_block = 20; // 2 bytes scale, 2 bytes min, 32x0.5 byte weights
|
||||
|
||||
auto * data = static_cast<uint8_t *>(tensor->data);
|
||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
||||
|
||||
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
|
||||
scales[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block)));
|
||||
biases[i] = ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block + 2)));
|
||||
float scale = static_cast<float>(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block))));
|
||||
float min = static_cast<float>(ov::float16::from_bits(*((uint16_t *) (data + i * bytes_per_block + 2))));
|
||||
scales[i] = ov::float16(scale);
|
||||
// zp = -min / scale (bias = min, so zp = -bias/scale)
|
||||
uint8_t zp_val = (scale != 0.0f) ? (uint8_t) std::round(-min / scale) : 0;
|
||||
// Pack two 4-bit zero points per byte
|
||||
if (i % 2 == 0) {
|
||||
zp[i / 2] = zp_val & 0x0F; // Lower nibble
|
||||
} else {
|
||||
zp[i / 2] |= (zp_val << 4); // Upper nibble
|
||||
}
|
||||
unpack_32_4(data + i * bytes_per_block + 4, weights + i * 16);
|
||||
});
|
||||
}
|
||||
|
||||
// Extracts (weight, scales, biases) from Q8_0 tensors.
|
||||
// Extracts (weight, scales, zp) from Q8_0 tensors.
|
||||
// Data layout is: |16 bit scale|32 x 8bit weights|.
|
||||
void extract_q8_0_data(const ggml_tensor * tensor,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & biases_arr) {
|
||||
ov::Tensor & zp_arr) {
|
||||
const uint64_t weights_per_block = 32;
|
||||
const uint64_t bytes_per_block = 34; // 2 bytes scale, 32x1 byte weights
|
||||
|
||||
auto * data = static_cast<uint8_t *>(tensor->data);
|
||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
||||
|
||||
bool is_scalar_bias = (biases_arr.get_size() == 1); // Symmetric quantization
|
||||
bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
|
||||
|
||||
// For Q8_0, zero point is always 128
|
||||
if (is_scalar_zp) {
|
||||
zp[0] = 128;
|
||||
}
|
||||
|
||||
ov::parallel_for(scales_arr.get_size(), [&](size_t i) {
|
||||
uint8_t * block_data = data + i * bytes_per_block;
|
||||
scales[i] = ov::float16::from_bits(*(uint16_t *) block_data);
|
||||
// For symmetric quantization, only write the first bias (all blocks share the same bias relationship)
|
||||
if (is_scalar_bias) {
|
||||
if (i == 0) {
|
||||
biases[0] = ov::float16(-128.f * static_cast<float>(scales[0]));
|
||||
}
|
||||
} else {
|
||||
biases[i] = ov::float16(-128.f * static_cast<float>(scales[i]));
|
||||
// For asymmetric quantization, store per-block zero points
|
||||
if (!is_scalar_zp) {
|
||||
zp[i] = 128;
|
||||
}
|
||||
for (size_t j = 0; j < weights_per_block; ++j) {
|
||||
uint8_t x = block_data[j + 2]; // j+2 to skip the scale bytes.
|
||||
|
|
@ -147,51 +164,60 @@ void unpack_256_4(const uint8_t * data, uint8_t * dst) {
|
|||
void extract_q4_k_data(const ggml_tensor * tensor,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & biases_arr) {
|
||||
ov::Tensor & zp_arr) {
|
||||
const uint64_t bytes_per_block = 2 + 2 + 12 + 128;
|
||||
const uint64_t n_super_block = tensor->nb[3] / bytes_per_block;
|
||||
|
||||
auto * data = static_cast<uint8_t *>(tensor->data);
|
||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
||||
|
||||
ov::parallel_for(n_super_block, [&](size_t i) {
|
||||
uint8_t * block_data = data + i * bytes_per_block;
|
||||
|
||||
// Extract scale factors and offsets
|
||||
float scale_scales = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data)));
|
||||
float scale_biases = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 1)));
|
||||
float scale_mins = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 1)));
|
||||
|
||||
// Extract qs1 and qs2
|
||||
uint8_t * qs1 = block_data + 4;
|
||||
// uint8_t* qs2 = block_data + 16;
|
||||
|
||||
scales[i * 8] = ov::float16(scale_scales * static_cast<float>((*(qs1) & 0b111111)));
|
||||
scales[i * 8 + 1] = ov::float16(scale_scales * static_cast<float>((*(qs1 + 1) & 0b111111)));
|
||||
scales[i * 8 + 2] = ov::float16(scale_scales * static_cast<float>((*(qs1 + 2) & 0b111111)));
|
||||
scales[i * 8 + 3] = ov::float16(scale_scales * static_cast<float>((*(qs1 + 3) & 0b111111)));
|
||||
scales[i * 8 + 4] =
|
||||
ov::float16(scale_scales * static_cast<float>((*(qs1 + 8) & 0b00001111) | ((*(qs1) >> 6) << 4)));
|
||||
scales[i * 8 + 5] =
|
||||
ov::float16(scale_scales * static_cast<float>((*(qs1 + 9) & 0b00001111) | ((*(qs1 + 1) >> 6) << 4)));
|
||||
scales[i * 8 + 6] =
|
||||
ov::float16(scale_scales * static_cast<float>((*(qs1 + 10) & 0b00001111) | ((*(qs1 + 2) >> 6) << 4)));
|
||||
scales[i * 8 + 7] =
|
||||
ov::float16(scale_scales * static_cast<float>((*(qs1 + 11) & 0b00001111) | ((*(qs1 + 3) >> 6) << 4)));
|
||||
// Calculate scales
|
||||
float scale_vals[8];
|
||||
scale_vals[0] = scale_scales * static_cast<float>((*(qs1) & 0b111111));
|
||||
scale_vals[1] = scale_scales * static_cast<float>((*(qs1 + 1) & 0b111111));
|
||||
scale_vals[2] = scale_scales * static_cast<float>((*(qs1 + 2) & 0b111111));
|
||||
scale_vals[3] = scale_scales * static_cast<float>((*(qs1 + 3) & 0b111111));
|
||||
scale_vals[4] = scale_scales * static_cast<float>((*(qs1 + 8) & 0b00001111) | ((*(qs1) >> 6) << 4));
|
||||
scale_vals[5] = scale_scales * static_cast<float>((*(qs1 + 9) & 0b00001111) | ((*(qs1 + 1) >> 6) << 4));
|
||||
scale_vals[6] = scale_scales * static_cast<float>((*(qs1 + 10) & 0b00001111) | ((*(qs1 + 2) >> 6) << 4));
|
||||
scale_vals[7] = scale_scales * static_cast<float>((*(qs1 + 11) & 0b00001111) | ((*(qs1 + 3) >> 6) << 4));
|
||||
|
||||
biases[i * 8] = ov::float16(-1.f * scale_biases * static_cast<float>((*(qs1 + 4) & 0b111111)));
|
||||
biases[i * 8 + 1] = ov::float16(-1.f * scale_biases * static_cast<float>((*(qs1 + 5) & 0b111111)));
|
||||
biases[i * 8 + 2] = ov::float16(-1.f * scale_biases * static_cast<float>((*(qs1 + 6) & 0b111111)));
|
||||
biases[i * 8 + 3] = ov::float16(-1.f * scale_biases * static_cast<float>((*(qs1 + 7) & 0b111111)));
|
||||
biases[i * 8 + 4] =
|
||||
ov::float16(-1.f * scale_biases * static_cast<float>((*(qs1 + 8) >> 4) | ((*(qs1 + 4) >> 6) << 4)));
|
||||
biases[i * 8 + 5] =
|
||||
ov::float16(-1.f * scale_biases * static_cast<float>((*(qs1 + 9) >> 4) | ((*(qs1 + 5) >> 6) << 4)));
|
||||
biases[i * 8 + 6] =
|
||||
ov::float16(-1.f * scale_biases * static_cast<float>((*(qs1 + 10) >> 4) | ((*(qs1 + 6) >> 6) << 4)));
|
||||
biases[i * 8 + 7] =
|
||||
ov::float16(-1.f * scale_biases * static_cast<float>((*(qs1 + 11) >> 4) | ((*(qs1 + 7) >> 6) << 4)));
|
||||
// Calculate min values (bias = -min)
|
||||
float min_vals[8];
|
||||
min_vals[0] = scale_mins * static_cast<float>((*(qs1 + 4) & 0b111111));
|
||||
min_vals[1] = scale_mins * static_cast<float>((*(qs1 + 5) & 0b111111));
|
||||
min_vals[2] = scale_mins * static_cast<float>((*(qs1 + 6) & 0b111111));
|
||||
min_vals[3] = scale_mins * static_cast<float>((*(qs1 + 7) & 0b111111));
|
||||
min_vals[4] = scale_mins * static_cast<float>((*(qs1 + 8) >> 4) | ((*(qs1 + 4) >> 6) << 4));
|
||||
min_vals[5] = scale_mins * static_cast<float>((*(qs1 + 9) >> 4) | ((*(qs1 + 5) >> 6) << 4));
|
||||
min_vals[6] = scale_mins * static_cast<float>((*(qs1 + 10) >> 4) | ((*(qs1 + 6) >> 6) << 4));
|
||||
min_vals[7] = scale_mins * static_cast<float>((*(qs1 + 11) >> 4) | ((*(qs1 + 7) >> 6) << 4));
|
||||
|
||||
// Store scales and compute zero points
|
||||
for (int j = 0; j < 8; j++) {
|
||||
scales[i * 8 + j] = ov::float16(scale_vals[j]);
|
||||
// zp = min / scale (since bias = -min and zp = -bias/scale)
|
||||
uint8_t zp_val = (scale_vals[j] != 0.0f) ? (uint8_t) std::round(min_vals[j] / scale_vals[j]) : 0;
|
||||
// Pack two 4-bit zero points per byte
|
||||
size_t idx = i * 8 + j;
|
||||
if (idx % 2 == 0) {
|
||||
zp[idx / 2] = zp_val & 0x0F;
|
||||
} else {
|
||||
zp[idx / 2] |= (zp_val << 4);
|
||||
}
|
||||
}
|
||||
unpack_256_4(block_data + 16, weights + i * 128);
|
||||
});
|
||||
}
|
||||
|
|
@ -199,16 +225,21 @@ void extract_q4_k_data(const ggml_tensor * tensor,
|
|||
void extract_q6_k_data(const ggml_tensor * tensor,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & biases_arr) {
|
||||
ov::Tensor & zp_arr) {
|
||||
const uint64_t bytes_per_block = 128 + 64 + 16 + 2;
|
||||
const uint64_t n_super_block = tensor->nb[3] / bytes_per_block;
|
||||
|
||||
auto * data = static_cast<uint8_t *>(tensor->data);
|
||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
||||
|
||||
bool is_scalar_bias = (biases_arr.get_size() == 1); // Symmetric quantization
|
||||
bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
|
||||
|
||||
// For Q6_K, zero point is always 32
|
||||
if (is_scalar_zp) {
|
||||
zp[0] = 32;
|
||||
}
|
||||
|
||||
ov::parallel_for(n_super_block, [&](size_t i) {
|
||||
uint8_t * block_data = data + i * bytes_per_block;
|
||||
|
|
@ -219,13 +250,9 @@ void extract_q6_k_data(const ggml_tensor * tensor,
|
|||
for (size_t j = 0; j < 16; j++) {
|
||||
scales[j + i * 16] =
|
||||
ov::float16(scale_factor * static_cast<float>(*((int8_t *) (block_data + 128 + 64 + j))));
|
||||
// For symmetric quantization, only write the first bias (all blocks share the same bias relationship)
|
||||
if (is_scalar_bias) {
|
||||
if (i == 0 && j == 0) {
|
||||
biases[0] = ov::float16(-32.f * static_cast<float>(scales[0]));
|
||||
}
|
||||
} else {
|
||||
biases[j + i * 16] = ov::float16(-32.f * static_cast<float>(scales[j + i * 16]));
|
||||
// For asymmetric quantization, store per-block zero points
|
||||
if (!is_scalar_zp) {
|
||||
zp[j + i * 16] = 32;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -258,20 +285,20 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t * d, uint8
|
|||
void extract_q5_k_data(const ggml_tensor * tensor,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & biases_arr) {
|
||||
ov::Tensor & zp_arr) {
|
||||
const uint64_t bytes_per_block = 4 + 12 + 32 + 128;
|
||||
const uint64_t n_super_block = tensor->nb[3] / bytes_per_block;
|
||||
|
||||
auto * data = static_cast<uint8_t *>(tensor->data);
|
||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
||||
|
||||
ov::parallel_for(n_super_block, [&](size_t i) {
|
||||
uint8_t * block_data = data + i * bytes_per_block;
|
||||
|
||||
const float d = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data)));
|
||||
const float min = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 1)));
|
||||
const float min_factor = static_cast<float>(ov::float16::from_bits(*((uint16_t *) block_data + 1)));
|
||||
|
||||
const uint8_t * scales_data = block_data + 4; // 12 bytes of scales
|
||||
const uint8_t * qh = block_data + 4 + 12; // 32 bytes of high bits
|
||||
|
|
@ -289,17 +316,18 @@ void extract_q5_k_data(const ggml_tensor * tensor,
|
|||
// Get scale and min for first 32 elements
|
||||
get_scale_min_k4(is + 0, scales_data, &sc, &m);
|
||||
const float d1 = d * sc;
|
||||
const float m1 = min * m;
|
||||
const float m1 = min_factor * m;
|
||||
|
||||
// Get scale and min for second 32 elements
|
||||
get_scale_min_k4(is + 1, scales_data, &sc, &m);
|
||||
const float d2 = d * sc;
|
||||
const float m2 = min * m;
|
||||
const float m2 = min_factor * m;
|
||||
|
||||
scales[i * 8 + is] = ov::float16(d1);
|
||||
biases[i * 8 + is] = ov::float16(-m1);
|
||||
scales[i * 8 + is + 1] = ov::float16(d2);
|
||||
biases[i * 8 + is + 1] = ov::float16(-m2);
|
||||
// zp = min / scale (since bias = -min and zp = -bias/scale)
|
||||
zp[i * 8 + is] = (d1 != 0.0f) ? (uint8_t) std::round(m1 / d1) : 0;
|
||||
zp[i * 8 + is + 1] = (d2 != 0.0f) ? (uint8_t) std::round(m2 / d2) : 0;
|
||||
|
||||
// Extract weights for first 32 elements (matching deq formula exactly)
|
||||
for (int l = 0; l < 32; ++l) {
|
||||
|
|
@ -321,16 +349,13 @@ void extract_q5_k_data(const ggml_tensor * tensor,
|
|||
|
||||
// TODO Reorder for make_intX_weights
|
||||
|
||||
ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
|
||||
ov::Tensor & scales,
|
||||
ov::Tensor & biases,
|
||||
size_t group_size) {
|
||||
ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight, ov::Tensor & scales, ov::Tensor & zp, size_t group_size) {
|
||||
ov::Shape orig_shape = weight.get_shape();
|
||||
|
||||
// Expand dimensions for scales and biases
|
||||
// Expand dimensions for scales and zp
|
||||
auto scale_shape = scales.get_shape();
|
||||
auto bias_shape = biases.get_shape();
|
||||
bool is_scalar_bias = bias_shape.empty(); // Symmetric quantization
|
||||
auto zp_shape = zp.get_shape();
|
||||
bool is_scalar_zp = zp_shape.empty(); // Symmetric quantization
|
||||
|
||||
ov::Shape packed_shape = {orig_shape[0], orig_shape[1] / group_size, group_size};
|
||||
|
||||
|
|
@ -340,10 +365,10 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
|
|||
} else {
|
||||
scale_shape.push_back(1);
|
||||
scales.set_shape(scale_shape);
|
||||
// For symmetric quantization, biases remain scalar (don't resize)
|
||||
if (!is_scalar_bias) {
|
||||
bias_shape = scale_shape;
|
||||
biases.set_shape(bias_shape);
|
||||
// For symmetric quantization, zp remains scalar (don't resize)
|
||||
if (!is_scalar_zp) {
|
||||
zp_shape.push_back(1);
|
||||
zp.set_shape(zp_shape);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -352,26 +377,9 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
|
|||
static_cast<uint8_t *>(weight.data()), nullptr);
|
||||
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
|
||||
auto scales_f16 = std::make_shared<ov::op::v0::Constant>(scales);
|
||||
ov::Tensor biases_u8(ov::element::u8, is_scalar_bias ? ov::Shape{} : scale_shape);
|
||||
|
||||
// Calculate zero point
|
||||
const ov::float16 * bias_data = biases.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
const ov::float16 * scale_data = scales.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
uint8_t * bias_u8_data = biases_u8.data<uint8_t>();
|
||||
|
||||
if (is_scalar_bias) {
|
||||
// Symmetric quantization: single bias value for all blocks
|
||||
// For Q8_0, bias = -128 * scale, so zero_point = 128
|
||||
bias_u8_data[0] = (uint8_t) std::round(-1.f * static_cast<float>(bias_data[0]) / static_cast<float>(scale_data[0]));
|
||||
} else {
|
||||
// Asymmetric quantization: per-block biases
|
||||
for (size_t i = 0; i < biases_u8.get_size(); ++i) {
|
||||
bias_u8_data[i] =
|
||||
(uint8_t) std::round(-1.f * static_cast<float>(bias_data[i]) / static_cast<float>(scale_data[i]));
|
||||
}
|
||||
}
|
||||
|
||||
auto zero_point = std::make_shared<ov::op::v0::Constant>(biases_u8);
|
||||
// Zero point is already in U8 format from extraction
|
||||
auto zero_point = std::make_shared<ov::op::v0::Constant>(zp);
|
||||
float zp_value;
|
||||
if (ov::op::util::get_single_value(zero_point, zp_value)) {
|
||||
zero_point = ov::op::v0::Constant::create(zero_point->get_element_type(), {}, {zp_value});
|
||||
|
|
@ -395,16 +403,13 @@ ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
|
|||
return std::make_shared<ov::op::v0::Convert>(w_zp_s, ov::element::f32);
|
||||
}
|
||||
|
||||
ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
|
||||
ov::Tensor & scales,
|
||||
ov::Tensor & biases,
|
||||
size_t group_size) {
|
||||
ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight, ov::Tensor & scales, ov::Tensor & zp, size_t group_size) {
|
||||
ov::Shape orig_weight_shape = weight.get_shape();
|
||||
|
||||
// Expand dimensions for scales and biases
|
||||
ov::Shape scale_bias_shape = scales.get_shape();
|
||||
auto bias_shape = biases.get_shape();
|
||||
bool is_scalar_bias = bias_shape.empty(); // Symmetric quantization
|
||||
// Expand dimensions for scales and zp
|
||||
ov::Shape scale_shape = scales.get_shape();
|
||||
auto zp_shape = zp.get_shape();
|
||||
bool is_scalar_zp = zp_shape.empty(); // Symmetric quantization
|
||||
|
||||
// Create INT4 weight tensor
|
||||
ov::Shape packed_shape = {orig_weight_shape[0], orig_weight_shape[1] / group_size, group_size};
|
||||
|
|
@ -413,12 +418,12 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
|
|||
// Requantized channel-wise case
|
||||
packed_shape.erase(packed_shape.begin() + 1);
|
||||
} else {
|
||||
scale_bias_shape.push_back(1);
|
||||
scales.set_shape(scale_bias_shape);
|
||||
// For symmetric quantization, biases remain scalar (don't resize)
|
||||
if (!is_scalar_bias) {
|
||||
bias_shape = scale_bias_shape;
|
||||
biases.set_shape(bias_shape);
|
||||
scale_shape.push_back(1);
|
||||
scales.set_shape(scale_shape);
|
||||
// For symmetric quantization, zp remains scalar (don't resize)
|
||||
if (!is_scalar_zp) {
|
||||
zp_shape.push_back(1);
|
||||
zp.set_shape(zp_shape);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -427,29 +432,8 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
|
|||
weights_node->get_rt_info()["__gguf_tensor_holder"] = weight;
|
||||
auto weights_f16 = std::make_shared<ov::op::v0::Convert>(weights_node, ov::element::f16);
|
||||
|
||||
// Pack zero points: two subsequent values into one
|
||||
const ov::float16 * bias_data = biases.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
const ov::float16 * scale_data = scales.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
ov::Tensor zero_point_tensor(ov::element::u4, is_scalar_bias ? ov::Shape{} : scale_bias_shape);
|
||||
uint8_t * zero_point_data = static_cast<uint8_t *>(zero_point_tensor.data());
|
||||
|
||||
if (is_scalar_bias) {
|
||||
// Symmetric quantization: single bias value for all blocks
|
||||
// For Q4_0, bias = -8 * scale, so zero_point = 8
|
||||
uint8_t zp = (uint8_t) std::round(-1.f * static_cast<float>(bias_data[0]) / static_cast<float>(scale_data[0]));
|
||||
zero_point_data[0] = (zp << 4) | (zp & 0x0F);
|
||||
} else {
|
||||
// Asymmetric quantization: per-block biases
|
||||
for (size_t i = 0; i < zero_point_tensor.get_byte_size(); ++i) {
|
||||
uint8_t bias1 =
|
||||
(uint8_t) std::round(-1.f * static_cast<float>(bias_data[i * 2]) / static_cast<float>(scale_data[i * 2]));
|
||||
uint8_t bias2 = (uint8_t) std::round(-1.f * static_cast<float>(bias_data[i * 2 + 1]) /
|
||||
static_cast<float>(scale_data[i * 2 + 1]));
|
||||
zero_point_data[i] = (bias2 << 4) | (bias1 & 0x0F);
|
||||
}
|
||||
}
|
||||
|
||||
auto zero_points_node = std::make_shared<ov::op::v0::Constant>(zero_point_tensor);
|
||||
// Zero point is already in U4 format from extraction
|
||||
auto zero_points_node = std::make_shared<ov::op::v0::Constant>(zp);
|
||||
float zp_value;
|
||||
if (ov::op::util::get_single_value(zero_points_node, zp_value)) {
|
||||
zero_points_node = ov::op::v0::Constant::create(zero_points_node->get_element_type(), {}, {zp_value});
|
||||
|
|
@ -480,7 +464,7 @@ std::shared_ptr<ov::Node> extract_quantized_weights(const ggml_tensor * tensor,
|
|||
const void * data,
|
||||
ov::Tensor & weights,
|
||||
ov::Tensor & scales,
|
||||
ov::Tensor & biases) {
|
||||
ov::Tensor & zp) {
|
||||
// Create a temporary tensor for extraction functions that read from tensor->data
|
||||
ggml_tensor temp_tensor = *tensor;
|
||||
temp_tensor.data = const_cast<void *>(data);
|
||||
|
|
@ -512,22 +496,22 @@ std::shared_ptr<ov::Node> extract_quantized_weights(const ggml_tensor * tensor,
|
|||
// Extract quantized data
|
||||
switch (tensor->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
extract_q4_0_data(&temp_tensor, weights, scales, biases);
|
||||
extract_q4_0_data(&temp_tensor, weights, scales, zp);
|
||||
break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
extract_q4_1_data(&temp_tensor, weights, scales, biases);
|
||||
extract_q4_1_data(&temp_tensor, weights, scales, zp);
|
||||
break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
extract_q4_k_data(&temp_tensor, weights, scales, biases);
|
||||
extract_q4_k_data(&temp_tensor, weights, scales, zp);
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
extract_q8_0_data(&temp_tensor, weights, scales, biases);
|
||||
extract_q8_0_data(&temp_tensor, weights, scales, zp);
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
extract_q6_k_data(&temp_tensor, weights, scales, biases);
|
||||
extract_q6_k_data(&temp_tensor, weights, scales, zp);
|
||||
break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
extract_q5_k_data(&temp_tensor, weights, scales, biases);
|
||||
extract_q5_k_data(&temp_tensor, weights, scales, zp);
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("Unsupported quantized type: " + std::string(ggml_type_name(tensor->type)));
|
||||
|
|
@ -536,9 +520,9 @@ std::shared_ptr<ov::Node> extract_quantized_weights(const ggml_tensor * tensor,
|
|||
// Create the OpenVINO weight subgraph
|
||||
ov::Output<ov::Node> weight_node;
|
||||
if (is_u4) {
|
||||
weight_node = make_int4_weights(weights, scales, biases, weights_per_block);
|
||||
weight_node = make_int4_weights(weights, scales, zp, weights_per_block);
|
||||
} else {
|
||||
weight_node = make_int8_weights(weights, scales, biases, weights_per_block);
|
||||
weight_node = make_int8_weights(weights, scales, zp, weights_per_block);
|
||||
}
|
||||
|
||||
auto result = weight_node.get_node_shared_ptr();
|
||||
|
|
@ -553,7 +537,7 @@ std::shared_ptr<ov::Node> requantize_to_buffers(const ggml_tensor * tensor,
|
|||
int64_t block_size,
|
||||
ov::Tensor & weights,
|
||||
ov::Tensor & scales,
|
||||
ov::Tensor & biases) {
|
||||
ov::Tensor & zp) {
|
||||
int64_t n_elements = ggml_nelements(tensor);
|
||||
|
||||
// First dequantize to F32
|
||||
|
|
@ -572,19 +556,19 @@ std::shared_ptr<ov::Node> requantize_to_buffers(const ggml_tensor * tensor,
|
|||
bool is_u4 = (requant_type == ExtraQuantType::Q4_0_C || requant_type == ExtraQuantType::Q4_0_128);
|
||||
|
||||
if (is_u4) {
|
||||
quantize_q4_0(weights_f32.data(), weights, scales, biases, n_elements, block_size);
|
||||
quantize_q4_0(weights_f32.data(), weights, scales, zp, n_elements, block_size);
|
||||
} else if (requant_type == ExtraQuantType::Q8_1_C) {
|
||||
quantize_q8_1(weights_f32.data(), weights, scales, biases, n_elements, block_size);
|
||||
quantize_q8_1(weights_f32.data(), weights, scales, zp, n_elements, block_size);
|
||||
} else {
|
||||
quantize_q8_0(weights_f32.data(), weights, scales, biases, n_elements, block_size);
|
||||
quantize_q8_0(weights_f32.data(), weights, scales, zp, n_elements, block_size);
|
||||
}
|
||||
|
||||
// Create the OpenVINO weight subgraph
|
||||
ov::Output<ov::Node> weight_node;
|
||||
if (is_u4) {
|
||||
weight_node = make_int4_weights(weights, scales, biases, block_size);
|
||||
weight_node = make_int4_weights(weights, scales, zp, block_size);
|
||||
} else {
|
||||
weight_node = make_int8_weights(weights, scales, biases, block_size);
|
||||
weight_node = make_int8_weights(weights, scales, zp, block_size);
|
||||
}
|
||||
|
||||
auto result = weight_node.get_node_shared_ptr();
|
||||
|
|
@ -653,50 +637,52 @@ std::shared_ptr<ov::Node> process_weight_tensor(const ggml_tensor * tensor, cons
|
|||
} else {
|
||||
weights = ov::Tensor(ov::element::f16, node_shape);
|
||||
}
|
||||
ov::Tensor dummy_scales, dummy_biases; // Not used for F16
|
||||
result = requantize_to_buffers(tensor, data, ExtraQuantType::F16, 0, weights, dummy_scales, dummy_biases);
|
||||
ov::Tensor dummy_scales, dummy_zp; // Not used for F16
|
||||
result = requantize_to_buffers(tensor, data, ExtraQuantType::F16, 0, weights, dummy_scales, dummy_zp);
|
||||
} else {
|
||||
// Requant to quantized format (Q4_0_128, Q8_0_32, etc.)
|
||||
ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
|
||||
ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block};
|
||||
// For symmetric quantization, biases are a single value instead of per-block
|
||||
ov::Shape bias_shape = layout.is_symmetric ? ov::Shape{} : scale_shape;
|
||||
// For symmetric quantization, zp is a scalar value instead of per-block
|
||||
// zp uses the same element type as weights (U4 or U8)
|
||||
ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape;
|
||||
|
||||
ov::Tensor weights, scales, biases;
|
||||
ov::Tensor weights, scales, zp;
|
||||
if (output_base_ptr) {
|
||||
uint8_t * buf_base = static_cast<uint8_t *>(output_base_ptr);
|
||||
weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset);
|
||||
scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
|
||||
biases = ov::Tensor(ov::element::f16, bias_shape, buf_base + layout.biases_offset);
|
||||
zp = ov::Tensor(weight_type, zp_shape, buf_base + layout.zp_offset);
|
||||
} else {
|
||||
weights = ov::Tensor(weight_type, node_shape);
|
||||
scales = ov::Tensor(ov::element::f16, scale_shape);
|
||||
biases = ov::Tensor(ov::element::f16, bias_shape);
|
||||
zp = ov::Tensor(weight_type, zp_shape);
|
||||
}
|
||||
|
||||
result = requantize_to_buffers(tensor, data, layout.requant_type.value(), layout.weights_per_block, weights,
|
||||
scales, biases);
|
||||
scales, zp);
|
||||
}
|
||||
} else {
|
||||
// Normal extraction path (no requant)
|
||||
ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
|
||||
ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block};
|
||||
// For symmetric quantization, biases are a single value instead of per-block
|
||||
ov::Shape bias_shape = layout.is_symmetric ? ov::Shape{} : scale_shape;
|
||||
// For symmetric quantization, zp is a scalar value instead of per-block
|
||||
// zp uses the same element type as weights (U4 or U8)
|
||||
ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape;
|
||||
|
||||
ov::Tensor weights, scales, biases;
|
||||
ov::Tensor weights, scales, zp;
|
||||
if (output_base_ptr) {
|
||||
uint8_t * buf_base = static_cast<uint8_t *>(output_base_ptr);
|
||||
weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset);
|
||||
scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
|
||||
biases = ov::Tensor(ov::element::f16, bias_shape, buf_base + layout.biases_offset);
|
||||
zp = ov::Tensor(weight_type, zp_shape, buf_base + layout.zp_offset);
|
||||
} else {
|
||||
weights = ov::Tensor(weight_type, node_shape);
|
||||
scales = ov::Tensor(ov::element::f16, scale_shape);
|
||||
biases = ov::Tensor(ov::element::f16, bias_shape);
|
||||
zp = ov::Tensor(weight_type, zp_shape);
|
||||
}
|
||||
|
||||
result = extract_quantized_weights(tensor, data, weights, scales, biases);
|
||||
result = extract_quantized_weights(tensor, data, weights, scales, zp);
|
||||
}
|
||||
|
||||
return result;
|
||||
|
|
@ -705,7 +691,7 @@ std::shared_ptr<ov::Node> process_weight_tensor(const ggml_tensor * tensor, cons
|
|||
void quantize_q4_0(const float * x,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & biases_arr,
|
||||
ov::Tensor & zp_arr,
|
||||
int64_t k,
|
||||
int64_t qk) {
|
||||
assert(k % qk == 0);
|
||||
|
|
@ -713,8 +699,13 @@ void quantize_q4_0(const float * x,
|
|||
|
||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
bool is_scalar_bias = (biases_arr.get_size() == 1); // Symmetric quantization
|
||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
||||
bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
|
||||
|
||||
// For Q4_0, zero point is always 8
|
||||
if (is_scalar_zp) {
|
||||
zp[0] = 8 | (8 << 4); // Pack two 4-bit values
|
||||
}
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float amax = 0.0f; // absolute max
|
||||
|
|
@ -732,27 +723,27 @@ void quantize_q4_0(const float * x,
|
|||
|
||||
if (d == 0) {
|
||||
scales[i] = ov::float16(1.0f);
|
||||
if (is_scalar_bias) {
|
||||
if (i == 0) {
|
||||
biases[0] = ov::float16(-8.0f);
|
||||
// zp is already set to 8 for symmetric, or set per-block for asymmetric
|
||||
if (!is_scalar_zp) {
|
||||
if (i % 2 == 0) {
|
||||
zp[i / 2] = 8;
|
||||
} else {
|
||||
zp[i / 2] |= (8 << 4);
|
||||
}
|
||||
} else {
|
||||
biases[i] = ov::float16(-8.0f);
|
||||
}
|
||||
uint8_t zp = 8;
|
||||
memset(weights + i * qk / 2, zp | (zp << 4), qk / 2);
|
||||
memset(weights + i * qk / 2, 8 | (8 << 4), qk / 2);
|
||||
continue;
|
||||
}
|
||||
|
||||
const float id = 1.0f / d;
|
||||
scales[i] = ov::float16(d);
|
||||
// For symmetric quantization, only write the first bias (all blocks share the same bias relationship)
|
||||
if (is_scalar_bias) {
|
||||
if (i == 0) {
|
||||
biases[0] = ov::float16(-8.f * d);
|
||||
// For asymmetric quantization, store per-block zero points
|
||||
if (!is_scalar_zp) {
|
||||
if (i % 2 == 0) {
|
||||
zp[i / 2] = 8;
|
||||
} else {
|
||||
zp[i / 2] |= (8 << 4);
|
||||
}
|
||||
} else {
|
||||
biases[i] = ov::float16(-8.f * d);
|
||||
}
|
||||
|
||||
for (int j = 0; j < qk / 2; ++j) {
|
||||
|
|
@ -768,7 +759,7 @@ void quantize_q4_0(const float * x,
|
|||
void quantize_q8_0(const float * x,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & biases_arr,
|
||||
ov::Tensor & zp_arr,
|
||||
int64_t k,
|
||||
int64_t qk) {
|
||||
assert(k % qk == 0);
|
||||
|
|
@ -776,8 +767,13 @@ void quantize_q8_0(const float * x,
|
|||
|
||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
bool is_scalar_bias = (biases_arr.get_size() == 1); // Symmetric quantization
|
||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
||||
bool is_scalar_zp = (zp_arr.get_size() == 1); // Symmetric quantization
|
||||
|
||||
// For Q8_0, zero point is always 128
|
||||
if (is_scalar_zp) {
|
||||
zp[0] = 128;
|
||||
}
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float amax = 0.0f; // absolute max
|
||||
|
|
@ -792,13 +788,9 @@ void quantize_q8_0(const float * x,
|
|||
const float d = amax / 127.0f;
|
||||
const float id = d ? 1.0f / d : 0.0f;
|
||||
scales[i] = ov::float16(d);
|
||||
// For symmetric quantization, only write the first bias (all blocks share the same bias relationship)
|
||||
if (is_scalar_bias) {
|
||||
if (i == 0) {
|
||||
biases[0] = ov::float16(-128.0f * d);
|
||||
}
|
||||
} else {
|
||||
biases[i] = ov::float16(-128.0f * d);
|
||||
// For asymmetric quantization, store per-block zero points
|
||||
if (!is_scalar_zp) {
|
||||
zp[i] = 128;
|
||||
}
|
||||
|
||||
for (int j = 0; j < qk; ++j) {
|
||||
|
|
@ -812,7 +804,7 @@ void quantize_q8_0(const float * x,
|
|||
void quantize_q8_1(const float * x,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & biases_arr,
|
||||
ov::Tensor & zp_arr,
|
||||
int64_t k,
|
||||
int64_t qk) {
|
||||
assert(k % qk == 0);
|
||||
|
|
@ -820,7 +812,7 @@ void quantize_q8_1(const float * x,
|
|||
|
||||
auto * weights = static_cast<uint8_t *>(weights_arr.data());
|
||||
auto * scales = scales_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * biases = biases_arr.data<ov::element_type_traits<ov::element::f16>::value_type>();
|
||||
auto * zp = static_cast<uint8_t *>(zp_arr.data());
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float min = std::numeric_limits<float>::max();
|
||||
float max = std::numeric_limits<float>::lowest();
|
||||
|
|
@ -838,7 +830,8 @@ void quantize_q8_1(const float * x,
|
|||
const float d = (max - min) / ((1 << 8) - 1);
|
||||
const float id = d ? 1.0f / d : 0.0f;
|
||||
scales[i] = ov::float16(d);
|
||||
biases[i] = ov::float16(min);
|
||||
// zp = -min / scale (Q8_1 is asymmetric)
|
||||
zp[i] = (d != 0.0f) ? (uint8_t) std::round(-min / d) : 0;
|
||||
|
||||
for (int j = 0; j < qk; ++j) {
|
||||
const float x0 = (x[i * qk + j] - min) * id;
|
||||
|
|
|
|||
|
|
@ -8,52 +8,52 @@
|
|||
|
||||
void unpack_32_4(const uint8_t* data, uint8_t* dst);
|
||||
|
||||
void extract_q4_0_data(const ggml_tensor* tensor,
|
||||
ov::Tensor& weights_arr,
|
||||
ov::Tensor& scales_arr,
|
||||
ov::Tensor& biases_arr);
|
||||
void extract_q4_0_data(const ggml_tensor * tensor,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & zp_arr);
|
||||
|
||||
void extract_q4_1_data(const ggml_tensor* tensor,
|
||||
ov::Tensor& weights_arr,
|
||||
ov::Tensor& scales_arr,
|
||||
ov::Tensor& biases_arr);
|
||||
void extract_q4_1_data(const ggml_tensor * tensor,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & zp_arr);
|
||||
|
||||
void extract_q8_0_data(const ggml_tensor* tensor,
|
||||
ov::Tensor& weights_arr,
|
||||
ov::Tensor& scales_arr,
|
||||
ov::Tensor& biases_arr);
|
||||
void extract_q8_0_data(const ggml_tensor * tensor,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & zp_arr);
|
||||
|
||||
void unpack_256_4(const uint8_t* data, uint8_t* dst);
|
||||
|
||||
void extract_q4_k_data(const ggml_tensor* tensor,
|
||||
ov::Tensor& weights_arr,
|
||||
ov::Tensor& scales_arr,
|
||||
ov::Tensor& biases_arr);
|
||||
void extract_q4_k_data(const ggml_tensor * tensor,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & zp_arr);
|
||||
|
||||
void extract_q5_k_data(const ggml_tensor* tensor,
|
||||
ov::Tensor& weights_arr,
|
||||
ov::Tensor& scales_arr,
|
||||
ov::Tensor& biases_arr);
|
||||
void extract_q5_k_data(const ggml_tensor * tensor,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & zp_arr);
|
||||
|
||||
void extract_q6_k_data(const ggml_tensor* tensor,
|
||||
ov::Tensor& weights_arr,
|
||||
ov::Tensor& scales_arr,
|
||||
ov::Tensor& biases_arr);
|
||||
void extract_q6_k_data(const ggml_tensor * tensor,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & zp_arr);
|
||||
|
||||
static constexpr size_t GGML_QUANTIZATION_GROUP_SIZE = 32;
|
||||
|
||||
ov::Output<ov::Node> make_int8_weights(ov::Tensor& weight,
|
||||
ov::Tensor& scales,
|
||||
ov::Tensor& biases,
|
||||
ov::Output<ov::Node> make_int8_weights(ov::Tensor & weight,
|
||||
ov::Tensor & scales,
|
||||
ov::Tensor & zp,
|
||||
size_t group_size = GGML_QUANTIZATION_GROUP_SIZE);
|
||||
|
||||
ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight,
|
||||
ov::Tensor& scales,
|
||||
ov::Tensor& biases,
|
||||
ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
|
||||
ov::Tensor & scales,
|
||||
ov::Tensor & zp,
|
||||
size_t group_size = GGML_QUANTIZATION_GROUP_SIZE);
|
||||
|
||||
// Extract quantized weights from tensor and create weight subgraph
|
||||
// If weights/scales/biases are provided (non-empty), uses them as output buffers
|
||||
// If weights/scales/zp are provided (non-empty), uses them as output buffers
|
||||
// Otherwise allocates new ov::Tensors internally
|
||||
// Returns the weight node (make_int4_weights or make_int8_weights result)
|
||||
std::shared_ptr<ov::Node> extract_quantized_weights(
|
||||
|
|
@ -61,10 +61,10 @@ std::shared_ptr<ov::Node> extract_quantized_weights(
|
|||
const void * data, // Source data pointer (may differ from tensor->data)
|
||||
ov::Tensor & weights,
|
||||
ov::Tensor & scales,
|
||||
ov::Tensor & biases);
|
||||
ov::Tensor & zp);
|
||||
|
||||
// Requantize weights from tensor to target format, writing to provided buffers
|
||||
// For F16 target, only weights buffer is used (scales/biases ignored)
|
||||
// For F16 target, only weights buffer is used (scales/zp ignored)
|
||||
// Returns the weight node
|
||||
std::shared_ptr<ov::Node> requantize_to_buffers(const ggml_tensor * tensor,
|
||||
const void * data, // Source data pointer
|
||||
|
|
@ -72,7 +72,7 @@ std::shared_ptr<ov::Node> requantize_to_buffers(const ggml_tensor * tensor,
|
|||
int64_t block_size,
|
||||
ov::Tensor & weights,
|
||||
ov::Tensor & scales,
|
||||
ov::Tensor & biases);
|
||||
ov::Tensor & zp);
|
||||
|
||||
// Process weight tensor and create an OpenVINO constant node
|
||||
// Handles F16/F32/BF16 and quantized weights, with optional requantization
|
||||
|
|
@ -84,11 +84,23 @@ std::shared_ptr<ov::Node> process_weight_tensor(
|
|||
const void * data, // Source data pointer (may differ from tensor->data)
|
||||
void * output_base_ptr = nullptr); // Base pointer for output buffers (or nullptr for internal allocation)
|
||||
|
||||
void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
|
||||
void quantize_q4_0(const float * x,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & zp_arr,
|
||||
int64_t k,
|
||||
int64_t qk);
|
||||
void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
|
||||
void quantize_q8_1(const float * x,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & zp_arr,
|
||||
int64_t k,
|
||||
int64_t qk);
|
||||
void quantize_q8_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
|
||||
void quantize_q8_0(const float * x,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
ov::Tensor & zp_arr,
|
||||
int64_t k,
|
||||
int64_t qk);
|
||||
|
||||
namespace ov {
|
||||
|
|
|
|||
Loading…
Reference in New Issue