Refactor weight tensor processing
This commit is contained in:
parent
0ee7e05485
commit
900dd76c24
|
|
@ -550,11 +550,6 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
|
|||
return model_weights;
|
||||
}
|
||||
|
||||
// Static cache for quantized weight nodes (keyed by tensor data pointer)
|
||||
// This is a fallback for when tensors don't have pre-built constants in extra
|
||||
static std::unordered_map<const void *, std::shared_ptr<ov::Node>> s_quantized_weight_cache;
|
||||
static std::mutex s_quantized_weight_cache_mutex;
|
||||
|
||||
std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor) {
|
||||
// Check if we have a pre-built constant from the OpenVINO backend buffer
|
||||
// This is set during ggml_backend_openvino_buffer_set_tensor
|
||||
|
|
@ -569,51 +564,62 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor
|
|||
if (extra_base->type == ggml_openvino_extra_base::Type::WEIGHT) {
|
||||
// F16/F32/BF16 weight with shared-memory constant
|
||||
auto * weight_extra = static_cast<ggml_openvino_weight_extra *>(tensor->extra);
|
||||
if (weight_extra->constant) {
|
||||
GGML_LOG_DEBUG("%s: using pre-built constant for %s\n", __func__, tensor->name);
|
||||
return weight_extra->constant;
|
||||
if (weight_extra->weight_node) {
|
||||
GGML_LOG_DEBUG("%s: using pre-built weight node for %s\n", __func__, tensor->name);
|
||||
return weight_extra->weight_node;
|
||||
}
|
||||
} else if (extra_base->type == ggml_openvino_extra_base::Type::QUANTIZED_WEIGHT) {
|
||||
// Quantized weight with pre-extracted data
|
||||
auto * quant_extra = static_cast<ggml_openvino_quantized_weight_extra *>(tensor->extra);
|
||||
if (quant_extra->constant) {
|
||||
GGML_LOG_DEBUG("%s: using pre-extracted quantized constant for %s\n", __func__, tensor->name);
|
||||
return quant_extra->constant;
|
||||
if (quant_extra->weight_node) {
|
||||
GGML_LOG_DEBUG("%s: using pre-extracted quantized weight node for %s\n", __func__, tensor->name);
|
||||
return quant_extra->weight_node;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: Check static cache for quantized weights (keyed by data pointer)
|
||||
// This handles cases where tensors weren't loaded through OpenVINO buffer
|
||||
if (ggml_is_quantized(tensor->type)) {
|
||||
std::lock_guard<std::mutex> lock(s_quantized_weight_cache_mutex);
|
||||
auto it = s_quantized_weight_cache.find(tensor->data);
|
||||
if (it != s_quantized_weight_cache.end()) {
|
||||
GGML_LOG_DEBUG("%s: using cached quantized constant for %s\n", __func__, tensor->name);
|
||||
return it->second;
|
||||
}
|
||||
}
|
||||
// Fallback: tensor doesn't have a pre-built extra. The buffer type can only be
|
||||
// openvino_host_buffer_type, which has enough space (get_alloc_size returns
|
||||
// layout.total_size for quantized 2D tensors) to store extracted data in-place.
|
||||
// Build the weight node and store it in tensor->extra for future reuse.
|
||||
GGML_LOG_DEBUG("%s: creating new weight node for %s\n", __func__, tensor->name);
|
||||
|
||||
GGML_LOG_DEBUG("%s: creating new constant for %s (extra=%p)\n", __func__, tensor->name, tensor->extra);
|
||||
|
||||
std::set<ggml_type> weight_types = {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0,
|
||||
GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K};
|
||||
static const std::set<ggml_type> weight_types = {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16,
|
||||
GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
|
||||
GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K};
|
||||
if (weight_types.find(tensor->type) == weight_types.end()) {
|
||||
throw std::runtime_error("Unexpected weight tensor type: " + std::string(tensor->name) + " with type " +
|
||||
ggml_type_name(tensor->type));
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Node> result = process_weight_tensor(tensor, tensor->data, nullptr);
|
||||
result->set_friendly_name(tensor->name);
|
||||
|
||||
// Cache the quantized weight node for future reuse
|
||||
OvWeight ov_weight;
|
||||
if (ggml_is_quantized(tensor->type)) {
|
||||
std::lock_guard<std::mutex> lock(s_quantized_weight_cache_mutex);
|
||||
s_quantized_weight_cache[tensor->data] = result;
|
||||
GGML_LOG_DEBUG("%s: cached quantized constant for %s\n", __func__, tensor->name);
|
||||
// For quantized weights, copy raw data to a temp buffer first because
|
||||
// process_weight_tensor reads from data and writes extracted results
|
||||
// (weights/scales/zp) to output_base_ptr — they would overlap if both
|
||||
// point to tensor->data.
|
||||
size_t raw_size = ggml_nbytes(tensor);
|
||||
std::vector<uint8_t> tmp(raw_size);
|
||||
memcpy(tmp.data(), tensor->data, raw_size);
|
||||
ov_weight = process_weight_tensor(tensor, tmp.data(), tensor->data);
|
||||
} else {
|
||||
// For non-quantized weights (F16/F32/BF16), data is already in tensor->data.
|
||||
// process_weight_tensor will create an ov::Tensor wrapping tensor->data directly.
|
||||
ov_weight = process_weight_tensor(tensor, tensor->data, tensor->data);
|
||||
}
|
||||
|
||||
return result;
|
||||
ov_weight.weight_node->set_friendly_name(tensor->name);
|
||||
|
||||
ggml_openvino_extra_base * extra;
|
||||
if (ov_weight.is_quantized()) {
|
||||
extra = new ggml_openvino_quantized_weight_extra(std::move(ov_weight.weights), std::move(ov_weight.scales),
|
||||
std::move(ov_weight.zp), ov_weight.weight_node);
|
||||
} else {
|
||||
extra = new ggml_openvino_weight_extra(std::move(ov_weight.weights), ov_weight.weight_node);
|
||||
}
|
||||
ggml_openvino_buffer_register_extra(tensor, extra);
|
||||
|
||||
return ov_weight.weight_node;
|
||||
}
|
||||
|
||||
void GgmlOvDecoder::dump_cgraph(const ggml_cgraph * cgraph, std::string & filename) {
|
||||
|
|
|
|||
|
|
@ -319,6 +319,7 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
|
|||
layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
|
||||
layout.zp_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;
|
||||
layout.total_size = layout.zp_offset + layout.zp_size;
|
||||
layout.total_size = std::max(layout.total_size, ggml_nbytes(tensor));
|
||||
|
||||
return layout;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -102,27 +102,30 @@ protected:
|
|||
explicit ggml_openvino_extra_base(Type t) : type(t) {}
|
||||
};
|
||||
|
||||
// Extra data for F16/F32/BF16 weight tensors - stores the pre-built ov::Constant node
|
||||
// Extra data for F16/F32/BF16 weight tensors - stores the pre-built weight node
|
||||
struct ggml_openvino_weight_extra : public ggml_openvino_extra_base {
|
||||
std::shared_ptr<ov::Node> constant; // Pre-built OpenVINO Constant node
|
||||
ov::Tensor weights; // The underlying weight data tensor
|
||||
std::shared_ptr<ov::Node> weight_node; // Pre-built OpenVINO weight node
|
||||
|
||||
explicit ggml_openvino_weight_extra(std::shared_ptr<ov::Node> c)
|
||||
: ggml_openvino_extra_base(Type::WEIGHT), constant(std::move(c)) {}
|
||||
ggml_openvino_weight_extra(ov::Tensor w, std::shared_ptr<ov::Node> n) :
|
||||
ggml_openvino_extra_base(Type::WEIGHT),
|
||||
weights(std::move(w)),
|
||||
weight_node(std::move(n)) {}
|
||||
};
|
||||
|
||||
// Extra data for quantized weight tensors - stores extracted weights/scales/zp and ov::Constant
|
||||
// Extra data for quantized weight tensors - stores extracted weights/scales/zp and weight node
|
||||
struct ggml_openvino_quantized_weight_extra : public ggml_openvino_extra_base {
|
||||
ov::Tensor weights; // U4 or U8 extracted weights
|
||||
ov::Tensor scales; // F16 scales
|
||||
ov::Tensor zp; // U4 or U8 zero points (same type as weights)
|
||||
std::shared_ptr<ov::Node> constant; // Pre-built OpenVINO weight subgraph
|
||||
std::shared_ptr<ov::Node> weight_node; // Pre-built OpenVINO weight subgraph
|
||||
|
||||
ggml_openvino_quantized_weight_extra(ov::Tensor w, ov::Tensor s, ov::Tensor z, std::shared_ptr<ov::Node> c) :
|
||||
ggml_openvino_quantized_weight_extra(ov::Tensor w, ov::Tensor s, ov::Tensor z, std::shared_ptr<ov::Node> n) :
|
||||
ggml_openvino_extra_base(Type::QUANTIZED_WEIGHT),
|
||||
weights(std::move(w)),
|
||||
scales(std::move(s)),
|
||||
zp(std::move(z)),
|
||||
constant(std::move(c)) {}
|
||||
weight_node(std::move(n)) {}
|
||||
};
|
||||
|
||||
// Extra data for KV cache / compute tensors - stores ov::Tensor for infer_request
|
||||
|
|
@ -140,19 +143,19 @@ struct ggml_openvino_tensor_extra : public ggml_openvino_extra_base {
|
|||
// Returns the total size needed in the buffer for extracted data.
|
||||
|
||||
struct ggml_openvino_extracted_layout {
|
||||
size_t total_size; // Total bytes needed
|
||||
size_t weights_offset; // Offset to weights in buffer
|
||||
size_t weights_size; // Size of weights in bytes
|
||||
size_t scales_offset; // Offset to scales in buffer
|
||||
size_t scales_size; // Size of scales in bytes
|
||||
size_t zp_offset; // Offset to zero points in buffer
|
||||
size_t zp_size; // Size of zero points in bytes (U4 or U8)
|
||||
bool is_u4; // true for U4 weights, false for U8
|
||||
size_t total_size = 0; // Total bytes needed
|
||||
size_t weights_offset = 0; // Offset to weights in buffer
|
||||
size_t weights_size = 0; // Size of weights in bytes
|
||||
size_t scales_offset = 0; // Offset to scales in buffer
|
||||
size_t scales_size = 0; // Size of scales in bytes
|
||||
size_t zp_offset = 0; // Offset to zero points in buffer
|
||||
size_t zp_size = 0; // Size of zero points in bytes (U4 or U8)
|
||||
bool is_u4; // true for U4 weights, false for U8
|
||||
int64_t weights_per_block; // weights per scale/zp block
|
||||
bool is_symmetric; // true for symmetric quantization
|
||||
|
||||
// Requantization info
|
||||
bool is_requant; // true if this tensor needs requantization
|
||||
bool is_requant = false; // true if this tensor needs requantization
|
||||
std::optional<ExtraQuantType> requant_type; // target requant type if is_requant
|
||||
};
|
||||
|
||||
|
|
@ -160,3 +163,7 @@ struct ggml_openvino_extracted_layout {
|
|||
ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor);
|
||||
|
||||
ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor * tensor, bool is_remote);
|
||||
|
||||
// Register an extra with the tensor's OpenVINO buffer context for proper lifetime management.
|
||||
// This sets tensor->extra and tracks the extra in the buffer context for cleanup.
|
||||
void ggml_openvino_buffer_register_extra(ggml_tensor * tensor, ggml_openvino_extra_base * extra);
|
||||
|
|
|
|||
|
|
@ -230,80 +230,45 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer
|
|||
// 2D tensor (typical weight shape)
|
||||
bool is_2d = (tensor->ne[2] == 1 && tensor->ne[3] == 1);
|
||||
|
||||
// Check if this is a quantized weight tensor that needs extraction/requantization
|
||||
ggml_openvino_extracted_layout layout = {};
|
||||
if (is_weight_buffer && is_full_tensor_set && is_2d && ggml_is_quantized(tensor->type)) {
|
||||
layout = ggml_openvino_get_extracted_layout(tensor);
|
||||
}
|
||||
|
||||
if (layout.total_size > 0) {
|
||||
// Quantized weight tensor with extraction/requantization
|
||||
uint8_t * buf_base = (uint8_t *) tensor->data;
|
||||
|
||||
if (is_weight_buffer && is_full_tensor_set && is_2d) {
|
||||
try {
|
||||
std::shared_ptr<ov::Node> constant = process_weight_tensor(tensor, data, buf_base);
|
||||
constant->set_friendly_name(tensor->name);
|
||||
auto result = process_weight_tensor(tensor, data, tensor->data);
|
||||
result.weight_node->set_friendly_name(tensor->name);
|
||||
|
||||
// Store in tensor->extra
|
||||
if (layout.is_requant && layout.requant_type.has_value() &&
|
||||
layout.requant_type.value() == ExtraQuantType::F16) {
|
||||
// F16 requant case - use weight_extra
|
||||
auto * extra = new ggml_openvino_weight_extra(constant);
|
||||
ctx->tensor_extras[tensor] = extra;
|
||||
tensor->extra = extra;
|
||||
GGML_LOG_DEBUG("%s: requantized %s to F16\n", __func__, tensor->name);
|
||||
} else {
|
||||
// Quantized case - use quantized_weight_extra
|
||||
// Create tensors with external memory (already filled by process_weight_tensor)
|
||||
ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
|
||||
ov::Shape weight_shape = {static_cast<size_t>(tensor->ne[1]), static_cast<size_t>(tensor->ne[0])};
|
||||
ov::Shape scale_shape = {static_cast<size_t>(tensor->ne[1]),
|
||||
static_cast<size_t>(tensor->ne[0] / layout.weights_per_block)};
|
||||
// zp shape: scalar for symmetric, per-block for asymmetric
|
||||
ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape;
|
||||
const auto & layout = result.layout;
|
||||
ggml_openvino_extra_base * extra;
|
||||
|
||||
ov::Tensor weights(weight_type, weight_shape, buf_base + layout.weights_offset);
|
||||
ov::Tensor scales(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
|
||||
ov::Tensor zp(weight_type, zp_shape, buf_base + layout.zp_offset);
|
||||
|
||||
auto * extra = new ggml_openvino_quantized_weight_extra(std::move(weights), std::move(scales),
|
||||
std::move(zp), constant);
|
||||
ctx->tensor_extras[tensor] = extra;
|
||||
tensor->extra = extra;
|
||||
// Quantized path with extracted weight/scale/zp tensors
|
||||
if (result.is_quantized()) {
|
||||
extra = new ggml_openvino_quantized_weight_extra(std::move(result.weights), std::move(result.scales),
|
||||
std::move(result.zp), result.weight_node);
|
||||
|
||||
if (layout.is_requant) {
|
||||
GGML_LOG_DEBUG("%s: requantized %s to %s (u%d, block_size=%ld)\n", __func__, tensor->name,
|
||||
layout.requant_type.value() == ExtraQuantType::Q4_0_128 ? "Q4_0_128" : "Q8_0_32",
|
||||
layout.is_u4 ? 4 : 8, layout.weights_per_block);
|
||||
extra_quant_type_name(layout.requant_type.value()), layout.is_u4 ? 4 : 8,
|
||||
layout.weights_per_block);
|
||||
} else {
|
||||
int64_t n_blocks = ggml_nelements(tensor) / layout.weights_per_block;
|
||||
GGML_LOG_DEBUG("%s: extracted quantized constant for %s (u%d, %zu weights, %ld blocks)\n", __func__,
|
||||
tensor->name, layout.is_u4 ? 4 : 8, layout.weights_size, n_blocks);
|
||||
GGML_LOG_DEBUG("%s: extracted quantized weight node for %s (u%d, %zu weights, %ld blocks)\n",
|
||||
__func__, tensor->name, layout.is_u4 ? 4 : 8, layout.weights_size, n_blocks);
|
||||
}
|
||||
} else {
|
||||
// F16/F32/BF16 weight or F16-requant
|
||||
extra = new ggml_openvino_weight_extra(std::move(result.weights), result.weight_node);
|
||||
|
||||
if (layout.total_size > 0) {
|
||||
GGML_LOG_DEBUG("%s: requantized %s to F16\n", __func__, tensor->name);
|
||||
} else {
|
||||
GGML_LOG_DEBUG("%s: created shared-memory weight node for %s\n", __func__, tensor->name);
|
||||
}
|
||||
}
|
||||
|
||||
} catch (const std::exception & e) {
|
||||
GGML_LOG_ERROR("%s: failed to process quantized data for %s: %s\n", __func__, tensor->name, e.what());
|
||||
// Fall back to storing raw data
|
||||
memcpy((char *) tensor->data + offset, data, size);
|
||||
}
|
||||
} else if (is_weight_buffer && is_full_tensor_set && is_2d &&
|
||||
(tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16)) {
|
||||
// F16/F32/BF16 weight tensor
|
||||
try {
|
||||
std::shared_ptr<ov::Node> constant = process_weight_tensor(tensor, data, tensor->data);
|
||||
constant->set_friendly_name(tensor->name);
|
||||
|
||||
// Store in tensor->extra
|
||||
ggml_openvino_weight_extra * extra = new ggml_openvino_weight_extra(constant);
|
||||
ctx->tensor_extras[tensor] = extra;
|
||||
tensor->extra = extra;
|
||||
|
||||
GGML_LOG_DEBUG("%s: created shared-memory constant for %s\n", __func__, tensor->name);
|
||||
|
||||
} catch (const std::exception & e) {
|
||||
GGML_LOG_DEBUG("%s: failed to create shared-memory constant for %s: %s\n", __func__, tensor->name,
|
||||
e.what());
|
||||
GGML_LOG_ERROR("%s: failed to process weight tensor for %s: %s\n", __func__, tensor->name, e.what());
|
||||
memcpy((char *) tensor->data + offset, data, size);
|
||||
}
|
||||
} else {
|
||||
// Non-weight tensor (KV cache, activations, etc.) - copy data
|
||||
|
|
@ -604,6 +569,22 @@ size_t ggml_backend_openvino_buffer_get_ctx_id(ggml_backend_buffer_t buffer) {
|
|||
return ctx->id;
|
||||
}
|
||||
|
||||
void ggml_openvino_buffer_register_extra(ggml_tensor * tensor, ggml_openvino_extra_base * extra) {
|
||||
GGML_ASSERT(tensor != nullptr);
|
||||
GGML_ASSERT(tensor->buffer != nullptr);
|
||||
GGML_ASSERT(ggml_backend_buffer_is_openvino(tensor->buffer));
|
||||
|
||||
auto * ctx = static_cast<ggml_backend_openvino_buffer_context *>(tensor->buffer->context);
|
||||
|
||||
auto it = ctx->tensor_extras.find(tensor);
|
||||
if (it != ctx->tensor_extras.end()) {
|
||||
delete it->second;
|
||||
}
|
||||
|
||||
ctx->tensor_extras[tensor] = extra;
|
||||
tensor->extra = extra;
|
||||
}
|
||||
|
||||
bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t buft) {
|
||||
return buft->iface.get_name == ggml_backend_openvino_buffer_type_get_name;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -576,10 +576,12 @@ std::shared_ptr<ov::Node> requantize_to_buffers(const ggml_tensor * tensor,
|
|||
return result;
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Node> process_weight_tensor(const ggml_tensor * tensor, const void * data, void * output_base_ptr) {
|
||||
OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, void * output_base_ptr) {
|
||||
GGML_ASSERT(tensor != nullptr);
|
||||
GGML_ASSERT(data != nullptr);
|
||||
|
||||
OvWeight result;
|
||||
|
||||
// Get 2D shape for weights [rows, cols]
|
||||
ov::Shape node_shape = {static_cast<size_t>(tensor->ne[1]), static_cast<size_t>(tensor->ne[0])};
|
||||
|
||||
|
|
@ -600,18 +602,16 @@ std::shared_ptr<ov::Node> process_weight_tensor(const ggml_tensor * tensor, cons
|
|||
OPENVINO_THROW("Unexpected tensor type in F16/F32/BF16 path");
|
||||
}
|
||||
|
||||
if (output_base_ptr) {
|
||||
if (output_base_ptr && output_base_ptr != data) {
|
||||
// Using external buffer - copy data and create shared-memory constant
|
||||
size_t tensor_bytes = ggml_nbytes(tensor);
|
||||
memcpy(output_base_ptr, data, tensor_bytes);
|
||||
ov::Tensor ov_tensor(element_type, node_shape, output_base_ptr);
|
||||
return std::make_shared<ov::op::v0::Constant>(ov_tensor);
|
||||
result.weights = ov::Tensor(element_type, node_shape, output_base_ptr);
|
||||
} else {
|
||||
// Allocate internal buffer
|
||||
ov::Tensor weights(element_type, node_shape);
|
||||
memcpy(weights.data(), data, ggml_nelements(tensor) * element_type.size());
|
||||
return std::make_shared<ov::op::v0::Constant>(weights);
|
||||
result.weights = ov::Tensor(element_type, node_shape, data);
|
||||
}
|
||||
result.weight_node = std::make_shared<ov::op::v0::Constant>(result.weights);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Handle quantized weights
|
||||
|
|
@ -619,70 +619,48 @@ std::shared_ptr<ov::Node> process_weight_tensor(const ggml_tensor * tensor, cons
|
|||
OPENVINO_THROW("Unsupported weight tensor type: ", ggml_type_name(tensor->type));
|
||||
}
|
||||
|
||||
auto layout = ggml_openvino_get_extracted_layout(tensor);
|
||||
result.layout = ggml_openvino_get_extracted_layout(tensor);
|
||||
const auto & layout = result.layout;
|
||||
if (layout.total_size == 0) {
|
||||
OPENVINO_THROW("Unsupported quantized type: ", ggml_type_name(tensor->type));
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Node> result;
|
||||
// F16 requant path - no separate scales/zp needed in result
|
||||
if (layout.is_requant && layout.requant_type.has_value() && layout.requant_type.value() == ExtraQuantType::F16) {
|
||||
if (output_base_ptr) {
|
||||
result.weights = ov::Tensor(ov::element::f16, node_shape,
|
||||
static_cast<uint8_t *>(output_base_ptr) + layout.weights_offset);
|
||||
} else {
|
||||
result.weights = ov::Tensor(ov::element::f16, node_shape);
|
||||
}
|
||||
ov::Tensor dummy_scales, dummy_zp; // Not used for F16
|
||||
result.weight_node =
|
||||
requantize_to_buffers(tensor, data, ExtraQuantType::F16, 0, result.weights, dummy_scales, dummy_zp);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Quantized path (normal extraction or quantized requant)
|
||||
// Create weight/scale/zp tensors - shared between both paths
|
||||
ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
|
||||
ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block};
|
||||
ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape;
|
||||
|
||||
if (output_base_ptr) {
|
||||
uint8_t * buf_base = static_cast<uint8_t *>(output_base_ptr);
|
||||
result.weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset);
|
||||
result.scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
|
||||
result.zp = ov::Tensor(weight_type, zp_shape, buf_base + layout.zp_offset);
|
||||
} else {
|
||||
result.weights = ov::Tensor(weight_type, node_shape);
|
||||
result.scales = ov::Tensor(ov::element::f16, scale_shape);
|
||||
result.zp = ov::Tensor(weight_type, zp_shape);
|
||||
}
|
||||
|
||||
if (layout.is_requant && layout.requant_type.has_value()) {
|
||||
// Requantization path
|
||||
if (layout.requant_type.value() == ExtraQuantType::F16) {
|
||||
// Requant to F16
|
||||
ov::Tensor weights;
|
||||
if (output_base_ptr) {
|
||||
weights = ov::Tensor(ov::element::f16, node_shape,
|
||||
static_cast<uint8_t *>(output_base_ptr) + layout.weights_offset);
|
||||
} else {
|
||||
weights = ov::Tensor(ov::element::f16, node_shape);
|
||||
}
|
||||
ov::Tensor dummy_scales, dummy_zp; // Not used for F16
|
||||
result = requantize_to_buffers(tensor, data, ExtraQuantType::F16, 0, weights, dummy_scales, dummy_zp);
|
||||
} else {
|
||||
// Requant to quantized format (Q4_0_128, Q8_0_32, etc.)
|
||||
ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
|
||||
ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block};
|
||||
// For symmetric quantization, zp is a scalar value instead of per-block
|
||||
// zp uses the same element type as weights (U4 or U8)
|
||||
ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape;
|
||||
|
||||
ov::Tensor weights, scales, zp;
|
||||
if (output_base_ptr) {
|
||||
uint8_t * buf_base = static_cast<uint8_t *>(output_base_ptr);
|
||||
weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset);
|
||||
scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
|
||||
zp = ov::Tensor(weight_type, zp_shape, buf_base + layout.zp_offset);
|
||||
} else {
|
||||
weights = ov::Tensor(weight_type, node_shape);
|
||||
scales = ov::Tensor(ov::element::f16, scale_shape);
|
||||
zp = ov::Tensor(weight_type, zp_shape);
|
||||
}
|
||||
|
||||
result = requantize_to_buffers(tensor, data, layout.requant_type.value(), layout.weights_per_block, weights,
|
||||
scales, zp);
|
||||
}
|
||||
result.weight_node = requantize_to_buffers(tensor, data, layout.requant_type.value(), layout.weights_per_block,
|
||||
result.weights, result.scales, result.zp);
|
||||
} else {
|
||||
// Normal extraction path (no requant)
|
||||
ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
|
||||
ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block};
|
||||
// For symmetric quantization, zp is a scalar value instead of per-block
|
||||
// zp uses the same element type as weights (U4 or U8)
|
||||
ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape;
|
||||
|
||||
ov::Tensor weights, scales, zp;
|
||||
if (output_base_ptr) {
|
||||
uint8_t * buf_base = static_cast<uint8_t *>(output_base_ptr);
|
||||
weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset);
|
||||
scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
|
||||
zp = ov::Tensor(weight_type, zp_shape, buf_base + layout.zp_offset);
|
||||
} else {
|
||||
weights = ov::Tensor(weight_type, node_shape);
|
||||
scales = ov::Tensor(ov::element::f16, scale_shape);
|
||||
zp = ov::Tensor(weight_type, zp_shape);
|
||||
}
|
||||
|
||||
result = extract_quantized_weights(tensor, data, weights, scales, zp);
|
||||
result.weight_node = extract_quantized_weights(tensor, data, result.weights, result.scales, result.zp);
|
||||
}
|
||||
|
||||
return result;
|
||||
|
|
|
|||
|
|
@ -74,12 +74,43 @@ std::shared_ptr<ov::Node> requantize_to_buffers(const ggml_tensor * tensor,
|
|||
ov::Tensor & scales,
|
||||
ov::Tensor & zp);
|
||||
|
||||
// Process weight tensor and create an OpenVINO constant node
|
||||
inline const char * extra_quant_type_name(ExtraQuantType t) {
|
||||
switch (t) {
|
||||
case ExtraQuantType::F16:
|
||||
return "F16";
|
||||
case ExtraQuantType::Q4_0_C:
|
||||
return "Q4_0_C";
|
||||
case ExtraQuantType::Q4_0_128:
|
||||
return "Q4_0_128";
|
||||
case ExtraQuantType::Q8_0_C:
|
||||
return "Q8_0_C";
|
||||
case ExtraQuantType::Q8_0_32:
|
||||
return "Q8_0_32";
|
||||
case ExtraQuantType::Q8_1_C:
|
||||
return "Q8_1_C";
|
||||
default:
|
||||
return "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
// Result from process_weight_tensor containing the weight node and tensors.
|
||||
// For quantized weights, also contains the extracted layout and scale/zp tensors.
|
||||
struct OvWeight {
|
||||
std::shared_ptr<ov::Node> weight_node;
|
||||
ggml_openvino_extracted_layout layout; // Only meaningful for quantized (layout.total_size > 0)
|
||||
ov::Tensor weights;
|
||||
ov::Tensor scales;
|
||||
ov::Tensor zp;
|
||||
|
||||
bool is_quantized() const { return layout.scales_size > 0; }
|
||||
};
|
||||
|
||||
// Process weight tensor and create an OpenVINO weight node
|
||||
// Handles F16/F32/BF16 and quantized weights, with optional requantization
|
||||
// If output_base_ptr is nullptr, allocates internal buffers (for decoder use)
|
||||
// If output_base_ptr is provided, uses pre-allocated buffers at specified offsets (for backend buffer use)
|
||||
// Returns the weight constant node
|
||||
std::shared_ptr<ov::Node> process_weight_tensor(
|
||||
// Returns OvWeight with the weight node and optional quantized tensors
|
||||
OvWeight process_weight_tensor(
|
||||
const ggml_tensor * tensor,
|
||||
const void * data, // Source data pointer (may differ from tensor->data)
|
||||
void * output_base_ptr = nullptr); // Base pointer for output buffers (or nullptr for internal allocation)
|
||||
|
|
|
|||
Loading…
Reference in New Issue