diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index d8d71cf25e..da381e4fad 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -550,11 +550,6 @@ std::map> GgmlOvDecoder::create_weight_no return model_weights; } -// Static cache for quantized weight nodes (keyed by tensor data pointer) -// This is a fallback for when tensors don't have pre-built constants in extra -static std::unordered_map> s_quantized_weight_cache; -static std::mutex s_quantized_weight_cache_mutex; - std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor) { // Check if we have a pre-built constant from the OpenVINO backend buffer // This is set during ggml_backend_openvino_buffer_set_tensor @@ -569,51 +564,62 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor if (extra_base->type == ggml_openvino_extra_base::Type::WEIGHT) { // F16/F32/BF16 weight with shared-memory constant auto * weight_extra = static_cast(tensor->extra); - if (weight_extra->constant) { - GGML_LOG_DEBUG("%s: using pre-built constant for %s\n", __func__, tensor->name); - return weight_extra->constant; + if (weight_extra->weight_node) { + GGML_LOG_DEBUG("%s: using pre-built weight node for %s\n", __func__, tensor->name); + return weight_extra->weight_node; } } else if (extra_base->type == ggml_openvino_extra_base::Type::QUANTIZED_WEIGHT) { // Quantized weight with pre-extracted data auto * quant_extra = static_cast(tensor->extra); - if (quant_extra->constant) { - GGML_LOG_DEBUG("%s: using pre-extracted quantized constant for %s\n", __func__, tensor->name); - return quant_extra->constant; + if (quant_extra->weight_node) { + GGML_LOG_DEBUG("%s: using pre-extracted quantized weight node for %s\n", __func__, tensor->name); + return quant_extra->weight_node; } } } - // Fallback: Check static cache for quantized weights (keyed by data pointer) - // This handles cases where tensors weren't loaded through OpenVINO buffer - if (ggml_is_quantized(tensor->type)) { - std::lock_guard lock(s_quantized_weight_cache_mutex); - auto it = s_quantized_weight_cache.find(tensor->data); - if (it != s_quantized_weight_cache.end()) { - GGML_LOG_DEBUG("%s: using cached quantized constant for %s\n", __func__, tensor->name); - return it->second; - } - } + // Fallback: tensor doesn't have a pre-built extra. The buffer type can only be + // openvino_host_buffer_type, which has enough space (get_alloc_size returns + // layout.total_size for quantized 2D tensors) to store extracted data in-place. + // Build the weight node and store it in tensor->extra for future reuse. + GGML_LOG_DEBUG("%s: creating new weight node for %s\n", __func__, tensor->name); - GGML_LOG_DEBUG("%s: creating new constant for %s (extra=%p)\n", __func__, tensor->name, tensor->extra); - - std::set weight_types = {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, - GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K}; + static const std::set weight_types = {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, + GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, + GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K}; if (weight_types.find(tensor->type) == weight_types.end()) { throw std::runtime_error("Unexpected weight tensor type: " + std::string(tensor->name) + " with type " + ggml_type_name(tensor->type)); } - std::shared_ptr result = process_weight_tensor(tensor, tensor->data, nullptr); - result->set_friendly_name(tensor->name); - - // Cache the quantized weight node for future reuse + OvWeight ov_weight; if (ggml_is_quantized(tensor->type)) { - std::lock_guard lock(s_quantized_weight_cache_mutex); - s_quantized_weight_cache[tensor->data] = result; - GGML_LOG_DEBUG("%s: cached quantized constant for %s\n", __func__, tensor->name); + // For quantized weights, copy raw data to a temp buffer first because + // process_weight_tensor reads from data and writes extracted results + // (weights/scales/zp) to output_base_ptr — they would overlap if both + // point to tensor->data. + size_t raw_size = ggml_nbytes(tensor); + std::vector tmp(raw_size); + memcpy(tmp.data(), tensor->data, raw_size); + ov_weight = process_weight_tensor(tensor, tmp.data(), tensor->data); + } else { + // For non-quantized weights (F16/F32/BF16), data is already in tensor->data. + // process_weight_tensor will create an ov::Tensor wrapping tensor->data directly. + ov_weight = process_weight_tensor(tensor, tensor->data, tensor->data); } - return result; + ov_weight.weight_node->set_friendly_name(tensor->name); + + ggml_openvino_extra_base * extra; + if (ov_weight.is_quantized()) { + extra = new ggml_openvino_quantized_weight_extra(std::move(ov_weight.weights), std::move(ov_weight.scales), + std::move(ov_weight.zp), ov_weight.weight_node); + } else { + extra = new ggml_openvino_weight_extra(std::move(ov_weight.weights), ov_weight.weight_node); + } + ggml_openvino_buffer_register_extra(tensor, extra); + + return ov_weight.weight_node; } void GgmlOvDecoder::dump_cgraph(const ggml_cgraph * cgraph, std::string & filename) { diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp index 4584dc38d0..39bf7610eb 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp @@ -319,6 +319,7 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment; layout.zp_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment; layout.total_size = layout.zp_offset + layout.zp_size; + layout.total_size = std::max(layout.total_size, ggml_nbytes(tensor)); return layout; } diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.h b/ggml/src/ggml-openvino/ggml-openvino-extra.h index 726a90abb0..9ce4667154 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.h +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.h @@ -102,27 +102,30 @@ protected: explicit ggml_openvino_extra_base(Type t) : type(t) {} }; -// Extra data for F16/F32/BF16 weight tensors - stores the pre-built ov::Constant node +// Extra data for F16/F32/BF16 weight tensors - stores the pre-built weight node struct ggml_openvino_weight_extra : public ggml_openvino_extra_base { - std::shared_ptr constant; // Pre-built OpenVINO Constant node + ov::Tensor weights; // The underlying weight data tensor + std::shared_ptr weight_node; // Pre-built OpenVINO weight node - explicit ggml_openvino_weight_extra(std::shared_ptr c) - : ggml_openvino_extra_base(Type::WEIGHT), constant(std::move(c)) {} + ggml_openvino_weight_extra(ov::Tensor w, std::shared_ptr n) : + ggml_openvino_extra_base(Type::WEIGHT), + weights(std::move(w)), + weight_node(std::move(n)) {} }; -// Extra data for quantized weight tensors - stores extracted weights/scales/zp and ov::Constant +// Extra data for quantized weight tensors - stores extracted weights/scales/zp and weight node struct ggml_openvino_quantized_weight_extra : public ggml_openvino_extra_base { ov::Tensor weights; // U4 or U8 extracted weights ov::Tensor scales; // F16 scales ov::Tensor zp; // U4 or U8 zero points (same type as weights) - std::shared_ptr constant; // Pre-built OpenVINO weight subgraph + std::shared_ptr weight_node; // Pre-built OpenVINO weight subgraph - ggml_openvino_quantized_weight_extra(ov::Tensor w, ov::Tensor s, ov::Tensor z, std::shared_ptr c) : + ggml_openvino_quantized_weight_extra(ov::Tensor w, ov::Tensor s, ov::Tensor z, std::shared_ptr n) : ggml_openvino_extra_base(Type::QUANTIZED_WEIGHT), weights(std::move(w)), scales(std::move(s)), zp(std::move(z)), - constant(std::move(c)) {} + weight_node(std::move(n)) {} }; // Extra data for KV cache / compute tensors - stores ov::Tensor for infer_request @@ -140,19 +143,19 @@ struct ggml_openvino_tensor_extra : public ggml_openvino_extra_base { // Returns the total size needed in the buffer for extracted data. struct ggml_openvino_extracted_layout { - size_t total_size; // Total bytes needed - size_t weights_offset; // Offset to weights in buffer - size_t weights_size; // Size of weights in bytes - size_t scales_offset; // Offset to scales in buffer - size_t scales_size; // Size of scales in bytes - size_t zp_offset; // Offset to zero points in buffer - size_t zp_size; // Size of zero points in bytes (U4 or U8) - bool is_u4; // true for U4 weights, false for U8 + size_t total_size = 0; // Total bytes needed + size_t weights_offset = 0; // Offset to weights in buffer + size_t weights_size = 0; // Size of weights in bytes + size_t scales_offset = 0; // Offset to scales in buffer + size_t scales_size = 0; // Size of scales in bytes + size_t zp_offset = 0; // Offset to zero points in buffer + size_t zp_size = 0; // Size of zero points in bytes (U4 or U8) + bool is_u4; // true for U4 weights, false for U8 int64_t weights_per_block; // weights per scale/zp block bool is_symmetric; // true for symmetric quantization // Requantization info - bool is_requant; // true if this tensor needs requantization + bool is_requant = false; // true if this tensor needs requantization std::optional requant_type; // target requant type if is_requant }; @@ -160,3 +163,7 @@ struct ggml_openvino_extracted_layout { ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor); ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor * tensor, bool is_remote); + +// Register an extra with the tensor's OpenVINO buffer context for proper lifetime management. +// This sets tensor->extra and tracks the extra in the buffer context for cleanup. +void ggml_openvino_buffer_register_extra(ggml_tensor * tensor, ggml_openvino_extra_base * extra); diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index e531a9c036..efd399fe3f 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -230,80 +230,45 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer // 2D tensor (typical weight shape) bool is_2d = (tensor->ne[2] == 1 && tensor->ne[3] == 1); - // Check if this is a quantized weight tensor that needs extraction/requantization - ggml_openvino_extracted_layout layout = {}; - if (is_weight_buffer && is_full_tensor_set && is_2d && ggml_is_quantized(tensor->type)) { - layout = ggml_openvino_get_extracted_layout(tensor); - } - - if (layout.total_size > 0) { - // Quantized weight tensor with extraction/requantization - uint8_t * buf_base = (uint8_t *) tensor->data; - + if (is_weight_buffer && is_full_tensor_set && is_2d) { try { - std::shared_ptr constant = process_weight_tensor(tensor, data, buf_base); - constant->set_friendly_name(tensor->name); + auto result = process_weight_tensor(tensor, data, tensor->data); + result.weight_node->set_friendly_name(tensor->name); - // Store in tensor->extra - if (layout.is_requant && layout.requant_type.has_value() && - layout.requant_type.value() == ExtraQuantType::F16) { - // F16 requant case - use weight_extra - auto * extra = new ggml_openvino_weight_extra(constant); - ctx->tensor_extras[tensor] = extra; - tensor->extra = extra; - GGML_LOG_DEBUG("%s: requantized %s to F16\n", __func__, tensor->name); - } else { - // Quantized case - use quantized_weight_extra - // Create tensors with external memory (already filled by process_weight_tensor) - ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; - ov::Shape weight_shape = {static_cast(tensor->ne[1]), static_cast(tensor->ne[0])}; - ov::Shape scale_shape = {static_cast(tensor->ne[1]), - static_cast(tensor->ne[0] / layout.weights_per_block)}; - // zp shape: scalar for symmetric, per-block for asymmetric - ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape; + const auto & layout = result.layout; + ggml_openvino_extra_base * extra; - ov::Tensor weights(weight_type, weight_shape, buf_base + layout.weights_offset); - ov::Tensor scales(ov::element::f16, scale_shape, buf_base + layout.scales_offset); - ov::Tensor zp(weight_type, zp_shape, buf_base + layout.zp_offset); - - auto * extra = new ggml_openvino_quantized_weight_extra(std::move(weights), std::move(scales), - std::move(zp), constant); - ctx->tensor_extras[tensor] = extra; - tensor->extra = extra; + // Quantized path with extracted weight/scale/zp tensors + if (result.is_quantized()) { + extra = new ggml_openvino_quantized_weight_extra(std::move(result.weights), std::move(result.scales), + std::move(result.zp), result.weight_node); if (layout.is_requant) { GGML_LOG_DEBUG("%s: requantized %s to %s (u%d, block_size=%ld)\n", __func__, tensor->name, - layout.requant_type.value() == ExtraQuantType::Q4_0_128 ? "Q4_0_128" : "Q8_0_32", - layout.is_u4 ? 4 : 8, layout.weights_per_block); + extra_quant_type_name(layout.requant_type.value()), layout.is_u4 ? 4 : 8, + layout.weights_per_block); } else { int64_t n_blocks = ggml_nelements(tensor) / layout.weights_per_block; - GGML_LOG_DEBUG("%s: extracted quantized constant for %s (u%d, %zu weights, %ld blocks)\n", __func__, - tensor->name, layout.is_u4 ? 4 : 8, layout.weights_size, n_blocks); + GGML_LOG_DEBUG("%s: extracted quantized weight node for %s (u%d, %zu weights, %ld blocks)\n", + __func__, tensor->name, layout.is_u4 ? 4 : 8, layout.weights_size, n_blocks); + } + } else { + // F16/F32/BF16 weight or F16-requant + extra = new ggml_openvino_weight_extra(std::move(result.weights), result.weight_node); + + if (layout.total_size > 0) { + GGML_LOG_DEBUG("%s: requantized %s to F16\n", __func__, tensor->name); + } else { + GGML_LOG_DEBUG("%s: created shared-memory weight node for %s\n", __func__, tensor->name); } } - } catch (const std::exception & e) { - GGML_LOG_ERROR("%s: failed to process quantized data for %s: %s\n", __func__, tensor->name, e.what()); - // Fall back to storing raw data - memcpy((char *) tensor->data + offset, data, size); - } - } else if (is_weight_buffer && is_full_tensor_set && is_2d && - (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16)) { - // F16/F32/BF16 weight tensor - try { - std::shared_ptr constant = process_weight_tensor(tensor, data, tensor->data); - constant->set_friendly_name(tensor->name); - - // Store in tensor->extra - ggml_openvino_weight_extra * extra = new ggml_openvino_weight_extra(constant); ctx->tensor_extras[tensor] = extra; tensor->extra = extra; - GGML_LOG_DEBUG("%s: created shared-memory constant for %s\n", __func__, tensor->name); - } catch (const std::exception & e) { - GGML_LOG_DEBUG("%s: failed to create shared-memory constant for %s: %s\n", __func__, tensor->name, - e.what()); + GGML_LOG_ERROR("%s: failed to process weight tensor for %s: %s\n", __func__, tensor->name, e.what()); + memcpy((char *) tensor->data + offset, data, size); } } else { // Non-weight tensor (KV cache, activations, etc.) - copy data @@ -604,6 +569,22 @@ size_t ggml_backend_openvino_buffer_get_ctx_id(ggml_backend_buffer_t buffer) { return ctx->id; } +void ggml_openvino_buffer_register_extra(ggml_tensor * tensor, ggml_openvino_extra_base * extra) { + GGML_ASSERT(tensor != nullptr); + GGML_ASSERT(tensor->buffer != nullptr); + GGML_ASSERT(ggml_backend_buffer_is_openvino(tensor->buffer)); + + auto * ctx = static_cast(tensor->buffer->context); + + auto it = ctx->tensor_extras.find(tensor); + if (it != ctx->tensor_extras.end()) { + delete it->second; + } + + ctx->tensor_extras[tensor] = extra; + tensor->extra = extra; +} + bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t buft) { return buft->iface.get_name == ggml_backend_openvino_buffer_type_get_name; } diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp index 2de0494c91..10909cbc1e 100644 --- a/ggml/src/ggml-openvino/ggml-quants.cpp +++ b/ggml/src/ggml-openvino/ggml-quants.cpp @@ -576,10 +576,12 @@ std::shared_ptr requantize_to_buffers(const ggml_tensor * tensor, return result; } -std::shared_ptr process_weight_tensor(const ggml_tensor * tensor, const void * data, void * output_base_ptr) { +OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, void * output_base_ptr) { GGML_ASSERT(tensor != nullptr); GGML_ASSERT(data != nullptr); + OvWeight result; + // Get 2D shape for weights [rows, cols] ov::Shape node_shape = {static_cast(tensor->ne[1]), static_cast(tensor->ne[0])}; @@ -600,18 +602,16 @@ std::shared_ptr process_weight_tensor(const ggml_tensor * tensor, cons OPENVINO_THROW("Unexpected tensor type in F16/F32/BF16 path"); } - if (output_base_ptr) { + if (output_base_ptr && output_base_ptr != data) { // Using external buffer - copy data and create shared-memory constant size_t tensor_bytes = ggml_nbytes(tensor); memcpy(output_base_ptr, data, tensor_bytes); - ov::Tensor ov_tensor(element_type, node_shape, output_base_ptr); - return std::make_shared(ov_tensor); + result.weights = ov::Tensor(element_type, node_shape, output_base_ptr); } else { - // Allocate internal buffer - ov::Tensor weights(element_type, node_shape); - memcpy(weights.data(), data, ggml_nelements(tensor) * element_type.size()); - return std::make_shared(weights); + result.weights = ov::Tensor(element_type, node_shape, data); } + result.weight_node = std::make_shared(result.weights); + return result; } // Handle quantized weights @@ -619,70 +619,48 @@ std::shared_ptr process_weight_tensor(const ggml_tensor * tensor, cons OPENVINO_THROW("Unsupported weight tensor type: ", ggml_type_name(tensor->type)); } - auto layout = ggml_openvino_get_extracted_layout(tensor); + result.layout = ggml_openvino_get_extracted_layout(tensor); + const auto & layout = result.layout; if (layout.total_size == 0) { OPENVINO_THROW("Unsupported quantized type: ", ggml_type_name(tensor->type)); } - std::shared_ptr result; + // F16 requant path - no separate scales/zp needed in result + if (layout.is_requant && layout.requant_type.has_value() && layout.requant_type.value() == ExtraQuantType::F16) { + if (output_base_ptr) { + result.weights = ov::Tensor(ov::element::f16, node_shape, + static_cast(output_base_ptr) + layout.weights_offset); + } else { + result.weights = ov::Tensor(ov::element::f16, node_shape); + } + ov::Tensor dummy_scales, dummy_zp; // Not used for F16 + result.weight_node = + requantize_to_buffers(tensor, data, ExtraQuantType::F16, 0, result.weights, dummy_scales, dummy_zp); + return result; + } + + // Quantized path (normal extraction or quantized requant) + // Create weight/scale/zp tensors - shared between both paths + ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; + ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block}; + ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape; + + if (output_base_ptr) { + uint8_t * buf_base = static_cast(output_base_ptr); + result.weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset); + result.scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset); + result.zp = ov::Tensor(weight_type, zp_shape, buf_base + layout.zp_offset); + } else { + result.weights = ov::Tensor(weight_type, node_shape); + result.scales = ov::Tensor(ov::element::f16, scale_shape); + result.zp = ov::Tensor(weight_type, zp_shape); + } if (layout.is_requant && layout.requant_type.has_value()) { - // Requantization path - if (layout.requant_type.value() == ExtraQuantType::F16) { - // Requant to F16 - ov::Tensor weights; - if (output_base_ptr) { - weights = ov::Tensor(ov::element::f16, node_shape, - static_cast(output_base_ptr) + layout.weights_offset); - } else { - weights = ov::Tensor(ov::element::f16, node_shape); - } - ov::Tensor dummy_scales, dummy_zp; // Not used for F16 - result = requantize_to_buffers(tensor, data, ExtraQuantType::F16, 0, weights, dummy_scales, dummy_zp); - } else { - // Requant to quantized format (Q4_0_128, Q8_0_32, etc.) - ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; - ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block}; - // For symmetric quantization, zp is a scalar value instead of per-block - // zp uses the same element type as weights (U4 or U8) - ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape; - - ov::Tensor weights, scales, zp; - if (output_base_ptr) { - uint8_t * buf_base = static_cast(output_base_ptr); - weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset); - scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset); - zp = ov::Tensor(weight_type, zp_shape, buf_base + layout.zp_offset); - } else { - weights = ov::Tensor(weight_type, node_shape); - scales = ov::Tensor(ov::element::f16, scale_shape); - zp = ov::Tensor(weight_type, zp_shape); - } - - result = requantize_to_buffers(tensor, data, layout.requant_type.value(), layout.weights_per_block, weights, - scales, zp); - } + result.weight_node = requantize_to_buffers(tensor, data, layout.requant_type.value(), layout.weights_per_block, + result.weights, result.scales, result.zp); } else { - // Normal extraction path (no requant) - ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; - ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block}; - // For symmetric quantization, zp is a scalar value instead of per-block - // zp uses the same element type as weights (U4 or U8) - ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape; - - ov::Tensor weights, scales, zp; - if (output_base_ptr) { - uint8_t * buf_base = static_cast(output_base_ptr); - weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset); - scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset); - zp = ov::Tensor(weight_type, zp_shape, buf_base + layout.zp_offset); - } else { - weights = ov::Tensor(weight_type, node_shape); - scales = ov::Tensor(ov::element::f16, scale_shape); - zp = ov::Tensor(weight_type, zp_shape); - } - - result = extract_quantized_weights(tensor, data, weights, scales, zp); + result.weight_node = extract_quantized_weights(tensor, data, result.weights, result.scales, result.zp); } return result; diff --git a/ggml/src/ggml-openvino/ggml-quants.hpp b/ggml/src/ggml-openvino/ggml-quants.hpp index 6739689264..600b9c9f29 100644 --- a/ggml/src/ggml-openvino/ggml-quants.hpp +++ b/ggml/src/ggml-openvino/ggml-quants.hpp @@ -74,12 +74,43 @@ std::shared_ptr requantize_to_buffers(const ggml_tensor * tensor, ov::Tensor & scales, ov::Tensor & zp); -// Process weight tensor and create an OpenVINO constant node +inline const char * extra_quant_type_name(ExtraQuantType t) { + switch (t) { + case ExtraQuantType::F16: + return "F16"; + case ExtraQuantType::Q4_0_C: + return "Q4_0_C"; + case ExtraQuantType::Q4_0_128: + return "Q4_0_128"; + case ExtraQuantType::Q8_0_C: + return "Q8_0_C"; + case ExtraQuantType::Q8_0_32: + return "Q8_0_32"; + case ExtraQuantType::Q8_1_C: + return "Q8_1_C"; + default: + return "unknown"; + } +} + +// Result from process_weight_tensor containing the weight node and tensors. +// For quantized weights, also contains the extracted layout and scale/zp tensors. +struct OvWeight { + std::shared_ptr weight_node; + ggml_openvino_extracted_layout layout; // Only meaningful for quantized (layout.total_size > 0) + ov::Tensor weights; + ov::Tensor scales; + ov::Tensor zp; + + bool is_quantized() const { return layout.scales_size > 0; } +}; + +// Process weight tensor and create an OpenVINO weight node // Handles F16/F32/BF16 and quantized weights, with optional requantization // If output_base_ptr is nullptr, allocates internal buffers (for decoder use) // If output_base_ptr is provided, uses pre-allocated buffers at specified offsets (for backend buffer use) -// Returns the weight constant node -std::shared_ptr process_weight_tensor( +// Returns OvWeight with the weight node and optional quantized tensors +OvWeight process_weight_tensor( const ggml_tensor * tensor, const void * data, // Source data pointer (may differ from tensor->data) void * output_base_ptr = nullptr); // Base pointer for output buffers (or nullptr for internal allocation)