Refactor weight tensor processing

2026-02-06 20:09:12 +08:00 · 2026-02-06 20:09:12 +08:00 · 900dd76c24
parent 0ee7e05485
commit 900dd76c24
6 changed files with 181 additions and 177 deletions
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@ -550,11 +550,6 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
    return model_weights;
 }

-// Static cache for quantized weight nodes (keyed by tensor data pointer)
-// This is a fallback for when tensors don't have pre-built constants in extra
-static std::unordered_map<const void *, std::shared_ptr<ov::Node>> s_quantized_weight_cache;
-static std::mutex s_quantized_weight_cache_mutex;
-
 std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor) {
    // Check if we have a pre-built constant from the OpenVINO backend buffer
    // This is set during ggml_backend_openvino_buffer_set_tensor
@ -569,51 +564,62 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor
        if (extra_base->type == ggml_openvino_extra_base::Type::WEIGHT) {
            // F16/F32/BF16 weight with shared-memory constant
            auto * weight_extra = static_cast<ggml_openvino_weight_extra *>(tensor->extra);
-            if (weight_extra->constant) {
-                GGML_LOG_DEBUG("%s: using pre-built constant for %s\n", __func__, tensor->name);
-                return weight_extra->constant;
+            if (weight_extra->weight_node) {
+                GGML_LOG_DEBUG("%s: using pre-built weight node for %s\n", __func__, tensor->name);
+                return weight_extra->weight_node;
            }
        } else if (extra_base->type == ggml_openvino_extra_base::Type::QUANTIZED_WEIGHT) {
            // Quantized weight with pre-extracted data
            auto * quant_extra = static_cast<ggml_openvino_quantized_weight_extra *>(tensor->extra);
-            if (quant_extra->constant) {
-                GGML_LOG_DEBUG("%s: using pre-extracted quantized constant for %s\n", __func__, tensor->name);
-                return quant_extra->constant;
+            if (quant_extra->weight_node) {
+                GGML_LOG_DEBUG("%s: using pre-extracted quantized weight node for %s\n", __func__, tensor->name);
+                return quant_extra->weight_node;
            }
        }
    }

-    // Fallback: Check static cache for quantized weights (keyed by data pointer)
-    // This handles cases where tensors weren't loaded through OpenVINO buffer
-    if (ggml_is_quantized(tensor->type)) {
-        std::lock_guard<std::mutex> lock(s_quantized_weight_cache_mutex);
-        auto it = s_quantized_weight_cache.find(tensor->data);
-        if (it != s_quantized_weight_cache.end()) {
-            GGML_LOG_DEBUG("%s: using cached quantized constant for %s\n", __func__, tensor->name);
-            return it->second;
-        }
-    }
+    // Fallback: tensor doesn't have a pre-built extra. The buffer type can only be
+    // openvino_host_buffer_type, which has enough space (get_alloc_size returns
+    // layout.total_size for quantized 2D tensors) to store extracted data in-place.
+    // Build the weight node and store it in tensor->extra for future reuse.
+    GGML_LOG_DEBUG("%s: creating new weight node for %s\n", __func__, tensor->name);

-    GGML_LOG_DEBUG("%s: creating new constant for %s (extra=%p)\n", __func__, tensor->name, tensor->extra);
-
-    std::set<ggml_type> weight_types = {GGML_TYPE_F32,  GGML_TYPE_F16,  GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0,
-                                        GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K};
+    static const std::set<ggml_type> weight_types = {GGML_TYPE_F32,  GGML_TYPE_F16,  GGML_TYPE_BF16,
+                                                     GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
+                                                     GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K};
    if (weight_types.find(tensor->type) == weight_types.end()) {
        throw std::runtime_error("Unexpected weight tensor type: " + std::string(tensor->name) + " with type " +
                                 ggml_type_name(tensor->type));
    }

-    std::shared_ptr<ov::Node> result = process_weight_tensor(tensor, tensor->data, nullptr);
-    result->set_friendly_name(tensor->name);
-
-    // Cache the quantized weight node for future reuse
+    OvWeight ov_weight;
    if (ggml_is_quantized(tensor->type)) {
-        std::lock_guard<std::mutex> lock(s_quantized_weight_cache_mutex);
-        s_quantized_weight_cache[tensor->data] = result;
-        GGML_LOG_DEBUG("%s: cached quantized constant for %s\n", __func__, tensor->name);
+        // For quantized weights, copy raw data to a temp buffer first because
+        // process_weight_tensor reads from data and writes extracted results
+        // (weights/scales/zp) to output_base_ptr — they would overlap if both
+        // point to tensor->data.
+        size_t raw_size = ggml_nbytes(tensor);
+        std::vector<uint8_t> tmp(raw_size);
+        memcpy(tmp.data(), tensor->data, raw_size);
+        ov_weight = process_weight_tensor(tensor, tmp.data(), tensor->data);
+    } else {
+        // For non-quantized weights (F16/F32/BF16), data is already in tensor->data.
+        // process_weight_tensor will create an ov::Tensor wrapping tensor->data directly.
+        ov_weight = process_weight_tensor(tensor, tensor->data, tensor->data);
    }

-    return result;
+    ov_weight.weight_node->set_friendly_name(tensor->name);
+
+    ggml_openvino_extra_base * extra;
+    if (ov_weight.is_quantized()) {
+        extra = new ggml_openvino_quantized_weight_extra(std::move(ov_weight.weights), std::move(ov_weight.scales),
+                                                         std::move(ov_weight.zp), ov_weight.weight_node);
+    } else {
+        extra = new ggml_openvino_weight_extra(std::move(ov_weight.weights), ov_weight.weight_node);
+    }
+    ggml_openvino_buffer_register_extra(tensor, extra);
+
+    return ov_weight.weight_node;
 }

 void GgmlOvDecoder::dump_cgraph(const ggml_cgraph * cgraph, std::string & filename) {
--- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp
@ -319,6 +319,7 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
    layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
    layout.zp_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;
    layout.total_size = layout.zp_offset + layout.zp_size;
+    layout.total_size = std::max(layout.total_size, ggml_nbytes(tensor));

    return layout;
 }
--- a/ggml/src/ggml-openvino/ggml-openvino-extra.h
+++ b/ggml/src/ggml-openvino/ggml-openvino-extra.h
@ -102,27 +102,30 @@ protected:
    explicit ggml_openvino_extra_base(Type t) : type(t) {}
 };

-// Extra data for F16/F32/BF16 weight tensors - stores the pre-built ov::Constant node
+// Extra data for F16/F32/BF16 weight tensors - stores the pre-built weight node
 struct ggml_openvino_weight_extra : public ggml_openvino_extra_base {
-    std::shared_ptr<ov::Node> constant;  // Pre-built OpenVINO Constant node
+    ov::Tensor weights;                     // The underlying weight data tensor
+    std::shared_ptr<ov::Node> weight_node;  // Pre-built OpenVINO weight node

-    explicit ggml_openvino_weight_extra(std::shared_ptr<ov::Node> c)
-        : ggml_openvino_extra_base(Type::WEIGHT), constant(std::move(c)) {}
+    ggml_openvino_weight_extra(ov::Tensor w, std::shared_ptr<ov::Node> n) :
+        ggml_openvino_extra_base(Type::WEIGHT),
+        weights(std::move(w)),
+        weight_node(std::move(n)) {}
 };

-// Extra data for quantized weight tensors - stores extracted weights/scales/zp and ov::Constant
+// Extra data for quantized weight tensors - stores extracted weights/scales/zp and weight node
 struct ggml_openvino_quantized_weight_extra : public ggml_openvino_extra_base {
    ov::Tensor weights;   // U4 or U8 extracted weights
    ov::Tensor scales;    // F16 scales
    ov::Tensor zp;        // U4 or U8 zero points (same type as weights)
-    std::shared_ptr<ov::Node> constant;  // Pre-built OpenVINO weight subgraph
+    std::shared_ptr<ov::Node> weight_node;  // Pre-built OpenVINO weight subgraph

-    ggml_openvino_quantized_weight_extra(ov::Tensor w, ov::Tensor s, ov::Tensor z, std::shared_ptr<ov::Node> c) :
+    ggml_openvino_quantized_weight_extra(ov::Tensor w, ov::Tensor s, ov::Tensor z, std::shared_ptr<ov::Node> n) :
        ggml_openvino_extra_base(Type::QUANTIZED_WEIGHT),
        weights(std::move(w)),
        scales(std::move(s)),
        zp(std::move(z)),
-        constant(std::move(c)) {}
+        weight_node(std::move(n)) {}
 };

 // Extra data for KV cache / compute tensors - stores ov::Tensor for infer_request
@ -140,19 +143,19 @@ struct ggml_openvino_tensor_extra : public ggml_openvino_extra_base {
 // Returns the total size needed in the buffer for extracted data.

 struct ggml_openvino_extracted_layout {
-    size_t total_size;        // Total bytes needed
-    size_t weights_offset;    // Offset to weights in buffer
-    size_t weights_size;      // Size of weights in bytes
-    size_t scales_offset;     // Offset to scales in buffer
-    size_t scales_size;       // Size of scales in bytes
-    size_t zp_offset;         // Offset to zero points in buffer
-    size_t zp_size;           // Size of zero points in bytes (U4 or U8)
-    bool is_u4;               // true for U4 weights, false for U8
+    size_t total_size = 0;      // Total bytes needed
+    size_t weights_offset = 0;  // Offset to weights in buffer
+    size_t weights_size = 0;    // Size of weights in bytes
+    size_t scales_offset = 0;   // Offset to scales in buffer
+    size_t scales_size = 0;     // Size of scales in bytes
+    size_t zp_offset = 0;       // Offset to zero points in buffer
+    size_t zp_size = 0;         // Size of zero points in bytes (U4 or U8)
+    bool is_u4;                 // true for U4 weights, false for U8
    int64_t weights_per_block;  // weights per scale/zp block
    bool is_symmetric;        // true for symmetric quantization

    // Requantization info
-    bool is_requant;                              // true if this tensor needs requantization
+    bool is_requant = false;                      // true if this tensor needs requantization
    std::optional<ExtraQuantType> requant_type;   // target requant type if is_requant
 };

@ -160,3 +163,7 @@ struct ggml_openvino_extracted_layout {
 ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor);

 ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor * tensor, bool is_remote);
+
+// Register an extra with the tensor's OpenVINO buffer context for proper lifetime management.
+// This sets tensor->extra and tracks the extra in the buffer context for cleanup.
+void ggml_openvino_buffer_register_extra(ggml_tensor * tensor, ggml_openvino_extra_base * extra);
--- a/ggml/src/ggml-openvino/ggml-openvino.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino.cpp
@ -230,80 +230,45 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer
    // 2D tensor (typical weight shape)
    bool is_2d = (tensor->ne[2] == 1 && tensor->ne[3] == 1);

-    // Check if this is a quantized weight tensor that needs extraction/requantization
-    ggml_openvino_extracted_layout layout = {};
-    if (is_weight_buffer && is_full_tensor_set && is_2d && ggml_is_quantized(tensor->type)) {
-        layout = ggml_openvino_get_extracted_layout(tensor);
-    }
-
-    if (layout.total_size > 0) {
-        // Quantized weight tensor with extraction/requantization
-        uint8_t * buf_base = (uint8_t *) tensor->data;
-
+    if (is_weight_buffer && is_full_tensor_set && is_2d) {
        try {
-            std::shared_ptr<ov::Node> constant = process_weight_tensor(tensor, data, buf_base);
-            constant->set_friendly_name(tensor->name);
+            auto result = process_weight_tensor(tensor, data, tensor->data);
+            result.weight_node->set_friendly_name(tensor->name);

-            // Store in tensor->extra
-            if (layout.is_requant && layout.requant_type.has_value() &&
-                layout.requant_type.value() == ExtraQuantType::F16) {
-                // F16 requant case - use weight_extra
-                auto * extra = new ggml_openvino_weight_extra(constant);
-                ctx->tensor_extras[tensor] = extra;
-                tensor->extra = extra;
-                GGML_LOG_DEBUG("%s: requantized %s to F16\n", __func__, tensor->name);
-            } else {
-                // Quantized case - use quantized_weight_extra
-                // Create tensors with external memory (already filled by process_weight_tensor)
-                ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
-                ov::Shape weight_shape = {static_cast<size_t>(tensor->ne[1]), static_cast<size_t>(tensor->ne[0])};
-                ov::Shape scale_shape = {static_cast<size_t>(tensor->ne[1]),
-                                         static_cast<size_t>(tensor->ne[0] / layout.weights_per_block)};
-                // zp shape: scalar for symmetric, per-block for asymmetric
-                ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape;
+            const auto & layout = result.layout;
+            ggml_openvino_extra_base * extra;

-                ov::Tensor weights(weight_type, weight_shape, buf_base + layout.weights_offset);
-                ov::Tensor scales(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
-                ov::Tensor zp(weight_type, zp_shape, buf_base + layout.zp_offset);
-
-                auto * extra = new ggml_openvino_quantized_weight_extra(std::move(weights), std::move(scales),
-                                                                        std::move(zp), constant);
-                ctx->tensor_extras[tensor] = extra;
-                tensor->extra = extra;
+            // Quantized path with extracted weight/scale/zp tensors
+            if (result.is_quantized()) {
+                extra = new ggml_openvino_quantized_weight_extra(std::move(result.weights), std::move(result.scales),
+                                                                 std::move(result.zp), result.weight_node);

                if (layout.is_requant) {
                    GGML_LOG_DEBUG("%s: requantized %s to %s (u%d, block_size=%ld)\n", __func__, tensor->name,
-                                   layout.requant_type.value() == ExtraQuantType::Q4_0_128 ? "Q4_0_128" : "Q8_0_32",
-                                   layout.is_u4 ? 4 : 8, layout.weights_per_block);
+                                   extra_quant_type_name(layout.requant_type.value()), layout.is_u4 ? 4 : 8,
+                                   layout.weights_per_block);
                } else {
                    int64_t n_blocks = ggml_nelements(tensor) / layout.weights_per_block;
-                    GGML_LOG_DEBUG("%s: extracted quantized constant for %s (u%d, %zu weights, %ld blocks)\n", __func__,
-                                   tensor->name, layout.is_u4 ? 4 : 8, layout.weights_size, n_blocks);
+                    GGML_LOG_DEBUG("%s: extracted quantized weight node for %s (u%d, %zu weights, %ld blocks)\n",
+                                   __func__, tensor->name, layout.is_u4 ? 4 : 8, layout.weights_size, n_blocks);
+                }
+            } else {
+                // F16/F32/BF16 weight or F16-requant
+                extra = new ggml_openvino_weight_extra(std::move(result.weights), result.weight_node);
+
+                if (layout.total_size > 0) {
+                    GGML_LOG_DEBUG("%s: requantized %s to F16\n", __func__, tensor->name);
+                } else {
+                    GGML_LOG_DEBUG("%s: created shared-memory weight node for %s\n", __func__, tensor->name);
                }
            }

-        } catch (const std::exception & e) {
-            GGML_LOG_ERROR("%s: failed to process quantized data for %s: %s\n", __func__, tensor->name, e.what());
-            // Fall back to storing raw data
-            memcpy((char *) tensor->data + offset, data, size);
-        }
-    } else if (is_weight_buffer && is_full_tensor_set && is_2d &&
-               (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16)) {
-        // F16/F32/BF16 weight tensor
-        try {
-            std::shared_ptr<ov::Node> constant = process_weight_tensor(tensor, data, tensor->data);
-            constant->set_friendly_name(tensor->name);
-
-            // Store in tensor->extra
-            ggml_openvino_weight_extra * extra = new ggml_openvino_weight_extra(constant);
            ctx->tensor_extras[tensor] = extra;
            tensor->extra = extra;

-            GGML_LOG_DEBUG("%s: created shared-memory constant for %s\n", __func__, tensor->name);
-
        } catch (const std::exception & e) {
-            GGML_LOG_DEBUG("%s: failed to create shared-memory constant for %s: %s\n", __func__, tensor->name,
-                           e.what());
+            GGML_LOG_ERROR("%s: failed to process weight tensor for %s: %s\n", __func__, tensor->name, e.what());
+            memcpy((char *) tensor->data + offset, data, size);
        }
    } else {
        // Non-weight tensor (KV cache, activations, etc.) - copy data
@ -604,6 +569,22 @@ size_t ggml_backend_openvino_buffer_get_ctx_id(ggml_backend_buffer_t buffer) {
    return ctx->id;
 }

+void ggml_openvino_buffer_register_extra(ggml_tensor * tensor, ggml_openvino_extra_base * extra) {
+    GGML_ASSERT(tensor != nullptr);
+    GGML_ASSERT(tensor->buffer != nullptr);
+    GGML_ASSERT(ggml_backend_buffer_is_openvino(tensor->buffer));
+
+    auto * ctx = static_cast<ggml_backend_openvino_buffer_context *>(tensor->buffer->context);
+
+    auto it = ctx->tensor_extras.find(tensor);
+    if (it != ctx->tensor_extras.end()) {
+        delete it->second;
+    }
+
+    ctx->tensor_extras[tensor] = extra;
+    tensor->extra = extra;
+}
+
 bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t buft) {
    return buft->iface.get_name == ggml_backend_openvino_buffer_type_get_name;
 }
--- a/ggml/src/ggml-openvino/ggml-quants.cpp
+++ b/ggml/src/ggml-openvino/ggml-quants.cpp
@ -576,10 +576,12 @@ std::shared_ptr<ov::Node> requantize_to_buffers(const ggml_tensor * tensor,
    return result;
 }

-std::shared_ptr<ov::Node> process_weight_tensor(const ggml_tensor * tensor, const void * data, void * output_base_ptr) {
+OvWeight process_weight_tensor(const ggml_tensor * tensor, const void * data, void * output_base_ptr) {
    GGML_ASSERT(tensor != nullptr);
    GGML_ASSERT(data != nullptr);

+    OvWeight result;
+
    // Get 2D shape for weights [rows, cols]
    ov::Shape node_shape = {static_cast<size_t>(tensor->ne[1]), static_cast<size_t>(tensor->ne[0])};

@ -600,18 +602,16 @@ std::shared_ptr<ov::Node> process_weight_tensor(const ggml_tensor * tensor, cons
            OPENVINO_THROW("Unexpected tensor type in F16/F32/BF16 path");
        }

-        if (output_base_ptr) {
+        if (output_base_ptr && output_base_ptr != data) {
            // Using external buffer - copy data and create shared-memory constant
            size_t tensor_bytes = ggml_nbytes(tensor);
            memcpy(output_base_ptr, data, tensor_bytes);
-            ov::Tensor ov_tensor(element_type, node_shape, output_base_ptr);
-            return std::make_shared<ov::op::v0::Constant>(ov_tensor);
+            result.weights = ov::Tensor(element_type, node_shape, output_base_ptr);
        } else {
-            // Allocate internal buffer
-            ov::Tensor weights(element_type, node_shape);
-            memcpy(weights.data(), data, ggml_nelements(tensor) * element_type.size());
-            return std::make_shared<ov::op::v0::Constant>(weights);
+            result.weights = ov::Tensor(element_type, node_shape, data);
        }
+        result.weight_node = std::make_shared<ov::op::v0::Constant>(result.weights);
+        return result;
    }

    // Handle quantized weights
@ -619,70 +619,48 @@ std::shared_ptr<ov::Node> process_weight_tensor(const ggml_tensor * tensor, cons
        OPENVINO_THROW("Unsupported weight tensor type: ", ggml_type_name(tensor->type));
    }

-    auto layout = ggml_openvino_get_extracted_layout(tensor);
+    result.layout = ggml_openvino_get_extracted_layout(tensor);
+    const auto & layout = result.layout;
    if (layout.total_size == 0) {
        OPENVINO_THROW("Unsupported quantized type: ", ggml_type_name(tensor->type));
    }

-    std::shared_ptr<ov::Node> result;
+    // F16 requant path - no separate scales/zp needed in result
+    if (layout.is_requant && layout.requant_type.has_value() && layout.requant_type.value() == ExtraQuantType::F16) {
+        if (output_base_ptr) {
+            result.weights = ov::Tensor(ov::element::f16, node_shape,
+                                        static_cast<uint8_t *>(output_base_ptr) + layout.weights_offset);
+        } else {
+            result.weights = ov::Tensor(ov::element::f16, node_shape);
+        }
+        ov::Tensor dummy_scales, dummy_zp;  // Not used for F16
+        result.weight_node =
+            requantize_to_buffers(tensor, data, ExtraQuantType::F16, 0, result.weights, dummy_scales, dummy_zp);
+        return result;
+    }
+
+    // Quantized path (normal extraction or quantized requant)
+    // Create weight/scale/zp tensors - shared between both paths
+    ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
+    ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block};
+    ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape;
+
+    if (output_base_ptr) {
+        uint8_t * buf_base = static_cast<uint8_t *>(output_base_ptr);
+        result.weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset);
+        result.scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
+        result.zp = ov::Tensor(weight_type, zp_shape, buf_base + layout.zp_offset);
+    } else {
+        result.weights = ov::Tensor(weight_type, node_shape);
+        result.scales = ov::Tensor(ov::element::f16, scale_shape);
+        result.zp = ov::Tensor(weight_type, zp_shape);
+    }

    if (layout.is_requant && layout.requant_type.has_value()) {
-        // Requantization path
-        if (layout.requant_type.value() == ExtraQuantType::F16) {
-            // Requant to F16
-            ov::Tensor weights;
-            if (output_base_ptr) {
-                weights = ov::Tensor(ov::element::f16, node_shape,
-                                     static_cast<uint8_t *>(output_base_ptr) + layout.weights_offset);
-            } else {
-                weights = ov::Tensor(ov::element::f16, node_shape);
-            }
-            ov::Tensor dummy_scales, dummy_zp;  // Not used for F16
-            result = requantize_to_buffers(tensor, data, ExtraQuantType::F16, 0, weights, dummy_scales, dummy_zp);
-        } else {
-            // Requant to quantized format (Q4_0_128, Q8_0_32, etc.)
-            ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
-            ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block};
-            // For symmetric quantization, zp is a scalar value instead of per-block
-            // zp uses the same element type as weights (U4 or U8)
-            ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape;
-
-            ov::Tensor weights, scales, zp;
-            if (output_base_ptr) {
-                uint8_t * buf_base = static_cast<uint8_t *>(output_base_ptr);
-                weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset);
-                scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
-                zp = ov::Tensor(weight_type, zp_shape, buf_base + layout.zp_offset);
-            } else {
-                weights = ov::Tensor(weight_type, node_shape);
-                scales = ov::Tensor(ov::element::f16, scale_shape);
-                zp = ov::Tensor(weight_type, zp_shape);
-            }
-
-            result = requantize_to_buffers(tensor, data, layout.requant_type.value(), layout.weights_per_block, weights,
-                                           scales, zp);
-        }
+        result.weight_node = requantize_to_buffers(tensor, data, layout.requant_type.value(), layout.weights_per_block,
+                                                   result.weights, result.scales, result.zp);
    } else {
-        // Normal extraction path (no requant)
-        ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
-        ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block};
-        // For symmetric quantization, zp is a scalar value instead of per-block
-        // zp uses the same element type as weights (U4 or U8)
-        ov::Shape zp_shape = layout.is_symmetric ? ov::Shape{} : scale_shape;
-
-        ov::Tensor weights, scales, zp;
-        if (output_base_ptr) {
-            uint8_t * buf_base = static_cast<uint8_t *>(output_base_ptr);
-            weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset);
-            scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
-            zp = ov::Tensor(weight_type, zp_shape, buf_base + layout.zp_offset);
-        } else {
-            weights = ov::Tensor(weight_type, node_shape);
-            scales = ov::Tensor(ov::element::f16, scale_shape);
-            zp = ov::Tensor(weight_type, zp_shape);
-        }
-
-        result = extract_quantized_weights(tensor, data, weights, scales, zp);
+        result.weight_node = extract_quantized_weights(tensor, data, result.weights, result.scales, result.zp);
    }

    return result;
--- a/ggml/src/ggml-openvino/ggml-quants.hpp
+++ b/ggml/src/ggml-openvino/ggml-quants.hpp
@ -74,12 +74,43 @@ std::shared_ptr<ov::Node> requantize_to_buffers(const ggml_tensor * tensor,
                                                ov::Tensor & scales,
                                                ov::Tensor & zp);

-// Process weight tensor and create an OpenVINO constant node
+inline const char * extra_quant_type_name(ExtraQuantType t) {
+    switch (t) {
+    case ExtraQuantType::F16:
+        return "F16";
+    case ExtraQuantType::Q4_0_C:
+        return "Q4_0_C";
+    case ExtraQuantType::Q4_0_128:
+        return "Q4_0_128";
+    case ExtraQuantType::Q8_0_C:
+        return "Q8_0_C";
+    case ExtraQuantType::Q8_0_32:
+        return "Q8_0_32";
+    case ExtraQuantType::Q8_1_C:
+        return "Q8_1_C";
+    default:
+        return "unknown";
+    }
+}
+
+// Result from process_weight_tensor containing the weight node and tensors.
+// For quantized weights, also contains the extracted layout and scale/zp tensors.
+struct OvWeight {
+    std::shared_ptr<ov::Node> weight_node;
+    ggml_openvino_extracted_layout layout;  // Only meaningful for quantized (layout.total_size > 0)
+    ov::Tensor weights;
+    ov::Tensor scales;
+    ov::Tensor zp;
+
+    bool is_quantized() const { return layout.scales_size > 0; }
+};
+
+// Process weight tensor and create an OpenVINO weight node
 // Handles F16/F32/BF16 and quantized weights, with optional requantization
 // If output_base_ptr is nullptr, allocates internal buffers (for decoder use)
 // If output_base_ptr is provided, uses pre-allocated buffers at specified offsets (for backend buffer use)
-// Returns the weight constant node
-std::shared_ptr<ov::Node> process_weight_tensor(
+// Returns OvWeight with the weight node and optional quantized tensors
+OvWeight process_weight_tensor(
    const ggml_tensor * tensor,
    const void * data,                  // Source data pointer (may differ from tensor->data)
    void * output_base_ptr = nullptr);  // Base pointer for output buffers (or nullptr for internal allocation)