create_weight_node accept non-ov backend buffer

2026-02-11 14:20:14 +08:00 · 2026-02-11 14:20:14 +08:00 · 1d4ec1b2ee
parent 7b3b65b04e
commit 1d4ec1b2ee
1 changed files with 19 additions and 17 deletions
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@ -551,13 +551,13 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
 }

 std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor) {
+    const bool is_ov_buffer = ggml_backend_buffer_is_openvino(tensor->buffer);
+
    // Check if we have a pre-built constant from the OpenVINO backend buffer
    // This is set during ggml_backend_openvino_buffer_set_tensor
    if (tensor->extra) {
-        if (!ggml_backend_buffer_is_openvino(tensor->buffer)) {
-            OPENVINO_ASSERT(false, "Unsupported weight tensor: " + std::string(tensor->name) +
-                                       " Possibly this is a cpu backend repacked quantized weights");
-        }
+        OPENVINO_ASSERT(is_ov_buffer, "Unsupported weight tensor: " + std::string(tensor->name) +
+                                          " Possibly this is a cpu backend repacked quantized weights");
        // Cast to our extra base type and check the type
        auto * extra_base = static_cast<ggml_openvino_extra_base *>(tensor->extra);

@ -578,12 +578,7 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor
        }
    }

-    // Fallback: tensor doesn't have a pre-built extra. The buffer type can only be
-    // openvino_host_buffer_type, which has enough space (get_alloc_size returns
-    // layout.total_size for quantized 2D tensors) to store extracted data in-place.
-    // Build the weight node and store it in tensor->extra for future reuse.
    GGML_LOG_DEBUG("%s: creating new weight node for %s\n", __func__, tensor->name);
-
    static const std::set<ggml_type> weight_types = {GGML_TYPE_F32,  GGML_TYPE_F16,  GGML_TYPE_BF16,
                                                     GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
                                                     GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K};
@ -594,14 +589,18 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor

    OvWeight ov_weight;
    if (ggml_is_quantized(tensor->type)) {
-        // For quantized weights, copy raw data to a temp buffer first because
-        // process_weight_tensor reads from data and writes extracted results
-        // (weights/scales/zp) to output_base_ptr — they would overlap if both
-        // point to tensor->data.
-        size_t raw_size = ggml_nbytes(tensor);
-        std::vector<uint8_t> tmp(raw_size);
-        memcpy(tmp.data(), tensor->data, raw_size);
-        ov_weight = process_weight_tensor(tensor, tmp.data(), tensor->data);
+        if (is_ov_buffer) {
+            // For quantized weights, copy raw data to a temp buffer first because
+            // process_weight_tensor reads from data and writes extracted results
+            // (weights/scales/zp) to output_base_ptr — they would overlap if both
+            // point to tensor->data.
+            size_t raw_size = ggml_nbytes(tensor);
+            std::vector<uint8_t> tmp(raw_size);
+            memcpy(tmp.data(), tensor->data, raw_size);
+            ov_weight = process_weight_tensor(tensor, tmp.data(), tensor->data);
+        } else {
+            ov_weight = process_weight_tensor(tensor, tensor->data, nullptr);
+        }
    } else {
        // For non-quantized weights (F16/F32/BF16), data is already in tensor->data.
        // process_weight_tensor will create an ov::Tensor wrapping tensor->data directly.
@ -609,6 +608,9 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor
    }

    ov_weight.weight_node->set_friendly_name(tensor->name);
+    if (!is_ov_buffer) {
+        return ov_weight.weight_node;
+    }

    ggml_openvino_extra_base * extra;
    if (ov_weight.is_quantized()) {