diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index da381e4fad..857aa3b850 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -551,13 +551,13 @@ std::map> GgmlOvDecoder::create_weight_no } std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor) { + const bool is_ov_buffer = ggml_backend_buffer_is_openvino(tensor->buffer); + // Check if we have a pre-built constant from the OpenVINO backend buffer // This is set during ggml_backend_openvino_buffer_set_tensor if (tensor->extra) { - if (!ggml_backend_buffer_is_openvino(tensor->buffer)) { - OPENVINO_ASSERT(false, "Unsupported weight tensor: " + std::string(tensor->name) + - " Possibly this is a cpu backend repacked quantized weights"); - } + OPENVINO_ASSERT(is_ov_buffer, "Unsupported weight tensor: " + std::string(tensor->name) + + " Possibly this is a cpu backend repacked quantized weights"); // Cast to our extra base type and check the type auto * extra_base = static_cast(tensor->extra); @@ -578,12 +578,7 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor } } - // Fallback: tensor doesn't have a pre-built extra. The buffer type can only be - // openvino_host_buffer_type, which has enough space (get_alloc_size returns - // layout.total_size for quantized 2D tensors) to store extracted data in-place. - // Build the weight node and store it in tensor->extra for future reuse. GGML_LOG_DEBUG("%s: creating new weight node for %s\n", __func__, tensor->name); - static const std::set weight_types = {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K}; @@ -594,14 +589,18 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor OvWeight ov_weight; if (ggml_is_quantized(tensor->type)) { - // For quantized weights, copy raw data to a temp buffer first because - // process_weight_tensor reads from data and writes extracted results - // (weights/scales/zp) to output_base_ptr — they would overlap if both - // point to tensor->data. - size_t raw_size = ggml_nbytes(tensor); - std::vector tmp(raw_size); - memcpy(tmp.data(), tensor->data, raw_size); - ov_weight = process_weight_tensor(tensor, tmp.data(), tensor->data); + if (is_ov_buffer) { + // For quantized weights, copy raw data to a temp buffer first because + // process_weight_tensor reads from data and writes extracted results + // (weights/scales/zp) to output_base_ptr — they would overlap if both + // point to tensor->data. + size_t raw_size = ggml_nbytes(tensor); + std::vector tmp(raw_size); + memcpy(tmp.data(), tensor->data, raw_size); + ov_weight = process_weight_tensor(tensor, tmp.data(), tensor->data); + } else { + ov_weight = process_weight_tensor(tensor, tensor->data, nullptr); + } } else { // For non-quantized weights (F16/F32/BF16), data is already in tensor->data. // process_weight_tensor will create an ov::Tensor wrapping tensor->data directly. @@ -609,6 +608,9 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor } ov_weight.weight_node->set_friendly_name(tensor->name); + if (!is_ov_buffer) { + return ov_weight.weight_node; + } ggml_openvino_extra_base * extra; if (ov_weight.is_quantized()) {