create_weight_node accept non-ov backend buffer

This commit is contained in:
Yu, Zijun 2026-02-11 14:20:14 +08:00
parent 7b3b65b04e
commit 1d4ec1b2ee
1 changed files with 19 additions and 17 deletions

View File

@ -551,13 +551,13 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
}
std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor) {
const bool is_ov_buffer = ggml_backend_buffer_is_openvino(tensor->buffer);
// Check if we have a pre-built constant from the OpenVINO backend buffer
// This is set during ggml_backend_openvino_buffer_set_tensor
if (tensor->extra) {
if (!ggml_backend_buffer_is_openvino(tensor->buffer)) {
OPENVINO_ASSERT(false, "Unsupported weight tensor: " + std::string(tensor->name) +
" Possibly this is a cpu backend repacked quantized weights");
}
OPENVINO_ASSERT(is_ov_buffer, "Unsupported weight tensor: " + std::string(tensor->name) +
" Possibly this is a cpu backend repacked quantized weights");
// Cast to our extra base type and check the type
auto * extra_base = static_cast<ggml_openvino_extra_base *>(tensor->extra);
@ -578,12 +578,7 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor
}
}
// Fallback: tensor doesn't have a pre-built extra. The buffer type can only be
// openvino_host_buffer_type, which has enough space (get_alloc_size returns
// layout.total_size for quantized 2D tensors) to store extracted data in-place.
// Build the weight node and store it in tensor->extra for future reuse.
GGML_LOG_DEBUG("%s: creating new weight node for %s\n", __func__, tensor->name);
static const std::set<ggml_type> weight_types = {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16,
GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K};
@ -594,14 +589,18 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor
OvWeight ov_weight;
if (ggml_is_quantized(tensor->type)) {
// For quantized weights, copy raw data to a temp buffer first because
// process_weight_tensor reads from data and writes extracted results
// (weights/scales/zp) to output_base_ptr — they would overlap if both
// point to tensor->data.
size_t raw_size = ggml_nbytes(tensor);
std::vector<uint8_t> tmp(raw_size);
memcpy(tmp.data(), tensor->data, raw_size);
ov_weight = process_weight_tensor(tensor, tmp.data(), tensor->data);
if (is_ov_buffer) {
// For quantized weights, copy raw data to a temp buffer first because
// process_weight_tensor reads from data and writes extracted results
// (weights/scales/zp) to output_base_ptr — they would overlap if both
// point to tensor->data.
size_t raw_size = ggml_nbytes(tensor);
std::vector<uint8_t> tmp(raw_size);
memcpy(tmp.data(), tensor->data, raw_size);
ov_weight = process_weight_tensor(tensor, tmp.data(), tensor->data);
} else {
ov_weight = process_weight_tensor(tensor, tensor->data, nullptr);
}
} else {
// For non-quantized weights (F16/F32/BF16), data is already in tensor->data.
// process_weight_tensor will create an ov::Tensor wrapping tensor->data directly.
@ -609,6 +608,9 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor
}
ov_weight.weight_node->set_friendly_name(tensor->name);
if (!is_ov_buffer) {
return ov_weight.weight_node;
}
ggml_openvino_extra_base * extra;
if (ov_weight.is_quantized()) {