Use shared_buffer for GPU NPU; Refactor

This commit is contained in:
Yu, Zijun 2025-12-18 17:03:03 +08:00 committed by Mustafa Cavus
parent 22d9c17a6f
commit 72bba828df
10 changed files with 389 additions and 326 deletions

View File

@ -1,4 +1,5 @@
find_package(OpenVINO REQUIRED)
find_package(OpenCL REQUIRED)
include("${OpenVINO_DIR}/../3rdparty/tbb/lib/cmake/TBB/TBBConfig.cmake")
@ -10,7 +11,7 @@ ggml_add_backend_library(ggml-openvino
${GGML_HEADERS_OPENVINO}
)
target_link_libraries(ggml-openvino PRIVATE openvino::runtime TBB::tbb)
target_link_libraries(ggml-openvino PRIVATE openvino::runtime TBB::tbb OpenCL::OpenCL)
if (GGML_OPENVINO)
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")

View File

@ -3,6 +3,7 @@
#include "ggml-backend-impl.h"
#include "ggml-backend.h"
#include "ggml-openvino-extra.h"
#include "ggml-openvino.h"
#include "ggml-quants.hpp"
#include <ggml-impl.h>
@ -471,9 +472,7 @@ const ggml_tensor * GgmlOvDecoder::get_tensor_from_name(const std::string & name
// return kv_param_res_names;
// }
std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(
ggml_cgraph * cgraph,
std::map<ggml_type, ExtraQuantType> types_to_requantize) {
std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph) {
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
static std::mutex weights_mutex;
auto * nodes = cgraph->nodes;
@ -498,10 +497,7 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
}
}
if (should_create) {
auto requant_type = types_to_requantize.count(src->type) ?
std::optional<ExtraQuantType>(types_to_requantize.at(src->type)) :
std::nullopt;
auto weight_node = create_weight_node(src, requant_type);
auto weight_node = create_weight_node(src);
weight_node->set_friendly_name(src_name);
{
std::lock_guard<std::mutex> lock(weights_mutex);
@ -520,11 +516,14 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
static std::unordered_map<const void *, std::shared_ptr<ov::Node>> s_quantized_weight_cache;
static std::mutex s_quantized_weight_cache_mutex;
std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor,
std::optional<ExtraQuantType> requant_type) {
std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor) {
// Check if we have a pre-built constant from the OpenVINO backend buffer
// This is set during ggml_backend_openvino_buffer_set_tensor
if (tensor->extra != nullptr && !requant_type.has_value()) {
if (tensor->extra) {
if (!ggml_backend_buffer_is_openvino(tensor->buffer)) {
OPENVINO_ASSERT(false, "Unsupported weight tensor: " + std::string(tensor->name) +
" Possibly this is a cpu backend repacked quantized weights");
}
// Cast to our extra base type and check the type
auto * extra_base = static_cast<ggml_openvino_extra_base *>(tensor->extra);
@ -547,7 +546,7 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor
// Fallback: Check static cache for quantized weights (keyed by data pointer)
// This handles cases where tensors weren't loaded through OpenVINO buffer
if (ggml_is_quantized(tensor->type) && !requant_type.has_value()) {
if (ggml_is_quantized(tensor->type)) {
std::lock_guard<std::mutex> lock(s_quantized_weight_cache_mutex);
auto it = s_quantized_weight_cache.find(tensor->data);
if (it != s_quantized_weight_cache.end()) {
@ -565,64 +564,11 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor
ggml_type_name(tensor->type));
}
auto node_type = get_ov_type(tensor);
auto node_shape = get_shape(tensor);
auto ne_total = ggml_nelements(tensor);
OPENVINO_ASSERT(node_shape[0] == 1, "Got 4D weights, expect all weights to be 2D: ", tensor->name);
node_shape.erase(node_shape.begin());
OPENVINO_ASSERT(node_shape[0] == 1, "Got 3D weights, expect all weights to be 2D: ", tensor->name);
node_shape.erase(node_shape.begin());
// F16 and F32 case
if (node_type != ov::element::dynamic) {
ov::Tensor weights(node_type, node_shape);
memcpy(weights.data(), tensor->data, ne_total * node_type.size());
std::shared_ptr<ov::Node> weight_node = std::make_shared<ov::op::v0::Constant>(weights);
// Disabled because it triggers a bug in NPUW, no performance impact on CPU GPU
// if (node_type == ov::element::f16) {
// weight_node = std::make_shared<ov::op::v0::Convert>(weight_node, ov::element::f32);
// }
weight_node->set_friendly_name(tensor->name);
return weight_node;
}
// Quantized case - extra should be nullptr (not our type)
// Our ggml_openvino_weight_extra is only set for F16/F32 weights
if (tensor->extra != nullptr) {
// Check if it's our type - if so, something is wrong
auto * extra_base = static_cast<ggml_openvino_extra_base *>(tensor->extra);
if (extra_base->type == ggml_openvino_extra_base::Type::WEIGHT ||
extra_base->type == ggml_openvino_extra_base::Type::TENSOR) {
OPENVINO_ASSERT(false, "Quantized weight tensor has unexpected extra type: " + std::string(tensor->name));
}
// Otherwise it might be repacked quantized weights from another backend
OPENVINO_ASSERT(false, "Unsupported weight tensor: " + std::string(tensor->name) +
" Possibly this is a repacked quantized weights");
}
if (requant_type.has_value()) {
return requantize(tensor, requant_type.value());
}
// Extract quantized weights using the shared function
auto layout = ggml_openvino_get_extracted_layout(tensor);
if (layout.total_size == 0) {
OPENVINO_THROW("Unsupported quantized type for ", tensor->name, " type=", ggml_type_name(tensor->type));
}
ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block};
ov::Tensor weights(weight_type, node_shape);
ov::Tensor scales(ov::element::f16, scale_shape);
ov::Tensor biases(ov::element::f16, scale_shape);
auto result = extract_quantized_weights(tensor, tensor->data, weights, scales, biases);
std::shared_ptr<ov::Node> result = process_weight_tensor(tensor, tensor->data, nullptr);
result->set_friendly_name(tensor->name);
// Cache the quantized weight node for future reuse
if (ggml_is_quantized(tensor->type) && !requant_type.has_value()) {
if (ggml_is_quantized(tensor->type)) {
std::lock_guard<std::mutex> lock(s_quantized_weight_cache_mutex);
s_quantized_weight_cache[tensor->data] = result;
GGML_LOG_DEBUG("%s: cached quantized constant for %s\n", __func__, tensor->name);

View File

@ -179,12 +179,9 @@ public:
static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename);
static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor * tensor,
std::optional<ExtraQuantType> requant_type = std::nullopt);
static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor * tensor);
static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(
ggml_cgraph * cgraph,
std::map<ggml_type, ExtraQuantType> types_to_requantize = {});
static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(ggml_cgraph * cgraph);
const ggml_tensor * get_tensor_used_op(const ggml_tensor * tensor) const;

View File

@ -0,0 +1,177 @@
#include "ggml-openvino-extra.h"
#include "ggml-impl.h"
ov::Core & ov_singleton_core() {
static ov::Core core;
return core;
}
// =====================================================
// Device Configuration Implementations
// =====================================================
void ggml_openvino_device_config::init() {
if (initialized) {
return;
}
device_name = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : "CPU";
auto available_devices = ov_singleton_core().get_available_devices();
if (std::find(available_devices.begin(), available_devices.end(), device_name) == available_devices.end()) {
GGML_LOG_WARN("GGML OpenVINO Backend: device %s is not available, fallback to CPU\n", device_name.c_str());
device_name = "CPU";
}
is_npu = (device_name == "NPU");
initialized = true;
}
// Get the global device config singleton
ggml_openvino_device_config & ggml_openvino_get_device_config() {
static ggml_openvino_device_config config;
return config;
}
// Initialize device config (call during backend init)
void ggml_openvino_init_device_config() {
ggml_openvino_get_device_config().init();
}
// Get the device name
const std::string & ggml_openvino_get_device_name() {
return ggml_openvino_get_device_config().device_name;
}
// Check if running on NPU
bool ggml_openvino_is_npu() {
return ggml_openvino_get_device_config().is_npu;
}
// Get requantization type for a tensor type (returns nullopt if no requant needed)
std::optional<ExtraQuantType> ggml_openvino_get_requant_type(ggml_type type) {
if (!ggml_openvino_is_npu()) {
return std::nullopt;
}
// NPU requantization rules
switch (type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q4_K:
return ExtraQuantType::Q4_0_128;
case GGML_TYPE_Q6_K:
case GGML_TYPE_Q5_K:
return ExtraQuantType::F16;
default:
return std::nullopt;
}
}
// =====================================================
// Extracted Layout Calculation
// =====================================================
ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor) {
ggml_openvino_extracted_layout layout = {};
if (!ggml_is_quantized(tensor->type)) {
return layout;
}
// Only handle 2D weight tensors
if (tensor->ne[2] != 1 || tensor->ne[3] != 1) {
return layout;
}
int64_t n_elements = ggml_nelements(tensor);
const size_t alignment = 64; // Good for SIMD
// Check if requantization is needed (NPU-specific)
auto requant_type = ggml_openvino_get_requant_type(tensor->type);
if (requant_type.has_value()) {
layout.is_requant = true;
layout.requant_type = requant_type;
// Special case: requant to F16 - just store F16 weights, no scales/biases
if (requant_type.value() == ExtraQuantType::F16) {
layout.weights_size = n_elements * sizeof(uint16_t); // F16 = 2 bytes
layout.total_size = layout.weights_size;
layout.weights_offset = 0;
// No scales/biases for F16
return layout;
}
// Requant to different quantized format (e.g., Q4_0_128)
switch (requant_type.value()) {
case ExtraQuantType::Q4_0_128:
layout.is_u4 = true;
layout.weights_per_block = 128;
break;
case ExtraQuantType::Q8_0_32:
layout.is_u4 = false;
layout.weights_per_block = 32;
break;
default:
// Unsupported requant type - fall through to normal extraction
layout.is_requant = false;
layout.requant_type = std::nullopt;
break;
}
if (layout.is_requant) {
// Calculate sizes for requantized format
layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;
int64_t n_blocks = n_elements / layout.weights_per_block;
layout.scales_size = n_blocks * sizeof(uint16_t);
layout.biases_size = n_blocks * sizeof(uint16_t);
layout.weights_offset = 0;
layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
layout.biases_offset =
layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;
layout.total_size = layout.biases_offset + layout.biases_size;
layout.total_size = std::max(layout.total_size, ggml_nbytes(tensor));
return layout;
}
}
// Normal extraction (no requant) - determine format based on tensor type
switch (tensor->type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q4_K:
layout.is_u4 = true;
layout.weights_per_block = 32;
break;
case GGML_TYPE_Q8_0:
layout.is_u4 = false;
layout.weights_per_block = 32;
break;
case GGML_TYPE_Q6_K:
layout.is_u4 = false;
layout.weights_per_block = 16;
break;
case GGML_TYPE_Q5_K:
layout.is_u4 = false;
layout.weights_per_block = 32;
break;
default:
// Unsupported quantization type
return layout;
}
// Calculate sizes
// Weights: U4 = n_elements/2 bytes, U8 = n_elements bytes
layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;
// Scales and biases: F16 per block
int64_t n_blocks = n_elements / layout.weights_per_block;
layout.scales_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes
layout.biases_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes
// Layout in buffer: [weights | scales | biases] with alignment
layout.weights_offset = 0;
layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
layout.biases_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;
layout.total_size = layout.biases_offset + layout.biases_size;
return layout;
}

View File

@ -1,16 +1,20 @@
#pragma once
#include "ggml.h"
#include "openvino/runtime/core.hpp"
#include <cstdlib>
#include <memory>
#include <optional>
#include <openvino/core/node.hpp>
#include <openvino/runtime/tensor.hpp>
#include <optional>
#include <string>
#include "ggml.h"
// ExtraQuantType enum - defines requantization target formats
enum class ExtraQuantType { F16, Q4_0_C, Q8_1_C, Q4_0_128, Q8_0_C, Q8_0_32 };
ov::Core & ov_singleton_core();
// =====================================================
// Global Device Configuration (singleton)
// =====================================================
@ -21,56 +25,23 @@ struct ggml_openvino_device_config {
bool is_npu = false;
bool initialized = false;
void init() {
if (initialized) return;
const char* env = std::getenv("GGML_OPENVINO_DEVICE");
if (env) {
device_name = env;
is_npu = (device_name == "NPU");
}
initialized = true;
}
void init();
};
// Get the global device config singleton
inline ggml_openvino_device_config& ggml_openvino_get_device_config() {
static ggml_openvino_device_config config;
return config;
}
ggml_openvino_device_config & ggml_openvino_get_device_config();
// Initialize device config (call during backend init)
inline void ggml_openvino_init_device_config() {
ggml_openvino_get_device_config().init();
}
void ggml_openvino_init_device_config();
// Get the device name
inline const std::string& ggml_openvino_get_device_name() {
return ggml_openvino_get_device_config().device_name;
}
const std::string & ggml_openvino_get_device_name();
// Check if running on NPU
inline bool ggml_openvino_is_npu() {
return ggml_openvino_get_device_config().is_npu;
}
bool ggml_openvino_is_npu();
// Get requantization type for a tensor type (returns nullopt if no requant needed)
inline std::optional<ExtraQuantType> ggml_openvino_get_requant_type(ggml_type type) {
if (!ggml_openvino_is_npu()) {
return std::nullopt;
}
// NPU requantization rules
switch (type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q4_K:
return ExtraQuantType::Q4_0_128;
case GGML_TYPE_Q6_K:
case GGML_TYPE_Q5_K:
return ExtraQuantType::F16;
default:
return std::nullopt;
}
}
std::optional<ExtraQuantType> ggml_openvino_get_requant_type(ggml_type type);
// =====================================================
// OpenVINO Tensor Extra Types
@ -140,108 +111,4 @@ struct ggml_openvino_extracted_layout {
};
// Calculate the buffer layout for extracted quantized data
inline ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor) {
ggml_openvino_extracted_layout layout = {};
if (!ggml_is_quantized(tensor->type)) {
return layout;
}
// Only handle 2D weight tensors
if (tensor->ne[2] != 1 || tensor->ne[3] != 1) {
return layout;
}
int64_t n_elements = ggml_nelements(tensor);
const size_t alignment = 64; // Good for SIMD
// Check if requantization is needed (NPU-specific)
auto requant_type = ggml_openvino_get_requant_type(tensor->type);
if (requant_type.has_value()) {
layout.is_requant = true;
layout.requant_type = requant_type;
// Special case: requant to F16 - just store F16 weights, no scales/biases
if (requant_type.value() == ExtraQuantType::F16) {
layout.weights_size = n_elements * sizeof(uint16_t); // F16 = 2 bytes
layout.total_size = layout.weights_size;
layout.weights_offset = 0;
// No scales/biases for F16
return layout;
}
// Requant to different quantized format (e.g., Q4_0_128)
switch (requant_type.value()) {
case ExtraQuantType::Q4_0_128:
layout.is_u4 = true;
layout.weights_per_block = 128;
break;
case ExtraQuantType::Q8_0_32:
layout.is_u4 = false;
layout.weights_per_block = 32;
break;
default:
// Unsupported requant type - fall through to normal extraction
layout.is_requant = false;
layout.requant_type = std::nullopt;
break;
}
if (layout.is_requant) {
// Calculate sizes for requantized format
layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;
int64_t n_blocks = n_elements / layout.weights_per_block;
layout.scales_size = n_blocks * sizeof(uint16_t);
layout.biases_size = n_blocks * sizeof(uint16_t);
layout.weights_offset = 0;
layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
layout.biases_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;
layout.total_size = layout.biases_offset + layout.biases_size;
layout.total_size = std::max(layout.total_size, ggml_nbytes(tensor));
return layout;
}
}
// Normal extraction (no requant) - determine format based on tensor type
switch (tensor->type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q4_K:
layout.is_u4 = true;
layout.weights_per_block = 32;
break;
case GGML_TYPE_Q8_0:
layout.is_u4 = false;
layout.weights_per_block = 32;
break;
case GGML_TYPE_Q6_K:
layout.is_u4 = false;
layout.weights_per_block = 16;
break;
case GGML_TYPE_Q5_K:
layout.is_u4 = false;
layout.weights_per_block = 32;
break;
default:
// Unsupported quantization type
return layout;
}
// Calculate sizes
// Weights: U4 = n_elements/2 bytes, U8 = n_elements bytes
layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;
// Scales and biases: F16 per block
int64_t n_blocks = n_elements / layout.weights_per_block;
layout.scales_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes
layout.biases_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes
// Layout in buffer: [weights | scales | biases] with alignment
layout.weights_offset = 0;
layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
layout.biases_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;
layout.total_size = layout.biases_offset + layout.biases_size;
return layout;
}
ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor);

View File

@ -12,7 +12,11 @@
#include <cstring>
#include <memory>
#include <mutex>
#include <openvino/core/type/element_type.hpp>
#include <openvino/openvino.hpp>
#include <openvino/runtime/allocator.hpp>
#include <openvino/runtime/intel_gpu/ocl/ocl.hpp>
#include <openvino/runtime/intel_npu/level_zero/level_zero.hpp>
#include <openvino/runtime/tensor.hpp>
#include <set>
#include <string>
@ -48,7 +52,8 @@ struct ggml_backend_openvino_buffer_context {
// For non-weight buffers (KV cache, compute), we still use contiguous allocation
void * data;
size_t size;
bool is_weight_buffer; // Set when buffer usage is set to WEIGHTS
std::shared_ptr<ov::Tensor> ov_tensor;
// Track all extras for cleanup
std::vector<ggml_openvino_extra_base *> tensor_extras;
@ -57,18 +62,42 @@ struct ggml_backend_openvino_buffer_context {
device(device),
name(std::string(GGML_OPENVINO_NAME) + std::to_string(device)),
data(nullptr),
size(size),
is_weight_buffer(false) {
// Allocate aligned contiguous memory
if (size > 0) {
size(size) {
if (size == 0) {
return;
}
const auto & device_name = ggml_openvino_get_device_name();
auto & core = ov_singleton_core();
if (device_name == "CPU") {
#ifdef _WIN32
data = _aligned_malloc(size, GGML_OPENVINO_BUFFER_ALIGNMENT);
data = _aligned_malloc(alloc_size, GGML_OPENVINO_BUFFER_ALIGNMENT);
#else
data = aligned_alloc(GGML_OPENVINO_BUFFER_ALIGNMENT, size);
#endif
if (data == nullptr) {
GGML_LOG_ERROR("%s: failed to allocate %zu bytes\n", __func__, size);
}
ov_tensor = std::make_shared<ov::Tensor>(ov::element::u8, ov::Shape{size}, data);
} else if (device_name == "GPU") {
auto gpu_context = core.get_default_context("GPU").as<ov::intel_gpu::ocl::ClContext>();
auto usm_tensor = gpu_context.create_usm_host_tensor(ov::element::u8, ov::Shape{size});
data = usm_tensor.get();
ov_tensor = std::make_shared<ov::intel_gpu::ocl::USMTensor>(std::move(usm_tensor));
} else {
auto npu_context = core.get_default_context("NPU").as<ov::intel_npu::level_zero::ZeroContext>();
auto l0_tensor = npu_context.create_l0_host_tensor(ov::element::u8, ov::Shape{size});
data = l0_tensor.get();
ov_tensor = std::make_shared<ov::intel_npu::level_zero::ZeroBufferTensor>(std::move(l0_tensor));
}
if (data == nullptr) {
GGML_LOG_ERROR("%s: failed to allocate %zu bytes\n", __func__, size);
return;
}
if (reinterpret_cast<uintptr_t>(data) % GGML_OPENVINO_BUFFER_ALIGNMENT != 0) {
GGML_LOG_ERROR("%s: %s buffer is not aligned to %d bytes\n", __func__, device_name.c_str(),
GGML_OPENVINO_BUFFER_ALIGNMENT);
GGML_ABORT("fatal error");
}
}
@ -78,15 +107,12 @@ struct ggml_backend_openvino_buffer_context {
delete extra;
}
tensor_extras.clear();
// Free contiguous memory
if (data != nullptr) {
if (data && ggml_openvino_get_device_name() == "CPU") {
#ifdef _WIN32
_aligned_free(data);
#else
free(data);
#endif
data = nullptr;
}
}
};
@ -156,57 +182,26 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer
}
if (layout.total_size > 0) {
// Quantized weight tensor with extraction/requantization
uint8_t * buf_base = (uint8_t *) tensor->data;
// 2D shape for weights [rows, cols]
ov::Shape weight_shape = {static_cast<size_t>(tensor->ne[1]), static_cast<size_t>(tensor->ne[0])};
try {
std::shared_ptr<ov::Node> constant;
std::shared_ptr<ov::Node> constant = process_weight_tensor(tensor, data, buf_base);
constant->set_friendly_name(tensor->name);
if (layout.is_requant && layout.requant_type.has_value()) {
// Requantization path
if (layout.requant_type.value() == ExtraQuantType::F16) {
// Requant to F16: create F16 tensor with external memory, requantize fills it
ov::Tensor weights(ov::element::f16, weight_shape, buf_base);
ov::Tensor dummy_scales, dummy_biases; // Not used for F16
// requantize_to_buffers fills weights and returns a Constant wrapping it
constant = requantize_to_buffers(tensor, data, ExtraQuantType::F16, 0, weights, dummy_scales,
dummy_biases);
// Store in tensor->extra (use weight_extra since it's F16)
auto * extra = new ggml_openvino_weight_extra(constant);
ctx->tensor_extras.push_back(extra);
tensor->extra = extra;
GGML_LOG_DEBUG("%s: requantized %s to F16\n", __func__, tensor->name);
} else {
// Requant to quantized format (Q4_0_128, Q8_0_32, etc.)
ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
ov::Shape scale_shape = {static_cast<size_t>(tensor->ne[1]),
static_cast<size_t>(tensor->ne[0] / layout.weights_per_block)};
ov::Tensor weights(weight_type, weight_shape, buf_base + layout.weights_offset);
ov::Tensor scales(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
ov::Tensor biases(ov::element::f16, scale_shape, buf_base + layout.biases_offset);
constant = requantize_to_buffers(tensor, data, layout.requant_type.value(),
layout.weights_per_block, weights, scales, biases);
// Store in tensor->extra
auto * extra = new ggml_openvino_quantized_weight_extra(std::move(weights), std::move(scales),
std::move(biases), constant);
ctx->tensor_extras.push_back(extra);
tensor->extra = extra;
GGML_LOG_DEBUG("%s: requantized %s to %s (u%d, block_size=%ld)\n", __func__, tensor->name,
layout.requant_type.value() == ExtraQuantType::Q4_0_128 ? "Q4_0_128" : "Q8_0_32",
layout.is_u4 ? 4 : 8, layout.weights_per_block);
}
// Store in tensor->extra
if (layout.is_requant && layout.requant_type.has_value() &&
layout.requant_type.value() == ExtraQuantType::F16) {
// F16 requant case - use weight_extra
auto * extra = new ggml_openvino_weight_extra(constant);
ctx->tensor_extras.push_back(extra);
tensor->extra = extra;
GGML_LOG_DEBUG("%s: requantized %s to F16\n", __func__, tensor->name);
} else {
// Normal extraction path (no requant)
// Quantized case - use quantized_weight_extra
// Create tensors with external memory (already filled by process_weight_tensor)
ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
int64_t n_blocks = ggml_nelements(tensor) / layout.weights_per_block;
ov::Shape weight_shape = {static_cast<size_t>(tensor->ne[1]), static_cast<size_t>(tensor->ne[0])};
ov::Shape scale_shape = {static_cast<size_t>(tensor->ne[1]),
static_cast<size_t>(tensor->ne[0] / layout.weights_per_block)};
@ -214,16 +209,20 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer
ov::Tensor scales(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
ov::Tensor biases(ov::element::f16, scale_shape, buf_base + layout.biases_offset);
constant = extract_quantized_weights(tensor, data, weights, scales, biases);
// Store in tensor->extra
auto * extra = new ggml_openvino_quantized_weight_extra(std::move(weights), std::move(scales),
std::move(biases), constant);
ctx->tensor_extras.push_back(extra);
tensor->extra = extra;
GGML_LOG_DEBUG("%s: extracted quantized constant for %s (u%d, %zu weights, %ld blocks)\n", __func__,
tensor->name, layout.is_u4 ? 4 : 8, layout.weights_size, n_blocks);
if (layout.is_requant) {
GGML_LOG_DEBUG("%s: requantized %s to %s (u%d, block_size=%ld)\n", __func__, tensor->name,
layout.requant_type.value() == ExtraQuantType::Q4_0_128 ? "Q4_0_128" : "Q8_0_32",
layout.is_u4 ? 4 : 8, layout.weights_per_block);
} else {
int64_t n_blocks = ggml_nelements(tensor) / layout.weights_per_block;
GGML_LOG_DEBUG("%s: extracted quantized constant for %s (u%d, %zu weights, %ld blocks)\n", __func__,
tensor->name, layout.is_u4 ? 4 : 8, layout.weights_size, n_blocks);
}
}
} catch (const std::exception & e) {
@ -233,32 +232,9 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer
}
} else if (is_weight_buffer && is_full_tensor_set && is_2d &&
(tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16)) {
// F16/F32/BF16 weight tensor - copy data and create shared-memory constant
memcpy((char *) tensor->data + offset, data, size);
// F16/F32/BF16 weight tensor
try {
// Get OpenVINO element type
ov::element::Type element_type;
switch (tensor->type) {
case GGML_TYPE_F32:
element_type = ov::element::f32;
break;
case GGML_TYPE_F16:
element_type = ov::element::f16;
break;
case GGML_TYPE_BF16:
element_type = ov::element::bf16;
break;
default:
return; // Should not happen
}
// Create 2D shape (OpenVINO expects [rows, cols])
ov::Shape shape = {static_cast<size_t>(tensor->ne[1]), static_cast<size_t>(tensor->ne[0])};
// Create ov::Tensor with external memory, then wrap with Constant
ov::Tensor ov_tensor(element_type, shape, tensor->data);
auto constant = std::make_shared<ov::op::v0::Constant>(ov_tensor);
std::shared_ptr<ov::Node> constant = process_weight_tensor(tensor, data, tensor->data);
constant->set_friendly_name(tensor->name);
// Store in tensor->extra
@ -418,7 +394,7 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(in
}
// Check if a buffer is an OpenVINO buffer
static bool ggml_backend_buffer_is_openvino(ggml_backend_buffer_t buffer) {
bool ggml_backend_buffer_is_openvino(ggml_backend_buffer_t buffer) {
return buffer->iface.free_buffer == ggml_backend_openvino_buffer_free_buffer;
}

View File

@ -569,6 +569,112 @@ std::shared_ptr<ov::Node> requantize(const ggml_tensor * tensor, ExtraQuantType
return requantize_to_buffers(tensor, tensor->data, requant_type, block_size, weights, scales, biases);
}
std::shared_ptr<ov::Node> process_weight_tensor(const ggml_tensor * tensor, const void * data, void * output_base_ptr) {
GGML_ASSERT(tensor != nullptr);
GGML_ASSERT(data != nullptr);
// Get 2D shape for weights [rows, cols]
ov::Shape node_shape = {static_cast<size_t>(tensor->ne[1]), static_cast<size_t>(tensor->ne[0])};
// Handle F16/F32/BF16 weights
if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {
ov::element::Type element_type;
switch (tensor->type) {
case GGML_TYPE_F32:
element_type = ov::element::f32;
break;
case GGML_TYPE_F16:
element_type = ov::element::f16;
break;
case GGML_TYPE_BF16:
element_type = ov::element::bf16;
break;
default:
OPENVINO_THROW("Unexpected tensor type in F16/F32/BF16 path");
}
if (output_base_ptr) {
// Using external buffer - copy data and create shared-memory constant
size_t tensor_bytes = ggml_nbytes(tensor);
memcpy(output_base_ptr, data, tensor_bytes);
ov::Tensor ov_tensor(element_type, node_shape, output_base_ptr);
return std::make_shared<ov::op::v0::Constant>(ov_tensor);
} else {
// Allocate internal buffer
ov::Tensor weights(element_type, node_shape);
memcpy(weights.data(), data, ggml_nelements(tensor) * element_type.size());
return std::make_shared<ov::op::v0::Constant>(weights);
}
}
// Handle quantized weights
if (!ggml_is_quantized(tensor->type)) {
OPENVINO_THROW("Unsupported weight tensor type: ", ggml_type_name(tensor->type));
}
auto layout = ggml_openvino_get_extracted_layout(tensor);
if (layout.total_size == 0) {
OPENVINO_THROW("Unsupported quantized type: ", ggml_type_name(tensor->type));
}
std::shared_ptr<ov::Node> result;
if (layout.is_requant && layout.requant_type.has_value()) {
// Requantization path
if (layout.requant_type.value() == ExtraQuantType::F16) {
// Requant to F16
ov::Tensor weights;
if (output_base_ptr) {
weights = ov::Tensor(ov::element::f16, node_shape,
static_cast<uint8_t *>(output_base_ptr) + layout.weights_offset);
} else {
weights = ov::Tensor(ov::element::f16, node_shape);
}
ov::Tensor dummy_scales, dummy_biases; // Not used for F16
result = requantize_to_buffers(tensor, data, ExtraQuantType::F16, 0, weights, dummy_scales, dummy_biases);
} else {
// Requant to quantized format (Q4_0_128, Q8_0_32, etc.)
ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block};
ov::Tensor weights, scales, biases;
if (output_base_ptr) {
uint8_t * buf_base = static_cast<uint8_t *>(output_base_ptr);
weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset);
scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
biases = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.biases_offset);
} else {
weights = ov::Tensor(weight_type, node_shape);
scales = ov::Tensor(ov::element::f16, scale_shape);
biases = ov::Tensor(ov::element::f16, scale_shape);
}
result = requantize_to_buffers(tensor, data, layout.requant_type.value(), layout.weights_per_block, weights,
scales, biases);
}
} else {
// Normal extraction path (no requant)
ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block};
ov::Tensor weights, scales, biases;
if (output_base_ptr) {
uint8_t * buf_base = static_cast<uint8_t *>(output_base_ptr);
weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset);
scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
biases = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.biases_offset);
} else {
weights = ov::Tensor(weight_type, node_shape);
scales = ov::Tensor(ov::element::f16, scale_shape);
biases = ov::Tensor(ov::element::f16, scale_shape);
}
result = extract_quantized_weights(tensor, data, weights, scales, biases);
}
return result;
}
void quantize_q4_0(const float * x,
ov::Tensor & weights_arr,
ov::Tensor & scales_arr,

View File

@ -78,6 +78,16 @@ std::shared_ptr<ov::Node> requantize_to_buffers(const ggml_tensor * tensor,
ov::Tensor & scales,
ov::Tensor & biases);
// Process weight tensor and create an OpenVINO constant node
// Handles F16/F32/BF16 and quantized weights, with optional requantization
// If output_base_ptr is nullptr, allocates internal buffers (for decoder use)
// If output_base_ptr is provided, uses pre-allocated buffers at specified offsets (for backend buffer use)
// Returns the weight constant node
std::shared_ptr<ov::Node> process_weight_tensor(
const ggml_tensor * tensor,
const void * data, // Source data pointer (may differ from tensor->data)
void * output_base_ptr = nullptr); // Base pointer for output buffers (or nullptr for internal allocation)
void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
int64_t qk);
void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,

View File

@ -107,7 +107,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
infer_request_cache.erase(key);
std::shared_ptr<ov::Model> model;
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, get_types_to_requant(device));
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, m_params, c_params, model_weights, is_static);
decoder_end_time = ggml_time_us();
@ -255,7 +255,7 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) {
infer_request_cache_prefill.erase(key);
std::shared_ptr<ov::Model> model;
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, get_types_to_requant(device));
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
auto ggml_decoder_prefill = std::make_shared<GgmlOvDecoder>(cgraph, m_params, c_params, model_weights,
is_static, true, prefill_chunk_size);
@ -404,21 +404,6 @@ ov::AnyMap get_ov_compile_config(const std::string & device) {
return config;
}
std::map<ggml_type, ExtraQuantType> get_types_to_requant(const std::string & device) {
// Use singleton to check if NPU (device param kept for API compatibility)
if (ggml_openvino_is_npu()) {
return {
{GGML_TYPE_Q4_0, ExtraQuantType::Q4_0_128},
{GGML_TYPE_Q4_1, ExtraQuantType::Q4_0_128},
{GGML_TYPE_Q4_K, ExtraQuantType::Q4_0_128},
{GGML_TYPE_Q6_K, ExtraQuantType::F16 },
{GGML_TYPE_Q5_K, ExtraQuantType::F16 },
};
}
return {};
GGML_UNUSED(device);
}
bool is_naive(ggml_cgraph * cgraph) {
constexpr int naive_graph_size_threshold = 20;
return cgraph->n_nodes < naive_graph_size_threshold;

View File

@ -73,8 +73,6 @@ graph_key compute_graph_key(struct ggml_cgraph * cgraph);
ov::AnyMap get_ov_compile_config(const std::string & device);
std::map<ggml_type, ExtraQuantType> get_types_to_requant(const std::string & device);
ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string & param_name);
ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
const std::string & param_name);