From 72bba828dfc539ec1ed979d2acf587b3325219bc Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 18 Dec 2025 17:03:03 +0800 Subject: [PATCH] Use shared_buffer for GPU NPU; Refactor --- ggml/src/ggml-openvino/CMakeLists.txt | 3 +- ggml/src/ggml-openvino/ggml-decoder.cpp | 78 ++------ ggml/src/ggml-openvino/ggml-decoder.h | 7 +- .../src/ggml-openvino/ggml-openvino-extra.cpp | 177 ++++++++++++++++++ ggml/src/ggml-openvino/ggml-openvino-extra.h | 159 ++-------------- ggml/src/ggml-openvino/ggml-openvino.cpp | 154 +++++++-------- ggml/src/ggml-openvino/ggml-quants.cpp | 106 +++++++++++ ggml/src/ggml-openvino/ggml-quants.hpp | 10 + ggml/src/ggml-openvino/utils.cpp | 19 +- ggml/src/ggml-openvino/utils.h | 2 - 10 files changed, 389 insertions(+), 326 deletions(-) create mode 100644 ggml/src/ggml-openvino/ggml-openvino-extra.cpp diff --git a/ggml/src/ggml-openvino/CMakeLists.txt b/ggml/src/ggml-openvino/CMakeLists.txt index 3051a8b240..175b585661 100644 --- a/ggml/src/ggml-openvino/CMakeLists.txt +++ b/ggml/src/ggml-openvino/CMakeLists.txt @@ -1,4 +1,5 @@ find_package(OpenVINO REQUIRED) +find_package(OpenCL REQUIRED) include("${OpenVINO_DIR}/../3rdparty/tbb/lib/cmake/TBB/TBBConfig.cmake") @@ -10,7 +11,7 @@ ggml_add_backend_library(ggml-openvino ${GGML_HEADERS_OPENVINO} ) -target_link_libraries(ggml-openvino PRIVATE openvino::runtime TBB::tbb) +target_link_libraries(ggml-openvino PRIVATE openvino::runtime TBB::tbb OpenCL::OpenCL) if (GGML_OPENVINO) if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 409a16e816..2d6437f069 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -3,6 +3,7 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" #include "ggml-openvino-extra.h" +#include "ggml-openvino.h" #include "ggml-quants.hpp" #include @@ -471,9 +472,7 @@ const ggml_tensor * GgmlOvDecoder::get_tensor_from_name(const std::string & name // return kv_param_res_names; // } -std::map> GgmlOvDecoder::create_weight_nodes( - ggml_cgraph * cgraph, - std::map types_to_requantize) { +std::map> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph) { std::map> model_weights; static std::mutex weights_mutex; auto * nodes = cgraph->nodes; @@ -498,10 +497,7 @@ std::map> GgmlOvDecoder::create_weight_no } } if (should_create) { - auto requant_type = types_to_requantize.count(src->type) ? - std::optional(types_to_requantize.at(src->type)) : - std::nullopt; - auto weight_node = create_weight_node(src, requant_type); + auto weight_node = create_weight_node(src); weight_node->set_friendly_name(src_name); { std::lock_guard lock(weights_mutex); @@ -520,11 +516,14 @@ std::map> GgmlOvDecoder::create_weight_no static std::unordered_map> s_quantized_weight_cache; static std::mutex s_quantized_weight_cache_mutex; -std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor, - std::optional requant_type) { +std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor) { // Check if we have a pre-built constant from the OpenVINO backend buffer // This is set during ggml_backend_openvino_buffer_set_tensor - if (tensor->extra != nullptr && !requant_type.has_value()) { + if (tensor->extra) { + if (!ggml_backend_buffer_is_openvino(tensor->buffer)) { + OPENVINO_ASSERT(false, "Unsupported weight tensor: " + std::string(tensor->name) + + " Possibly this is a cpu backend repacked quantized weights"); + } // Cast to our extra base type and check the type auto * extra_base = static_cast(tensor->extra); @@ -547,7 +546,7 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor // Fallback: Check static cache for quantized weights (keyed by data pointer) // This handles cases where tensors weren't loaded through OpenVINO buffer - if (ggml_is_quantized(tensor->type) && !requant_type.has_value()) { + if (ggml_is_quantized(tensor->type)) { std::lock_guard lock(s_quantized_weight_cache_mutex); auto it = s_quantized_weight_cache.find(tensor->data); if (it != s_quantized_weight_cache.end()) { @@ -565,64 +564,11 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor ggml_type_name(tensor->type)); } - auto node_type = get_ov_type(tensor); - auto node_shape = get_shape(tensor); - auto ne_total = ggml_nelements(tensor); - - OPENVINO_ASSERT(node_shape[0] == 1, "Got 4D weights, expect all weights to be 2D: ", tensor->name); - node_shape.erase(node_shape.begin()); - OPENVINO_ASSERT(node_shape[0] == 1, "Got 3D weights, expect all weights to be 2D: ", tensor->name); - node_shape.erase(node_shape.begin()); - - // F16 and F32 case - if (node_type != ov::element::dynamic) { - ov::Tensor weights(node_type, node_shape); - memcpy(weights.data(), tensor->data, ne_total * node_type.size()); - std::shared_ptr weight_node = std::make_shared(weights); - // Disabled because it triggers a bug in NPUW, no performance impact on CPU GPU - // if (node_type == ov::element::f16) { - // weight_node = std::make_shared(weight_node, ov::element::f32); - // } - weight_node->set_friendly_name(tensor->name); - return weight_node; - } - - // Quantized case - extra should be nullptr (not our type) - // Our ggml_openvino_weight_extra is only set for F16/F32 weights - if (tensor->extra != nullptr) { - // Check if it's our type - if so, something is wrong - auto * extra_base = static_cast(tensor->extra); - if (extra_base->type == ggml_openvino_extra_base::Type::WEIGHT || - extra_base->type == ggml_openvino_extra_base::Type::TENSOR) { - OPENVINO_ASSERT(false, "Quantized weight tensor has unexpected extra type: " + std::string(tensor->name)); - } - // Otherwise it might be repacked quantized weights from another backend - OPENVINO_ASSERT(false, "Unsupported weight tensor: " + std::string(tensor->name) + - " Possibly this is a repacked quantized weights"); - } - - if (requant_type.has_value()) { - return requantize(tensor, requant_type.value()); - } - - // Extract quantized weights using the shared function - auto layout = ggml_openvino_get_extracted_layout(tensor); - if (layout.total_size == 0) { - OPENVINO_THROW("Unsupported quantized type for ", tensor->name, " type=", ggml_type_name(tensor->type)); - } - - ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; - ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block}; - - ov::Tensor weights(weight_type, node_shape); - ov::Tensor scales(ov::element::f16, scale_shape); - ov::Tensor biases(ov::element::f16, scale_shape); - - auto result = extract_quantized_weights(tensor, tensor->data, weights, scales, biases); + std::shared_ptr result = process_weight_tensor(tensor, tensor->data, nullptr); result->set_friendly_name(tensor->name); // Cache the quantized weight node for future reuse - if (ggml_is_quantized(tensor->type) && !requant_type.has_value()) { + if (ggml_is_quantized(tensor->type)) { std::lock_guard lock(s_quantized_weight_cache_mutex); s_quantized_weight_cache[tensor->data] = result; GGML_LOG_DEBUG("%s: cached quantized constant for %s\n", __func__, tensor->name); diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index edcd036785..0b302b9320 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -179,12 +179,9 @@ public: static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename); - static std::shared_ptr create_weight_node(ggml_tensor * tensor, - std::optional requant_type = std::nullopt); + static std::shared_ptr create_weight_node(ggml_tensor * tensor); - static std::map> create_weight_nodes( - ggml_cgraph * cgraph, - std::map types_to_requantize = {}); + static std::map> create_weight_nodes(ggml_cgraph * cgraph); const ggml_tensor * get_tensor_used_op(const ggml_tensor * tensor) const; diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp new file mode 100644 index 0000000000..75b27c8fa8 --- /dev/null +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp @@ -0,0 +1,177 @@ +#include "ggml-openvino-extra.h" + +#include "ggml-impl.h" + +ov::Core & ov_singleton_core() { + static ov::Core core; + return core; +} + +// ===================================================== +// Device Configuration Implementations +// ===================================================== + +void ggml_openvino_device_config::init() { + if (initialized) { + return; + } + device_name = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : "CPU"; + auto available_devices = ov_singleton_core().get_available_devices(); + if (std::find(available_devices.begin(), available_devices.end(), device_name) == available_devices.end()) { + GGML_LOG_WARN("GGML OpenVINO Backend: device %s is not available, fallback to CPU\n", device_name.c_str()); + device_name = "CPU"; + } + is_npu = (device_name == "NPU"); + initialized = true; +} + +// Get the global device config singleton +ggml_openvino_device_config & ggml_openvino_get_device_config() { + static ggml_openvino_device_config config; + return config; +} + +// Initialize device config (call during backend init) +void ggml_openvino_init_device_config() { + ggml_openvino_get_device_config().init(); +} + +// Get the device name +const std::string & ggml_openvino_get_device_name() { + return ggml_openvino_get_device_config().device_name; +} + +// Check if running on NPU +bool ggml_openvino_is_npu() { + return ggml_openvino_get_device_config().is_npu; +} + +// Get requantization type for a tensor type (returns nullopt if no requant needed) +std::optional ggml_openvino_get_requant_type(ggml_type type) { + if (!ggml_openvino_is_npu()) { + return std::nullopt; + } + // NPU requantization rules + switch (type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q4_K: + return ExtraQuantType::Q4_0_128; + case GGML_TYPE_Q6_K: + case GGML_TYPE_Q5_K: + return ExtraQuantType::F16; + default: + return std::nullopt; + } +} + +// ===================================================== +// Extracted Layout Calculation +// ===================================================== + +ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor) { + ggml_openvino_extracted_layout layout = {}; + + if (!ggml_is_quantized(tensor->type)) { + return layout; + } + + // Only handle 2D weight tensors + if (tensor->ne[2] != 1 || tensor->ne[3] != 1) { + return layout; + } + + int64_t n_elements = ggml_nelements(tensor); + const size_t alignment = 64; // Good for SIMD + + // Check if requantization is needed (NPU-specific) + auto requant_type = ggml_openvino_get_requant_type(tensor->type); + if (requant_type.has_value()) { + layout.is_requant = true; + layout.requant_type = requant_type; + + // Special case: requant to F16 - just store F16 weights, no scales/biases + if (requant_type.value() == ExtraQuantType::F16) { + layout.weights_size = n_elements * sizeof(uint16_t); // F16 = 2 bytes + layout.total_size = layout.weights_size; + layout.weights_offset = 0; + // No scales/biases for F16 + return layout; + } + + // Requant to different quantized format (e.g., Q4_0_128) + switch (requant_type.value()) { + case ExtraQuantType::Q4_0_128: + layout.is_u4 = true; + layout.weights_per_block = 128; + break; + case ExtraQuantType::Q8_0_32: + layout.is_u4 = false; + layout.weights_per_block = 32; + break; + default: + // Unsupported requant type - fall through to normal extraction + layout.is_requant = false; + layout.requant_type = std::nullopt; + break; + } + + if (layout.is_requant) { + // Calculate sizes for requantized format + layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements; + int64_t n_blocks = n_elements / layout.weights_per_block; + layout.scales_size = n_blocks * sizeof(uint16_t); + layout.biases_size = n_blocks * sizeof(uint16_t); + + layout.weights_offset = 0; + layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment; + layout.biases_offset = + layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment; + layout.total_size = layout.biases_offset + layout.biases_size; + layout.total_size = std::max(layout.total_size, ggml_nbytes(tensor)); + return layout; + } + } + + // Normal extraction (no requant) - determine format based on tensor type + switch (tensor->type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q4_K: + layout.is_u4 = true; + layout.weights_per_block = 32; + break; + case GGML_TYPE_Q8_0: + layout.is_u4 = false; + layout.weights_per_block = 32; + break; + case GGML_TYPE_Q6_K: + layout.is_u4 = false; + layout.weights_per_block = 16; + break; + case GGML_TYPE_Q5_K: + layout.is_u4 = false; + layout.weights_per_block = 32; + break; + default: + // Unsupported quantization type + return layout; + } + + // Calculate sizes + // Weights: U4 = n_elements/2 bytes, U8 = n_elements bytes + layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements; + + // Scales and biases: F16 per block + int64_t n_blocks = n_elements / layout.weights_per_block; + layout.scales_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes + layout.biases_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes + + // Layout in buffer: [weights | scales | biases] with alignment + layout.weights_offset = 0; + layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment; + layout.biases_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment; + layout.total_size = layout.biases_offset + layout.biases_size; + + return layout; +} diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.h b/ggml/src/ggml-openvino/ggml-openvino-extra.h index 99db870412..7e0138388f 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.h +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.h @@ -1,16 +1,20 @@ #pragma once +#include "ggml.h" +#include "openvino/runtime/core.hpp" + #include #include -#include #include #include +#include #include -#include "ggml.h" // ExtraQuantType enum - defines requantization target formats enum class ExtraQuantType { F16, Q4_0_C, Q8_1_C, Q4_0_128, Q8_0_C, Q8_0_32 }; +ov::Core & ov_singleton_core(); + // ===================================================== // Global Device Configuration (singleton) // ===================================================== @@ -21,56 +25,23 @@ struct ggml_openvino_device_config { bool is_npu = false; bool initialized = false; - void init() { - if (initialized) return; - const char* env = std::getenv("GGML_OPENVINO_DEVICE"); - if (env) { - device_name = env; - is_npu = (device_name == "NPU"); - } - initialized = true; - } + void init(); }; // Get the global device config singleton -inline ggml_openvino_device_config& ggml_openvino_get_device_config() { - static ggml_openvino_device_config config; - return config; -} +ggml_openvino_device_config & ggml_openvino_get_device_config(); // Initialize device config (call during backend init) -inline void ggml_openvino_init_device_config() { - ggml_openvino_get_device_config().init(); -} +void ggml_openvino_init_device_config(); // Get the device name -inline const std::string& ggml_openvino_get_device_name() { - return ggml_openvino_get_device_config().device_name; -} +const std::string & ggml_openvino_get_device_name(); // Check if running on NPU -inline bool ggml_openvino_is_npu() { - return ggml_openvino_get_device_config().is_npu; -} +bool ggml_openvino_is_npu(); // Get requantization type for a tensor type (returns nullopt if no requant needed) -inline std::optional ggml_openvino_get_requant_type(ggml_type type) { - if (!ggml_openvino_is_npu()) { - return std::nullopt; - } - // NPU requantization rules - switch (type) { - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q4_K: - return ExtraQuantType::Q4_0_128; - case GGML_TYPE_Q6_K: - case GGML_TYPE_Q5_K: - return ExtraQuantType::F16; - default: - return std::nullopt; - } -} +std::optional ggml_openvino_get_requant_type(ggml_type type); // ===================================================== // OpenVINO Tensor Extra Types @@ -140,108 +111,4 @@ struct ggml_openvino_extracted_layout { }; // Calculate the buffer layout for extracted quantized data -inline ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor) { - ggml_openvino_extracted_layout layout = {}; - - if (!ggml_is_quantized(tensor->type)) { - return layout; - } - - // Only handle 2D weight tensors - if (tensor->ne[2] != 1 || tensor->ne[3] != 1) { - return layout; - } - - int64_t n_elements = ggml_nelements(tensor); - const size_t alignment = 64; // Good for SIMD - - // Check if requantization is needed (NPU-specific) - auto requant_type = ggml_openvino_get_requant_type(tensor->type); - if (requant_type.has_value()) { - layout.is_requant = true; - layout.requant_type = requant_type; - - // Special case: requant to F16 - just store F16 weights, no scales/biases - if (requant_type.value() == ExtraQuantType::F16) { - layout.weights_size = n_elements * sizeof(uint16_t); // F16 = 2 bytes - layout.total_size = layout.weights_size; - layout.weights_offset = 0; - // No scales/biases for F16 - return layout; - } - - // Requant to different quantized format (e.g., Q4_0_128) - switch (requant_type.value()) { - case ExtraQuantType::Q4_0_128: - layout.is_u4 = true; - layout.weights_per_block = 128; - break; - case ExtraQuantType::Q8_0_32: - layout.is_u4 = false; - layout.weights_per_block = 32; - break; - default: - // Unsupported requant type - fall through to normal extraction - layout.is_requant = false; - layout.requant_type = std::nullopt; - break; - } - - if (layout.is_requant) { - // Calculate sizes for requantized format - layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements; - int64_t n_blocks = n_elements / layout.weights_per_block; - layout.scales_size = n_blocks * sizeof(uint16_t); - layout.biases_size = n_blocks * sizeof(uint16_t); - - layout.weights_offset = 0; - layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment; - layout.biases_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment; - layout.total_size = layout.biases_offset + layout.biases_size; - layout.total_size = std::max(layout.total_size, ggml_nbytes(tensor)); - return layout; - } - } - - // Normal extraction (no requant) - determine format based on tensor type - switch (tensor->type) { - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q4_K: - layout.is_u4 = true; - layout.weights_per_block = 32; - break; - case GGML_TYPE_Q8_0: - layout.is_u4 = false; - layout.weights_per_block = 32; - break; - case GGML_TYPE_Q6_K: - layout.is_u4 = false; - layout.weights_per_block = 16; - break; - case GGML_TYPE_Q5_K: - layout.is_u4 = false; - layout.weights_per_block = 32; - break; - default: - // Unsupported quantization type - return layout; - } - - // Calculate sizes - // Weights: U4 = n_elements/2 bytes, U8 = n_elements bytes - layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements; - - // Scales and biases: F16 per block - int64_t n_blocks = n_elements / layout.weights_per_block; - layout.scales_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes - layout.biases_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes - - // Layout in buffer: [weights | scales | biases] with alignment - layout.weights_offset = 0; - layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment; - layout.biases_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment; - layout.total_size = layout.biases_offset + layout.biases_size; - - return layout; -} +ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor); diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 747d1b8a30..e20ae71e40 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -12,7 +12,11 @@ #include #include #include +#include #include +#include +#include +#include #include #include #include @@ -48,7 +52,8 @@ struct ggml_backend_openvino_buffer_context { // For non-weight buffers (KV cache, compute), we still use contiguous allocation void * data; size_t size; - bool is_weight_buffer; // Set when buffer usage is set to WEIGHTS + + std::shared_ptr ov_tensor; // Track all extras for cleanup std::vector tensor_extras; @@ -57,18 +62,42 @@ struct ggml_backend_openvino_buffer_context { device(device), name(std::string(GGML_OPENVINO_NAME) + std::to_string(device)), data(nullptr), - size(size), - is_weight_buffer(false) { - // Allocate aligned contiguous memory - if (size > 0) { + size(size) { + if (size == 0) { + return; + } + + const auto & device_name = ggml_openvino_get_device_name(); + auto & core = ov_singleton_core(); + + if (device_name == "CPU") { #ifdef _WIN32 - data = _aligned_malloc(size, GGML_OPENVINO_BUFFER_ALIGNMENT); + data = _aligned_malloc(alloc_size, GGML_OPENVINO_BUFFER_ALIGNMENT); #else data = aligned_alloc(GGML_OPENVINO_BUFFER_ALIGNMENT, size); #endif - if (data == nullptr) { - GGML_LOG_ERROR("%s: failed to allocate %zu bytes\n", __func__, size); - } + ov_tensor = std::make_shared(ov::element::u8, ov::Shape{size}, data); + } else if (device_name == "GPU") { + auto gpu_context = core.get_default_context("GPU").as(); + auto usm_tensor = gpu_context.create_usm_host_tensor(ov::element::u8, ov::Shape{size}); + data = usm_tensor.get(); + ov_tensor = std::make_shared(std::move(usm_tensor)); + } else { + auto npu_context = core.get_default_context("NPU").as(); + auto l0_tensor = npu_context.create_l0_host_tensor(ov::element::u8, ov::Shape{size}); + data = l0_tensor.get(); + ov_tensor = std::make_shared(std::move(l0_tensor)); + } + + if (data == nullptr) { + GGML_LOG_ERROR("%s: failed to allocate %zu bytes\n", __func__, size); + return; + } + + if (reinterpret_cast(data) % GGML_OPENVINO_BUFFER_ALIGNMENT != 0) { + GGML_LOG_ERROR("%s: %s buffer is not aligned to %d bytes\n", __func__, device_name.c_str(), + GGML_OPENVINO_BUFFER_ALIGNMENT); + GGML_ABORT("fatal error"); } } @@ -78,15 +107,12 @@ struct ggml_backend_openvino_buffer_context { delete extra; } tensor_extras.clear(); - - // Free contiguous memory - if (data != nullptr) { + if (data && ggml_openvino_get_device_name() == "CPU") { #ifdef _WIN32 _aligned_free(data); #else free(data); #endif - data = nullptr; } } }; @@ -156,57 +182,26 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer } if (layout.total_size > 0) { + // Quantized weight tensor with extraction/requantization uint8_t * buf_base = (uint8_t *) tensor->data; - // 2D shape for weights [rows, cols] - ov::Shape weight_shape = {static_cast(tensor->ne[1]), static_cast(tensor->ne[0])}; - try { - std::shared_ptr constant; + std::shared_ptr constant = process_weight_tensor(tensor, data, buf_base); + constant->set_friendly_name(tensor->name); - if (layout.is_requant && layout.requant_type.has_value()) { - // Requantization path - if (layout.requant_type.value() == ExtraQuantType::F16) { - // Requant to F16: create F16 tensor with external memory, requantize fills it - ov::Tensor weights(ov::element::f16, weight_shape, buf_base); - ov::Tensor dummy_scales, dummy_biases; // Not used for F16 - // requantize_to_buffers fills weights and returns a Constant wrapping it - constant = requantize_to_buffers(tensor, data, ExtraQuantType::F16, 0, weights, dummy_scales, - dummy_biases); - - // Store in tensor->extra (use weight_extra since it's F16) - auto * extra = new ggml_openvino_weight_extra(constant); - ctx->tensor_extras.push_back(extra); - tensor->extra = extra; - - GGML_LOG_DEBUG("%s: requantized %s to F16\n", __func__, tensor->name); - } else { - // Requant to quantized format (Q4_0_128, Q8_0_32, etc.) - ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; - ov::Shape scale_shape = {static_cast(tensor->ne[1]), - static_cast(tensor->ne[0] / layout.weights_per_block)}; - - ov::Tensor weights(weight_type, weight_shape, buf_base + layout.weights_offset); - ov::Tensor scales(ov::element::f16, scale_shape, buf_base + layout.scales_offset); - ov::Tensor biases(ov::element::f16, scale_shape, buf_base + layout.biases_offset); - - constant = requantize_to_buffers(tensor, data, layout.requant_type.value(), - layout.weights_per_block, weights, scales, biases); - - // Store in tensor->extra - auto * extra = new ggml_openvino_quantized_weight_extra(std::move(weights), std::move(scales), - std::move(biases), constant); - ctx->tensor_extras.push_back(extra); - tensor->extra = extra; - - GGML_LOG_DEBUG("%s: requantized %s to %s (u%d, block_size=%ld)\n", __func__, tensor->name, - layout.requant_type.value() == ExtraQuantType::Q4_0_128 ? "Q4_0_128" : "Q8_0_32", - layout.is_u4 ? 4 : 8, layout.weights_per_block); - } + // Store in tensor->extra + if (layout.is_requant && layout.requant_type.has_value() && + layout.requant_type.value() == ExtraQuantType::F16) { + // F16 requant case - use weight_extra + auto * extra = new ggml_openvino_weight_extra(constant); + ctx->tensor_extras.push_back(extra); + tensor->extra = extra; + GGML_LOG_DEBUG("%s: requantized %s to F16\n", __func__, tensor->name); } else { - // Normal extraction path (no requant) + // Quantized case - use quantized_weight_extra + // Create tensors with external memory (already filled by process_weight_tensor) ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; - int64_t n_blocks = ggml_nelements(tensor) / layout.weights_per_block; + ov::Shape weight_shape = {static_cast(tensor->ne[1]), static_cast(tensor->ne[0])}; ov::Shape scale_shape = {static_cast(tensor->ne[1]), static_cast(tensor->ne[0] / layout.weights_per_block)}; @@ -214,16 +209,20 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer ov::Tensor scales(ov::element::f16, scale_shape, buf_base + layout.scales_offset); ov::Tensor biases(ov::element::f16, scale_shape, buf_base + layout.biases_offset); - constant = extract_quantized_weights(tensor, data, weights, scales, biases); - - // Store in tensor->extra auto * extra = new ggml_openvino_quantized_weight_extra(std::move(weights), std::move(scales), std::move(biases), constant); ctx->tensor_extras.push_back(extra); tensor->extra = extra; - GGML_LOG_DEBUG("%s: extracted quantized constant for %s (u%d, %zu weights, %ld blocks)\n", __func__, - tensor->name, layout.is_u4 ? 4 : 8, layout.weights_size, n_blocks); + if (layout.is_requant) { + GGML_LOG_DEBUG("%s: requantized %s to %s (u%d, block_size=%ld)\n", __func__, tensor->name, + layout.requant_type.value() == ExtraQuantType::Q4_0_128 ? "Q4_0_128" : "Q8_0_32", + layout.is_u4 ? 4 : 8, layout.weights_per_block); + } else { + int64_t n_blocks = ggml_nelements(tensor) / layout.weights_per_block; + GGML_LOG_DEBUG("%s: extracted quantized constant for %s (u%d, %zu weights, %ld blocks)\n", __func__, + tensor->name, layout.is_u4 ? 4 : 8, layout.weights_size, n_blocks); + } } } catch (const std::exception & e) { @@ -233,32 +232,9 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer } } else if (is_weight_buffer && is_full_tensor_set && is_2d && (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16)) { - // F16/F32/BF16 weight tensor - copy data and create shared-memory constant - memcpy((char *) tensor->data + offset, data, size); - + // F16/F32/BF16 weight tensor try { - // Get OpenVINO element type - ov::element::Type element_type; - switch (tensor->type) { - case GGML_TYPE_F32: - element_type = ov::element::f32; - break; - case GGML_TYPE_F16: - element_type = ov::element::f16; - break; - case GGML_TYPE_BF16: - element_type = ov::element::bf16; - break; - default: - return; // Should not happen - } - - // Create 2D shape (OpenVINO expects [rows, cols]) - ov::Shape shape = {static_cast(tensor->ne[1]), static_cast(tensor->ne[0])}; - - // Create ov::Tensor with external memory, then wrap with Constant - ov::Tensor ov_tensor(element_type, shape, tensor->data); - auto constant = std::make_shared(ov_tensor); + std::shared_ptr constant = process_weight_tensor(tensor, data, tensor->data); constant->set_friendly_name(tensor->name); // Store in tensor->extra @@ -418,7 +394,7 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(in } // Check if a buffer is an OpenVINO buffer -static bool ggml_backend_buffer_is_openvino(ggml_backend_buffer_t buffer) { +bool ggml_backend_buffer_is_openvino(ggml_backend_buffer_t buffer) { return buffer->iface.free_buffer == ggml_backend_openvino_buffer_free_buffer; } diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp index 662f27be7a..6cacc7b034 100644 --- a/ggml/src/ggml-openvino/ggml-quants.cpp +++ b/ggml/src/ggml-openvino/ggml-quants.cpp @@ -569,6 +569,112 @@ std::shared_ptr requantize(const ggml_tensor * tensor, ExtraQuantType return requantize_to_buffers(tensor, tensor->data, requant_type, block_size, weights, scales, biases); } +std::shared_ptr process_weight_tensor(const ggml_tensor * tensor, const void * data, void * output_base_ptr) { + GGML_ASSERT(tensor != nullptr); + GGML_ASSERT(data != nullptr); + + // Get 2D shape for weights [rows, cols] + ov::Shape node_shape = {static_cast(tensor->ne[1]), static_cast(tensor->ne[0])}; + + // Handle F16/F32/BF16 weights + if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) { + ov::element::Type element_type; + switch (tensor->type) { + case GGML_TYPE_F32: + element_type = ov::element::f32; + break; + case GGML_TYPE_F16: + element_type = ov::element::f16; + break; + case GGML_TYPE_BF16: + element_type = ov::element::bf16; + break; + default: + OPENVINO_THROW("Unexpected tensor type in F16/F32/BF16 path"); + } + + if (output_base_ptr) { + // Using external buffer - copy data and create shared-memory constant + size_t tensor_bytes = ggml_nbytes(tensor); + memcpy(output_base_ptr, data, tensor_bytes); + ov::Tensor ov_tensor(element_type, node_shape, output_base_ptr); + return std::make_shared(ov_tensor); + } else { + // Allocate internal buffer + ov::Tensor weights(element_type, node_shape); + memcpy(weights.data(), data, ggml_nelements(tensor) * element_type.size()); + return std::make_shared(weights); + } + } + + // Handle quantized weights + if (!ggml_is_quantized(tensor->type)) { + OPENVINO_THROW("Unsupported weight tensor type: ", ggml_type_name(tensor->type)); + } + + auto layout = ggml_openvino_get_extracted_layout(tensor); + if (layout.total_size == 0) { + OPENVINO_THROW("Unsupported quantized type: ", ggml_type_name(tensor->type)); + } + + std::shared_ptr result; + + if (layout.is_requant && layout.requant_type.has_value()) { + // Requantization path + if (layout.requant_type.value() == ExtraQuantType::F16) { + // Requant to F16 + ov::Tensor weights; + if (output_base_ptr) { + weights = ov::Tensor(ov::element::f16, node_shape, + static_cast(output_base_ptr) + layout.weights_offset); + } else { + weights = ov::Tensor(ov::element::f16, node_shape); + } + ov::Tensor dummy_scales, dummy_biases; // Not used for F16 + result = requantize_to_buffers(tensor, data, ExtraQuantType::F16, 0, weights, dummy_scales, dummy_biases); + } else { + // Requant to quantized format (Q4_0_128, Q8_0_32, etc.) + ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; + ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block}; + + ov::Tensor weights, scales, biases; + if (output_base_ptr) { + uint8_t * buf_base = static_cast(output_base_ptr); + weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset); + scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset); + biases = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.biases_offset); + } else { + weights = ov::Tensor(weight_type, node_shape); + scales = ov::Tensor(ov::element::f16, scale_shape); + biases = ov::Tensor(ov::element::f16, scale_shape); + } + + result = requantize_to_buffers(tensor, data, layout.requant_type.value(), layout.weights_per_block, weights, + scales, biases); + } + } else { + // Normal extraction path (no requant) + ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; + ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block}; + + ov::Tensor weights, scales, biases; + if (output_base_ptr) { + uint8_t * buf_base = static_cast(output_base_ptr); + weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset); + scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset); + biases = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.biases_offset); + } else { + weights = ov::Tensor(weight_type, node_shape); + scales = ov::Tensor(ov::element::f16, scale_shape); + biases = ov::Tensor(ov::element::f16, scale_shape); + } + + result = extract_quantized_weights(tensor, data, weights, scales, biases); + } + + return result; +} + void quantize_q4_0(const float * x, ov::Tensor & weights_arr, ov::Tensor & scales_arr, diff --git a/ggml/src/ggml-openvino/ggml-quants.hpp b/ggml/src/ggml-openvino/ggml-quants.hpp index 0f14a6ed2d..b1d286f1b8 100644 --- a/ggml/src/ggml-openvino/ggml-quants.hpp +++ b/ggml/src/ggml-openvino/ggml-quants.hpp @@ -78,6 +78,16 @@ std::shared_ptr requantize_to_buffers(const ggml_tensor * tensor, ov::Tensor & scales, ov::Tensor & biases); +// Process weight tensor and create an OpenVINO constant node +// Handles F16/F32/BF16 and quantized weights, with optional requantization +// If output_base_ptr is nullptr, allocates internal buffers (for decoder use) +// If output_base_ptr is provided, uses pre-allocated buffers at specified offsets (for backend buffer use) +// Returns the weight constant node +std::shared_ptr process_weight_tensor( + const ggml_tensor * tensor, + const void * data, // Source data pointer (may differ from tensor->data) + void * output_base_ptr = nullptr); // Base pointer for output buffers (or nullptr for internal allocation) + void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, int64_t qk); void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 251fb82361..6d56af9318 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -107,7 +107,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin infer_request_cache.erase(key); std::shared_ptr model; - auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, get_types_to_requant(device)); + auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); ggml_decoder = std::make_shared(cgraph, m_params, c_params, model_weights, is_static); decoder_end_time = ggml_time_us(); @@ -255,7 +255,7 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { infer_request_cache_prefill.erase(key); std::shared_ptr model; - auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, get_types_to_requant(device)); + auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); auto ggml_decoder_prefill = std::make_shared(cgraph, m_params, c_params, model_weights, is_static, true, prefill_chunk_size); @@ -404,21 +404,6 @@ ov::AnyMap get_ov_compile_config(const std::string & device) { return config; } -std::map get_types_to_requant(const std::string & device) { - // Use singleton to check if NPU (device param kept for API compatibility) - if (ggml_openvino_is_npu()) { - return { - {GGML_TYPE_Q4_0, ExtraQuantType::Q4_0_128}, - {GGML_TYPE_Q4_1, ExtraQuantType::Q4_0_128}, - {GGML_TYPE_Q4_K, ExtraQuantType::Q4_0_128}, - {GGML_TYPE_Q6_K, ExtraQuantType::F16 }, - {GGML_TYPE_Q5_K, ExtraQuantType::F16 }, - }; - } - return {}; - GGML_UNUSED(device); -} - bool is_naive(ggml_cgraph * cgraph) { constexpr int naive_graph_size_threshold = 20; return cgraph->n_nodes < naive_graph_size_threshold; diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 85bb3a2f88..81fb2c2035 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -73,8 +73,6 @@ graph_key compute_graph_key(struct ggml_cgraph * cgraph); ov::AnyMap get_ov_compile_config(const std::string & device); -std::map get_types_to_requant(const std::string & device); - ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string & param_name); ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr ggml_decoder, const std::string & param_name);