Use shared_buffer for GPU NPU; Refactor

2025-12-18 17:03:03 +08:00 · 2025-12-18 17:03:03 +08:00 · 72bba828df
parent 22d9c17a6f
commit 72bba828df
10 changed files with 389 additions and 326 deletions
--- a/ggml/src/ggml-openvino/CMakeLists.txt
+++ b/ggml/src/ggml-openvino/CMakeLists.txt
@ -1,4 +1,5 @@
 find_package(OpenVINO REQUIRED)
+find_package(OpenCL REQUIRED)

 include("${OpenVINO_DIR}/../3rdparty/tbb/lib/cmake/TBB/TBBConfig.cmake")

@ -10,7 +11,7 @@ ggml_add_backend_library(ggml-openvino
    ${GGML_HEADERS_OPENVINO}
 )

-target_link_libraries(ggml-openvino PRIVATE openvino::runtime TBB::tbb)
+target_link_libraries(ggml-openvino PRIVATE openvino::runtime TBB::tbb OpenCL::OpenCL)

 if (GGML_OPENVINO)
    if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@ -3,6 +3,7 @@
 #include "ggml-backend-impl.h"
 #include "ggml-backend.h"
 #include "ggml-openvino-extra.h"
+#include "ggml-openvino.h"
 #include "ggml-quants.hpp"

 #include <ggml-impl.h>
@ -471,9 +472,7 @@ const ggml_tensor * GgmlOvDecoder::get_tensor_from_name(const std::string & name
 //     return kv_param_res_names;
 // }

-std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(
-    ggml_cgraph * cgraph,
-    std::map<ggml_type, ExtraQuantType> types_to_requantize) {
+std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph) {
    std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
    static std::mutex weights_mutex;
    auto * nodes = cgraph->nodes;
@ -498,10 +497,7 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
                        }
                    }
                    if (should_create) {
-                        auto requant_type = types_to_requantize.count(src->type) ?
-                                                std::optional<ExtraQuantType>(types_to_requantize.at(src->type)) :
-                                                std::nullopt;
-                        auto weight_node = create_weight_node(src, requant_type);
+                        auto weight_node = create_weight_node(src);
                        weight_node->set_friendly_name(src_name);
                        {
                            std::lock_guard<std::mutex> lock(weights_mutex);
@ -520,11 +516,14 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
 static std::unordered_map<const void *, std::shared_ptr<ov::Node>> s_quantized_weight_cache;
 static std::mutex s_quantized_weight_cache_mutex;

-std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor,
-                                                            std::optional<ExtraQuantType> requant_type) {
+std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor) {
    // Check if we have a pre-built constant from the OpenVINO backend buffer
    // This is set during ggml_backend_openvino_buffer_set_tensor
-    if (tensor->extra != nullptr && !requant_type.has_value()) {
+    if (tensor->extra) {
+        if (!ggml_backend_buffer_is_openvino(tensor->buffer)) {
+            OPENVINO_ASSERT(false, "Unsupported weight tensor: " + std::string(tensor->name) +
+                                       " Possibly this is a cpu backend repacked quantized weights");
+        }
        // Cast to our extra base type and check the type
        auto * extra_base = static_cast<ggml_openvino_extra_base *>(tensor->extra);

@ -547,7 +546,7 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor

    // Fallback: Check static cache for quantized weights (keyed by data pointer)
    // This handles cases where tensors weren't loaded through OpenVINO buffer
-    if (ggml_is_quantized(tensor->type) && !requant_type.has_value()) {
+    if (ggml_is_quantized(tensor->type)) {
        std::lock_guard<std::mutex> lock(s_quantized_weight_cache_mutex);
        auto it = s_quantized_weight_cache.find(tensor->data);
        if (it != s_quantized_weight_cache.end()) {
@ -565,64 +564,11 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor
                                 ggml_type_name(tensor->type));
    }

-    auto node_type = get_ov_type(tensor);
-    auto node_shape = get_shape(tensor);
-    auto ne_total = ggml_nelements(tensor);
-
-    OPENVINO_ASSERT(node_shape[0] == 1, "Got 4D weights, expect all weights to be 2D: ", tensor->name);
-    node_shape.erase(node_shape.begin());
-    OPENVINO_ASSERT(node_shape[0] == 1, "Got 3D weights, expect all weights to be 2D: ", tensor->name);
-    node_shape.erase(node_shape.begin());
-
-    // F16 and F32 case
-    if (node_type != ov::element::dynamic) {
-        ov::Tensor weights(node_type, node_shape);
-        memcpy(weights.data(), tensor->data, ne_total * node_type.size());
-        std::shared_ptr<ov::Node> weight_node = std::make_shared<ov::op::v0::Constant>(weights);
-        // Disabled because it triggers a bug in NPUW, no performance impact on CPU GPU
-        // if (node_type == ov::element::f16) {
-        //     weight_node = std::make_shared<ov::op::v0::Convert>(weight_node, ov::element::f32);
-        // }
-        weight_node->set_friendly_name(tensor->name);
-        return weight_node;
-    }
-
-    // Quantized case - extra should be nullptr (not our type)
-    // Our ggml_openvino_weight_extra is only set for F16/F32 weights
-    if (tensor->extra != nullptr) {
-        // Check if it's our type - if so, something is wrong
-        auto * extra_base = static_cast<ggml_openvino_extra_base *>(tensor->extra);
-        if (extra_base->type == ggml_openvino_extra_base::Type::WEIGHT ||
-            extra_base->type == ggml_openvino_extra_base::Type::TENSOR) {
-            OPENVINO_ASSERT(false, "Quantized weight tensor has unexpected extra type: " + std::string(tensor->name));
-        }
-        // Otherwise it might be repacked quantized weights from another backend
-        OPENVINO_ASSERT(false, "Unsupported weight tensor: " + std::string(tensor->name) +
-                                   " Possibly this is a repacked quantized weights");
-    }
-
-    if (requant_type.has_value()) {
-        return requantize(tensor, requant_type.value());
-    }
-
-    // Extract quantized weights using the shared function
-    auto layout = ggml_openvino_get_extracted_layout(tensor);
-    if (layout.total_size == 0) {
-        OPENVINO_THROW("Unsupported quantized type for ", tensor->name, " type=", ggml_type_name(tensor->type));
-    }
-
-    ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
-    ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block};
-
-    ov::Tensor weights(weight_type, node_shape);
-    ov::Tensor scales(ov::element::f16, scale_shape);
-    ov::Tensor biases(ov::element::f16, scale_shape);
-
-    auto result = extract_quantized_weights(tensor, tensor->data, weights, scales, biases);
+    std::shared_ptr<ov::Node> result = process_weight_tensor(tensor, tensor->data, nullptr);
    result->set_friendly_name(tensor->name);

    // Cache the quantized weight node for future reuse
-    if (ggml_is_quantized(tensor->type) && !requant_type.has_value()) {
+    if (ggml_is_quantized(tensor->type)) {
        std::lock_guard<std::mutex> lock(s_quantized_weight_cache_mutex);
        s_quantized_weight_cache[tensor->data] = result;
        GGML_LOG_DEBUG("%s: cached quantized constant for %s\n", __func__, tensor->name);
--- a/ggml/src/ggml-openvino/ggml-decoder.h
+++ b/ggml/src/ggml-openvino/ggml-decoder.h
@ -179,12 +179,9 @@ public:

    static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename);

-    static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor * tensor,
-                                                        std::optional<ExtraQuantType> requant_type = std::nullopt);
+    static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor * tensor);

-    static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(
-        ggml_cgraph * cgraph,
-        std::map<ggml_type, ExtraQuantType> types_to_requantize = {});
+    static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(ggml_cgraph * cgraph);

    const ggml_tensor * get_tensor_used_op(const ggml_tensor * tensor) const;

--- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp
@ -0,0 +1,177 @@
+#include "ggml-openvino-extra.h"
+
+#include "ggml-impl.h"
+
+ov::Core & ov_singleton_core() {
+    static ov::Core core;
+    return core;
+}
+
+// =====================================================
+// Device Configuration Implementations
+// =====================================================
+
+void ggml_openvino_device_config::init() {
+    if (initialized) {
+        return;
+    }
+    device_name = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : "CPU";
+    auto available_devices = ov_singleton_core().get_available_devices();
+    if (std::find(available_devices.begin(), available_devices.end(), device_name) == available_devices.end()) {
+        GGML_LOG_WARN("GGML OpenVINO Backend: device %s is not available, fallback to CPU\n", device_name.c_str());
+        device_name = "CPU";
+    }
+    is_npu = (device_name == "NPU");
+    initialized = true;
+}
+
+// Get the global device config singleton
+ggml_openvino_device_config & ggml_openvino_get_device_config() {
+    static ggml_openvino_device_config config;
+    return config;
+}
+
+// Initialize device config (call during backend init)
+void ggml_openvino_init_device_config() {
+    ggml_openvino_get_device_config().init();
+}
+
+// Get the device name
+const std::string & ggml_openvino_get_device_name() {
+    return ggml_openvino_get_device_config().device_name;
+}
+
+// Check if running on NPU
+bool ggml_openvino_is_npu() {
+    return ggml_openvino_get_device_config().is_npu;
+}
+
+// Get requantization type for a tensor type (returns nullopt if no requant needed)
+std::optional<ExtraQuantType> ggml_openvino_get_requant_type(ggml_type type) {
+    if (!ggml_openvino_is_npu()) {
+        return std::nullopt;
+    }
+    // NPU requantization rules
+    switch (type) {
+    case GGML_TYPE_Q4_0:
+    case GGML_TYPE_Q4_1:
+    case GGML_TYPE_Q4_K:
+        return ExtraQuantType::Q4_0_128;
+    case GGML_TYPE_Q6_K:
+    case GGML_TYPE_Q5_K:
+        return ExtraQuantType::F16;
+    default:
+        return std::nullopt;
+    }
+}
+
+// =====================================================
+// Extracted Layout Calculation
+// =====================================================
+
+ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor) {
+    ggml_openvino_extracted_layout layout = {};
+
+    if (!ggml_is_quantized(tensor->type)) {
+        return layout;
+    }
+
+    // Only handle 2D weight tensors
+    if (tensor->ne[2] != 1 || tensor->ne[3] != 1) {
+        return layout;
+    }
+
+    int64_t n_elements = ggml_nelements(tensor);
+    const size_t alignment = 64;  // Good for SIMD
+
+    // Check if requantization is needed (NPU-specific)
+    auto requant_type = ggml_openvino_get_requant_type(tensor->type);
+    if (requant_type.has_value()) {
+        layout.is_requant = true;
+        layout.requant_type = requant_type;
+
+        // Special case: requant to F16 - just store F16 weights, no scales/biases
+        if (requant_type.value() == ExtraQuantType::F16) {
+            layout.weights_size = n_elements * sizeof(uint16_t);  // F16 = 2 bytes
+            layout.total_size = layout.weights_size;
+            layout.weights_offset = 0;
+            // No scales/biases for F16
+            return layout;
+        }
+
+        // Requant to different quantized format (e.g., Q4_0_128)
+        switch (requant_type.value()) {
+        case ExtraQuantType::Q4_0_128:
+            layout.is_u4 = true;
+            layout.weights_per_block = 128;
+            break;
+        case ExtraQuantType::Q8_0_32:
+            layout.is_u4 = false;
+            layout.weights_per_block = 32;
+            break;
+        default:
+            // Unsupported requant type - fall through to normal extraction
+            layout.is_requant = false;
+            layout.requant_type = std::nullopt;
+            break;
+        }
+
+        if (layout.is_requant) {
+            // Calculate sizes for requantized format
+            layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;
+            int64_t n_blocks = n_elements / layout.weights_per_block;
+            layout.scales_size = n_blocks * sizeof(uint16_t);
+            layout.biases_size = n_blocks * sizeof(uint16_t);
+
+            layout.weights_offset = 0;
+            layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
+            layout.biases_offset =
+                layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;
+            layout.total_size = layout.biases_offset + layout.biases_size;
+            layout.total_size = std::max(layout.total_size, ggml_nbytes(tensor));
+            return layout;
+        }
+    }
+
+    // Normal extraction (no requant) - determine format based on tensor type
+    switch (tensor->type) {
+    case GGML_TYPE_Q4_0:
+    case GGML_TYPE_Q4_1:
+    case GGML_TYPE_Q4_K:
+        layout.is_u4 = true;
+        layout.weights_per_block = 32;
+        break;
+    case GGML_TYPE_Q8_0:
+        layout.is_u4 = false;
+        layout.weights_per_block = 32;
+        break;
+    case GGML_TYPE_Q6_K:
+        layout.is_u4 = false;
+        layout.weights_per_block = 16;
+        break;
+    case GGML_TYPE_Q5_K:
+        layout.is_u4 = false;
+        layout.weights_per_block = 32;
+        break;
+    default:
+        // Unsupported quantization type
+        return layout;
+    }
+
+    // Calculate sizes
+    // Weights: U4 = n_elements/2 bytes, U8 = n_elements bytes
+    layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;
+
+    // Scales and biases: F16 per block
+    int64_t n_blocks = n_elements / layout.weights_per_block;
+    layout.scales_size = n_blocks * sizeof(uint16_t);  // F16 = 2 bytes
+    layout.biases_size = n_blocks * sizeof(uint16_t);  // F16 = 2 bytes
+
+    // Layout in buffer: [weights | scales | biases] with alignment
+    layout.weights_offset = 0;
+    layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
+    layout.biases_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;
+    layout.total_size = layout.biases_offset + layout.biases_size;
+
+    return layout;
+}
--- a/ggml/src/ggml-openvino/ggml-openvino-extra.h
+++ b/ggml/src/ggml-openvino/ggml-openvino-extra.h
@ -1,16 +1,20 @@
 #pragma once

+#include "ggml.h"
+#include "openvino/runtime/core.hpp"
+
 #include <cstdlib>
 #include <memory>
-#include <optional>
 #include <openvino/core/node.hpp>
 #include <openvino/runtime/tensor.hpp>
+#include <optional>
 #include <string>
-#include "ggml.h"

 // ExtraQuantType enum - defines requantization target formats
 enum class ExtraQuantType { F16, Q4_0_C, Q8_1_C, Q4_0_128, Q8_0_C, Q8_0_32 };

+ov::Core & ov_singleton_core();
+
 // =====================================================
 // Global Device Configuration (singleton)
 // =====================================================
@ -21,56 +25,23 @@ struct ggml_openvino_device_config {
    bool is_npu = false;
    bool initialized = false;

-    void init() {
-        if (initialized) return;
-        const char* env = std::getenv("GGML_OPENVINO_DEVICE");
-        if (env) {
-            device_name = env;
-            is_npu = (device_name == "NPU");
-        }
-        initialized = true;
-    }
+    void init();
 };

 // Get the global device config singleton
-inline ggml_openvino_device_config& ggml_openvino_get_device_config() {
-    static ggml_openvino_device_config config;
-    return config;
-}
+ggml_openvino_device_config & ggml_openvino_get_device_config();

 // Initialize device config (call during backend init)
-inline void ggml_openvino_init_device_config() {
-    ggml_openvino_get_device_config().init();
-}
+void ggml_openvino_init_device_config();

 // Get the device name
-inline const std::string& ggml_openvino_get_device_name() {
-    return ggml_openvino_get_device_config().device_name;
-}
+const std::string & ggml_openvino_get_device_name();

 // Check if running on NPU
-inline bool ggml_openvino_is_npu() {
-    return ggml_openvino_get_device_config().is_npu;
-}
+bool ggml_openvino_is_npu();

 // Get requantization type for a tensor type (returns nullopt if no requant needed)
-inline std::optional<ExtraQuantType> ggml_openvino_get_requant_type(ggml_type type) {
-    if (!ggml_openvino_is_npu()) {
-        return std::nullopt;
-    }
-    // NPU requantization rules
-    switch (type) {
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q4_K:
-            return ExtraQuantType::Q4_0_128;
-        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_Q5_K:
-            return ExtraQuantType::F16;
-        default:
-            return std::nullopt;
-    }
-}
+std::optional<ExtraQuantType> ggml_openvino_get_requant_type(ggml_type type);

 // =====================================================
 // OpenVINO Tensor Extra Types
@ -140,108 +111,4 @@ struct ggml_openvino_extracted_layout {
 };

 // Calculate the buffer layout for extracted quantized data
-inline ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor) {
-    ggml_openvino_extracted_layout layout = {};
-
-    if (!ggml_is_quantized(tensor->type)) {
-        return layout;
-    }
-
-    // Only handle 2D weight tensors
-    if (tensor->ne[2] != 1 || tensor->ne[3] != 1) {
-        return layout;
-    }
-
-    int64_t n_elements = ggml_nelements(tensor);
-    const size_t alignment = 64;  // Good for SIMD
-
-    // Check if requantization is needed (NPU-specific)
-    auto requant_type = ggml_openvino_get_requant_type(tensor->type);
-    if (requant_type.has_value()) {
-        layout.is_requant = true;
-        layout.requant_type = requant_type;
-
-        // Special case: requant to F16 - just store F16 weights, no scales/biases
-        if (requant_type.value() == ExtraQuantType::F16) {
-            layout.weights_size = n_elements * sizeof(uint16_t);  // F16 = 2 bytes
-            layout.total_size = layout.weights_size;
-            layout.weights_offset = 0;
-            // No scales/biases for F16
-            return layout;
-        }
-
-        // Requant to different quantized format (e.g., Q4_0_128)
-        switch (requant_type.value()) {
-            case ExtraQuantType::Q4_0_128:
-                layout.is_u4 = true;
-                layout.weights_per_block = 128;
-                break;
-            case ExtraQuantType::Q8_0_32:
-                layout.is_u4 = false;
-                layout.weights_per_block = 32;
-                break;
-            default:
-                // Unsupported requant type - fall through to normal extraction
-                layout.is_requant = false;
-                layout.requant_type = std::nullopt;
-                break;
-        }
-
-        if (layout.is_requant) {
-            // Calculate sizes for requantized format
-            layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;
-            int64_t n_blocks = n_elements / layout.weights_per_block;
-            layout.scales_size = n_blocks * sizeof(uint16_t);
-            layout.biases_size = n_blocks * sizeof(uint16_t);
-
-            layout.weights_offset = 0;
-            layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
-            layout.biases_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;
-            layout.total_size = layout.biases_offset + layout.biases_size;
-            layout.total_size = std::max(layout.total_size, ggml_nbytes(tensor));
-            return layout;
-        }
-    }
-
-    // Normal extraction (no requant) - determine format based on tensor type
-    switch (tensor->type) {
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q4_K:
-            layout.is_u4 = true;
-            layout.weights_per_block = 32;
-            break;
-        case GGML_TYPE_Q8_0:
-            layout.is_u4 = false;
-            layout.weights_per_block = 32;
-            break;
-        case GGML_TYPE_Q6_K:
-            layout.is_u4 = false;
-            layout.weights_per_block = 16;
-            break;
-        case GGML_TYPE_Q5_K:
-            layout.is_u4 = false;
-            layout.weights_per_block = 32;
-            break;
-        default:
-            // Unsupported quantization type
-            return layout;
-    }
-
-    // Calculate sizes
-    // Weights: U4 = n_elements/2 bytes, U8 = n_elements bytes
-    layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;
-
-    // Scales and biases: F16 per block
-    int64_t n_blocks = n_elements / layout.weights_per_block;
-    layout.scales_size = n_blocks * sizeof(uint16_t);  // F16 = 2 bytes
-    layout.biases_size = n_blocks * sizeof(uint16_t);  // F16 = 2 bytes
-
-    // Layout in buffer: [weights | scales | biases] with alignment
-    layout.weights_offset = 0;
-    layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
-    layout.biases_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;
-    layout.total_size = layout.biases_offset + layout.biases_size;
-
-    return layout;
-}
+ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor);
--- a/ggml/src/ggml-openvino/ggml-openvino.cpp
+++ b/ggml/src/ggml-openvino/ggml-openvino.cpp
@ -12,7 +12,11 @@
 #include <cstring>
 #include <memory>
 #include <mutex>
+#include <openvino/core/type/element_type.hpp>
 #include <openvino/openvino.hpp>
+#include <openvino/runtime/allocator.hpp>
+#include <openvino/runtime/intel_gpu/ocl/ocl.hpp>
+#include <openvino/runtime/intel_npu/level_zero/level_zero.hpp>
 #include <openvino/runtime/tensor.hpp>
 #include <set>
 #include <string>
@ -48,7 +52,8 @@ struct ggml_backend_openvino_buffer_context {
    // For non-weight buffers (KV cache, compute), we still use contiguous allocation
    void * data;
    size_t size;
-    bool is_weight_buffer;  // Set when buffer usage is set to WEIGHTS
+
+    std::shared_ptr<ov::Tensor> ov_tensor;

    // Track all extras for cleanup
    std::vector<ggml_openvino_extra_base *> tensor_extras;
@ -57,18 +62,42 @@ struct ggml_backend_openvino_buffer_context {
        device(device),
        name(std::string(GGML_OPENVINO_NAME) + std::to_string(device)),
        data(nullptr),
-        size(size),
-        is_weight_buffer(false) {
-        // Allocate aligned contiguous memory
-        if (size > 0) {
+        size(size) {
+        if (size == 0) {
+            return;
+        }
+
+        const auto & device_name = ggml_openvino_get_device_name();
+        auto & core = ov_singleton_core();
+
+        if (device_name == "CPU") {
 #ifdef _WIN32
-            data = _aligned_malloc(size, GGML_OPENVINO_BUFFER_ALIGNMENT);
+            data = _aligned_malloc(alloc_size, GGML_OPENVINO_BUFFER_ALIGNMENT);
 #else
            data = aligned_alloc(GGML_OPENVINO_BUFFER_ALIGNMENT, size);
 #endif
-            if (data == nullptr) {
-                GGML_LOG_ERROR("%s: failed to allocate %zu bytes\n", __func__, size);
-            }
+            ov_tensor = std::make_shared<ov::Tensor>(ov::element::u8, ov::Shape{size}, data);
+        } else if (device_name == "GPU") {
+            auto gpu_context = core.get_default_context("GPU").as<ov::intel_gpu::ocl::ClContext>();
+            auto usm_tensor = gpu_context.create_usm_host_tensor(ov::element::u8, ov::Shape{size});
+            data = usm_tensor.get();
+            ov_tensor = std::make_shared<ov::intel_gpu::ocl::USMTensor>(std::move(usm_tensor));
+        } else {
+            auto npu_context = core.get_default_context("NPU").as<ov::intel_npu::level_zero::ZeroContext>();
+            auto l0_tensor = npu_context.create_l0_host_tensor(ov::element::u8, ov::Shape{size});
+            data = l0_tensor.get();
+            ov_tensor = std::make_shared<ov::intel_npu::level_zero::ZeroBufferTensor>(std::move(l0_tensor));
+        }
+
+        if (data == nullptr) {
+            GGML_LOG_ERROR("%s: failed to allocate %zu bytes\n", __func__, size);
+            return;
+        }
+
+        if (reinterpret_cast<uintptr_t>(data) % GGML_OPENVINO_BUFFER_ALIGNMENT != 0) {
+            GGML_LOG_ERROR("%s: %s buffer is not aligned to %d bytes\n", __func__, device_name.c_str(),
+                           GGML_OPENVINO_BUFFER_ALIGNMENT);
+            GGML_ABORT("fatal error");
        }
    }

@ -78,15 +107,12 @@ struct ggml_backend_openvino_buffer_context {
            delete extra;
        }
        tensor_extras.clear();
-
-        // Free contiguous memory
-        if (data != nullptr) {
+        if (data && ggml_openvino_get_device_name() == "CPU") {
 #ifdef _WIN32
            _aligned_free(data);
 #else
            free(data);
 #endif
-            data = nullptr;
        }
    }
 };
@ -156,57 +182,26 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer
    }

    if (layout.total_size > 0) {
+        // Quantized weight tensor with extraction/requantization
        uint8_t * buf_base = (uint8_t *) tensor->data;

-        // 2D shape for weights [rows, cols]
-        ov::Shape weight_shape = {static_cast<size_t>(tensor->ne[1]), static_cast<size_t>(tensor->ne[0])};
-
        try {
-            std::shared_ptr<ov::Node> constant;
+            std::shared_ptr<ov::Node> constant = process_weight_tensor(tensor, data, buf_base);
+            constant->set_friendly_name(tensor->name);

-            if (layout.is_requant && layout.requant_type.has_value()) {
-                // Requantization path
-                if (layout.requant_type.value() == ExtraQuantType::F16) {
-                    // Requant to F16: create F16 tensor with external memory, requantize fills it
-                    ov::Tensor weights(ov::element::f16, weight_shape, buf_base);
-                    ov::Tensor dummy_scales, dummy_biases;  // Not used for F16
-                    // requantize_to_buffers fills weights and returns a Constant wrapping it
-                    constant = requantize_to_buffers(tensor, data, ExtraQuantType::F16, 0, weights, dummy_scales,
-                                                     dummy_biases);
-
-                    // Store in tensor->extra (use weight_extra since it's F16)
-                    auto * extra = new ggml_openvino_weight_extra(constant);
-                    ctx->tensor_extras.push_back(extra);
-                    tensor->extra = extra;
-
-                    GGML_LOG_DEBUG("%s: requantized %s to F16\n", __func__, tensor->name);
-                } else {
-                    // Requant to quantized format (Q4_0_128, Q8_0_32, etc.)
-                    ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
-                    ov::Shape scale_shape = {static_cast<size_t>(tensor->ne[1]),
-                                             static_cast<size_t>(tensor->ne[0] / layout.weights_per_block)};
-
-                    ov::Tensor weights(weight_type, weight_shape, buf_base + layout.weights_offset);
-                    ov::Tensor scales(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
-                    ov::Tensor biases(ov::element::f16, scale_shape, buf_base + layout.biases_offset);
-
-                    constant = requantize_to_buffers(tensor, data, layout.requant_type.value(),
-                                                     layout.weights_per_block, weights, scales, biases);
-
-                    // Store in tensor->extra
-                    auto * extra = new ggml_openvino_quantized_weight_extra(std::move(weights), std::move(scales),
-                                                                            std::move(biases), constant);
-                    ctx->tensor_extras.push_back(extra);
-                    tensor->extra = extra;
-
-                    GGML_LOG_DEBUG("%s: requantized %s to %s (u%d, block_size=%ld)\n", __func__, tensor->name,
-                                   layout.requant_type.value() == ExtraQuantType::Q4_0_128 ? "Q4_0_128" : "Q8_0_32",
-                                   layout.is_u4 ? 4 : 8, layout.weights_per_block);
-                }
+            // Store in tensor->extra
+            if (layout.is_requant && layout.requant_type.has_value() &&
+                layout.requant_type.value() == ExtraQuantType::F16) {
+                // F16 requant case - use weight_extra
+                auto * extra = new ggml_openvino_weight_extra(constant);
+                ctx->tensor_extras.push_back(extra);
+                tensor->extra = extra;
+                GGML_LOG_DEBUG("%s: requantized %s to F16\n", __func__, tensor->name);
            } else {
-                // Normal extraction path (no requant)
+                // Quantized case - use quantized_weight_extra
+                // Create tensors with external memory (already filled by process_weight_tensor)
                ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
-                int64_t n_blocks = ggml_nelements(tensor) / layout.weights_per_block;
+                ov::Shape weight_shape = {static_cast<size_t>(tensor->ne[1]), static_cast<size_t>(tensor->ne[0])};
                ov::Shape scale_shape = {static_cast<size_t>(tensor->ne[1]),
                                         static_cast<size_t>(tensor->ne[0] / layout.weights_per_block)};

@ -214,16 +209,20 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer
                ov::Tensor scales(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
                ov::Tensor biases(ov::element::f16, scale_shape, buf_base + layout.biases_offset);

-                constant = extract_quantized_weights(tensor, data, weights, scales, biases);
-
-                // Store in tensor->extra
                auto * extra = new ggml_openvino_quantized_weight_extra(std::move(weights), std::move(scales),
                                                                        std::move(biases), constant);
                ctx->tensor_extras.push_back(extra);
                tensor->extra = extra;

-                GGML_LOG_DEBUG("%s: extracted quantized constant for %s (u%d, %zu weights, %ld blocks)\n", __func__,
-                               tensor->name, layout.is_u4 ? 4 : 8, layout.weights_size, n_blocks);
+                if (layout.is_requant) {
+                    GGML_LOG_DEBUG("%s: requantized %s to %s (u%d, block_size=%ld)\n", __func__, tensor->name,
+                                   layout.requant_type.value() == ExtraQuantType::Q4_0_128 ? "Q4_0_128" : "Q8_0_32",
+                                   layout.is_u4 ? 4 : 8, layout.weights_per_block);
+                } else {
+                    int64_t n_blocks = ggml_nelements(tensor) / layout.weights_per_block;
+                    GGML_LOG_DEBUG("%s: extracted quantized constant for %s (u%d, %zu weights, %ld blocks)\n", __func__,
+                                   tensor->name, layout.is_u4 ? 4 : 8, layout.weights_size, n_blocks);
+                }
            }

        } catch (const std::exception & e) {
@ -233,32 +232,9 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer
        }
    } else if (is_weight_buffer && is_full_tensor_set && is_2d &&
               (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16)) {
-        // F16/F32/BF16 weight tensor - copy data and create shared-memory constant
-        memcpy((char *) tensor->data + offset, data, size);
-
+        // F16/F32/BF16 weight tensor
        try {
-            // Get OpenVINO element type
-            ov::element::Type element_type;
-            switch (tensor->type) {
-            case GGML_TYPE_F32:
-                element_type = ov::element::f32;
-                break;
-            case GGML_TYPE_F16:
-                element_type = ov::element::f16;
-                break;
-            case GGML_TYPE_BF16:
-                element_type = ov::element::bf16;
-                break;
-            default:
-                return;  // Should not happen
-            }
-
-            // Create 2D shape (OpenVINO expects [rows, cols])
-            ov::Shape shape = {static_cast<size_t>(tensor->ne[1]), static_cast<size_t>(tensor->ne[0])};
-
-            // Create ov::Tensor with external memory, then wrap with Constant
-            ov::Tensor ov_tensor(element_type, shape, tensor->data);
-            auto constant = std::make_shared<ov::op::v0::Constant>(ov_tensor);
+            std::shared_ptr<ov::Node> constant = process_weight_tensor(tensor, data, tensor->data);
            constant->set_friendly_name(tensor->name);

            // Store in tensor->extra
@ -418,7 +394,7 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(in
 }

 // Check if a buffer is an OpenVINO buffer
-static bool ggml_backend_buffer_is_openvino(ggml_backend_buffer_t buffer) {
+bool ggml_backend_buffer_is_openvino(ggml_backend_buffer_t buffer) {
    return buffer->iface.free_buffer == ggml_backend_openvino_buffer_free_buffer;
 }

--- a/ggml/src/ggml-openvino/ggml-quants.cpp
+++ b/ggml/src/ggml-openvino/ggml-quants.cpp
@ -569,6 +569,112 @@ std::shared_ptr<ov::Node> requantize(const ggml_tensor * tensor, ExtraQuantType
    return requantize_to_buffers(tensor, tensor->data, requant_type, block_size, weights, scales, biases);
 }

+std::shared_ptr<ov::Node> process_weight_tensor(const ggml_tensor * tensor, const void * data, void * output_base_ptr) {
+    GGML_ASSERT(tensor != nullptr);
+    GGML_ASSERT(data != nullptr);
+
+    // Get 2D shape for weights [rows, cols]
+    ov::Shape node_shape = {static_cast<size_t>(tensor->ne[1]), static_cast<size_t>(tensor->ne[0])};
+
+    // Handle F16/F32/BF16 weights
+    if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {
+        ov::element::Type element_type;
+        switch (tensor->type) {
+        case GGML_TYPE_F32:
+            element_type = ov::element::f32;
+            break;
+        case GGML_TYPE_F16:
+            element_type = ov::element::f16;
+            break;
+        case GGML_TYPE_BF16:
+            element_type = ov::element::bf16;
+            break;
+        default:
+            OPENVINO_THROW("Unexpected tensor type in F16/F32/BF16 path");
+        }
+
+        if (output_base_ptr) {
+            // Using external buffer - copy data and create shared-memory constant
+            size_t tensor_bytes = ggml_nbytes(tensor);
+            memcpy(output_base_ptr, data, tensor_bytes);
+            ov::Tensor ov_tensor(element_type, node_shape, output_base_ptr);
+            return std::make_shared<ov::op::v0::Constant>(ov_tensor);
+        } else {
+            // Allocate internal buffer
+            ov::Tensor weights(element_type, node_shape);
+            memcpy(weights.data(), data, ggml_nelements(tensor) * element_type.size());
+            return std::make_shared<ov::op::v0::Constant>(weights);
+        }
+    }
+
+    // Handle quantized weights
+    if (!ggml_is_quantized(tensor->type)) {
+        OPENVINO_THROW("Unsupported weight tensor type: ", ggml_type_name(tensor->type));
+    }
+
+    auto layout = ggml_openvino_get_extracted_layout(tensor);
+    if (layout.total_size == 0) {
+        OPENVINO_THROW("Unsupported quantized type: ", ggml_type_name(tensor->type));
+    }
+
+    std::shared_ptr<ov::Node> result;
+
+    if (layout.is_requant && layout.requant_type.has_value()) {
+        // Requantization path
+        if (layout.requant_type.value() == ExtraQuantType::F16) {
+            // Requant to F16
+            ov::Tensor weights;
+            if (output_base_ptr) {
+                weights = ov::Tensor(ov::element::f16, node_shape,
+                                     static_cast<uint8_t *>(output_base_ptr) + layout.weights_offset);
+            } else {
+                weights = ov::Tensor(ov::element::f16, node_shape);
+            }
+            ov::Tensor dummy_scales, dummy_biases;  // Not used for F16
+            result = requantize_to_buffers(tensor, data, ExtraQuantType::F16, 0, weights, dummy_scales, dummy_biases);
+        } else {
+            // Requant to quantized format (Q4_0_128, Q8_0_32, etc.)
+            ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
+            ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block};
+
+            ov::Tensor weights, scales, biases;
+            if (output_base_ptr) {
+                uint8_t * buf_base = static_cast<uint8_t *>(output_base_ptr);
+                weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset);
+                scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
+                biases = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.biases_offset);
+            } else {
+                weights = ov::Tensor(weight_type, node_shape);
+                scales = ov::Tensor(ov::element::f16, scale_shape);
+                biases = ov::Tensor(ov::element::f16, scale_shape);
+            }
+
+            result = requantize_to_buffers(tensor, data, layout.requant_type.value(), layout.weights_per_block, weights,
+                                           scales, biases);
+        }
+    } else {
+        // Normal extraction path (no requant)
+        ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
+        ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block};
+
+        ov::Tensor weights, scales, biases;
+        if (output_base_ptr) {
+            uint8_t * buf_base = static_cast<uint8_t *>(output_base_ptr);
+            weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset);
+            scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
+            biases = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.biases_offset);
+        } else {
+            weights = ov::Tensor(weight_type, node_shape);
+            scales = ov::Tensor(ov::element::f16, scale_shape);
+            biases = ov::Tensor(ov::element::f16, scale_shape);
+        }
+
+        result = extract_quantized_weights(tensor, data, weights, scales, biases);
+    }
+
+    return result;
+}
+
 void quantize_q4_0(const float * x,
                   ov::Tensor & weights_arr,
                   ov::Tensor & scales_arr,
--- a/ggml/src/ggml-openvino/ggml-quants.hpp
+++ b/ggml/src/ggml-openvino/ggml-quants.hpp
@ -78,6 +78,16 @@ std::shared_ptr<ov::Node> requantize_to_buffers(const ggml_tensor * tensor,
                                                ov::Tensor & scales,
                                                ov::Tensor & biases);

+// Process weight tensor and create an OpenVINO constant node
+// Handles F16/F32/BF16 and quantized weights, with optional requantization
+// If output_base_ptr is nullptr, allocates internal buffers (for decoder use)
+// If output_base_ptr is provided, uses pre-allocated buffers at specified offsets (for backend buffer use)
+// Returns the weight constant node
+std::shared_ptr<ov::Node> process_weight_tensor(
+    const ggml_tensor * tensor,
+    const void * data,                  // Source data pointer (may differ from tensor->data)
+    void * output_base_ptr = nullptr);  // Base pointer for output buffers (or nullptr for internal allocation)
+
 void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
                   int64_t qk);
 void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
--- a/ggml/src/ggml-openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/utils.cpp
@ -107,7 +107,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
            infer_request_cache.erase(key);

            std::shared_ptr<ov::Model> model;
-            auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, get_types_to_requant(device));
+            auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);

            ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, m_params, c_params, model_weights, is_static);
            decoder_end_time = ggml_time_us();
@ -255,7 +255,7 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) {
            infer_request_cache_prefill.erase(key);

            std::shared_ptr<ov::Model> model;
-            auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, get_types_to_requant(device));
+            auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);

            auto ggml_decoder_prefill = std::make_shared<GgmlOvDecoder>(cgraph, m_params, c_params, model_weights,
                                                                        is_static, true, prefill_chunk_size);
@ -404,21 +404,6 @@ ov::AnyMap get_ov_compile_config(const std::string & device) {
    return config;
 }

-std::map<ggml_type, ExtraQuantType> get_types_to_requant(const std::string & device) {
-    // Use singleton to check if NPU (device param kept for API compatibility)
-    if (ggml_openvino_is_npu()) {
-        return {
-            {GGML_TYPE_Q4_0, ExtraQuantType::Q4_0_128},
-            {GGML_TYPE_Q4_1, ExtraQuantType::Q4_0_128},
-            {GGML_TYPE_Q4_K, ExtraQuantType::Q4_0_128},
-            {GGML_TYPE_Q6_K, ExtraQuantType::F16     },
-            {GGML_TYPE_Q5_K, ExtraQuantType::F16     },
-        };
-    }
-    return {};
-    GGML_UNUSED(device);
-}
-
 bool is_naive(ggml_cgraph * cgraph) {
    constexpr int naive_graph_size_threshold = 20;
    return cgraph->n_nodes < naive_graph_size_threshold;
--- a/ggml/src/ggml-openvino/utils.h
+++ b/ggml/src/ggml-openvino/utils.h
@ -73,8 +73,6 @@ graph_key compute_graph_key(struct ggml_cgraph * cgraph);

 ov::AnyMap get_ov_compile_config(const std::string & device);

-std::map<ggml_type, ExtraQuantType> get_types_to_requant(const std::string & device);
-
 ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string & param_name);
 ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
                                             const std::string & param_name);