diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 275a8a216a..409a16e816 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -2,6 +2,7 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" +#include "ggml-openvino-extra.h" #include "ggml-quants.hpp" #include @@ -17,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -33,6 +35,7 @@ #include #include #include +#include #include GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, @@ -512,8 +515,49 @@ std::map> GgmlOvDecoder::create_weight_no return model_weights; } +// Static cache for quantized weight nodes (keyed by tensor data pointer) +// This is a fallback for when tensors don't have pre-built constants in extra +static std::unordered_map> s_quantized_weight_cache; +static std::mutex s_quantized_weight_cache_mutex; + std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor, std::optional requant_type) { + // Check if we have a pre-built constant from the OpenVINO backend buffer + // This is set during ggml_backend_openvino_buffer_set_tensor + if (tensor->extra != nullptr && !requant_type.has_value()) { + // Cast to our extra base type and check the type + auto * extra_base = static_cast(tensor->extra); + + if (extra_base->type == ggml_openvino_extra_base::Type::WEIGHT) { + // F16/F32/BF16 weight with shared-memory constant + auto * weight_extra = static_cast(tensor->extra); + if (weight_extra->constant) { + GGML_LOG_DEBUG("%s: using pre-built constant for %s\n", __func__, tensor->name); + return weight_extra->constant; + } + } else if (extra_base->type == ggml_openvino_extra_base::Type::QUANTIZED_WEIGHT) { + // Quantized weight with pre-extracted data + auto * quant_extra = static_cast(tensor->extra); + if (quant_extra->constant) { + GGML_LOG_DEBUG("%s: using pre-extracted quantized constant for %s\n", __func__, tensor->name); + return quant_extra->constant; + } + } + } + + // Fallback: Check static cache for quantized weights (keyed by data pointer) + // This handles cases where tensors weren't loaded through OpenVINO buffer + if (ggml_is_quantized(tensor->type) && !requant_type.has_value()) { + std::lock_guard lock(s_quantized_weight_cache_mutex); + auto it = s_quantized_weight_cache.find(tensor->data); + if (it != s_quantized_weight_cache.end()) { + GGML_LOG_DEBUG("%s: using cached quantized constant for %s\n", __func__, tensor->name); + return it->second; + } + } + + GGML_LOG_DEBUG("%s: creating new constant for %s (extra=%p)\n", __func__, tensor->name, tensor->extra); + std::set weight_types = {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K}; if (weight_types.find(tensor->type) == weight_types.end()) { @@ -543,63 +587,48 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor return weight_node; } - // Quantized case - OPENVINO_ASSERT(tensor->extra == nullptr, "Unsupported weight tensor: " + std::string(tensor->name) + - " Possibly this is a repacked quantized weights"); + // Quantized case - extra should be nullptr (not our type) + // Our ggml_openvino_weight_extra is only set for F16/F32 weights + if (tensor->extra != nullptr) { + // Check if it's our type - if so, something is wrong + auto * extra_base = static_cast(tensor->extra); + if (extra_base->type == ggml_openvino_extra_base::Type::WEIGHT || + extra_base->type == ggml_openvino_extra_base::Type::TENSOR) { + OPENVINO_ASSERT(false, "Quantized weight tensor has unexpected extra type: " + std::string(tensor->name)); + } + // Otherwise it might be repacked quantized weights from another backend + OPENVINO_ASSERT(false, "Unsupported weight tensor: " + std::string(tensor->name) + + " Possibly this is a repacked quantized weights"); + } if (requant_type.has_value()) { return requantize(tensor, requant_type.value()); } - ov::element::Type weight_type; - if (tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_1 || tensor->type == GGML_TYPE_Q4_K) { - weight_type = ov::element::u4; - } else { // tensor.type == GGUF_TYPE_Q8_0 || tensor.type == GGUF_TYPE_Q6_K || tensor.type == GGUF_TYPE_Q5_K - weight_type = ov::element::u8; + // Extract quantized weights using the shared function + auto layout = ggml_openvino_get_extracted_layout(tensor); + if (layout.total_size == 0) { + OPENVINO_THROW("Unsupported quantized type for ", tensor->name, " type=", ggml_type_name(tensor->type)); } - uint64_t weights_per_block; - // here we only consider sub block, q6k:16 q4k:32 q5k:32 - if (tensor->type == GGML_TYPE_Q6_K) { - weights_per_block = 16; - } else { - weights_per_block = 32; - } - - OPENVINO_ASSERT(node_shape.back() % weights_per_block == 0, "[load_gguf] tensor ", tensor->name, - " has incompatible last dim shape: ", node_shape.back()); + ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; + ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block}; ov::Tensor weights(weight_type, node_shape); - // For scales and biases - node_shape[node_shape.size() - 1] = node_shape[node_shape.size() - 1] / weights_per_block; - ov::Tensor scales(ov::element::f16, node_shape); - ov::Tensor biases(ov::element::f16, node_shape); + ov::Tensor scales(ov::element::f16, scale_shape); + ov::Tensor biases(ov::element::f16, scale_shape); - ov::Output weight_node; - if (tensor->type == GGML_TYPE_Q4_0) { - extract_q4_0_data(tensor, weights, scales, biases); - weight_node = make_int4_weights(weights, scales, biases, weights_per_block); - } else if (tensor->type == GGML_TYPE_Q4_1) { - extract_q4_1_data(tensor, weights, scales, biases); - weight_node = make_int4_weights(weights, scales, biases, weights_per_block); - } else if (tensor->type == GGML_TYPE_Q8_0) { - extract_q8_0_data(tensor, weights, scales, biases); - weight_node = make_int8_weights(weights, scales, biases, weights_per_block); - } else if (tensor->type == GGML_TYPE_Q6_K) { - extract_q6_k_data(tensor, weights, scales, biases); - weight_node = make_int8_weights(weights, scales, biases, weights_per_block); - } else if (tensor->type == GGML_TYPE_Q4_K) { - extract_q4_k_data(tensor, weights, scales, biases); - weight_node = make_int4_weights(weights, scales, biases, weights_per_block); - } else if (tensor->type == GGML_TYPE_Q5_K) { - extract_q5_k_data(tensor, weights, scales, biases); - weight_node = make_int8_weights(weights, scales, biases, weights_per_block); + auto result = extract_quantized_weights(tensor, tensor->data, weights, scales, biases); + result->set_friendly_name(tensor->name); + + // Cache the quantized weight node for future reuse + if (ggml_is_quantized(tensor->type) && !requant_type.has_value()) { + std::lock_guard lock(s_quantized_weight_cache_mutex); + s_quantized_weight_cache[tensor->data] = result; + GGML_LOG_DEBUG("%s: cached quantized constant for %s\n", __func__, tensor->name); } - OPENVINO_ASSERT(weight_node.get_shape().size() == 2, "Weight should be 2D"); - - weight_node.get_node_shared_ptr()->set_friendly_name(tensor->name); - return weight_node.get_node_shared_ptr(); + return result; } void GgmlOvDecoder::dump_cgraph(const ggml_cgraph * cgraph, std::string & filename) { diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.h b/ggml/src/ggml-openvino/ggml-openvino-extra.h new file mode 100644 index 0000000000..99db870412 --- /dev/null +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.h @@ -0,0 +1,247 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include "ggml.h" + +// ExtraQuantType enum - defines requantization target formats +enum class ExtraQuantType { F16, Q4_0_C, Q8_1_C, Q4_0_128, Q8_0_C, Q8_0_32 }; + +// ===================================================== +// Global Device Configuration (singleton) +// ===================================================== +// Initialized once during backend init from GGML_OPENVINO_DEVICE env var + +struct ggml_openvino_device_config { + std::string device_name = "CPU"; + bool is_npu = false; + bool initialized = false; + + void init() { + if (initialized) return; + const char* env = std::getenv("GGML_OPENVINO_DEVICE"); + if (env) { + device_name = env; + is_npu = (device_name == "NPU"); + } + initialized = true; + } +}; + +// Get the global device config singleton +inline ggml_openvino_device_config& ggml_openvino_get_device_config() { + static ggml_openvino_device_config config; + return config; +} + +// Initialize device config (call during backend init) +inline void ggml_openvino_init_device_config() { + ggml_openvino_get_device_config().init(); +} + +// Get the device name +inline const std::string& ggml_openvino_get_device_name() { + return ggml_openvino_get_device_config().device_name; +} + +// Check if running on NPU +inline bool ggml_openvino_is_npu() { + return ggml_openvino_get_device_config().is_npu; +} + +// Get requantization type for a tensor type (returns nullopt if no requant needed) +inline std::optional ggml_openvino_get_requant_type(ggml_type type) { + if (!ggml_openvino_is_npu()) { + return std::nullopt; + } + // NPU requantization rules + switch (type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q4_K: + return ExtraQuantType::Q4_0_128; + case GGML_TYPE_Q6_K: + case GGML_TYPE_Q5_K: + return ExtraQuantType::F16; + default: + return std::nullopt; + } +} + +// ===================================================== +// OpenVINO Tensor Extra Types +// ===================================================== +// These types are stored in tensor->extra by the OpenVINO backend buffer. +// They allow: +// 1. Pre-built ov::Constant nodes for weights (avoiding memcpy during graph construction) +// 2. ov::Tensor wrappers for KV cache / compute tensors (for direct use with infer_request) + +// Base class for OpenVINO tensor extra data +struct ggml_openvino_extra_base { + enum class Type { WEIGHT, QUANTIZED_WEIGHT, TENSOR }; + Type type; + virtual ~ggml_openvino_extra_base() = default; +protected: + explicit ggml_openvino_extra_base(Type t) : type(t) {} +}; + +// Extra data for F16/F32/BF16 weight tensors - stores the pre-built ov::Constant node +struct ggml_openvino_weight_extra : public ggml_openvino_extra_base { + std::shared_ptr constant; // Pre-built OpenVINO Constant node + + explicit ggml_openvino_weight_extra(std::shared_ptr c) + : ggml_openvino_extra_base(Type::WEIGHT), constant(std::move(c)) {} +}; + +// Extra data for quantized weight tensors - stores extracted weights/scales/biases and ov::Constant +struct ggml_openvino_quantized_weight_extra : public ggml_openvino_extra_base { + ov::Tensor weights; // U4 or U8 extracted weights + ov::Tensor scales; // F16 scales + ov::Tensor biases; // F16 biases (zero points) + std::shared_ptr constant; // Pre-built OpenVINO weight subgraph + + ggml_openvino_quantized_weight_extra(ov::Tensor w, ov::Tensor s, ov::Tensor b, std::shared_ptr c) + : ggml_openvino_extra_base(Type::QUANTIZED_WEIGHT), + weights(std::move(w)), scales(std::move(s)), biases(std::move(b)), constant(std::move(c)) {} +}; + +// Extra data for KV cache / compute tensors - stores ov::Tensor for infer_request +struct ggml_openvino_tensor_extra : public ggml_openvino_extra_base { + std::shared_ptr tensor; // For direct use with infer_request + + explicit ggml_openvino_tensor_extra(std::shared_ptr t) + : ggml_openvino_extra_base(Type::TENSOR), tensor(std::move(t)) {} +}; + +// ===================================================== +// Extracted Size Calculation for Quantized Tensors +// ===================================================== +// For quantized tensors, we need extra space to store extracted weights, scales, and biases. +// Returns the total size needed in the buffer for extracted data. + +struct ggml_openvino_extracted_layout { + size_t total_size; // Total bytes needed + size_t weights_offset; // Offset to weights in buffer + size_t weights_size; // Size of weights in bytes + size_t scales_offset; // Offset to scales in buffer + size_t scales_size; // Size of scales in bytes + size_t biases_offset; // Offset to biases in buffer + size_t biases_size; // Size of biases in bytes + bool is_u4; // true for U4 weights, false for U8 + int64_t weights_per_block;// weights per scale/bias block + + // Requantization info + bool is_requant; // true if this tensor needs requantization + std::optional requant_type; // target requant type if is_requant +}; + +// Calculate the buffer layout for extracted quantized data +inline ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor) { + ggml_openvino_extracted_layout layout = {}; + + if (!ggml_is_quantized(tensor->type)) { + return layout; + } + + // Only handle 2D weight tensors + if (tensor->ne[2] != 1 || tensor->ne[3] != 1) { + return layout; + } + + int64_t n_elements = ggml_nelements(tensor); + const size_t alignment = 64; // Good for SIMD + + // Check if requantization is needed (NPU-specific) + auto requant_type = ggml_openvino_get_requant_type(tensor->type); + if (requant_type.has_value()) { + layout.is_requant = true; + layout.requant_type = requant_type; + + // Special case: requant to F16 - just store F16 weights, no scales/biases + if (requant_type.value() == ExtraQuantType::F16) { + layout.weights_size = n_elements * sizeof(uint16_t); // F16 = 2 bytes + layout.total_size = layout.weights_size; + layout.weights_offset = 0; + // No scales/biases for F16 + return layout; + } + + // Requant to different quantized format (e.g., Q4_0_128) + switch (requant_type.value()) { + case ExtraQuantType::Q4_0_128: + layout.is_u4 = true; + layout.weights_per_block = 128; + break; + case ExtraQuantType::Q8_0_32: + layout.is_u4 = false; + layout.weights_per_block = 32; + break; + default: + // Unsupported requant type - fall through to normal extraction + layout.is_requant = false; + layout.requant_type = std::nullopt; + break; + } + + if (layout.is_requant) { + // Calculate sizes for requantized format + layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements; + int64_t n_blocks = n_elements / layout.weights_per_block; + layout.scales_size = n_blocks * sizeof(uint16_t); + layout.biases_size = n_blocks * sizeof(uint16_t); + + layout.weights_offset = 0; + layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment; + layout.biases_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment; + layout.total_size = layout.biases_offset + layout.biases_size; + layout.total_size = std::max(layout.total_size, ggml_nbytes(tensor)); + return layout; + } + } + + // Normal extraction (no requant) - determine format based on tensor type + switch (tensor->type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q4_K: + layout.is_u4 = true; + layout.weights_per_block = 32; + break; + case GGML_TYPE_Q8_0: + layout.is_u4 = false; + layout.weights_per_block = 32; + break; + case GGML_TYPE_Q6_K: + layout.is_u4 = false; + layout.weights_per_block = 16; + break; + case GGML_TYPE_Q5_K: + layout.is_u4 = false; + layout.weights_per_block = 32; + break; + default: + // Unsupported quantization type + return layout; + } + + // Calculate sizes + // Weights: U4 = n_elements/2 bytes, U8 = n_elements bytes + layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements; + + // Scales and biases: F16 per block + int64_t n_blocks = n_elements / layout.weights_per_block; + layout.scales_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes + layout.biases_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes + + // Layout in buffer: [weights | scales | biases] with alignment + layout.weights_offset = 0; + layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment; + layout.biases_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment; + layout.total_size = layout.biases_offset + layout.biases_size; + + return layout; +} diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index e809d250f7..747d1b8a30 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -3,18 +3,429 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" #include "ggml-impl.h" +#include "ggml-openvino-extra.h" #include "ggml-openvino/utils.h" +#include "ggml-quants.hpp" #include "ggml.h" #include +#include +#include #include #include +#include #include #include #include #define GGML_OPENVINO_MAX_STREAMS 8 +// OpenVINO buffer alignment (same as CPU for compatibility) +#define GGML_OPENVINO_BUFFER_ALIGNMENT 64 + +// ===================================================== +// OpenVINO Buffer Implementation using ov::Tensor +// ===================================================== +// +// Design: This implementation uses a hybrid approach: +// 1. For weight tensors: Store a pre-built ov::op::v0::Constant in tensor->extra +// - This avoids the memcpy during graph construction +// - For quantized weights, the constant is already converted to OpenVINO format +// 2. For KV cache / compute tensors: Store an ov::Tensor in tensor->extra +// - This can be directly passed to infer_request +// - Future: can be changed to ov::RemoteTensor for GPU/NPU +// +// This design is similar to: +// - CUDA split buffer: tensor->extra stores device pointers +// - CPU repack buffer: tensor->extra stores tensor_traits with repacked data +// ===================================================== + +// Buffer context that manages per-tensor allocations (no contiguous buffer for weights) +struct ggml_backend_openvino_buffer_context { + int device; + std::string name; + + // For non-weight buffers (KV cache, compute), we still use contiguous allocation + void * data; + size_t size; + bool is_weight_buffer; // Set when buffer usage is set to WEIGHTS + + // Track all extras for cleanup + std::vector tensor_extras; + + ggml_backend_openvino_buffer_context(int device, size_t size) : + device(device), + name(std::string(GGML_OPENVINO_NAME) + std::to_string(device)), + data(nullptr), + size(size), + is_weight_buffer(false) { + // Allocate aligned contiguous memory + if (size > 0) { +#ifdef _WIN32 + data = _aligned_malloc(size, GGML_OPENVINO_BUFFER_ALIGNMENT); +#else + data = aligned_alloc(GGML_OPENVINO_BUFFER_ALIGNMENT, size); +#endif + if (data == nullptr) { + GGML_LOG_ERROR("%s: failed to allocate %zu bytes\n", __func__, size); + } + } + } + + ~ggml_backend_openvino_buffer_context() { + // Clean up all tensor extras + for (auto * extra : tensor_extras) { + delete extra; + } + tensor_extras.clear(); + + // Free contiguous memory + if (data != nullptr) { +#ifdef _WIN32 + _aligned_free(data); +#else + free(data); +#endif + data = nullptr; + } + } +}; + +// Buffer type context (per-device) +struct ggml_backend_openvino_buffer_type_context { + int device; + std::string name; +}; + +// Buffer interface functions +static void ggml_backend_openvino_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; + delete ctx; +} + +static void * ggml_backend_openvino_buffer_get_base(ggml_backend_buffer_t buffer) { + ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; + return ctx->data; +} + +static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + // Views share the extra from view_src + if (tensor->view_src != nullptr) { + GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); + if (tensor->view_src->extra != nullptr) { + tensor->extra = tensor->view_src->extra; + } + return GGML_STATUS_SUCCESS; + } + + // For non-view tensors, tensor->extra will be set in set_tensor + // when the actual weight data is loaded + GGML_UNUSED(buffer); + return GGML_STATUS_SUCCESS; +} + +static void ggml_backend_openvino_buffer_memset_tensor(ggml_backend_buffer_t buffer, + ggml_tensor * tensor, + uint8_t value, + size_t offset, + size_t size) { + GGML_ASSERT(tensor != nullptr && tensor->data != nullptr); + memset((char *) tensor->data + offset, value, size); + GGML_UNUSED(buffer); +} + +static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer, + ggml_tensor * tensor, + const void * data, + size_t offset, + size_t size) { + GGML_ASSERT(tensor != nullptr && tensor->data != nullptr); + ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; + + // Check if this is a weight buffer (usage is set BEFORE set_tensor is called) + bool is_weight_buffer = (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + // Full tensor set: offset=0, full size, not a view + bool is_full_tensor_set = (offset == 0 && size == ggml_nbytes(tensor) && tensor->view_src == nullptr); + // 2D tensor (typical weight shape) + bool is_2d = (tensor->ne[2] == 1 && tensor->ne[3] == 1); + + // Check if this is a quantized weight tensor that needs extraction/requantization + ggml_openvino_extracted_layout layout = {}; + if (is_weight_buffer && is_full_tensor_set && is_2d && ggml_is_quantized(tensor->type)) { + layout = ggml_openvino_get_extracted_layout(tensor); + } + + if (layout.total_size > 0) { + uint8_t * buf_base = (uint8_t *) tensor->data; + + // 2D shape for weights [rows, cols] + ov::Shape weight_shape = {static_cast(tensor->ne[1]), static_cast(tensor->ne[0])}; + + try { + std::shared_ptr constant; + + if (layout.is_requant && layout.requant_type.has_value()) { + // Requantization path + if (layout.requant_type.value() == ExtraQuantType::F16) { + // Requant to F16: create F16 tensor with external memory, requantize fills it + ov::Tensor weights(ov::element::f16, weight_shape, buf_base); + ov::Tensor dummy_scales, dummy_biases; // Not used for F16 + // requantize_to_buffers fills weights and returns a Constant wrapping it + constant = requantize_to_buffers(tensor, data, ExtraQuantType::F16, 0, weights, dummy_scales, + dummy_biases); + + // Store in tensor->extra (use weight_extra since it's F16) + auto * extra = new ggml_openvino_weight_extra(constant); + ctx->tensor_extras.push_back(extra); + tensor->extra = extra; + + GGML_LOG_DEBUG("%s: requantized %s to F16\n", __func__, tensor->name); + } else { + // Requant to quantized format (Q4_0_128, Q8_0_32, etc.) + ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; + ov::Shape scale_shape = {static_cast(tensor->ne[1]), + static_cast(tensor->ne[0] / layout.weights_per_block)}; + + ov::Tensor weights(weight_type, weight_shape, buf_base + layout.weights_offset); + ov::Tensor scales(ov::element::f16, scale_shape, buf_base + layout.scales_offset); + ov::Tensor biases(ov::element::f16, scale_shape, buf_base + layout.biases_offset); + + constant = requantize_to_buffers(tensor, data, layout.requant_type.value(), + layout.weights_per_block, weights, scales, biases); + + // Store in tensor->extra + auto * extra = new ggml_openvino_quantized_weight_extra(std::move(weights), std::move(scales), + std::move(biases), constant); + ctx->tensor_extras.push_back(extra); + tensor->extra = extra; + + GGML_LOG_DEBUG("%s: requantized %s to %s (u%d, block_size=%ld)\n", __func__, tensor->name, + layout.requant_type.value() == ExtraQuantType::Q4_0_128 ? "Q4_0_128" : "Q8_0_32", + layout.is_u4 ? 4 : 8, layout.weights_per_block); + } + } else { + // Normal extraction path (no requant) + ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8; + int64_t n_blocks = ggml_nelements(tensor) / layout.weights_per_block; + ov::Shape scale_shape = {static_cast(tensor->ne[1]), + static_cast(tensor->ne[0] / layout.weights_per_block)}; + + ov::Tensor weights(weight_type, weight_shape, buf_base + layout.weights_offset); + ov::Tensor scales(ov::element::f16, scale_shape, buf_base + layout.scales_offset); + ov::Tensor biases(ov::element::f16, scale_shape, buf_base + layout.biases_offset); + + constant = extract_quantized_weights(tensor, data, weights, scales, biases); + + // Store in tensor->extra + auto * extra = new ggml_openvino_quantized_weight_extra(std::move(weights), std::move(scales), + std::move(biases), constant); + ctx->tensor_extras.push_back(extra); + tensor->extra = extra; + + GGML_LOG_DEBUG("%s: extracted quantized constant for %s (u%d, %zu weights, %ld blocks)\n", __func__, + tensor->name, layout.is_u4 ? 4 : 8, layout.weights_size, n_blocks); + } + + } catch (const std::exception & e) { + GGML_LOG_ERROR("%s: failed to process quantized data for %s: %s\n", __func__, tensor->name, e.what()); + // Fall back to storing raw data + memcpy((char *) tensor->data + offset, data, size); + } + } else if (is_weight_buffer && is_full_tensor_set && is_2d && + (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16)) { + // F16/F32/BF16 weight tensor - copy data and create shared-memory constant + memcpy((char *) tensor->data + offset, data, size); + + try { + // Get OpenVINO element type + ov::element::Type element_type; + switch (tensor->type) { + case GGML_TYPE_F32: + element_type = ov::element::f32; + break; + case GGML_TYPE_F16: + element_type = ov::element::f16; + break; + case GGML_TYPE_BF16: + element_type = ov::element::bf16; + break; + default: + return; // Should not happen + } + + // Create 2D shape (OpenVINO expects [rows, cols]) + ov::Shape shape = {static_cast(tensor->ne[1]), static_cast(tensor->ne[0])}; + + // Create ov::Tensor with external memory, then wrap with Constant + ov::Tensor ov_tensor(element_type, shape, tensor->data); + auto constant = std::make_shared(ov_tensor); + constant->set_friendly_name(tensor->name); + + // Store in tensor->extra + ggml_openvino_weight_extra * extra = new ggml_openvino_weight_extra(constant); + ctx->tensor_extras.push_back(extra); + tensor->extra = extra; + + GGML_LOG_DEBUG("%s: created shared-memory constant for %s\n", __func__, tensor->name); + + } catch (const std::exception & e) { + GGML_LOG_DEBUG("%s: failed to create shared-memory constant for %s: %s\n", __func__, tensor->name, + e.what()); + } + } else { + // Non-weight tensor (KV cache, activations, etc.) - just copy data + memcpy((char *) tensor->data + offset, data, size); + } +} + +static void ggml_backend_openvino_buffer_get_tensor(ggml_backend_buffer_t buffer, + const ggml_tensor * tensor, + void * data, + size_t offset, + size_t size) { + GGML_ASSERT(tensor != nullptr && tensor->data != nullptr); + memcpy(data, (const char *) tensor->data + offset, size); + GGML_UNUSED(buffer); +} + +static bool ggml_backend_openvino_buffer_cpy_tensor(ggml_backend_buffer_t buffer, + const ggml_tensor * src, + ggml_tensor * dst) { + GGML_ASSERT(src != nullptr && dst != nullptr); + // Can copy from any host buffer (including other OpenVINO buffers) + if (ggml_backend_buffer_is_host(src->buffer)) { + memcpy(dst->data, src->data, ggml_nbytes(src)); + return true; + } + return false; + GGML_UNUSED(buffer); +} + +static void ggml_backend_openvino_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; + if (ctx->data != nullptr) { + memset(ctx->data, value, ctx->size); + } +} + +static const ggml_backend_buffer_i ggml_backend_openvino_buffer_interface = { + /* .free_buffer = */ ggml_backend_openvino_buffer_free_buffer, + /* .get_base = */ ggml_backend_openvino_buffer_get_base, + /* .init_tensor = */ ggml_backend_openvino_buffer_init_tensor, + /* .memset_tensor = */ ggml_backend_openvino_buffer_memset_tensor, + /* .set_tensor = */ ggml_backend_openvino_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_openvino_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_openvino_buffer_cpy_tensor, + /* .clear = */ ggml_backend_openvino_buffer_clear, + /* .reset = */ NULL, +}; + +// Buffer type interface functions +static const char * ggml_backend_openvino_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + ggml_backend_openvino_buffer_type_context * ctx = (ggml_backend_openvino_buffer_type_context *) buft->context; + return ctx->name.c_str(); +} + +static ggml_backend_buffer_t ggml_backend_openvino_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, + size_t size) { + ggml_backend_openvino_buffer_type_context * buft_ctx = (ggml_backend_openvino_buffer_type_context *) buft->context; + + // Create buffer context with contiguous memory allocation + ggml_backend_openvino_buffer_context * ctx = new ggml_backend_openvino_buffer_context(buft_ctx->device, size); + + if (ctx->data == nullptr && size > 0) { + GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size); + delete ctx; + return nullptr; + } + + return ggml_backend_buffer_init(buft, ggml_backend_openvino_buffer_interface, ctx, size); +} + +static size_t ggml_backend_openvino_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + return GGML_OPENVINO_BUFFER_ALIGNMENT; +} + +static size_t ggml_backend_openvino_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + return SIZE_MAX; +} + +static size_t ggml_backend_openvino_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, + const ggml_tensor * tensor) { + GGML_UNUSED(buft); + + // For quantized 2D tensors (weights), we need extra space for extracted data + if (ggml_is_quantized(tensor->type) && tensor->ne[2] == 1 && tensor->ne[3] == 1) { + ggml_openvino_extracted_layout layout = ggml_openvino_get_extracted_layout(tensor); + if (layout.total_size > 0) { + GGML_LOG_DEBUG( + "%s: tensor %s needs %zu bytes (original %zu, extracted: weights=%zu scales=%zu biases=%zu)\n", + __func__, tensor->name, layout.total_size, ggml_nbytes(tensor), layout.weights_size, layout.scales_size, + layout.biases_size); + return layout.total_size; + } + } + + return ggml_nbytes(tensor); +} + +static bool ggml_backend_openvino_buffer_type_is_host(ggml_backend_buffer_type_t buft) { + GGML_UNUSED(buft); + // Currently using host memory via ov::Tensor + // This will be false when using GPU/NPU remote tensors + return true; +} + +static const ggml_backend_buffer_type_i ggml_backend_openvino_buffer_type_interface = { + /* .get_name = */ ggml_backend_openvino_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_openvino_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_openvino_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_openvino_buffer_type_get_max_size, + /* .get_alloc_size = */ ggml_backend_openvino_buffer_type_get_alloc_size, + /* .is_host = */ ggml_backend_openvino_buffer_type_is_host, +}; + +// Get buffer type for a specific device +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(int device) { + GGML_ASSERT(device >= 0 && device < ggml_backend_openvino_get_device_count()); + + static std::mutex mutex; + std::lock_guard lock(mutex); + + static std::vector buffer_types; + static std::vector buffer_type_contexts; + + if (buffer_types.empty()) { + int device_count = ggml_backend_openvino_get_device_count(); + buffer_types.resize(device_count); + buffer_type_contexts.resize(device_count); + + for (int i = 0; i < device_count; i++) { + buffer_type_contexts[i].device = i; + buffer_type_contexts[i].name = std::string(GGML_OPENVINO_NAME) + std::to_string(i); + + buffer_types[i] = ggml_backend_buffer_type{ + /* .iface = */ ggml_backend_openvino_buffer_type_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_openvino_reg(), i), + /* .context = */ &buffer_type_contexts[i], + }; + } + } + + return &buffer_types[device]; +} + +// Check if a buffer is an OpenVINO buffer +static bool ggml_backend_buffer_is_openvino(ggml_backend_buffer_t buffer) { + return buffer->iface.free_buffer == ggml_backend_openvino_buffer_free_buffer; +} + +// ===================================================== +// OpenVINO Backend Context and Interface +// ===================================================== + struct ggml_backend_openvino_context { int device; // the device ID currently in use std::string name; // context Name @@ -111,13 +522,6 @@ GGML_BACKEND_API bool ggml_backend_is_openvino(ggml_backend_t backend) { return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_openvino_guid()); } -// device buffer -GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(int device) { - GGML_ASSERT(device >= 0); - return ggml_backend_cpu_buffer_type(); - GGML_UNUSED(device); -} - struct ggml_backend_openvino_device_context { int device; std::string name; @@ -350,7 +754,8 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con } static bool ggml_backend_openvino_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { - return ggml_backend_buft_is_host(buft); + // Support our own buffer type and any host buffer (for mmap'd files, etc.) + return buft->iface.get_name == ggml_backend_openvino_buffer_type_get_name || ggml_backend_buft_is_host(buft); GGML_UNUSED(dev); } @@ -410,6 +815,10 @@ static int get_openvino_device_count() { } static ggml_openvino_device_info ggml_openvino_init() { + // Initialize device config singleton from env var + ggml_openvino_init_device_config(); + GGML_LOG_INFO("OpenVINO: using device %s\n", ggml_openvino_get_device_name().c_str()); + ggml_openvino_device_info info = {}; info.device_count = get_openvino_device_count(); return info; diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp index 2076c3c75d..662f27be7a 100644 --- a/ggml/src/ggml-openvino/ggml-quants.cpp +++ b/ggml/src/ggml-openvino/ggml-quants.cpp @@ -418,11 +418,124 @@ ov::Output make_int4_weights(ov::Tensor & weight, return std::make_shared(w_zp_s, ov::element::f32); } -std::shared_ptr requantize(const ggml_tensor * tensor, ExtraQuantType requant_type) { - std::vector weights_f32(tensor->ne[0] * tensor->ne[1]); - ggml_get_type_traits(tensor->type)->to_float(tensor->data, weights_f32.data(), ggml_nelements(tensor)); +// Extract quantized weights from tensor and create weight subgraph +std::shared_ptr extract_quantized_weights(const ggml_tensor * tensor, + const void * data, + ov::Tensor & weights, + ov::Tensor & scales, + ov::Tensor & biases) { + // Create a temporary tensor for extraction functions that read from tensor->data + ggml_tensor temp_tensor = *tensor; + temp_tensor.data = const_cast(data); - std::shared_ptr weight_node; + // Determine block size based on tensor type + int64_t weights_per_block; + bool is_u4; + switch (tensor->type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q4_K: + is_u4 = true; + weights_per_block = 32; + break; + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q5_K: + is_u4 = false; + weights_per_block = 32; + break; + case GGML_TYPE_Q6_K: + is_u4 = false; + weights_per_block = 16; + break; + default: + throw std::runtime_error("Unsupported quantized type for extraction: " + + std::string(ggml_type_name(tensor->type))); + } + + // Extract quantized data + switch (tensor->type) { + case GGML_TYPE_Q4_0: + extract_q4_0_data(&temp_tensor, weights, scales, biases); + break; + case GGML_TYPE_Q4_1: + extract_q4_1_data(&temp_tensor, weights, scales, biases); + break; + case GGML_TYPE_Q4_K: + extract_q4_k_data(&temp_tensor, weights, scales, biases); + break; + case GGML_TYPE_Q8_0: + extract_q8_0_data(&temp_tensor, weights, scales, biases); + break; + case GGML_TYPE_Q6_K: + extract_q6_k_data(&temp_tensor, weights, scales, biases); + break; + case GGML_TYPE_Q5_K: + extract_q5_k_data(&temp_tensor, weights, scales, biases); + break; + default: + throw std::runtime_error("Unsupported quantized type: " + std::string(ggml_type_name(tensor->type))); + } + + // Create the OpenVINO weight subgraph + ov::Output weight_node; + if (is_u4) { + weight_node = make_int4_weights(weights, scales, biases, weights_per_block); + } else { + weight_node = make_int8_weights(weights, scales, biases, weights_per_block); + } + + auto result = weight_node.get_node_shared_ptr(); + result->set_friendly_name(tensor->name); + return result; +} + +// Requantize weights to target format, writing to provided buffers +std::shared_ptr requantize_to_buffers(const ggml_tensor * tensor, + const void * data, + ExtraQuantType requant_type, + int64_t block_size, + ov::Tensor & weights, + ov::Tensor & scales, + ov::Tensor & biases) { + int64_t n_elements = ggml_nelements(tensor); + + // First dequantize to F32 + std::vector weights_f32(n_elements); + ggml_get_type_traits(tensor->type)->to_float(data, weights_f32.data(), n_elements); + + // Handle F16 case - just convert and create constant + if (requant_type == ExtraQuantType::F16) { + ggml_get_type_traits(GGML_TYPE_F16)->from_float_ref(weights_f32.data(), weights.data(), n_elements); + auto result = std::make_shared(weights); + result->set_friendly_name(tensor->name); + return result; + } + + // Requantize to target quantized format + bool is_u4 = (requant_type == ExtraQuantType::Q4_0_C || requant_type == ExtraQuantType::Q4_0_128); + + if (is_u4) { + quantize_q4_0(weights_f32.data(), weights, scales, biases, n_elements, block_size); + } else if (requant_type == ExtraQuantType::Q8_1_C) { + quantize_q8_1(weights_f32.data(), weights, scales, biases, n_elements, block_size); + } else { + quantize_q8_0(weights_f32.data(), weights, scales, biases, n_elements, block_size); + } + + // Create the OpenVINO weight subgraph + ov::Output weight_node; + if (is_u4) { + weight_node = make_int4_weights(weights, scales, biases, block_size); + } else { + weight_node = make_int8_weights(weights, scales, biases, block_size); + } + + auto result = weight_node.get_node_shared_ptr(); + result->set_friendly_name(tensor->name); + return result; +} + +std::shared_ptr requantize(const ggml_tensor * tensor, ExtraQuantType requant_type) { ov::Shape node_shape = {(uint64_t) (tensor->ne[1]), (uint64_t) (tensor->ne[0])}; // FIXME hardcoded workaround to fix the case where token_emb.weight is q4_0 (instead of q6_k) @@ -432,42 +545,28 @@ std::shared_ptr requantize(const ggml_tensor * tensor, ExtraQuantType requant_type = ExtraQuantType::F16; } - if (requant_type == ExtraQuantType::F16) { - ov::Tensor weights(ov::element::f16, node_shape); - ggml_get_type_traits(GGML_TYPE_F16)->from_float_ref(weights_f32.data(), weights.data(), ggml_nelements(tensor)); - std::shared_ptr weight_node = std::make_shared(weights); - weight_node->set_friendly_name(tensor->name); - return weight_node; - } - + // Determine block size int64_t block_size = node_shape[1]; if (requant_type == ExtraQuantType::Q4_0_128) { block_size = 128; } else if (requant_type == ExtraQuantType::Q8_0_32) { block_size = 32; } - auto scales_shape = ov::Shape{node_shape[0], node_shape[1] / block_size}; - ov::Tensor weights; - ov::Tensor scales(ov::element::f16, scales_shape); - ov::Tensor bias(ov::element::f16, scales_shape); - - if (requant_type == ExtraQuantType::Q4_0_C || requant_type == ExtraQuantType::Q4_0_128) { - weights = ov::Tensor(ov::element::u4, node_shape); - quantize_q4_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size); - weight_node = make_int4_weights(weights, scales, bias, block_size).get_node_shared_ptr(); - } else if (requant_type == ExtraQuantType::Q8_1_C) { - weights = ov::Tensor(ov::element::u8, node_shape); - quantize_q8_1(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size); - weight_node = make_int8_weights(weights, scales, bias, block_size).get_node_shared_ptr(); - } else if (requant_type == ExtraQuantType::Q8_0_C || requant_type == ExtraQuantType::Q8_0_32) { - weights = ov::Tensor(ov::element::u8, node_shape); - quantize_q8_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size); - weight_node = make_int8_weights(weights, scales, bias, block_size).get_node_shared_ptr(); + // Allocate tensors + ov::Tensor weights, scales, biases; + if (requant_type == ExtraQuantType::F16) { + weights = ov::Tensor(ov::element::f16, node_shape); + } else { + bool is_u4 = (requant_type == ExtraQuantType::Q4_0_C || requant_type == ExtraQuantType::Q4_0_128); + ov::element::Type weight_type = is_u4 ? ov::element::u4 : ov::element::u8; + ov::Shape scales_shape = {node_shape[0], node_shape[1] / block_size}; + weights = ov::Tensor(weight_type, node_shape); + scales = ov::Tensor(ov::element::f16, scales_shape); + biases = ov::Tensor(ov::element::f16, scales_shape); } - weight_node->set_friendly_name(tensor->name); - return weight_node; + return requantize_to_buffers(tensor, tensor->data, requant_type, block_size, weights, scales, biases); } void quantize_q4_0(const float * x, diff --git a/ggml/src/ggml-openvino/ggml-quants.hpp b/ggml/src/ggml-openvino/ggml-quants.hpp index 71ae317a39..0f14a6ed2d 100644 --- a/ggml/src/ggml-openvino/ggml-quants.hpp +++ b/ggml/src/ggml-openvino/ggml-quants.hpp @@ -1,10 +1,11 @@ #pragma once +#include "ggml-openvino-extra.h" // For ExtraQuantType +#include "ggml.h" + #include #include #include -#include "ggml.h" - void unpack_32_4(const uint8_t* data, uint8_t* dst); void extract_q4_0_data(const ggml_tensor* tensor, @@ -51,10 +52,32 @@ ov::Output make_int4_weights(ov::Tensor& weight, ov::Tensor& biases, size_t group_size = GGML_QUANTIZATION_GROUP_SIZE); -enum class ExtraQuantType { F16, Q4_0_C, Q8_1_C, Q4_0_128, Q8_0_C, Q8_0_32 }; +// ExtraQuantType is defined in ggml-openvino-extra.h std::shared_ptr requantize(const ggml_tensor* tensor, ExtraQuantType requant_type); +// Extract quantized weights from tensor and create weight subgraph +// If weights/scales/biases are provided (non-empty), uses them as output buffers +// Otherwise allocates new ov::Tensors internally +// Returns the weight node (make_int4_weights or make_int8_weights result) +std::shared_ptr extract_quantized_weights( + const ggml_tensor * tensor, + const void * data, // Source data pointer (may differ from tensor->data) + ov::Tensor & weights, + ov::Tensor & scales, + ov::Tensor & biases); + +// Requantize weights from tensor to target format, writing to provided buffers +// For F16 target, only weights buffer is used (scales/biases ignored) +// Returns the weight node +std::shared_ptr requantize_to_buffers(const ggml_tensor * tensor, + const void * data, // Source data pointer + ExtraQuantType requant_type, + int64_t block_size, + ov::Tensor & weights, + ov::Tensor & scales, + ov::Tensor & biases); + void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, int64_t qk); void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k, diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 836e366fd7..251fb82361 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -1,6 +1,7 @@ #include "utils.h" #include "ggml-impl.h" +#include "ggml-openvino-extra.h" #include "ggml-openvino/ggml-decoder.h" #include "ggml.h" #include "openvino/frontend.hpp" @@ -39,23 +40,14 @@ static ov::Core core; enum ggml_status ov_graph_compute(ggml_cgraph * cgraph) { - auto get_device = [&] { - std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : "CPU"; - auto available_devices = core.get_available_devices(); - if (std::find(available_devices.begin(), available_devices.end(), device) == available_devices.end()) { - GGML_LOG_WARN("GGML OpenVINO Backend: device %s is not available, fallback to CPU\n", device.c_str()); - device = "CPU"; - } - return device; - }; - if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { std::string filename = "cgraph.txt"; GgmlOvDecoder::dump_cgraph(cgraph, filename); } - static const auto device = get_device(); - static const auto is_static = device == "NPU" ? true : false; + // Use device from singleton (initialized during backend init) + const auto & device = ggml_openvino_get_device_name(); + const auto is_static = ggml_openvino_is_npu(); return is_static ? ov_graph_compute_static(cgraph) : ov_graph_compute_dynamic(cgraph, device); } @@ -413,7 +405,8 @@ ov::AnyMap get_ov_compile_config(const std::string & device) { } std::map get_types_to_requant(const std::string & device) { - if (device == "NPU") { + // Use singleton to check if NPU (device param kept for API compatibility) + if (ggml_openvino_is_npu()) { return { {GGML_TYPE_Q4_0, ExtraQuantType::Q4_0_128}, {GGML_TYPE_Q4_1, ExtraQuantType::Q4_0_128}, @@ -423,6 +416,7 @@ std::map get_types_to_requant(const std::string & dev }; } return {}; + GGML_UNUSED(device); } bool is_naive(ggml_cgraph * cgraph) {