backend buffer: allocate on host
This commit is contained in:
parent
ae5336386f
commit
22d9c17a6f
|
|
@ -2,6 +2,7 @@
|
|||
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-openvino-extra.h"
|
||||
#include "ggml-quants.hpp"
|
||||
|
||||
#include <ggml-impl.h>
|
||||
|
|
@ -17,6 +18,7 @@
|
|||
#include <iomanip>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <openvino/core/dimension.hpp>
|
||||
#include <openvino/core/except.hpp>
|
||||
#include <openvino/core/node.hpp>
|
||||
|
|
@ -33,6 +35,7 @@
|
|||
#include <set>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph,
|
||||
|
|
@ -512,8 +515,49 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
|
|||
return model_weights;
|
||||
}
|
||||
|
||||
// Static cache for quantized weight nodes (keyed by tensor data pointer)
|
||||
// This is a fallback for when tensors don't have pre-built constants in extra
|
||||
static std::unordered_map<const void *, std::shared_ptr<ov::Node>> s_quantized_weight_cache;
|
||||
static std::mutex s_quantized_weight_cache_mutex;
|
||||
|
||||
std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor,
|
||||
std::optional<ExtraQuantType> requant_type) {
|
||||
// Check if we have a pre-built constant from the OpenVINO backend buffer
|
||||
// This is set during ggml_backend_openvino_buffer_set_tensor
|
||||
if (tensor->extra != nullptr && !requant_type.has_value()) {
|
||||
// Cast to our extra base type and check the type
|
||||
auto * extra_base = static_cast<ggml_openvino_extra_base *>(tensor->extra);
|
||||
|
||||
if (extra_base->type == ggml_openvino_extra_base::Type::WEIGHT) {
|
||||
// F16/F32/BF16 weight with shared-memory constant
|
||||
auto * weight_extra = static_cast<ggml_openvino_weight_extra *>(tensor->extra);
|
||||
if (weight_extra->constant) {
|
||||
GGML_LOG_DEBUG("%s: using pre-built constant for %s\n", __func__, tensor->name);
|
||||
return weight_extra->constant;
|
||||
}
|
||||
} else if (extra_base->type == ggml_openvino_extra_base::Type::QUANTIZED_WEIGHT) {
|
||||
// Quantized weight with pre-extracted data
|
||||
auto * quant_extra = static_cast<ggml_openvino_quantized_weight_extra *>(tensor->extra);
|
||||
if (quant_extra->constant) {
|
||||
GGML_LOG_DEBUG("%s: using pre-extracted quantized constant for %s\n", __func__, tensor->name);
|
||||
return quant_extra->constant;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: Check static cache for quantized weights (keyed by data pointer)
|
||||
// This handles cases where tensors weren't loaded through OpenVINO buffer
|
||||
if (ggml_is_quantized(tensor->type) && !requant_type.has_value()) {
|
||||
std::lock_guard<std::mutex> lock(s_quantized_weight_cache_mutex);
|
||||
auto it = s_quantized_weight_cache.find(tensor->data);
|
||||
if (it != s_quantized_weight_cache.end()) {
|
||||
GGML_LOG_DEBUG("%s: using cached quantized constant for %s\n", __func__, tensor->name);
|
||||
return it->second;
|
||||
}
|
||||
}
|
||||
|
||||
GGML_LOG_DEBUG("%s: creating new constant for %s (extra=%p)\n", __func__, tensor->name, tensor->extra);
|
||||
|
||||
std::set<ggml_type> weight_types = {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0,
|
||||
GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K};
|
||||
if (weight_types.find(tensor->type) == weight_types.end()) {
|
||||
|
|
@ -543,63 +587,48 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor
|
|||
return weight_node;
|
||||
}
|
||||
|
||||
// Quantized case
|
||||
OPENVINO_ASSERT(tensor->extra == nullptr, "Unsupported weight tensor: " + std::string(tensor->name) +
|
||||
" Possibly this is a repacked quantized weights");
|
||||
// Quantized case - extra should be nullptr (not our type)
|
||||
// Our ggml_openvino_weight_extra is only set for F16/F32 weights
|
||||
if (tensor->extra != nullptr) {
|
||||
// Check if it's our type - if so, something is wrong
|
||||
auto * extra_base = static_cast<ggml_openvino_extra_base *>(tensor->extra);
|
||||
if (extra_base->type == ggml_openvino_extra_base::Type::WEIGHT ||
|
||||
extra_base->type == ggml_openvino_extra_base::Type::TENSOR) {
|
||||
OPENVINO_ASSERT(false, "Quantized weight tensor has unexpected extra type: " + std::string(tensor->name));
|
||||
}
|
||||
// Otherwise it might be repacked quantized weights from another backend
|
||||
OPENVINO_ASSERT(false, "Unsupported weight tensor: " + std::string(tensor->name) +
|
||||
" Possibly this is a repacked quantized weights");
|
||||
}
|
||||
|
||||
if (requant_type.has_value()) {
|
||||
return requantize(tensor, requant_type.value());
|
||||
}
|
||||
|
||||
ov::element::Type weight_type;
|
||||
if (tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_1 || tensor->type == GGML_TYPE_Q4_K) {
|
||||
weight_type = ov::element::u4;
|
||||
} else { // tensor.type == GGUF_TYPE_Q8_0 || tensor.type == GGUF_TYPE_Q6_K || tensor.type == GGUF_TYPE_Q5_K
|
||||
weight_type = ov::element::u8;
|
||||
// Extract quantized weights using the shared function
|
||||
auto layout = ggml_openvino_get_extracted_layout(tensor);
|
||||
if (layout.total_size == 0) {
|
||||
OPENVINO_THROW("Unsupported quantized type for ", tensor->name, " type=", ggml_type_name(tensor->type));
|
||||
}
|
||||
|
||||
uint64_t weights_per_block;
|
||||
// here we only consider sub block, q6k:16 q4k:32 q5k:32
|
||||
if (tensor->type == GGML_TYPE_Q6_K) {
|
||||
weights_per_block = 16;
|
||||
} else {
|
||||
weights_per_block = 32;
|
||||
}
|
||||
|
||||
OPENVINO_ASSERT(node_shape.back() % weights_per_block == 0, "[load_gguf] tensor ", tensor->name,
|
||||
" has incompatible last dim shape: ", node_shape.back());
|
||||
ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
|
||||
ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block};
|
||||
|
||||
ov::Tensor weights(weight_type, node_shape);
|
||||
// For scales and biases
|
||||
node_shape[node_shape.size() - 1] = node_shape[node_shape.size() - 1] / weights_per_block;
|
||||
ov::Tensor scales(ov::element::f16, node_shape);
|
||||
ov::Tensor biases(ov::element::f16, node_shape);
|
||||
ov::Tensor scales(ov::element::f16, scale_shape);
|
||||
ov::Tensor biases(ov::element::f16, scale_shape);
|
||||
|
||||
ov::Output<ov::Node> weight_node;
|
||||
if (tensor->type == GGML_TYPE_Q4_0) {
|
||||
extract_q4_0_data(tensor, weights, scales, biases);
|
||||
weight_node = make_int4_weights(weights, scales, biases, weights_per_block);
|
||||
} else if (tensor->type == GGML_TYPE_Q4_1) {
|
||||
extract_q4_1_data(tensor, weights, scales, biases);
|
||||
weight_node = make_int4_weights(weights, scales, biases, weights_per_block);
|
||||
} else if (tensor->type == GGML_TYPE_Q8_0) {
|
||||
extract_q8_0_data(tensor, weights, scales, biases);
|
||||
weight_node = make_int8_weights(weights, scales, biases, weights_per_block);
|
||||
} else if (tensor->type == GGML_TYPE_Q6_K) {
|
||||
extract_q6_k_data(tensor, weights, scales, biases);
|
||||
weight_node = make_int8_weights(weights, scales, biases, weights_per_block);
|
||||
} else if (tensor->type == GGML_TYPE_Q4_K) {
|
||||
extract_q4_k_data(tensor, weights, scales, biases);
|
||||
weight_node = make_int4_weights(weights, scales, biases, weights_per_block);
|
||||
} else if (tensor->type == GGML_TYPE_Q5_K) {
|
||||
extract_q5_k_data(tensor, weights, scales, biases);
|
||||
weight_node = make_int8_weights(weights, scales, biases, weights_per_block);
|
||||
auto result = extract_quantized_weights(tensor, tensor->data, weights, scales, biases);
|
||||
result->set_friendly_name(tensor->name);
|
||||
|
||||
// Cache the quantized weight node for future reuse
|
||||
if (ggml_is_quantized(tensor->type) && !requant_type.has_value()) {
|
||||
std::lock_guard<std::mutex> lock(s_quantized_weight_cache_mutex);
|
||||
s_quantized_weight_cache[tensor->data] = result;
|
||||
GGML_LOG_DEBUG("%s: cached quantized constant for %s\n", __func__, tensor->name);
|
||||
}
|
||||
|
||||
OPENVINO_ASSERT(weight_node.get_shape().size() == 2, "Weight should be 2D");
|
||||
|
||||
weight_node.get_node_shared_ptr()->set_friendly_name(tensor->name);
|
||||
return weight_node.get_node_shared_ptr();
|
||||
return result;
|
||||
}
|
||||
|
||||
void GgmlOvDecoder::dump_cgraph(const ggml_cgraph * cgraph, std::string & filename) {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,247 @@
|
|||
#pragma once
|
||||
|
||||
#include <cstdlib>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <openvino/core/node.hpp>
|
||||
#include <openvino/runtime/tensor.hpp>
|
||||
#include <string>
|
||||
#include "ggml.h"
|
||||
|
||||
// ExtraQuantType enum - defines requantization target formats
|
||||
enum class ExtraQuantType { F16, Q4_0_C, Q8_1_C, Q4_0_128, Q8_0_C, Q8_0_32 };
|
||||
|
||||
// =====================================================
|
||||
// Global Device Configuration (singleton)
|
||||
// =====================================================
|
||||
// Initialized once during backend init from GGML_OPENVINO_DEVICE env var
|
||||
|
||||
struct ggml_openvino_device_config {
|
||||
std::string device_name = "CPU";
|
||||
bool is_npu = false;
|
||||
bool initialized = false;
|
||||
|
||||
void init() {
|
||||
if (initialized) return;
|
||||
const char* env = std::getenv("GGML_OPENVINO_DEVICE");
|
||||
if (env) {
|
||||
device_name = env;
|
||||
is_npu = (device_name == "NPU");
|
||||
}
|
||||
initialized = true;
|
||||
}
|
||||
};
|
||||
|
||||
// Get the global device config singleton
|
||||
inline ggml_openvino_device_config& ggml_openvino_get_device_config() {
|
||||
static ggml_openvino_device_config config;
|
||||
return config;
|
||||
}
|
||||
|
||||
// Initialize device config (call during backend init)
|
||||
inline void ggml_openvino_init_device_config() {
|
||||
ggml_openvino_get_device_config().init();
|
||||
}
|
||||
|
||||
// Get the device name
|
||||
inline const std::string& ggml_openvino_get_device_name() {
|
||||
return ggml_openvino_get_device_config().device_name;
|
||||
}
|
||||
|
||||
// Check if running on NPU
|
||||
inline bool ggml_openvino_is_npu() {
|
||||
return ggml_openvino_get_device_config().is_npu;
|
||||
}
|
||||
|
||||
// Get requantization type for a tensor type (returns nullopt if no requant needed)
|
||||
inline std::optional<ExtraQuantType> ggml_openvino_get_requant_type(ggml_type type) {
|
||||
if (!ggml_openvino_is_npu()) {
|
||||
return std::nullopt;
|
||||
}
|
||||
// NPU requantization rules
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q4_K:
|
||||
return ExtraQuantType::Q4_0_128;
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
return ExtraQuantType::F16;
|
||||
default:
|
||||
return std::nullopt;
|
||||
}
|
||||
}
|
||||
|
||||
// =====================================================
|
||||
// OpenVINO Tensor Extra Types
|
||||
// =====================================================
|
||||
// These types are stored in tensor->extra by the OpenVINO backend buffer.
|
||||
// They allow:
|
||||
// 1. Pre-built ov::Constant nodes for weights (avoiding memcpy during graph construction)
|
||||
// 2. ov::Tensor wrappers for KV cache / compute tensors (for direct use with infer_request)
|
||||
|
||||
// Base class for OpenVINO tensor extra data
|
||||
struct ggml_openvino_extra_base {
|
||||
enum class Type { WEIGHT, QUANTIZED_WEIGHT, TENSOR };
|
||||
Type type;
|
||||
virtual ~ggml_openvino_extra_base() = default;
|
||||
protected:
|
||||
explicit ggml_openvino_extra_base(Type t) : type(t) {}
|
||||
};
|
||||
|
||||
// Extra data for F16/F32/BF16 weight tensors - stores the pre-built ov::Constant node
|
||||
struct ggml_openvino_weight_extra : public ggml_openvino_extra_base {
|
||||
std::shared_ptr<ov::Node> constant; // Pre-built OpenVINO Constant node
|
||||
|
||||
explicit ggml_openvino_weight_extra(std::shared_ptr<ov::Node> c)
|
||||
: ggml_openvino_extra_base(Type::WEIGHT), constant(std::move(c)) {}
|
||||
};
|
||||
|
||||
// Extra data for quantized weight tensors - stores extracted weights/scales/biases and ov::Constant
|
||||
struct ggml_openvino_quantized_weight_extra : public ggml_openvino_extra_base {
|
||||
ov::Tensor weights; // U4 or U8 extracted weights
|
||||
ov::Tensor scales; // F16 scales
|
||||
ov::Tensor biases; // F16 biases (zero points)
|
||||
std::shared_ptr<ov::Node> constant; // Pre-built OpenVINO weight subgraph
|
||||
|
||||
ggml_openvino_quantized_weight_extra(ov::Tensor w, ov::Tensor s, ov::Tensor b, std::shared_ptr<ov::Node> c)
|
||||
: ggml_openvino_extra_base(Type::QUANTIZED_WEIGHT),
|
||||
weights(std::move(w)), scales(std::move(s)), biases(std::move(b)), constant(std::move(c)) {}
|
||||
};
|
||||
|
||||
// Extra data for KV cache / compute tensors - stores ov::Tensor for infer_request
|
||||
struct ggml_openvino_tensor_extra : public ggml_openvino_extra_base {
|
||||
std::shared_ptr<ov::Tensor> tensor; // For direct use with infer_request
|
||||
|
||||
explicit ggml_openvino_tensor_extra(std::shared_ptr<ov::Tensor> t)
|
||||
: ggml_openvino_extra_base(Type::TENSOR), tensor(std::move(t)) {}
|
||||
};
|
||||
|
||||
// =====================================================
|
||||
// Extracted Size Calculation for Quantized Tensors
|
||||
// =====================================================
|
||||
// For quantized tensors, we need extra space to store extracted weights, scales, and biases.
|
||||
// Returns the total size needed in the buffer for extracted data.
|
||||
|
||||
struct ggml_openvino_extracted_layout {
|
||||
size_t total_size; // Total bytes needed
|
||||
size_t weights_offset; // Offset to weights in buffer
|
||||
size_t weights_size; // Size of weights in bytes
|
||||
size_t scales_offset; // Offset to scales in buffer
|
||||
size_t scales_size; // Size of scales in bytes
|
||||
size_t biases_offset; // Offset to biases in buffer
|
||||
size_t biases_size; // Size of biases in bytes
|
||||
bool is_u4; // true for U4 weights, false for U8
|
||||
int64_t weights_per_block;// weights per scale/bias block
|
||||
|
||||
// Requantization info
|
||||
bool is_requant; // true if this tensor needs requantization
|
||||
std::optional<ExtraQuantType> requant_type; // target requant type if is_requant
|
||||
};
|
||||
|
||||
// Calculate the buffer layout for extracted quantized data
|
||||
inline ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor) {
|
||||
ggml_openvino_extracted_layout layout = {};
|
||||
|
||||
if (!ggml_is_quantized(tensor->type)) {
|
||||
return layout;
|
||||
}
|
||||
|
||||
// Only handle 2D weight tensors
|
||||
if (tensor->ne[2] != 1 || tensor->ne[3] != 1) {
|
||||
return layout;
|
||||
}
|
||||
|
||||
int64_t n_elements = ggml_nelements(tensor);
|
||||
const size_t alignment = 64; // Good for SIMD
|
||||
|
||||
// Check if requantization is needed (NPU-specific)
|
||||
auto requant_type = ggml_openvino_get_requant_type(tensor->type);
|
||||
if (requant_type.has_value()) {
|
||||
layout.is_requant = true;
|
||||
layout.requant_type = requant_type;
|
||||
|
||||
// Special case: requant to F16 - just store F16 weights, no scales/biases
|
||||
if (requant_type.value() == ExtraQuantType::F16) {
|
||||
layout.weights_size = n_elements * sizeof(uint16_t); // F16 = 2 bytes
|
||||
layout.total_size = layout.weights_size;
|
||||
layout.weights_offset = 0;
|
||||
// No scales/biases for F16
|
||||
return layout;
|
||||
}
|
||||
|
||||
// Requant to different quantized format (e.g., Q4_0_128)
|
||||
switch (requant_type.value()) {
|
||||
case ExtraQuantType::Q4_0_128:
|
||||
layout.is_u4 = true;
|
||||
layout.weights_per_block = 128;
|
||||
break;
|
||||
case ExtraQuantType::Q8_0_32:
|
||||
layout.is_u4 = false;
|
||||
layout.weights_per_block = 32;
|
||||
break;
|
||||
default:
|
||||
// Unsupported requant type - fall through to normal extraction
|
||||
layout.is_requant = false;
|
||||
layout.requant_type = std::nullopt;
|
||||
break;
|
||||
}
|
||||
|
||||
if (layout.is_requant) {
|
||||
// Calculate sizes for requantized format
|
||||
layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;
|
||||
int64_t n_blocks = n_elements / layout.weights_per_block;
|
||||
layout.scales_size = n_blocks * sizeof(uint16_t);
|
||||
layout.biases_size = n_blocks * sizeof(uint16_t);
|
||||
|
||||
layout.weights_offset = 0;
|
||||
layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
|
||||
layout.biases_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;
|
||||
layout.total_size = layout.biases_offset + layout.biases_size;
|
||||
layout.total_size = std::max(layout.total_size, ggml_nbytes(tensor));
|
||||
return layout;
|
||||
}
|
||||
}
|
||||
|
||||
// Normal extraction (no requant) - determine format based on tensor type
|
||||
switch (tensor->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q4_K:
|
||||
layout.is_u4 = true;
|
||||
layout.weights_per_block = 32;
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
layout.is_u4 = false;
|
||||
layout.weights_per_block = 32;
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
layout.is_u4 = false;
|
||||
layout.weights_per_block = 16;
|
||||
break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
layout.is_u4 = false;
|
||||
layout.weights_per_block = 32;
|
||||
break;
|
||||
default:
|
||||
// Unsupported quantization type
|
||||
return layout;
|
||||
}
|
||||
|
||||
// Calculate sizes
|
||||
// Weights: U4 = n_elements/2 bytes, U8 = n_elements bytes
|
||||
layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;
|
||||
|
||||
// Scales and biases: F16 per block
|
||||
int64_t n_blocks = n_elements / layout.weights_per_block;
|
||||
layout.scales_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes
|
||||
layout.biases_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes
|
||||
|
||||
// Layout in buffer: [weights | scales | biases] with alignment
|
||||
layout.weights_offset = 0;
|
||||
layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
|
||||
layout.biases_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;
|
||||
layout.total_size = layout.biases_offset + layout.biases_size;
|
||||
|
||||
return layout;
|
||||
}
|
||||
|
|
@ -3,18 +3,429 @@
|
|||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-openvino-extra.h"
|
||||
#include "ggml-openvino/utils.h"
|
||||
#include "ggml-quants.hpp"
|
||||
#include "ggml.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <openvino/openvino.hpp>
|
||||
#include <openvino/runtime/tensor.hpp>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#define GGML_OPENVINO_MAX_STREAMS 8
|
||||
|
||||
// OpenVINO buffer alignment (same as CPU for compatibility)
|
||||
#define GGML_OPENVINO_BUFFER_ALIGNMENT 64
|
||||
|
||||
// =====================================================
|
||||
// OpenVINO Buffer Implementation using ov::Tensor
|
||||
// =====================================================
|
||||
//
|
||||
// Design: This implementation uses a hybrid approach:
|
||||
// 1. For weight tensors: Store a pre-built ov::op::v0::Constant in tensor->extra
|
||||
// - This avoids the memcpy during graph construction
|
||||
// - For quantized weights, the constant is already converted to OpenVINO format
|
||||
// 2. For KV cache / compute tensors: Store an ov::Tensor in tensor->extra
|
||||
// - This can be directly passed to infer_request
|
||||
// - Future: can be changed to ov::RemoteTensor for GPU/NPU
|
||||
//
|
||||
// This design is similar to:
|
||||
// - CUDA split buffer: tensor->extra stores device pointers
|
||||
// - CPU repack buffer: tensor->extra stores tensor_traits with repacked data
|
||||
// =====================================================
|
||||
|
||||
// Buffer context that manages per-tensor allocations (no contiguous buffer for weights)
|
||||
struct ggml_backend_openvino_buffer_context {
|
||||
int device;
|
||||
std::string name;
|
||||
|
||||
// For non-weight buffers (KV cache, compute), we still use contiguous allocation
|
||||
void * data;
|
||||
size_t size;
|
||||
bool is_weight_buffer; // Set when buffer usage is set to WEIGHTS
|
||||
|
||||
// Track all extras for cleanup
|
||||
std::vector<ggml_openvino_extra_base *> tensor_extras;
|
||||
|
||||
ggml_backend_openvino_buffer_context(int device, size_t size) :
|
||||
device(device),
|
||||
name(std::string(GGML_OPENVINO_NAME) + std::to_string(device)),
|
||||
data(nullptr),
|
||||
size(size),
|
||||
is_weight_buffer(false) {
|
||||
// Allocate aligned contiguous memory
|
||||
if (size > 0) {
|
||||
#ifdef _WIN32
|
||||
data = _aligned_malloc(size, GGML_OPENVINO_BUFFER_ALIGNMENT);
|
||||
#else
|
||||
data = aligned_alloc(GGML_OPENVINO_BUFFER_ALIGNMENT, size);
|
||||
#endif
|
||||
if (data == nullptr) {
|
||||
GGML_LOG_ERROR("%s: failed to allocate %zu bytes\n", __func__, size);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
~ggml_backend_openvino_buffer_context() {
|
||||
// Clean up all tensor extras
|
||||
for (auto * extra : tensor_extras) {
|
||||
delete extra;
|
||||
}
|
||||
tensor_extras.clear();
|
||||
|
||||
// Free contiguous memory
|
||||
if (data != nullptr) {
|
||||
#ifdef _WIN32
|
||||
_aligned_free(data);
|
||||
#else
|
||||
free(data);
|
||||
#endif
|
||||
data = nullptr;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Buffer type context (per-device)
|
||||
struct ggml_backend_openvino_buffer_type_context {
|
||||
int device;
|
||||
std::string name;
|
||||
};
|
||||
|
||||
// Buffer interface functions
|
||||
static void ggml_backend_openvino_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
static void * ggml_backend_openvino_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
|
||||
return ctx->data;
|
||||
}
|
||||
|
||||
static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
// Views share the extra from view_src
|
||||
if (tensor->view_src != nullptr) {
|
||||
GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
|
||||
if (tensor->view_src->extra != nullptr) {
|
||||
tensor->extra = tensor->view_src->extra;
|
||||
}
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// For non-view tensors, tensor->extra will be set in set_tensor
|
||||
// when the actual weight data is loaded
|
||||
GGML_UNUSED(buffer);
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static void ggml_backend_openvino_buffer_memset_tensor(ggml_backend_buffer_t buffer,
|
||||
ggml_tensor * tensor,
|
||||
uint8_t value,
|
||||
size_t offset,
|
||||
size_t size) {
|
||||
GGML_ASSERT(tensor != nullptr && tensor->data != nullptr);
|
||||
memset((char *) tensor->data + offset, value, size);
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
ggml_tensor * tensor,
|
||||
const void * data,
|
||||
size_t offset,
|
||||
size_t size) {
|
||||
GGML_ASSERT(tensor != nullptr && tensor->data != nullptr);
|
||||
ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
|
||||
|
||||
// Check if this is a weight buffer (usage is set BEFORE set_tensor is called)
|
||||
bool is_weight_buffer = (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
||||
// Full tensor set: offset=0, full size, not a view
|
||||
bool is_full_tensor_set = (offset == 0 && size == ggml_nbytes(tensor) && tensor->view_src == nullptr);
|
||||
// 2D tensor (typical weight shape)
|
||||
bool is_2d = (tensor->ne[2] == 1 && tensor->ne[3] == 1);
|
||||
|
||||
// Check if this is a quantized weight tensor that needs extraction/requantization
|
||||
ggml_openvino_extracted_layout layout = {};
|
||||
if (is_weight_buffer && is_full_tensor_set && is_2d && ggml_is_quantized(tensor->type)) {
|
||||
layout = ggml_openvino_get_extracted_layout(tensor);
|
||||
}
|
||||
|
||||
if (layout.total_size > 0) {
|
||||
uint8_t * buf_base = (uint8_t *) tensor->data;
|
||||
|
||||
// 2D shape for weights [rows, cols]
|
||||
ov::Shape weight_shape = {static_cast<size_t>(tensor->ne[1]), static_cast<size_t>(tensor->ne[0])};
|
||||
|
||||
try {
|
||||
std::shared_ptr<ov::Node> constant;
|
||||
|
||||
if (layout.is_requant && layout.requant_type.has_value()) {
|
||||
// Requantization path
|
||||
if (layout.requant_type.value() == ExtraQuantType::F16) {
|
||||
// Requant to F16: create F16 tensor with external memory, requantize fills it
|
||||
ov::Tensor weights(ov::element::f16, weight_shape, buf_base);
|
||||
ov::Tensor dummy_scales, dummy_biases; // Not used for F16
|
||||
// requantize_to_buffers fills weights and returns a Constant wrapping it
|
||||
constant = requantize_to_buffers(tensor, data, ExtraQuantType::F16, 0, weights, dummy_scales,
|
||||
dummy_biases);
|
||||
|
||||
// Store in tensor->extra (use weight_extra since it's F16)
|
||||
auto * extra = new ggml_openvino_weight_extra(constant);
|
||||
ctx->tensor_extras.push_back(extra);
|
||||
tensor->extra = extra;
|
||||
|
||||
GGML_LOG_DEBUG("%s: requantized %s to F16\n", __func__, tensor->name);
|
||||
} else {
|
||||
// Requant to quantized format (Q4_0_128, Q8_0_32, etc.)
|
||||
ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
|
||||
ov::Shape scale_shape = {static_cast<size_t>(tensor->ne[1]),
|
||||
static_cast<size_t>(tensor->ne[0] / layout.weights_per_block)};
|
||||
|
||||
ov::Tensor weights(weight_type, weight_shape, buf_base + layout.weights_offset);
|
||||
ov::Tensor scales(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
|
||||
ov::Tensor biases(ov::element::f16, scale_shape, buf_base + layout.biases_offset);
|
||||
|
||||
constant = requantize_to_buffers(tensor, data, layout.requant_type.value(),
|
||||
layout.weights_per_block, weights, scales, biases);
|
||||
|
||||
// Store in tensor->extra
|
||||
auto * extra = new ggml_openvino_quantized_weight_extra(std::move(weights), std::move(scales),
|
||||
std::move(biases), constant);
|
||||
ctx->tensor_extras.push_back(extra);
|
||||
tensor->extra = extra;
|
||||
|
||||
GGML_LOG_DEBUG("%s: requantized %s to %s (u%d, block_size=%ld)\n", __func__, tensor->name,
|
||||
layout.requant_type.value() == ExtraQuantType::Q4_0_128 ? "Q4_0_128" : "Q8_0_32",
|
||||
layout.is_u4 ? 4 : 8, layout.weights_per_block);
|
||||
}
|
||||
} else {
|
||||
// Normal extraction path (no requant)
|
||||
ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
|
||||
int64_t n_blocks = ggml_nelements(tensor) / layout.weights_per_block;
|
||||
ov::Shape scale_shape = {static_cast<size_t>(tensor->ne[1]),
|
||||
static_cast<size_t>(tensor->ne[0] / layout.weights_per_block)};
|
||||
|
||||
ov::Tensor weights(weight_type, weight_shape, buf_base + layout.weights_offset);
|
||||
ov::Tensor scales(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
|
||||
ov::Tensor biases(ov::element::f16, scale_shape, buf_base + layout.biases_offset);
|
||||
|
||||
constant = extract_quantized_weights(tensor, data, weights, scales, biases);
|
||||
|
||||
// Store in tensor->extra
|
||||
auto * extra = new ggml_openvino_quantized_weight_extra(std::move(weights), std::move(scales),
|
||||
std::move(biases), constant);
|
||||
ctx->tensor_extras.push_back(extra);
|
||||
tensor->extra = extra;
|
||||
|
||||
GGML_LOG_DEBUG("%s: extracted quantized constant for %s (u%d, %zu weights, %ld blocks)\n", __func__,
|
||||
tensor->name, layout.is_u4 ? 4 : 8, layout.weights_size, n_blocks);
|
||||
}
|
||||
|
||||
} catch (const std::exception & e) {
|
||||
GGML_LOG_ERROR("%s: failed to process quantized data for %s: %s\n", __func__, tensor->name, e.what());
|
||||
// Fall back to storing raw data
|
||||
memcpy((char *) tensor->data + offset, data, size);
|
||||
}
|
||||
} else if (is_weight_buffer && is_full_tensor_set && is_2d &&
|
||||
(tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16)) {
|
||||
// F16/F32/BF16 weight tensor - copy data and create shared-memory constant
|
||||
memcpy((char *) tensor->data + offset, data, size);
|
||||
|
||||
try {
|
||||
// Get OpenVINO element type
|
||||
ov::element::Type element_type;
|
||||
switch (tensor->type) {
|
||||
case GGML_TYPE_F32:
|
||||
element_type = ov::element::f32;
|
||||
break;
|
||||
case GGML_TYPE_F16:
|
||||
element_type = ov::element::f16;
|
||||
break;
|
||||
case GGML_TYPE_BF16:
|
||||
element_type = ov::element::bf16;
|
||||
break;
|
||||
default:
|
||||
return; // Should not happen
|
||||
}
|
||||
|
||||
// Create 2D shape (OpenVINO expects [rows, cols])
|
||||
ov::Shape shape = {static_cast<size_t>(tensor->ne[1]), static_cast<size_t>(tensor->ne[0])};
|
||||
|
||||
// Create ov::Tensor with external memory, then wrap with Constant
|
||||
ov::Tensor ov_tensor(element_type, shape, tensor->data);
|
||||
auto constant = std::make_shared<ov::op::v0::Constant>(ov_tensor);
|
||||
constant->set_friendly_name(tensor->name);
|
||||
|
||||
// Store in tensor->extra
|
||||
ggml_openvino_weight_extra * extra = new ggml_openvino_weight_extra(constant);
|
||||
ctx->tensor_extras.push_back(extra);
|
||||
tensor->extra = extra;
|
||||
|
||||
GGML_LOG_DEBUG("%s: created shared-memory constant for %s\n", __func__, tensor->name);
|
||||
|
||||
} catch (const std::exception & e) {
|
||||
GGML_LOG_DEBUG("%s: failed to create shared-memory constant for %s: %s\n", __func__, tensor->name,
|
||||
e.what());
|
||||
}
|
||||
} else {
|
||||
// Non-weight tensor (KV cache, activations, etc.) - just copy data
|
||||
memcpy((char *) tensor->data + offset, data, size);
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_backend_openvino_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||
const ggml_tensor * tensor,
|
||||
void * data,
|
||||
size_t offset,
|
||||
size_t size) {
|
||||
GGML_ASSERT(tensor != nullptr && tensor->data != nullptr);
|
||||
memcpy(data, (const char *) tensor->data + offset, size);
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
static bool ggml_backend_openvino_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
|
||||
const ggml_tensor * src,
|
||||
ggml_tensor * dst) {
|
||||
GGML_ASSERT(src != nullptr && dst != nullptr);
|
||||
// Can copy from any host buffer (including other OpenVINO buffers)
|
||||
if (ggml_backend_buffer_is_host(src->buffer)) {
|
||||
memcpy(dst->data, src->data, ggml_nbytes(src));
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
static void ggml_backend_openvino_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
|
||||
if (ctx->data != nullptr) {
|
||||
memset(ctx->data, value, ctx->size);
|
||||
}
|
||||
}
|
||||
|
||||
static const ggml_backend_buffer_i ggml_backend_openvino_buffer_interface = {
|
||||
/* .free_buffer = */ ggml_backend_openvino_buffer_free_buffer,
|
||||
/* .get_base = */ ggml_backend_openvino_buffer_get_base,
|
||||
/* .init_tensor = */ ggml_backend_openvino_buffer_init_tensor,
|
||||
/* .memset_tensor = */ ggml_backend_openvino_buffer_memset_tensor,
|
||||
/* .set_tensor = */ ggml_backend_openvino_buffer_set_tensor,
|
||||
/* .get_tensor = */ ggml_backend_openvino_buffer_get_tensor,
|
||||
/* .cpy_tensor = */ ggml_backend_openvino_buffer_cpy_tensor,
|
||||
/* .clear = */ ggml_backend_openvino_buffer_clear,
|
||||
/* .reset = */ NULL,
|
||||
};
|
||||
|
||||
// Buffer type interface functions
|
||||
static const char * ggml_backend_openvino_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||
ggml_backend_openvino_buffer_type_context * ctx = (ggml_backend_openvino_buffer_type_context *) buft->context;
|
||||
return ctx->name.c_str();
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_openvino_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
||||
size_t size) {
|
||||
ggml_backend_openvino_buffer_type_context * buft_ctx = (ggml_backend_openvino_buffer_type_context *) buft->context;
|
||||
|
||||
// Create buffer context with contiguous memory allocation
|
||||
ggml_backend_openvino_buffer_context * ctx = new ggml_backend_openvino_buffer_context(buft_ctx->device, size);
|
||||
|
||||
if (ctx->data == nullptr && size > 0) {
|
||||
GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
|
||||
delete ctx;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return ggml_backend_buffer_init(buft, ggml_backend_openvino_buffer_interface, ctx, size);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_openvino_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
GGML_UNUSED(buft);
|
||||
return GGML_OPENVINO_BUFFER_ALIGNMENT;
|
||||
}
|
||||
|
||||
static size_t ggml_backend_openvino_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
|
||||
GGML_UNUSED(buft);
|
||||
return SIZE_MAX;
|
||||
}
|
||||
|
||||
static size_t ggml_backend_openvino_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft,
|
||||
const ggml_tensor * tensor) {
|
||||
GGML_UNUSED(buft);
|
||||
|
||||
// For quantized 2D tensors (weights), we need extra space for extracted data
|
||||
if (ggml_is_quantized(tensor->type) && tensor->ne[2] == 1 && tensor->ne[3] == 1) {
|
||||
ggml_openvino_extracted_layout layout = ggml_openvino_get_extracted_layout(tensor);
|
||||
if (layout.total_size > 0) {
|
||||
GGML_LOG_DEBUG(
|
||||
"%s: tensor %s needs %zu bytes (original %zu, extracted: weights=%zu scales=%zu biases=%zu)\n",
|
||||
__func__, tensor->name, layout.total_size, ggml_nbytes(tensor), layout.weights_size, layout.scales_size,
|
||||
layout.biases_size);
|
||||
return layout.total_size;
|
||||
}
|
||||
}
|
||||
|
||||
return ggml_nbytes(tensor);
|
||||
}
|
||||
|
||||
static bool ggml_backend_openvino_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
||||
GGML_UNUSED(buft);
|
||||
// Currently using host memory via ov::Tensor
|
||||
// This will be false when using GPU/NPU remote tensors
|
||||
return true;
|
||||
}
|
||||
|
||||
static const ggml_backend_buffer_type_i ggml_backend_openvino_buffer_type_interface = {
|
||||
/* .get_name = */ ggml_backend_openvino_buffer_type_get_name,
|
||||
/* .alloc_buffer = */ ggml_backend_openvino_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_openvino_buffer_type_get_alignment,
|
||||
/* .get_max_size = */ ggml_backend_openvino_buffer_type_get_max_size,
|
||||
/* .get_alloc_size = */ ggml_backend_openvino_buffer_type_get_alloc_size,
|
||||
/* .is_host = */ ggml_backend_openvino_buffer_type_is_host,
|
||||
};
|
||||
|
||||
// Get buffer type for a specific device
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(int device) {
|
||||
GGML_ASSERT(device >= 0 && device < ggml_backend_openvino_get_device_count());
|
||||
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
|
||||
static std::vector<ggml_backend_buffer_type> buffer_types;
|
||||
static std::vector<ggml_backend_openvino_buffer_type_context> buffer_type_contexts;
|
||||
|
||||
if (buffer_types.empty()) {
|
||||
int device_count = ggml_backend_openvino_get_device_count();
|
||||
buffer_types.resize(device_count);
|
||||
buffer_type_contexts.resize(device_count);
|
||||
|
||||
for (int i = 0; i < device_count; i++) {
|
||||
buffer_type_contexts[i].device = i;
|
||||
buffer_type_contexts[i].name = std::string(GGML_OPENVINO_NAME) + std::to_string(i);
|
||||
|
||||
buffer_types[i] = ggml_backend_buffer_type{
|
||||
/* .iface = */ ggml_backend_openvino_buffer_type_interface,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_openvino_reg(), i),
|
||||
/* .context = */ &buffer_type_contexts[i],
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
return &buffer_types[device];
|
||||
}
|
||||
|
||||
// Check if a buffer is an OpenVINO buffer
|
||||
static bool ggml_backend_buffer_is_openvino(ggml_backend_buffer_t buffer) {
|
||||
return buffer->iface.free_buffer == ggml_backend_openvino_buffer_free_buffer;
|
||||
}
|
||||
|
||||
// =====================================================
|
||||
// OpenVINO Backend Context and Interface
|
||||
// =====================================================
|
||||
|
||||
struct ggml_backend_openvino_context {
|
||||
int device; // the device ID currently in use
|
||||
std::string name; // context Name
|
||||
|
|
@ -111,13 +522,6 @@ GGML_BACKEND_API bool ggml_backend_is_openvino(ggml_backend_t backend) {
|
|||
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_openvino_guid());
|
||||
}
|
||||
|
||||
// device buffer
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(int device) {
|
||||
GGML_ASSERT(device >= 0);
|
||||
return ggml_backend_cpu_buffer_type();
|
||||
GGML_UNUSED(device);
|
||||
}
|
||||
|
||||
struct ggml_backend_openvino_device_context {
|
||||
int device;
|
||||
std::string name;
|
||||
|
|
@ -350,7 +754,8 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
|
|||
}
|
||||
|
||||
static bool ggml_backend_openvino_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
||||
return ggml_backend_buft_is_host(buft);
|
||||
// Support our own buffer type and any host buffer (for mmap'd files, etc.)
|
||||
return buft->iface.get_name == ggml_backend_openvino_buffer_type_get_name || ggml_backend_buft_is_host(buft);
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
|
|
@ -410,6 +815,10 @@ static int get_openvino_device_count() {
|
|||
}
|
||||
|
||||
static ggml_openvino_device_info ggml_openvino_init() {
|
||||
// Initialize device config singleton from env var
|
||||
ggml_openvino_init_device_config();
|
||||
GGML_LOG_INFO("OpenVINO: using device %s\n", ggml_openvino_get_device_name().c_str());
|
||||
|
||||
ggml_openvino_device_info info = {};
|
||||
info.device_count = get_openvino_device_count();
|
||||
return info;
|
||||
|
|
|
|||
|
|
@ -418,11 +418,124 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor & weight,
|
|||
return std::make_shared<ov::op::v0::Convert>(w_zp_s, ov::element::f32);
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Node> requantize(const ggml_tensor * tensor, ExtraQuantType requant_type) {
|
||||
std::vector<float> weights_f32(tensor->ne[0] * tensor->ne[1]);
|
||||
ggml_get_type_traits(tensor->type)->to_float(tensor->data, weights_f32.data(), ggml_nelements(tensor));
|
||||
// Extract quantized weights from tensor and create weight subgraph
|
||||
std::shared_ptr<ov::Node> extract_quantized_weights(const ggml_tensor * tensor,
|
||||
const void * data,
|
||||
ov::Tensor & weights,
|
||||
ov::Tensor & scales,
|
||||
ov::Tensor & biases) {
|
||||
// Create a temporary tensor for extraction functions that read from tensor->data
|
||||
ggml_tensor temp_tensor = *tensor;
|
||||
temp_tensor.data = const_cast<void *>(data);
|
||||
|
||||
std::shared_ptr<ov::Node> weight_node;
|
||||
// Determine block size based on tensor type
|
||||
int64_t weights_per_block;
|
||||
bool is_u4;
|
||||
switch (tensor->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q4_K:
|
||||
is_u4 = true;
|
||||
weights_per_block = 32;
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q5_K:
|
||||
is_u4 = false;
|
||||
weights_per_block = 32;
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
is_u4 = false;
|
||||
weights_per_block = 16;
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("Unsupported quantized type for extraction: " +
|
||||
std::string(ggml_type_name(tensor->type)));
|
||||
}
|
||||
|
||||
// Extract quantized data
|
||||
switch (tensor->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
extract_q4_0_data(&temp_tensor, weights, scales, biases);
|
||||
break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
extract_q4_1_data(&temp_tensor, weights, scales, biases);
|
||||
break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
extract_q4_k_data(&temp_tensor, weights, scales, biases);
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
extract_q8_0_data(&temp_tensor, weights, scales, biases);
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
extract_q6_k_data(&temp_tensor, weights, scales, biases);
|
||||
break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
extract_q5_k_data(&temp_tensor, weights, scales, biases);
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error("Unsupported quantized type: " + std::string(ggml_type_name(tensor->type)));
|
||||
}
|
||||
|
||||
// Create the OpenVINO weight subgraph
|
||||
ov::Output<ov::Node> weight_node;
|
||||
if (is_u4) {
|
||||
weight_node = make_int4_weights(weights, scales, biases, weights_per_block);
|
||||
} else {
|
||||
weight_node = make_int8_weights(weights, scales, biases, weights_per_block);
|
||||
}
|
||||
|
||||
auto result = weight_node.get_node_shared_ptr();
|
||||
result->set_friendly_name(tensor->name);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Requantize weights to target format, writing to provided buffers
|
||||
std::shared_ptr<ov::Node> requantize_to_buffers(const ggml_tensor * tensor,
|
||||
const void * data,
|
||||
ExtraQuantType requant_type,
|
||||
int64_t block_size,
|
||||
ov::Tensor & weights,
|
||||
ov::Tensor & scales,
|
||||
ov::Tensor & biases) {
|
||||
int64_t n_elements = ggml_nelements(tensor);
|
||||
|
||||
// First dequantize to F32
|
||||
std::vector<float> weights_f32(n_elements);
|
||||
ggml_get_type_traits(tensor->type)->to_float(data, weights_f32.data(), n_elements);
|
||||
|
||||
// Handle F16 case - just convert and create constant
|
||||
if (requant_type == ExtraQuantType::F16) {
|
||||
ggml_get_type_traits(GGML_TYPE_F16)->from_float_ref(weights_f32.data(), weights.data(), n_elements);
|
||||
auto result = std::make_shared<ov::op::v0::Constant>(weights);
|
||||
result->set_friendly_name(tensor->name);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Requantize to target quantized format
|
||||
bool is_u4 = (requant_type == ExtraQuantType::Q4_0_C || requant_type == ExtraQuantType::Q4_0_128);
|
||||
|
||||
if (is_u4) {
|
||||
quantize_q4_0(weights_f32.data(), weights, scales, biases, n_elements, block_size);
|
||||
} else if (requant_type == ExtraQuantType::Q8_1_C) {
|
||||
quantize_q8_1(weights_f32.data(), weights, scales, biases, n_elements, block_size);
|
||||
} else {
|
||||
quantize_q8_0(weights_f32.data(), weights, scales, biases, n_elements, block_size);
|
||||
}
|
||||
|
||||
// Create the OpenVINO weight subgraph
|
||||
ov::Output<ov::Node> weight_node;
|
||||
if (is_u4) {
|
||||
weight_node = make_int4_weights(weights, scales, biases, block_size);
|
||||
} else {
|
||||
weight_node = make_int8_weights(weights, scales, biases, block_size);
|
||||
}
|
||||
|
||||
auto result = weight_node.get_node_shared_ptr();
|
||||
result->set_friendly_name(tensor->name);
|
||||
return result;
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Node> requantize(const ggml_tensor * tensor, ExtraQuantType requant_type) {
|
||||
ov::Shape node_shape = {(uint64_t) (tensor->ne[1]), (uint64_t) (tensor->ne[0])};
|
||||
|
||||
// FIXME hardcoded workaround to fix the case where token_emb.weight is q4_0 (instead of q6_k)
|
||||
|
|
@ -432,42 +545,28 @@ std::shared_ptr<ov::Node> requantize(const ggml_tensor * tensor, ExtraQuantType
|
|||
requant_type = ExtraQuantType::F16;
|
||||
}
|
||||
|
||||
if (requant_type == ExtraQuantType::F16) {
|
||||
ov::Tensor weights(ov::element::f16, node_shape);
|
||||
ggml_get_type_traits(GGML_TYPE_F16)->from_float_ref(weights_f32.data(), weights.data(), ggml_nelements(tensor));
|
||||
std::shared_ptr<ov::Node> weight_node = std::make_shared<ov::op::v0::Constant>(weights);
|
||||
weight_node->set_friendly_name(tensor->name);
|
||||
return weight_node;
|
||||
}
|
||||
|
||||
// Determine block size
|
||||
int64_t block_size = node_shape[1];
|
||||
if (requant_type == ExtraQuantType::Q4_0_128) {
|
||||
block_size = 128;
|
||||
} else if (requant_type == ExtraQuantType::Q8_0_32) {
|
||||
block_size = 32;
|
||||
}
|
||||
auto scales_shape = ov::Shape{node_shape[0], node_shape[1] / block_size};
|
||||
|
||||
ov::Tensor weights;
|
||||
ov::Tensor scales(ov::element::f16, scales_shape);
|
||||
ov::Tensor bias(ov::element::f16, scales_shape);
|
||||
|
||||
if (requant_type == ExtraQuantType::Q4_0_C || requant_type == ExtraQuantType::Q4_0_128) {
|
||||
weights = ov::Tensor(ov::element::u4, node_shape);
|
||||
quantize_q4_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size);
|
||||
weight_node = make_int4_weights(weights, scales, bias, block_size).get_node_shared_ptr();
|
||||
} else if (requant_type == ExtraQuantType::Q8_1_C) {
|
||||
weights = ov::Tensor(ov::element::u8, node_shape);
|
||||
quantize_q8_1(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size);
|
||||
weight_node = make_int8_weights(weights, scales, bias, block_size).get_node_shared_ptr();
|
||||
} else if (requant_type == ExtraQuantType::Q8_0_C || requant_type == ExtraQuantType::Q8_0_32) {
|
||||
weights = ov::Tensor(ov::element::u8, node_shape);
|
||||
quantize_q8_0(weights_f32.data(), weights, scales, bias, weights.get_size(), block_size);
|
||||
weight_node = make_int8_weights(weights, scales, bias, block_size).get_node_shared_ptr();
|
||||
// Allocate tensors
|
||||
ov::Tensor weights, scales, biases;
|
||||
if (requant_type == ExtraQuantType::F16) {
|
||||
weights = ov::Tensor(ov::element::f16, node_shape);
|
||||
} else {
|
||||
bool is_u4 = (requant_type == ExtraQuantType::Q4_0_C || requant_type == ExtraQuantType::Q4_0_128);
|
||||
ov::element::Type weight_type = is_u4 ? ov::element::u4 : ov::element::u8;
|
||||
ov::Shape scales_shape = {node_shape[0], node_shape[1] / block_size};
|
||||
weights = ov::Tensor(weight_type, node_shape);
|
||||
scales = ov::Tensor(ov::element::f16, scales_shape);
|
||||
biases = ov::Tensor(ov::element::f16, scales_shape);
|
||||
}
|
||||
|
||||
weight_node->set_friendly_name(tensor->name);
|
||||
return weight_node;
|
||||
return requantize_to_buffers(tensor, tensor->data, requant_type, block_size, weights, scales, biases);
|
||||
}
|
||||
|
||||
void quantize_q4_0(const float * x,
|
||||
|
|
|
|||
|
|
@ -1,10 +1,11 @@
|
|||
#pragma once
|
||||
#include "ggml-openvino-extra.h" // For ExtraQuantType
|
||||
#include "ggml.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/runtime/tensor.hpp>
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
void unpack_32_4(const uint8_t* data, uint8_t* dst);
|
||||
|
||||
void extract_q4_0_data(const ggml_tensor* tensor,
|
||||
|
|
@ -51,10 +52,32 @@ ov::Output<ov::Node> make_int4_weights(ov::Tensor& weight,
|
|||
ov::Tensor& biases,
|
||||
size_t group_size = GGML_QUANTIZATION_GROUP_SIZE);
|
||||
|
||||
enum class ExtraQuantType { F16, Q4_0_C, Q8_1_C, Q4_0_128, Q8_0_C, Q8_0_32 };
|
||||
// ExtraQuantType is defined in ggml-openvino-extra.h
|
||||
|
||||
std::shared_ptr<ov::Node> requantize(const ggml_tensor* tensor, ExtraQuantType requant_type);
|
||||
|
||||
// Extract quantized weights from tensor and create weight subgraph
|
||||
// If weights/scales/biases are provided (non-empty), uses them as output buffers
|
||||
// Otherwise allocates new ov::Tensors internally
|
||||
// Returns the weight node (make_int4_weights or make_int8_weights result)
|
||||
std::shared_ptr<ov::Node> extract_quantized_weights(
|
||||
const ggml_tensor * tensor,
|
||||
const void * data, // Source data pointer (may differ from tensor->data)
|
||||
ov::Tensor & weights,
|
||||
ov::Tensor & scales,
|
||||
ov::Tensor & biases);
|
||||
|
||||
// Requantize weights from tensor to target format, writing to provided buffers
|
||||
// For F16 target, only weights buffer is used (scales/biases ignored)
|
||||
// Returns the weight node
|
||||
std::shared_ptr<ov::Node> requantize_to_buffers(const ggml_tensor * tensor,
|
||||
const void * data, // Source data pointer
|
||||
ExtraQuantType requant_type,
|
||||
int64_t block_size,
|
||||
ov::Tensor & weights,
|
||||
ov::Tensor & scales,
|
||||
ov::Tensor & biases);
|
||||
|
||||
void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
|
||||
int64_t qk);
|
||||
void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
#include "utils.h"
|
||||
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-openvino-extra.h"
|
||||
#include "ggml-openvino/ggml-decoder.h"
|
||||
#include "ggml.h"
|
||||
#include "openvino/frontend.hpp"
|
||||
|
|
@ -39,23 +40,14 @@
|
|||
static ov::Core core;
|
||||
|
||||
enum ggml_status ov_graph_compute(ggml_cgraph * cgraph) {
|
||||
auto get_device = [&] {
|
||||
std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : "CPU";
|
||||
auto available_devices = core.get_available_devices();
|
||||
if (std::find(available_devices.begin(), available_devices.end(), device) == available_devices.end()) {
|
||||
GGML_LOG_WARN("GGML OpenVINO Backend: device %s is not available, fallback to CPU\n", device.c_str());
|
||||
device = "CPU";
|
||||
}
|
||||
return device;
|
||||
};
|
||||
|
||||
if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) {
|
||||
std::string filename = "cgraph.txt";
|
||||
GgmlOvDecoder::dump_cgraph(cgraph, filename);
|
||||
}
|
||||
|
||||
static const auto device = get_device();
|
||||
static const auto is_static = device == "NPU" ? true : false;
|
||||
// Use device from singleton (initialized during backend init)
|
||||
const auto & device = ggml_openvino_get_device_name();
|
||||
const auto is_static = ggml_openvino_is_npu();
|
||||
return is_static ? ov_graph_compute_static(cgraph) : ov_graph_compute_dynamic(cgraph, device);
|
||||
}
|
||||
|
||||
|
|
@ -413,7 +405,8 @@ ov::AnyMap get_ov_compile_config(const std::string & device) {
|
|||
}
|
||||
|
||||
std::map<ggml_type, ExtraQuantType> get_types_to_requant(const std::string & device) {
|
||||
if (device == "NPU") {
|
||||
// Use singleton to check if NPU (device param kept for API compatibility)
|
||||
if (ggml_openvino_is_npu()) {
|
||||
return {
|
||||
{GGML_TYPE_Q4_0, ExtraQuantType::Q4_0_128},
|
||||
{GGML_TYPE_Q4_1, ExtraQuantType::Q4_0_128},
|
||||
|
|
@ -423,6 +416,7 @@ std::map<ggml_type, ExtraQuantType> get_types_to_requant(const std::string & dev
|
|||
};
|
||||
}
|
||||
return {};
|
||||
GGML_UNUSED(device);
|
||||
}
|
||||
|
||||
bool is_naive(ggml_cgraph * cgraph) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue