Use shared_buffer for GPU NPU; Refactor
This commit is contained in:
parent
22d9c17a6f
commit
72bba828df
|
|
@ -1,4 +1,5 @@
|
|||
find_package(OpenVINO REQUIRED)
|
||||
find_package(OpenCL REQUIRED)
|
||||
|
||||
include("${OpenVINO_DIR}/../3rdparty/tbb/lib/cmake/TBB/TBBConfig.cmake")
|
||||
|
||||
|
|
@ -10,7 +11,7 @@ ggml_add_backend_library(ggml-openvino
|
|||
${GGML_HEADERS_OPENVINO}
|
||||
)
|
||||
|
||||
target_link_libraries(ggml-openvino PRIVATE openvino::runtime TBB::tbb)
|
||||
target_link_libraries(ggml-openvino PRIVATE openvino::runtime TBB::tbb OpenCL::OpenCL)
|
||||
|
||||
if (GGML_OPENVINO)
|
||||
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-openvino-extra.h"
|
||||
#include "ggml-openvino.h"
|
||||
#include "ggml-quants.hpp"
|
||||
|
||||
#include <ggml-impl.h>
|
||||
|
|
@ -471,9 +472,7 @@ const ggml_tensor * GgmlOvDecoder::get_tensor_from_name(const std::string & name
|
|||
// return kv_param_res_names;
|
||||
// }
|
||||
|
||||
std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(
|
||||
ggml_cgraph * cgraph,
|
||||
std::map<ggml_type, ExtraQuantType> types_to_requantize) {
|
||||
std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph) {
|
||||
std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
|
||||
static std::mutex weights_mutex;
|
||||
auto * nodes = cgraph->nodes;
|
||||
|
|
@ -498,10 +497,7 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
|
|||
}
|
||||
}
|
||||
if (should_create) {
|
||||
auto requant_type = types_to_requantize.count(src->type) ?
|
||||
std::optional<ExtraQuantType>(types_to_requantize.at(src->type)) :
|
||||
std::nullopt;
|
||||
auto weight_node = create_weight_node(src, requant_type);
|
||||
auto weight_node = create_weight_node(src);
|
||||
weight_node->set_friendly_name(src_name);
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(weights_mutex);
|
||||
|
|
@ -520,11 +516,14 @@ std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_no
|
|||
static std::unordered_map<const void *, std::shared_ptr<ov::Node>> s_quantized_weight_cache;
|
||||
static std::mutex s_quantized_weight_cache_mutex;
|
||||
|
||||
std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor,
|
||||
std::optional<ExtraQuantType> requant_type) {
|
||||
std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor) {
|
||||
// Check if we have a pre-built constant from the OpenVINO backend buffer
|
||||
// This is set during ggml_backend_openvino_buffer_set_tensor
|
||||
if (tensor->extra != nullptr && !requant_type.has_value()) {
|
||||
if (tensor->extra) {
|
||||
if (!ggml_backend_buffer_is_openvino(tensor->buffer)) {
|
||||
OPENVINO_ASSERT(false, "Unsupported weight tensor: " + std::string(tensor->name) +
|
||||
" Possibly this is a cpu backend repacked quantized weights");
|
||||
}
|
||||
// Cast to our extra base type and check the type
|
||||
auto * extra_base = static_cast<ggml_openvino_extra_base *>(tensor->extra);
|
||||
|
||||
|
|
@ -547,7 +546,7 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor
|
|||
|
||||
// Fallback: Check static cache for quantized weights (keyed by data pointer)
|
||||
// This handles cases where tensors weren't loaded through OpenVINO buffer
|
||||
if (ggml_is_quantized(tensor->type) && !requant_type.has_value()) {
|
||||
if (ggml_is_quantized(tensor->type)) {
|
||||
std::lock_guard<std::mutex> lock(s_quantized_weight_cache_mutex);
|
||||
auto it = s_quantized_weight_cache.find(tensor->data);
|
||||
if (it != s_quantized_weight_cache.end()) {
|
||||
|
|
@ -565,64 +564,11 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor * tensor
|
|||
ggml_type_name(tensor->type));
|
||||
}
|
||||
|
||||
auto node_type = get_ov_type(tensor);
|
||||
auto node_shape = get_shape(tensor);
|
||||
auto ne_total = ggml_nelements(tensor);
|
||||
|
||||
OPENVINO_ASSERT(node_shape[0] == 1, "Got 4D weights, expect all weights to be 2D: ", tensor->name);
|
||||
node_shape.erase(node_shape.begin());
|
||||
OPENVINO_ASSERT(node_shape[0] == 1, "Got 3D weights, expect all weights to be 2D: ", tensor->name);
|
||||
node_shape.erase(node_shape.begin());
|
||||
|
||||
// F16 and F32 case
|
||||
if (node_type != ov::element::dynamic) {
|
||||
ov::Tensor weights(node_type, node_shape);
|
||||
memcpy(weights.data(), tensor->data, ne_total * node_type.size());
|
||||
std::shared_ptr<ov::Node> weight_node = std::make_shared<ov::op::v0::Constant>(weights);
|
||||
// Disabled because it triggers a bug in NPUW, no performance impact on CPU GPU
|
||||
// if (node_type == ov::element::f16) {
|
||||
// weight_node = std::make_shared<ov::op::v0::Convert>(weight_node, ov::element::f32);
|
||||
// }
|
||||
weight_node->set_friendly_name(tensor->name);
|
||||
return weight_node;
|
||||
}
|
||||
|
||||
// Quantized case - extra should be nullptr (not our type)
|
||||
// Our ggml_openvino_weight_extra is only set for F16/F32 weights
|
||||
if (tensor->extra != nullptr) {
|
||||
// Check if it's our type - if so, something is wrong
|
||||
auto * extra_base = static_cast<ggml_openvino_extra_base *>(tensor->extra);
|
||||
if (extra_base->type == ggml_openvino_extra_base::Type::WEIGHT ||
|
||||
extra_base->type == ggml_openvino_extra_base::Type::TENSOR) {
|
||||
OPENVINO_ASSERT(false, "Quantized weight tensor has unexpected extra type: " + std::string(tensor->name));
|
||||
}
|
||||
// Otherwise it might be repacked quantized weights from another backend
|
||||
OPENVINO_ASSERT(false, "Unsupported weight tensor: " + std::string(tensor->name) +
|
||||
" Possibly this is a repacked quantized weights");
|
||||
}
|
||||
|
||||
if (requant_type.has_value()) {
|
||||
return requantize(tensor, requant_type.value());
|
||||
}
|
||||
|
||||
// Extract quantized weights using the shared function
|
||||
auto layout = ggml_openvino_get_extracted_layout(tensor);
|
||||
if (layout.total_size == 0) {
|
||||
OPENVINO_THROW("Unsupported quantized type for ", tensor->name, " type=", ggml_type_name(tensor->type));
|
||||
}
|
||||
|
||||
ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
|
||||
ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block};
|
||||
|
||||
ov::Tensor weights(weight_type, node_shape);
|
||||
ov::Tensor scales(ov::element::f16, scale_shape);
|
||||
ov::Tensor biases(ov::element::f16, scale_shape);
|
||||
|
||||
auto result = extract_quantized_weights(tensor, tensor->data, weights, scales, biases);
|
||||
std::shared_ptr<ov::Node> result = process_weight_tensor(tensor, tensor->data, nullptr);
|
||||
result->set_friendly_name(tensor->name);
|
||||
|
||||
// Cache the quantized weight node for future reuse
|
||||
if (ggml_is_quantized(tensor->type) && !requant_type.has_value()) {
|
||||
if (ggml_is_quantized(tensor->type)) {
|
||||
std::lock_guard<std::mutex> lock(s_quantized_weight_cache_mutex);
|
||||
s_quantized_weight_cache[tensor->data] = result;
|
||||
GGML_LOG_DEBUG("%s: cached quantized constant for %s\n", __func__, tensor->name);
|
||||
|
|
|
|||
|
|
@ -179,12 +179,9 @@ public:
|
|||
|
||||
static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename);
|
||||
|
||||
static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor * tensor,
|
||||
std::optional<ExtraQuantType> requant_type = std::nullopt);
|
||||
static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor * tensor);
|
||||
|
||||
static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(
|
||||
ggml_cgraph * cgraph,
|
||||
std::map<ggml_type, ExtraQuantType> types_to_requantize = {});
|
||||
static std::map<std::string, std::shared_ptr<ov::Node>> create_weight_nodes(ggml_cgraph * cgraph);
|
||||
|
||||
const ggml_tensor * get_tensor_used_op(const ggml_tensor * tensor) const;
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,177 @@
|
|||
#include "ggml-openvino-extra.h"
|
||||
|
||||
#include "ggml-impl.h"
|
||||
|
||||
ov::Core & ov_singleton_core() {
|
||||
static ov::Core core;
|
||||
return core;
|
||||
}
|
||||
|
||||
// =====================================================
|
||||
// Device Configuration Implementations
|
||||
// =====================================================
|
||||
|
||||
void ggml_openvino_device_config::init() {
|
||||
if (initialized) {
|
||||
return;
|
||||
}
|
||||
device_name = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : "CPU";
|
||||
auto available_devices = ov_singleton_core().get_available_devices();
|
||||
if (std::find(available_devices.begin(), available_devices.end(), device_name) == available_devices.end()) {
|
||||
GGML_LOG_WARN("GGML OpenVINO Backend: device %s is not available, fallback to CPU\n", device_name.c_str());
|
||||
device_name = "CPU";
|
||||
}
|
||||
is_npu = (device_name == "NPU");
|
||||
initialized = true;
|
||||
}
|
||||
|
||||
// Get the global device config singleton
|
||||
ggml_openvino_device_config & ggml_openvino_get_device_config() {
|
||||
static ggml_openvino_device_config config;
|
||||
return config;
|
||||
}
|
||||
|
||||
// Initialize device config (call during backend init)
|
||||
void ggml_openvino_init_device_config() {
|
||||
ggml_openvino_get_device_config().init();
|
||||
}
|
||||
|
||||
// Get the device name
|
||||
const std::string & ggml_openvino_get_device_name() {
|
||||
return ggml_openvino_get_device_config().device_name;
|
||||
}
|
||||
|
||||
// Check if running on NPU
|
||||
bool ggml_openvino_is_npu() {
|
||||
return ggml_openvino_get_device_config().is_npu;
|
||||
}
|
||||
|
||||
// Get requantization type for a tensor type (returns nullopt if no requant needed)
|
||||
std::optional<ExtraQuantType> ggml_openvino_get_requant_type(ggml_type type) {
|
||||
if (!ggml_openvino_is_npu()) {
|
||||
return std::nullopt;
|
||||
}
|
||||
// NPU requantization rules
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q4_K:
|
||||
return ExtraQuantType::Q4_0_128;
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
return ExtraQuantType::F16;
|
||||
default:
|
||||
return std::nullopt;
|
||||
}
|
||||
}
|
||||
|
||||
// =====================================================
|
||||
// Extracted Layout Calculation
|
||||
// =====================================================
|
||||
|
||||
ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor) {
|
||||
ggml_openvino_extracted_layout layout = {};
|
||||
|
||||
if (!ggml_is_quantized(tensor->type)) {
|
||||
return layout;
|
||||
}
|
||||
|
||||
// Only handle 2D weight tensors
|
||||
if (tensor->ne[2] != 1 || tensor->ne[3] != 1) {
|
||||
return layout;
|
||||
}
|
||||
|
||||
int64_t n_elements = ggml_nelements(tensor);
|
||||
const size_t alignment = 64; // Good for SIMD
|
||||
|
||||
// Check if requantization is needed (NPU-specific)
|
||||
auto requant_type = ggml_openvino_get_requant_type(tensor->type);
|
||||
if (requant_type.has_value()) {
|
||||
layout.is_requant = true;
|
||||
layout.requant_type = requant_type;
|
||||
|
||||
// Special case: requant to F16 - just store F16 weights, no scales/biases
|
||||
if (requant_type.value() == ExtraQuantType::F16) {
|
||||
layout.weights_size = n_elements * sizeof(uint16_t); // F16 = 2 bytes
|
||||
layout.total_size = layout.weights_size;
|
||||
layout.weights_offset = 0;
|
||||
// No scales/biases for F16
|
||||
return layout;
|
||||
}
|
||||
|
||||
// Requant to different quantized format (e.g., Q4_0_128)
|
||||
switch (requant_type.value()) {
|
||||
case ExtraQuantType::Q4_0_128:
|
||||
layout.is_u4 = true;
|
||||
layout.weights_per_block = 128;
|
||||
break;
|
||||
case ExtraQuantType::Q8_0_32:
|
||||
layout.is_u4 = false;
|
||||
layout.weights_per_block = 32;
|
||||
break;
|
||||
default:
|
||||
// Unsupported requant type - fall through to normal extraction
|
||||
layout.is_requant = false;
|
||||
layout.requant_type = std::nullopt;
|
||||
break;
|
||||
}
|
||||
|
||||
if (layout.is_requant) {
|
||||
// Calculate sizes for requantized format
|
||||
layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;
|
||||
int64_t n_blocks = n_elements / layout.weights_per_block;
|
||||
layout.scales_size = n_blocks * sizeof(uint16_t);
|
||||
layout.biases_size = n_blocks * sizeof(uint16_t);
|
||||
|
||||
layout.weights_offset = 0;
|
||||
layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
|
||||
layout.biases_offset =
|
||||
layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;
|
||||
layout.total_size = layout.biases_offset + layout.biases_size;
|
||||
layout.total_size = std::max(layout.total_size, ggml_nbytes(tensor));
|
||||
return layout;
|
||||
}
|
||||
}
|
||||
|
||||
// Normal extraction (no requant) - determine format based on tensor type
|
||||
switch (tensor->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q4_K:
|
||||
layout.is_u4 = true;
|
||||
layout.weights_per_block = 32;
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
layout.is_u4 = false;
|
||||
layout.weights_per_block = 32;
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
layout.is_u4 = false;
|
||||
layout.weights_per_block = 16;
|
||||
break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
layout.is_u4 = false;
|
||||
layout.weights_per_block = 32;
|
||||
break;
|
||||
default:
|
||||
// Unsupported quantization type
|
||||
return layout;
|
||||
}
|
||||
|
||||
// Calculate sizes
|
||||
// Weights: U4 = n_elements/2 bytes, U8 = n_elements bytes
|
||||
layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;
|
||||
|
||||
// Scales and biases: F16 per block
|
||||
int64_t n_blocks = n_elements / layout.weights_per_block;
|
||||
layout.scales_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes
|
||||
layout.biases_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes
|
||||
|
||||
// Layout in buffer: [weights | scales | biases] with alignment
|
||||
layout.weights_offset = 0;
|
||||
layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
|
||||
layout.biases_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;
|
||||
layout.total_size = layout.biases_offset + layout.biases_size;
|
||||
|
||||
return layout;
|
||||
}
|
||||
|
|
@ -1,16 +1,20 @@
|
|||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "openvino/runtime/core.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <openvino/core/node.hpp>
|
||||
#include <openvino/runtime/tensor.hpp>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include "ggml.h"
|
||||
|
||||
// ExtraQuantType enum - defines requantization target formats
|
||||
enum class ExtraQuantType { F16, Q4_0_C, Q8_1_C, Q4_0_128, Q8_0_C, Q8_0_32 };
|
||||
|
||||
ov::Core & ov_singleton_core();
|
||||
|
||||
// =====================================================
|
||||
// Global Device Configuration (singleton)
|
||||
// =====================================================
|
||||
|
|
@ -21,56 +25,23 @@ struct ggml_openvino_device_config {
|
|||
bool is_npu = false;
|
||||
bool initialized = false;
|
||||
|
||||
void init() {
|
||||
if (initialized) return;
|
||||
const char* env = std::getenv("GGML_OPENVINO_DEVICE");
|
||||
if (env) {
|
||||
device_name = env;
|
||||
is_npu = (device_name == "NPU");
|
||||
}
|
||||
initialized = true;
|
||||
}
|
||||
void init();
|
||||
};
|
||||
|
||||
// Get the global device config singleton
|
||||
inline ggml_openvino_device_config& ggml_openvino_get_device_config() {
|
||||
static ggml_openvino_device_config config;
|
||||
return config;
|
||||
}
|
||||
ggml_openvino_device_config & ggml_openvino_get_device_config();
|
||||
|
||||
// Initialize device config (call during backend init)
|
||||
inline void ggml_openvino_init_device_config() {
|
||||
ggml_openvino_get_device_config().init();
|
||||
}
|
||||
void ggml_openvino_init_device_config();
|
||||
|
||||
// Get the device name
|
||||
inline const std::string& ggml_openvino_get_device_name() {
|
||||
return ggml_openvino_get_device_config().device_name;
|
||||
}
|
||||
const std::string & ggml_openvino_get_device_name();
|
||||
|
||||
// Check if running on NPU
|
||||
inline bool ggml_openvino_is_npu() {
|
||||
return ggml_openvino_get_device_config().is_npu;
|
||||
}
|
||||
bool ggml_openvino_is_npu();
|
||||
|
||||
// Get requantization type for a tensor type (returns nullopt if no requant needed)
|
||||
inline std::optional<ExtraQuantType> ggml_openvino_get_requant_type(ggml_type type) {
|
||||
if (!ggml_openvino_is_npu()) {
|
||||
return std::nullopt;
|
||||
}
|
||||
// NPU requantization rules
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q4_K:
|
||||
return ExtraQuantType::Q4_0_128;
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
return ExtraQuantType::F16;
|
||||
default:
|
||||
return std::nullopt;
|
||||
}
|
||||
}
|
||||
std::optional<ExtraQuantType> ggml_openvino_get_requant_type(ggml_type type);
|
||||
|
||||
// =====================================================
|
||||
// OpenVINO Tensor Extra Types
|
||||
|
|
@ -140,108 +111,4 @@ struct ggml_openvino_extracted_layout {
|
|||
};
|
||||
|
||||
// Calculate the buffer layout for extracted quantized data
|
||||
inline ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor) {
|
||||
ggml_openvino_extracted_layout layout = {};
|
||||
|
||||
if (!ggml_is_quantized(tensor->type)) {
|
||||
return layout;
|
||||
}
|
||||
|
||||
// Only handle 2D weight tensors
|
||||
if (tensor->ne[2] != 1 || tensor->ne[3] != 1) {
|
||||
return layout;
|
||||
}
|
||||
|
||||
int64_t n_elements = ggml_nelements(tensor);
|
||||
const size_t alignment = 64; // Good for SIMD
|
||||
|
||||
// Check if requantization is needed (NPU-specific)
|
||||
auto requant_type = ggml_openvino_get_requant_type(tensor->type);
|
||||
if (requant_type.has_value()) {
|
||||
layout.is_requant = true;
|
||||
layout.requant_type = requant_type;
|
||||
|
||||
// Special case: requant to F16 - just store F16 weights, no scales/biases
|
||||
if (requant_type.value() == ExtraQuantType::F16) {
|
||||
layout.weights_size = n_elements * sizeof(uint16_t); // F16 = 2 bytes
|
||||
layout.total_size = layout.weights_size;
|
||||
layout.weights_offset = 0;
|
||||
// No scales/biases for F16
|
||||
return layout;
|
||||
}
|
||||
|
||||
// Requant to different quantized format (e.g., Q4_0_128)
|
||||
switch (requant_type.value()) {
|
||||
case ExtraQuantType::Q4_0_128:
|
||||
layout.is_u4 = true;
|
||||
layout.weights_per_block = 128;
|
||||
break;
|
||||
case ExtraQuantType::Q8_0_32:
|
||||
layout.is_u4 = false;
|
||||
layout.weights_per_block = 32;
|
||||
break;
|
||||
default:
|
||||
// Unsupported requant type - fall through to normal extraction
|
||||
layout.is_requant = false;
|
||||
layout.requant_type = std::nullopt;
|
||||
break;
|
||||
}
|
||||
|
||||
if (layout.is_requant) {
|
||||
// Calculate sizes for requantized format
|
||||
layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;
|
||||
int64_t n_blocks = n_elements / layout.weights_per_block;
|
||||
layout.scales_size = n_blocks * sizeof(uint16_t);
|
||||
layout.biases_size = n_blocks * sizeof(uint16_t);
|
||||
|
||||
layout.weights_offset = 0;
|
||||
layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
|
||||
layout.biases_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;
|
||||
layout.total_size = layout.biases_offset + layout.biases_size;
|
||||
layout.total_size = std::max(layout.total_size, ggml_nbytes(tensor));
|
||||
return layout;
|
||||
}
|
||||
}
|
||||
|
||||
// Normal extraction (no requant) - determine format based on tensor type
|
||||
switch (tensor->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q4_K:
|
||||
layout.is_u4 = true;
|
||||
layout.weights_per_block = 32;
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
layout.is_u4 = false;
|
||||
layout.weights_per_block = 32;
|
||||
break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
layout.is_u4 = false;
|
||||
layout.weights_per_block = 16;
|
||||
break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
layout.is_u4 = false;
|
||||
layout.weights_per_block = 32;
|
||||
break;
|
||||
default:
|
||||
// Unsupported quantization type
|
||||
return layout;
|
||||
}
|
||||
|
||||
// Calculate sizes
|
||||
// Weights: U4 = n_elements/2 bytes, U8 = n_elements bytes
|
||||
layout.weights_size = layout.is_u4 ? (n_elements / 2) : n_elements;
|
||||
|
||||
// Scales and biases: F16 per block
|
||||
int64_t n_blocks = n_elements / layout.weights_per_block;
|
||||
layout.scales_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes
|
||||
layout.biases_size = n_blocks * sizeof(uint16_t); // F16 = 2 bytes
|
||||
|
||||
// Layout in buffer: [weights | scales | biases] with alignment
|
||||
layout.weights_offset = 0;
|
||||
layout.scales_offset = ((layout.weights_size + alignment - 1) / alignment) * alignment;
|
||||
layout.biases_offset = layout.scales_offset + ((layout.scales_size + alignment - 1) / alignment) * alignment;
|
||||
layout.total_size = layout.biases_offset + layout.biases_size;
|
||||
|
||||
return layout;
|
||||
}
|
||||
ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor);
|
||||
|
|
|
|||
|
|
@ -12,7 +12,11 @@
|
|||
#include <cstring>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <openvino/core/type/element_type.hpp>
|
||||
#include <openvino/openvino.hpp>
|
||||
#include <openvino/runtime/allocator.hpp>
|
||||
#include <openvino/runtime/intel_gpu/ocl/ocl.hpp>
|
||||
#include <openvino/runtime/intel_npu/level_zero/level_zero.hpp>
|
||||
#include <openvino/runtime/tensor.hpp>
|
||||
#include <set>
|
||||
#include <string>
|
||||
|
|
@ -48,7 +52,8 @@ struct ggml_backend_openvino_buffer_context {
|
|||
// For non-weight buffers (KV cache, compute), we still use contiguous allocation
|
||||
void * data;
|
||||
size_t size;
|
||||
bool is_weight_buffer; // Set when buffer usage is set to WEIGHTS
|
||||
|
||||
std::shared_ptr<ov::Tensor> ov_tensor;
|
||||
|
||||
// Track all extras for cleanup
|
||||
std::vector<ggml_openvino_extra_base *> tensor_extras;
|
||||
|
|
@ -57,18 +62,42 @@ struct ggml_backend_openvino_buffer_context {
|
|||
device(device),
|
||||
name(std::string(GGML_OPENVINO_NAME) + std::to_string(device)),
|
||||
data(nullptr),
|
||||
size(size),
|
||||
is_weight_buffer(false) {
|
||||
// Allocate aligned contiguous memory
|
||||
if (size > 0) {
|
||||
size(size) {
|
||||
if (size == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const auto & device_name = ggml_openvino_get_device_name();
|
||||
auto & core = ov_singleton_core();
|
||||
|
||||
if (device_name == "CPU") {
|
||||
#ifdef _WIN32
|
||||
data = _aligned_malloc(size, GGML_OPENVINO_BUFFER_ALIGNMENT);
|
||||
data = _aligned_malloc(alloc_size, GGML_OPENVINO_BUFFER_ALIGNMENT);
|
||||
#else
|
||||
data = aligned_alloc(GGML_OPENVINO_BUFFER_ALIGNMENT, size);
|
||||
#endif
|
||||
if (data == nullptr) {
|
||||
GGML_LOG_ERROR("%s: failed to allocate %zu bytes\n", __func__, size);
|
||||
}
|
||||
ov_tensor = std::make_shared<ov::Tensor>(ov::element::u8, ov::Shape{size}, data);
|
||||
} else if (device_name == "GPU") {
|
||||
auto gpu_context = core.get_default_context("GPU").as<ov::intel_gpu::ocl::ClContext>();
|
||||
auto usm_tensor = gpu_context.create_usm_host_tensor(ov::element::u8, ov::Shape{size});
|
||||
data = usm_tensor.get();
|
||||
ov_tensor = std::make_shared<ov::intel_gpu::ocl::USMTensor>(std::move(usm_tensor));
|
||||
} else {
|
||||
auto npu_context = core.get_default_context("NPU").as<ov::intel_npu::level_zero::ZeroContext>();
|
||||
auto l0_tensor = npu_context.create_l0_host_tensor(ov::element::u8, ov::Shape{size});
|
||||
data = l0_tensor.get();
|
||||
ov_tensor = std::make_shared<ov::intel_npu::level_zero::ZeroBufferTensor>(std::move(l0_tensor));
|
||||
}
|
||||
|
||||
if (data == nullptr) {
|
||||
GGML_LOG_ERROR("%s: failed to allocate %zu bytes\n", __func__, size);
|
||||
return;
|
||||
}
|
||||
|
||||
if (reinterpret_cast<uintptr_t>(data) % GGML_OPENVINO_BUFFER_ALIGNMENT != 0) {
|
||||
GGML_LOG_ERROR("%s: %s buffer is not aligned to %d bytes\n", __func__, device_name.c_str(),
|
||||
GGML_OPENVINO_BUFFER_ALIGNMENT);
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -78,15 +107,12 @@ struct ggml_backend_openvino_buffer_context {
|
|||
delete extra;
|
||||
}
|
||||
tensor_extras.clear();
|
||||
|
||||
// Free contiguous memory
|
||||
if (data != nullptr) {
|
||||
if (data && ggml_openvino_get_device_name() == "CPU") {
|
||||
#ifdef _WIN32
|
||||
_aligned_free(data);
|
||||
#else
|
||||
free(data);
|
||||
#endif
|
||||
data = nullptr;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
|
@ -156,57 +182,26 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer
|
|||
}
|
||||
|
||||
if (layout.total_size > 0) {
|
||||
// Quantized weight tensor with extraction/requantization
|
||||
uint8_t * buf_base = (uint8_t *) tensor->data;
|
||||
|
||||
// 2D shape for weights [rows, cols]
|
||||
ov::Shape weight_shape = {static_cast<size_t>(tensor->ne[1]), static_cast<size_t>(tensor->ne[0])};
|
||||
|
||||
try {
|
||||
std::shared_ptr<ov::Node> constant;
|
||||
std::shared_ptr<ov::Node> constant = process_weight_tensor(tensor, data, buf_base);
|
||||
constant->set_friendly_name(tensor->name);
|
||||
|
||||
if (layout.is_requant && layout.requant_type.has_value()) {
|
||||
// Requantization path
|
||||
if (layout.requant_type.value() == ExtraQuantType::F16) {
|
||||
// Requant to F16: create F16 tensor with external memory, requantize fills it
|
||||
ov::Tensor weights(ov::element::f16, weight_shape, buf_base);
|
||||
ov::Tensor dummy_scales, dummy_biases; // Not used for F16
|
||||
// requantize_to_buffers fills weights and returns a Constant wrapping it
|
||||
constant = requantize_to_buffers(tensor, data, ExtraQuantType::F16, 0, weights, dummy_scales,
|
||||
dummy_biases);
|
||||
|
||||
// Store in tensor->extra (use weight_extra since it's F16)
|
||||
auto * extra = new ggml_openvino_weight_extra(constant);
|
||||
ctx->tensor_extras.push_back(extra);
|
||||
tensor->extra = extra;
|
||||
|
||||
GGML_LOG_DEBUG("%s: requantized %s to F16\n", __func__, tensor->name);
|
||||
} else {
|
||||
// Requant to quantized format (Q4_0_128, Q8_0_32, etc.)
|
||||
ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
|
||||
ov::Shape scale_shape = {static_cast<size_t>(tensor->ne[1]),
|
||||
static_cast<size_t>(tensor->ne[0] / layout.weights_per_block)};
|
||||
|
||||
ov::Tensor weights(weight_type, weight_shape, buf_base + layout.weights_offset);
|
||||
ov::Tensor scales(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
|
||||
ov::Tensor biases(ov::element::f16, scale_shape, buf_base + layout.biases_offset);
|
||||
|
||||
constant = requantize_to_buffers(tensor, data, layout.requant_type.value(),
|
||||
layout.weights_per_block, weights, scales, biases);
|
||||
|
||||
// Store in tensor->extra
|
||||
auto * extra = new ggml_openvino_quantized_weight_extra(std::move(weights), std::move(scales),
|
||||
std::move(biases), constant);
|
||||
ctx->tensor_extras.push_back(extra);
|
||||
tensor->extra = extra;
|
||||
|
||||
GGML_LOG_DEBUG("%s: requantized %s to %s (u%d, block_size=%ld)\n", __func__, tensor->name,
|
||||
layout.requant_type.value() == ExtraQuantType::Q4_0_128 ? "Q4_0_128" : "Q8_0_32",
|
||||
layout.is_u4 ? 4 : 8, layout.weights_per_block);
|
||||
}
|
||||
// Store in tensor->extra
|
||||
if (layout.is_requant && layout.requant_type.has_value() &&
|
||||
layout.requant_type.value() == ExtraQuantType::F16) {
|
||||
// F16 requant case - use weight_extra
|
||||
auto * extra = new ggml_openvino_weight_extra(constant);
|
||||
ctx->tensor_extras.push_back(extra);
|
||||
tensor->extra = extra;
|
||||
GGML_LOG_DEBUG("%s: requantized %s to F16\n", __func__, tensor->name);
|
||||
} else {
|
||||
// Normal extraction path (no requant)
|
||||
// Quantized case - use quantized_weight_extra
|
||||
// Create tensors with external memory (already filled by process_weight_tensor)
|
||||
ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
|
||||
int64_t n_blocks = ggml_nelements(tensor) / layout.weights_per_block;
|
||||
ov::Shape weight_shape = {static_cast<size_t>(tensor->ne[1]), static_cast<size_t>(tensor->ne[0])};
|
||||
ov::Shape scale_shape = {static_cast<size_t>(tensor->ne[1]),
|
||||
static_cast<size_t>(tensor->ne[0] / layout.weights_per_block)};
|
||||
|
||||
|
|
@ -214,16 +209,20 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer
|
|||
ov::Tensor scales(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
|
||||
ov::Tensor biases(ov::element::f16, scale_shape, buf_base + layout.biases_offset);
|
||||
|
||||
constant = extract_quantized_weights(tensor, data, weights, scales, biases);
|
||||
|
||||
// Store in tensor->extra
|
||||
auto * extra = new ggml_openvino_quantized_weight_extra(std::move(weights), std::move(scales),
|
||||
std::move(biases), constant);
|
||||
ctx->tensor_extras.push_back(extra);
|
||||
tensor->extra = extra;
|
||||
|
||||
GGML_LOG_DEBUG("%s: extracted quantized constant for %s (u%d, %zu weights, %ld blocks)\n", __func__,
|
||||
tensor->name, layout.is_u4 ? 4 : 8, layout.weights_size, n_blocks);
|
||||
if (layout.is_requant) {
|
||||
GGML_LOG_DEBUG("%s: requantized %s to %s (u%d, block_size=%ld)\n", __func__, tensor->name,
|
||||
layout.requant_type.value() == ExtraQuantType::Q4_0_128 ? "Q4_0_128" : "Q8_0_32",
|
||||
layout.is_u4 ? 4 : 8, layout.weights_per_block);
|
||||
} else {
|
||||
int64_t n_blocks = ggml_nelements(tensor) / layout.weights_per_block;
|
||||
GGML_LOG_DEBUG("%s: extracted quantized constant for %s (u%d, %zu weights, %ld blocks)\n", __func__,
|
||||
tensor->name, layout.is_u4 ? 4 : 8, layout.weights_size, n_blocks);
|
||||
}
|
||||
}
|
||||
|
||||
} catch (const std::exception & e) {
|
||||
|
|
@ -233,32 +232,9 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer
|
|||
}
|
||||
} else if (is_weight_buffer && is_full_tensor_set && is_2d &&
|
||||
(tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16)) {
|
||||
// F16/F32/BF16 weight tensor - copy data and create shared-memory constant
|
||||
memcpy((char *) tensor->data + offset, data, size);
|
||||
|
||||
// F16/F32/BF16 weight tensor
|
||||
try {
|
||||
// Get OpenVINO element type
|
||||
ov::element::Type element_type;
|
||||
switch (tensor->type) {
|
||||
case GGML_TYPE_F32:
|
||||
element_type = ov::element::f32;
|
||||
break;
|
||||
case GGML_TYPE_F16:
|
||||
element_type = ov::element::f16;
|
||||
break;
|
||||
case GGML_TYPE_BF16:
|
||||
element_type = ov::element::bf16;
|
||||
break;
|
||||
default:
|
||||
return; // Should not happen
|
||||
}
|
||||
|
||||
// Create 2D shape (OpenVINO expects [rows, cols])
|
||||
ov::Shape shape = {static_cast<size_t>(tensor->ne[1]), static_cast<size_t>(tensor->ne[0])};
|
||||
|
||||
// Create ov::Tensor with external memory, then wrap with Constant
|
||||
ov::Tensor ov_tensor(element_type, shape, tensor->data);
|
||||
auto constant = std::make_shared<ov::op::v0::Constant>(ov_tensor);
|
||||
std::shared_ptr<ov::Node> constant = process_weight_tensor(tensor, data, tensor->data);
|
||||
constant->set_friendly_name(tensor->name);
|
||||
|
||||
// Store in tensor->extra
|
||||
|
|
@ -418,7 +394,7 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(in
|
|||
}
|
||||
|
||||
// Check if a buffer is an OpenVINO buffer
|
||||
static bool ggml_backend_buffer_is_openvino(ggml_backend_buffer_t buffer) {
|
||||
bool ggml_backend_buffer_is_openvino(ggml_backend_buffer_t buffer) {
|
||||
return buffer->iface.free_buffer == ggml_backend_openvino_buffer_free_buffer;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -569,6 +569,112 @@ std::shared_ptr<ov::Node> requantize(const ggml_tensor * tensor, ExtraQuantType
|
|||
return requantize_to_buffers(tensor, tensor->data, requant_type, block_size, weights, scales, biases);
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Node> process_weight_tensor(const ggml_tensor * tensor, const void * data, void * output_base_ptr) {
|
||||
GGML_ASSERT(tensor != nullptr);
|
||||
GGML_ASSERT(data != nullptr);
|
||||
|
||||
// Get 2D shape for weights [rows, cols]
|
||||
ov::Shape node_shape = {static_cast<size_t>(tensor->ne[1]), static_cast<size_t>(tensor->ne[0])};
|
||||
|
||||
// Handle F16/F32/BF16 weights
|
||||
if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {
|
||||
ov::element::Type element_type;
|
||||
switch (tensor->type) {
|
||||
case GGML_TYPE_F32:
|
||||
element_type = ov::element::f32;
|
||||
break;
|
||||
case GGML_TYPE_F16:
|
||||
element_type = ov::element::f16;
|
||||
break;
|
||||
case GGML_TYPE_BF16:
|
||||
element_type = ov::element::bf16;
|
||||
break;
|
||||
default:
|
||||
OPENVINO_THROW("Unexpected tensor type in F16/F32/BF16 path");
|
||||
}
|
||||
|
||||
if (output_base_ptr) {
|
||||
// Using external buffer - copy data and create shared-memory constant
|
||||
size_t tensor_bytes = ggml_nbytes(tensor);
|
||||
memcpy(output_base_ptr, data, tensor_bytes);
|
||||
ov::Tensor ov_tensor(element_type, node_shape, output_base_ptr);
|
||||
return std::make_shared<ov::op::v0::Constant>(ov_tensor);
|
||||
} else {
|
||||
// Allocate internal buffer
|
||||
ov::Tensor weights(element_type, node_shape);
|
||||
memcpy(weights.data(), data, ggml_nelements(tensor) * element_type.size());
|
||||
return std::make_shared<ov::op::v0::Constant>(weights);
|
||||
}
|
||||
}
|
||||
|
||||
// Handle quantized weights
|
||||
if (!ggml_is_quantized(tensor->type)) {
|
||||
OPENVINO_THROW("Unsupported weight tensor type: ", ggml_type_name(tensor->type));
|
||||
}
|
||||
|
||||
auto layout = ggml_openvino_get_extracted_layout(tensor);
|
||||
if (layout.total_size == 0) {
|
||||
OPENVINO_THROW("Unsupported quantized type: ", ggml_type_name(tensor->type));
|
||||
}
|
||||
|
||||
std::shared_ptr<ov::Node> result;
|
||||
|
||||
if (layout.is_requant && layout.requant_type.has_value()) {
|
||||
// Requantization path
|
||||
if (layout.requant_type.value() == ExtraQuantType::F16) {
|
||||
// Requant to F16
|
||||
ov::Tensor weights;
|
||||
if (output_base_ptr) {
|
||||
weights = ov::Tensor(ov::element::f16, node_shape,
|
||||
static_cast<uint8_t *>(output_base_ptr) + layout.weights_offset);
|
||||
} else {
|
||||
weights = ov::Tensor(ov::element::f16, node_shape);
|
||||
}
|
||||
ov::Tensor dummy_scales, dummy_biases; // Not used for F16
|
||||
result = requantize_to_buffers(tensor, data, ExtraQuantType::F16, 0, weights, dummy_scales, dummy_biases);
|
||||
} else {
|
||||
// Requant to quantized format (Q4_0_128, Q8_0_32, etc.)
|
||||
ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
|
||||
ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block};
|
||||
|
||||
ov::Tensor weights, scales, biases;
|
||||
if (output_base_ptr) {
|
||||
uint8_t * buf_base = static_cast<uint8_t *>(output_base_ptr);
|
||||
weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset);
|
||||
scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
|
||||
biases = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.biases_offset);
|
||||
} else {
|
||||
weights = ov::Tensor(weight_type, node_shape);
|
||||
scales = ov::Tensor(ov::element::f16, scale_shape);
|
||||
biases = ov::Tensor(ov::element::f16, scale_shape);
|
||||
}
|
||||
|
||||
result = requantize_to_buffers(tensor, data, layout.requant_type.value(), layout.weights_per_block, weights,
|
||||
scales, biases);
|
||||
}
|
||||
} else {
|
||||
// Normal extraction path (no requant)
|
||||
ov::element::Type weight_type = layout.is_u4 ? ov::element::u4 : ov::element::u8;
|
||||
ov::Shape scale_shape = {node_shape[0], node_shape[1] / layout.weights_per_block};
|
||||
|
||||
ov::Tensor weights, scales, biases;
|
||||
if (output_base_ptr) {
|
||||
uint8_t * buf_base = static_cast<uint8_t *>(output_base_ptr);
|
||||
weights = ov::Tensor(weight_type, node_shape, buf_base + layout.weights_offset);
|
||||
scales = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.scales_offset);
|
||||
biases = ov::Tensor(ov::element::f16, scale_shape, buf_base + layout.biases_offset);
|
||||
} else {
|
||||
weights = ov::Tensor(weight_type, node_shape);
|
||||
scales = ov::Tensor(ov::element::f16, scale_shape);
|
||||
biases = ov::Tensor(ov::element::f16, scale_shape);
|
||||
}
|
||||
|
||||
result = extract_quantized_weights(tensor, data, weights, scales, biases);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
void quantize_q4_0(const float * x,
|
||||
ov::Tensor & weights_arr,
|
||||
ov::Tensor & scales_arr,
|
||||
|
|
|
|||
|
|
@ -78,6 +78,16 @@ std::shared_ptr<ov::Node> requantize_to_buffers(const ggml_tensor * tensor,
|
|||
ov::Tensor & scales,
|
||||
ov::Tensor & biases);
|
||||
|
||||
// Process weight tensor and create an OpenVINO constant node
|
||||
// Handles F16/F32/BF16 and quantized weights, with optional requantization
|
||||
// If output_base_ptr is nullptr, allocates internal buffers (for decoder use)
|
||||
// If output_base_ptr is provided, uses pre-allocated buffers at specified offsets (for backend buffer use)
|
||||
// Returns the weight constant node
|
||||
std::shared_ptr<ov::Node> process_weight_tensor(
|
||||
const ggml_tensor * tensor,
|
||||
const void * data, // Source data pointer (may differ from tensor->data)
|
||||
void * output_base_ptr = nullptr); // Base pointer for output buffers (or nullptr for internal allocation)
|
||||
|
||||
void quantize_q4_0(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
|
||||
int64_t qk);
|
||||
void quantize_q8_1(const float* x, ov::Tensor& weights_arr, ov::Tensor& scales_arr, ov::Tensor& biases_arr, int64_t k,
|
||||
|
|
|
|||
|
|
@ -107,7 +107,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
|
|||
infer_request_cache.erase(key);
|
||||
|
||||
std::shared_ptr<ov::Model> model;
|
||||
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, get_types_to_requant(device));
|
||||
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
|
||||
|
||||
ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, m_params, c_params, model_weights, is_static);
|
||||
decoder_end_time = ggml_time_us();
|
||||
|
|
@ -255,7 +255,7 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) {
|
|||
infer_request_cache_prefill.erase(key);
|
||||
|
||||
std::shared_ptr<ov::Model> model;
|
||||
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, get_types_to_requant(device));
|
||||
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
|
||||
|
||||
auto ggml_decoder_prefill = std::make_shared<GgmlOvDecoder>(cgraph, m_params, c_params, model_weights,
|
||||
is_static, true, prefill_chunk_size);
|
||||
|
|
@ -404,21 +404,6 @@ ov::AnyMap get_ov_compile_config(const std::string & device) {
|
|||
return config;
|
||||
}
|
||||
|
||||
std::map<ggml_type, ExtraQuantType> get_types_to_requant(const std::string & device) {
|
||||
// Use singleton to check if NPU (device param kept for API compatibility)
|
||||
if (ggml_openvino_is_npu()) {
|
||||
return {
|
||||
{GGML_TYPE_Q4_0, ExtraQuantType::Q4_0_128},
|
||||
{GGML_TYPE_Q4_1, ExtraQuantType::Q4_0_128},
|
||||
{GGML_TYPE_Q4_K, ExtraQuantType::Q4_0_128},
|
||||
{GGML_TYPE_Q6_K, ExtraQuantType::F16 },
|
||||
{GGML_TYPE_Q5_K, ExtraQuantType::F16 },
|
||||
};
|
||||
}
|
||||
return {};
|
||||
GGML_UNUSED(device);
|
||||
}
|
||||
|
||||
bool is_naive(ggml_cgraph * cgraph) {
|
||||
constexpr int naive_graph_size_threshold = 20;
|
||||
return cgraph->n_nodes < naive_graph_size_threshold;
|
||||
|
|
|
|||
|
|
@ -73,8 +73,6 @@ graph_key compute_graph_key(struct ggml_cgraph * cgraph);
|
|||
|
||||
ov::AnyMap get_ov_compile_config(const std::string & device);
|
||||
|
||||
std::map<ggml_type, ExtraQuantType> get_types_to_requant(const std::string & device);
|
||||
|
||||
ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string & param_name);
|
||||
ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
|
||||
const std::string & param_name);
|
||||
|
|
|
|||
Loading…
Reference in New Issue