Add ov_backend_host_buffer; Use cached remote context
This commit is contained in:
parent
72bba828df
commit
3fdcb6ab72
|
|
@ -18,9 +18,17 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device);
|
|||
|
||||
GGML_BACKEND_API bool ggml_backend_is_openvino(ggml_backend_t backend);
|
||||
|
||||
GGML_BACKEND_API bool ggml_backend_buffer_is_openvino(ggml_backend_buffer_t buffer);
|
||||
|
||||
GGML_BACKEND_API bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t buft);
|
||||
|
||||
GGML_BACKEND_API bool ggml_backend_buft_is_openvino_host(ggml_backend_buffer_type_t buft);
|
||||
|
||||
// device buffer
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(int device);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_host_buffer_type(int device);
|
||||
|
||||
GGML_BACKEND_API int ggml_backend_openvino_get_device_count(void);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_openvino_reg(void);
|
||||
|
|
|
|||
|
|
@ -593,11 +593,19 @@ void GgmlOvDecoder::dump_cgraph(const ggml_cgraph * cgraph, std::string & filena
|
|||
<< std::setw(20) << "op"
|
||||
<< std::setw(20) << "name"
|
||||
<< std::setw(3) << " "
|
||||
<< std::setw(50) << "stride"
|
||||
<< std::setw(62) << "stride"
|
||||
<< std::setw(20) << "buffer_type"
|
||||
<< "\n";
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
|
||||
// Get buffer type name
|
||||
const char * buf_name = "none";
|
||||
ggml_backend_buffer_t buf = node->view_src ? node->view_src->buffer : node->buffer;
|
||||
if (buf) {
|
||||
buf_name = ggml_backend_buffer_name(buf);
|
||||
}
|
||||
|
||||
file << " - " << std::setw(3) << i << ": [ "
|
||||
<< std::setw(5) << node->ne[0] << ", "
|
||||
<< std::setw(5) << node->ne[1] << ", "
|
||||
|
|
@ -610,10 +618,18 @@ void GgmlOvDecoder::dump_cgraph(const ggml_cgraph * cgraph, std::string & filena
|
|||
<< std::setw(5) << node->nb[1] << ", "
|
||||
<< std::setw(5) << node->nb[2] << ", "
|
||||
<< std::setw(5) << node->nb[3] << "] "
|
||||
<< std::right << std::setw(15) << buf_name << std::right
|
||||
<< "\n";
|
||||
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (auto* src = node->src[i]) {
|
||||
// Get buffer type name for source
|
||||
const char * src_buf_name = "none";
|
||||
ggml_backend_buffer_t src_buf = src->view_src ? src->view_src->buffer : src->buffer;
|
||||
if (src_buf) {
|
||||
src_buf_name = ggml_backend_buffer_name(src_buf);
|
||||
}
|
||||
|
||||
file << std::setw(10) << " [ "
|
||||
<< std::setw(5) << src->ne[0] << ", "
|
||||
<< std::setw(5) << src->ne[1] << ", "
|
||||
|
|
@ -627,6 +643,7 @@ void GgmlOvDecoder::dump_cgraph(const ggml_cgraph * cgraph, std::string & filena
|
|||
<< std::setw(5) << src->nb[1] << ", "
|
||||
<< std::setw(5) << src->nb[2] << ", "
|
||||
<< std::setw(5) << src->nb[3] << "] "
|
||||
<< std::right << std::setw(15) << src_buf_name << std::right
|
||||
<< "\n";
|
||||
}
|
||||
}
|
||||
|
|
@ -636,11 +653,19 @@ void GgmlOvDecoder::dump_cgraph(const ggml_cgraph * cgraph, std::string & filena
|
|||
for (int i = 0; i < cgraph->n_leafs; i++) {
|
||||
ggml_tensor * node = cgraph->leafs[i];
|
||||
|
||||
// Get buffer type name for leaf
|
||||
const char * leaf_buf_name = "none";
|
||||
ggml_backend_buffer_t leaf_buf = node->view_src ? node->view_src->buffer : node->buffer;
|
||||
if (leaf_buf) {
|
||||
leaf_buf_name = ggml_backend_buffer_name(leaf_buf);
|
||||
}
|
||||
|
||||
file << " - " << std::setw(3) << i << ": [ "
|
||||
<< std::setw(5) << node->ne[0] << ", "
|
||||
<< std::setw(5) << node->ne[1] << "] "
|
||||
<< std::setw(8) << ggml_op_name(node->op) << " "
|
||||
<< std::setw(16) << ggml_get_name(node) << "\n";
|
||||
<< std::setw(16) << ggml_get_name(node)
|
||||
<< std::setw(20) << leaf_buf_name << "\n";
|
||||
}
|
||||
// clang-format on
|
||||
file << "========================================\n";
|
||||
|
|
|
|||
|
|
@ -2,6 +2,9 @@
|
|||
|
||||
#include "ggml-impl.h"
|
||||
|
||||
#include <openvino/runtime/intel_gpu/ocl/ocl.hpp>
|
||||
#include <openvino/runtime/intel_npu/level_zero/level_zero.hpp>
|
||||
|
||||
ov::Core & ov_singleton_core() {
|
||||
static ov::Core core;
|
||||
return core;
|
||||
|
|
@ -22,6 +25,31 @@ void ggml_openvino_device_config::init() {
|
|||
device_name = "CPU";
|
||||
}
|
||||
is_npu = (device_name == "NPU");
|
||||
|
||||
auto * cache_dir = getenv("GGML_OPENVINO_CACHE_DIR");
|
||||
if (device_name == "NPU") {
|
||||
compile_config = {
|
||||
{"NPU_COMPILER_DYNAMIC_QUANTIZATION", "YES" },
|
||||
{"NPU_USE_NPUW", "YES" },
|
||||
{"NPUW_DEVICES", "NPU" },
|
||||
{"NPUW_FOLD", "YES" },
|
||||
{"NPUW_WEIGHTS_BANK", "shared"},
|
||||
{"NPUW_FUNCALL_FOR_ALL", "YES" },
|
||||
{"NPUW_FUNCALL_ASYNC", "YES" },
|
||||
{"NPUW_DQ", "YES" },
|
||||
{"NPUW_DQ_FULL", "NO" },
|
||||
};
|
||||
if (cache_dir) {
|
||||
compile_config["NPUW_CACHE_DIR"] = cache_dir;
|
||||
}
|
||||
} else if (cache_dir) {
|
||||
ov_singleton_core().set_property(ov::cache_dir(cache_dir));
|
||||
}
|
||||
|
||||
if (device_name != "CPU") {
|
||||
remote_context = ov_singleton_core().get_default_context(device_name);
|
||||
}
|
||||
|
||||
initialized = true;
|
||||
}
|
||||
|
||||
|
|
@ -46,6 +74,16 @@ bool ggml_openvino_is_npu() {
|
|||
return ggml_openvino_get_device_config().is_npu;
|
||||
}
|
||||
|
||||
// Get the remote context for the current device (returns empty optional for CPU)
|
||||
std::optional<ov::RemoteContext> ggml_openvino_get_remote_context() {
|
||||
return ggml_openvino_get_device_config().remote_context;
|
||||
}
|
||||
|
||||
// Get the compile config for the current device
|
||||
const ov::AnyMap & ggml_openvino_get_compile_config() {
|
||||
return ggml_openvino_get_device_config().compile_config;
|
||||
}
|
||||
|
||||
// Get requantization type for a tensor type (returns nullopt if no requant needed)
|
||||
std::optional<ExtraQuantType> ggml_openvino_get_requant_type(ggml_type type) {
|
||||
if (!ggml_openvino_is_npu()) {
|
||||
|
|
@ -175,3 +213,50 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten
|
|||
|
||||
return layout;
|
||||
}
|
||||
|
||||
ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor * tensor) {
|
||||
ov::Shape shape;
|
||||
for (int i = GGML_MAX_DIMS - 1; i >= 0; --i) {
|
||||
shape.push_back(static_cast<size_t>(tensor->ne[i]));
|
||||
}
|
||||
|
||||
ov::element::Type element_type;
|
||||
switch (tensor->type) {
|
||||
case GGML_TYPE_F32:
|
||||
element_type = ov::element::f32;
|
||||
break;
|
||||
case GGML_TYPE_F16:
|
||||
element_type = ov::element::f16;
|
||||
break;
|
||||
case GGML_TYPE_BF16:
|
||||
element_type = ov::element::bf16;
|
||||
break;
|
||||
case GGML_TYPE_I32:
|
||||
element_type = ov::element::i32;
|
||||
break;
|
||||
case GGML_TYPE_I64:
|
||||
element_type = ov::element::i64;
|
||||
break;
|
||||
default:
|
||||
GGML_LOG_ERROR("%s: unsupported tensor type for ov::Tensor: %s\n", __func__, ggml_type_name(tensor->type));
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
const auto & device_name = ggml_openvino_get_device_name();
|
||||
auto remote_context = ggml_openvino_get_remote_context();
|
||||
|
||||
std::shared_ptr<ov::Tensor> ov_tensor;
|
||||
if (device_name == "CPU") {
|
||||
ov_tensor = std::make_shared<ov::Tensor>(element_type, shape, tensor->data);
|
||||
} else if (device_name == "GPU") {
|
||||
auto gpu_context = remote_context->as<ov::intel_gpu::ocl::ClContext>();
|
||||
auto usm_tensor = gpu_context.create_tensor(element_type, shape, tensor->data);
|
||||
ov_tensor = std::make_shared<ov::intel_gpu::ocl::USMTensor>(std::move(usm_tensor));
|
||||
} else {
|
||||
auto npu_context = remote_context->as<ov::intel_npu::level_zero::ZeroContext>();
|
||||
auto l0_tensor = npu_context.create_tensor(element_type, shape, tensor->data);
|
||||
ov_tensor = std::make_shared<ov::intel_npu::level_zero::ZeroBufferTensor>(std::move(l0_tensor));
|
||||
}
|
||||
|
||||
return new ggml_openvino_tensor_extra(ov_tensor);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@
|
|||
#include <cstdlib>
|
||||
#include <memory>
|
||||
#include <openvino/core/node.hpp>
|
||||
#include <openvino/runtime/remote_context.hpp>
|
||||
#include <openvino/runtime/tensor.hpp>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
|
|
@ -15,6 +16,12 @@ enum class ExtraQuantType { F16, Q4_0_C, Q8_1_C, Q4_0_128, Q8_0_C, Q8_0_32 };
|
|||
|
||||
ov::Core & ov_singleton_core();
|
||||
|
||||
// Get the remote context for the current device (returns empty optional for CPU)
|
||||
std::optional<ov::RemoteContext> ggml_openvino_get_remote_context();
|
||||
|
||||
// Get the compile config for the current device
|
||||
const ov::AnyMap & ggml_openvino_get_compile_config();
|
||||
|
||||
// =====================================================
|
||||
// Global Device Configuration (singleton)
|
||||
// =====================================================
|
||||
|
|
@ -24,6 +31,8 @@ struct ggml_openvino_device_config {
|
|||
std::string device_name = "CPU";
|
||||
bool is_npu = false;
|
||||
bool initialized = false;
|
||||
std::optional<ov::RemoteContext> remote_context;
|
||||
ov::AnyMap compile_config;
|
||||
|
||||
void init();
|
||||
};
|
||||
|
|
@ -112,3 +121,5 @@ struct ggml_openvino_extracted_layout {
|
|||
|
||||
// Calculate the buffer layout for extracted quantized data
|
||||
ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor);
|
||||
|
||||
ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor * tensor);
|
||||
|
|
|
|||
|
|
@ -56,7 +56,7 @@ struct ggml_backend_openvino_buffer_context {
|
|||
std::shared_ptr<ov::Tensor> ov_tensor;
|
||||
|
||||
// Track all extras for cleanup
|
||||
std::vector<ggml_openvino_extra_base *> tensor_extras;
|
||||
std::map<ggml_tensor *, ggml_openvino_extra_base *> tensor_extras;
|
||||
|
||||
ggml_backend_openvino_buffer_context(int device, size_t size) :
|
||||
device(device),
|
||||
|
|
@ -103,8 +103,8 @@ struct ggml_backend_openvino_buffer_context {
|
|||
|
||||
~ggml_backend_openvino_buffer_context() {
|
||||
// Clean up all tensor extras
|
||||
for (auto * extra : tensor_extras) {
|
||||
delete extra;
|
||||
for (auto & pair : tensor_extras) {
|
||||
delete pair.second;
|
||||
}
|
||||
tensor_extras.clear();
|
||||
if (data && ggml_openvino_get_device_name() == "CPU") {
|
||||
|
|
@ -144,9 +144,20 @@ static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_bu
|
|||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
// For non-view tensors, tensor->extra will be set in set_tensor
|
||||
// when the actual weight data is loaded
|
||||
GGML_UNUSED(buffer);
|
||||
ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context;
|
||||
|
||||
if (tensor->data != nullptr) {
|
||||
ggml_openvino_tensor_extra * extra = ggml_openvino_create_tensor_extra(tensor);
|
||||
if (extra != nullptr) {
|
||||
auto it = ctx->tensor_extras.find(tensor);
|
||||
if (it != ctx->tensor_extras.end()) {
|
||||
delete it->second;
|
||||
}
|
||||
ctx->tensor_extras[tensor] = extra;
|
||||
tensor->extra = extra;
|
||||
}
|
||||
}
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
|
|
@ -194,7 +205,7 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer
|
|||
layout.requant_type.value() == ExtraQuantType::F16) {
|
||||
// F16 requant case - use weight_extra
|
||||
auto * extra = new ggml_openvino_weight_extra(constant);
|
||||
ctx->tensor_extras.push_back(extra);
|
||||
ctx->tensor_extras[tensor] = extra;
|
||||
tensor->extra = extra;
|
||||
GGML_LOG_DEBUG("%s: requantized %s to F16\n", __func__, tensor->name);
|
||||
} else {
|
||||
|
|
@ -211,7 +222,7 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer
|
|||
|
||||
auto * extra = new ggml_openvino_quantized_weight_extra(std::move(weights), std::move(scales),
|
||||
std::move(biases), constant);
|
||||
ctx->tensor_extras.push_back(extra);
|
||||
ctx->tensor_extras[tensor] = extra;
|
||||
tensor->extra = extra;
|
||||
|
||||
if (layout.is_requant) {
|
||||
|
|
@ -239,7 +250,7 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer
|
|||
|
||||
// Store in tensor->extra
|
||||
ggml_openvino_weight_extra * extra = new ggml_openvino_weight_extra(constant);
|
||||
ctx->tensor_extras.push_back(extra);
|
||||
ctx->tensor_extras[tensor] = extra;
|
||||
tensor->extra = extra;
|
||||
|
||||
GGML_LOG_DEBUG("%s: created shared-memory constant for %s\n", __func__, tensor->name);
|
||||
|
|
@ -251,6 +262,19 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer
|
|||
} else {
|
||||
// Non-weight tensor (KV cache, activations, etc.) - just copy data
|
||||
memcpy((char *) tensor->data + offset, data, size);
|
||||
|
||||
ggml_openvino_tensor_extra * extra = ggml_openvino_create_tensor_extra(tensor);
|
||||
if (extra == nullptr) {
|
||||
GGML_LOG_ERROR("%s: failed to create tensor extra for %s\n", __func__, tensor->name);
|
||||
return;
|
||||
}
|
||||
|
||||
auto it = ctx->tensor_extras.find(tensor);
|
||||
if (it != ctx->tensor_extras.end()) {
|
||||
delete it->second;
|
||||
}
|
||||
ctx->tensor_extras[tensor] = extra;
|
||||
tensor->extra = extra;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -393,11 +417,67 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(in
|
|||
return &buffer_types[device];
|
||||
}
|
||||
|
||||
// Check if a buffer is an OpenVINO buffer
|
||||
// =====================================================
|
||||
// OpenVINO Host Buffer Implementation
|
||||
// =====================================================
|
||||
|
||||
static const char * ggml_backend_openvino_host_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||
ggml_backend_openvino_buffer_type_context * ctx = (ggml_backend_openvino_buffer_type_context *) buft->context;
|
||||
static std::string name;
|
||||
name = ctx->name + "_HOST";
|
||||
return name.c_str();
|
||||
}
|
||||
|
||||
static const ggml_backend_buffer_type_i ggml_backend_openvino_host_buffer_type_interface = {
|
||||
/* .get_name = */ ggml_backend_openvino_host_buffer_type_get_name,
|
||||
/* .alloc_buffer = */ ggml_backend_openvino_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_openvino_buffer_type_get_alignment,
|
||||
/* .get_max_size = */ ggml_backend_openvino_buffer_type_get_max_size,
|
||||
/* .get_alloc_size = */ ggml_backend_openvino_buffer_type_get_alloc_size,
|
||||
/* .is_host = */ ggml_backend_openvino_buffer_type_is_host,
|
||||
};
|
||||
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_host_buffer_type(int device) {
|
||||
GGML_ASSERT(device >= 0 && device < ggml_backend_openvino_get_device_count());
|
||||
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
|
||||
static std::vector<ggml_backend_buffer_type> buffer_types;
|
||||
static std::vector<ggml_backend_openvino_buffer_type_context> buffer_type_contexts;
|
||||
|
||||
if (buffer_types.empty()) {
|
||||
int device_count = ggml_backend_openvino_get_device_count();
|
||||
buffer_types.resize(device_count);
|
||||
buffer_type_contexts.resize(device_count);
|
||||
|
||||
for (int i = 0; i < device_count; i++) {
|
||||
buffer_type_contexts[i].device = i;
|
||||
buffer_type_contexts[i].name = std::string(GGML_OPENVINO_NAME) + std::to_string(i);
|
||||
|
||||
buffer_types[i] = ggml_backend_buffer_type{
|
||||
/* .iface = */ ggml_backend_openvino_host_buffer_type_interface,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_openvino_reg(), i),
|
||||
/* .context = */ &buffer_type_contexts[i],
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
return &buffer_types[device];
|
||||
}
|
||||
|
||||
bool ggml_backend_buffer_is_openvino(ggml_backend_buffer_t buffer) {
|
||||
return buffer->iface.free_buffer == ggml_backend_openvino_buffer_free_buffer;
|
||||
}
|
||||
|
||||
bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t buft) {
|
||||
return buft->iface.get_name == ggml_backend_openvino_buffer_type_get_name;
|
||||
}
|
||||
|
||||
bool ggml_backend_buft_is_openvino_host(ggml_backend_buffer_type_t buft) {
|
||||
return buft->iface.get_name == ggml_backend_openvino_host_buffer_type_get_name;
|
||||
}
|
||||
|
||||
// =====================================================
|
||||
// OpenVINO Backend Context and Interface
|
||||
// =====================================================
|
||||
|
|
@ -552,6 +632,11 @@ static ggml_backend_buffer_type_t ggml_backend_openvino_device_get_buffer_type(g
|
|||
return ggml_backend_openvino_buffer_type(ctx->device);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_openvino_device_get_host_buffer_type(ggml_backend_dev_t dev) {
|
||||
ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *) dev->context;
|
||||
return ggml_backend_openvino_host_buffer_type(ctx->device);
|
||||
}
|
||||
|
||||
static bool is_op_unsupported_case(const ggml_tensor * op) {
|
||||
switch (op->op) {
|
||||
case GGML_OP_SOFT_MAX: {
|
||||
|
|
@ -731,7 +816,8 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
|
|||
|
||||
static bool ggml_backend_openvino_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
||||
// Support our own buffer type and any host buffer (for mmap'd files, etc.)
|
||||
return buft->iface.get_name == ggml_backend_openvino_buffer_type_get_name || ggml_backend_buft_is_host(buft);
|
||||
return ggml_backend_buft_is_openvino(buft) || ggml_backend_buft_is_host(buft);
|
||||
// return ggml_backend_buft_is_openvino(buft) || ggml_backend_buft_is_openvino_host(buft);
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
|
|
@ -743,7 +829,8 @@ static const struct ggml_backend_device_i ggml_backend_openvino_device_interface
|
|||
/* .get_props = */ ggml_backend_openvino_device_get_props,
|
||||
/* .init_backend = */ ggml_backend_openvino_device_init,
|
||||
/* .get_buffer_type = */ ggml_backend_openvino_device_get_buffer_type,
|
||||
/* .get_host_buffer_type = */ NULL,
|
||||
// /* .get_host_buffer_type = */ NULL,
|
||||
/* .get_host_buffer_type = */ ggml_backend_openvino_device_get_host_buffer_type,
|
||||
/* .buffer_from_host_ptr = */ NULL,
|
||||
/* .supports_op = */ ggml_backend_openvino_device_supports_op,
|
||||
/* .supports_buft = */ ggml_backend_openvino_device_supports_buft,
|
||||
|
|
|
|||
|
|
@ -37,11 +37,9 @@
|
|||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
|
||||
|
||||
static ov::Core core;
|
||||
|
||||
enum ggml_status ov_graph_compute(ggml_cgraph * cgraph) {
|
||||
if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) {
|
||||
std::string filename = "cgraph.txt";
|
||||
std::string filename = "cgraph_ov.txt";
|
||||
GgmlOvDecoder::dump_cgraph(cgraph, filename);
|
||||
}
|
||||
|
||||
|
|
@ -52,8 +50,9 @@ enum ggml_status ov_graph_compute(ggml_cgraph * cgraph) {
|
|||
}
|
||||
|
||||
enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::string & device) {
|
||||
auto & core = ov_singleton_core();
|
||||
const auto & config = ggml_openvino_get_compile_config();
|
||||
static auto is_static = false;
|
||||
static auto config = get_ov_compile_config(device);
|
||||
|
||||
// if (is_naive(cgraph)) {
|
||||
// return naive_compute(cgraph, core, device, config);
|
||||
|
|
@ -124,7 +123,13 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
|
|||
ov::serialize(model, timestamped_filename);
|
||||
}
|
||||
|
||||
auto compiled_model = core.compile_model(model, device, config);
|
||||
ov::CompiledModel compiled_model;
|
||||
auto remote_context = ggml_openvino_get_remote_context();
|
||||
if (remote_context.has_value()) {
|
||||
compiled_model = core.compile_model(model, remote_context.value(), config);
|
||||
} else {
|
||||
compiled_model = core.compile_model(model, device, config);
|
||||
}
|
||||
compile_end_time = ggml_time_us();
|
||||
infer_request = std::make_shared<ov::InferRequest>(compiled_model.create_infer_request());
|
||||
infer_request_cache[key] = infer_request;
|
||||
|
|
@ -173,18 +178,20 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin
|
|||
|
||||
if (getenv("GGML_OPENVINO_PROFILING")) {
|
||||
GGML_LOG_INFO("\nGGML OpenVINO Backend: \n");
|
||||
GGML_LOG_INFO(" - Graph decoder Time: %ld ms \n", (decoder_end_time - start_time) / 1000);
|
||||
GGML_LOG_INFO(" - Graph decoder time: %ld ms \n", (decoder_end_time - start_time) / 1000);
|
||||
if (!cache_hit) {
|
||||
GGML_LOG_INFO(" - Graph conversion Time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000);
|
||||
GGML_LOG_INFO(" - Graph compile Time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000);
|
||||
GGML_LOG_INFO(" - Graph conversion time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000);
|
||||
GGML_LOG_INFO(" - Graph compile time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000);
|
||||
}
|
||||
GGML_LOG_INFO(" - Graph Inference Time: %ld ms \n", (infer_end_time - compile_end_time) / 1000);
|
||||
GGML_LOG_INFO(" - Graph inference time: %ld ms \n", (infer_end_time - compile_end_time) / 1000);
|
||||
}
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) {
|
||||
auto & core = ov_singleton_core();
|
||||
|
||||
auto get_prefill_chunk_size = [] {
|
||||
const char * chunk_size_str = getenv("GGML_OPENVINO_PREFILL_CHUNK_SIZE");
|
||||
if (chunk_size_str && atoi(chunk_size_str) > 0) {
|
||||
|
|
@ -196,7 +203,7 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) {
|
|||
static std::string device = "NPU";
|
||||
static auto is_static = true;
|
||||
static auto prefill_chunk_size = get_prefill_chunk_size();
|
||||
static auto config = get_ov_compile_config(device);
|
||||
const auto & config = ggml_openvino_get_compile_config();
|
||||
|
||||
if (is_naive(cgraph)) {
|
||||
return naive_compute(cgraph, core, device, config);
|
||||
|
|
@ -281,8 +288,16 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) {
|
|||
ov::serialize(model_decode, timestamped_filename);
|
||||
}
|
||||
|
||||
auto compiled_model_prefill = core.compile_model(model_prefill, device, get_ov_compile_config(device));
|
||||
auto compiled_model_decode = core.compile_model(model_decode, device, get_ov_compile_config(device));
|
||||
ov::CompiledModel compiled_model_prefill;
|
||||
ov::CompiledModel compiled_model_decode;
|
||||
auto remote_context = ggml_openvino_get_remote_context();
|
||||
if (remote_context.has_value()) {
|
||||
compiled_model_prefill = core.compile_model(model_prefill, remote_context.value(), config);
|
||||
compiled_model_decode = core.compile_model(model_decode, remote_context.value(), config);
|
||||
} else {
|
||||
compiled_model_prefill = core.compile_model(model_prefill, device, config);
|
||||
compiled_model_decode = core.compile_model(model_decode, device, config);
|
||||
}
|
||||
|
||||
infer_request_cache_prefill[key] =
|
||||
std::make_shared<ov::InferRequest>(compiled_model_prefill.create_infer_request());
|
||||
|
|
@ -369,41 +384,17 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) {
|
|||
|
||||
if (getenv("GGML_OPENVINO_PROFILING")) {
|
||||
GGML_LOG_INFO("\nGGML OpenVINO Backend: \n");
|
||||
GGML_LOG_INFO(" - Graph decoder Time: %ld ms \n", (decoder_end_time - start_time) / 1000);
|
||||
GGML_LOG_INFO(" - Graph decoder time: %ld ms \n", (decoder_end_time - start_time) / 1000);
|
||||
if (!cache_hit) {
|
||||
GGML_LOG_INFO(" - Graph conversion Time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000);
|
||||
GGML_LOG_INFO(" - Graph compile Time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000);
|
||||
GGML_LOG_INFO(" - Graph conversion time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000);
|
||||
GGML_LOG_INFO(" - Graph compile time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000);
|
||||
}
|
||||
GGML_LOG_INFO(" - Graph Inference Time: %ld ms \n", (infer_end_time - compile_end_time) / 1000);
|
||||
GGML_LOG_INFO(" - Graph inference time: %ld ms \n", (infer_end_time - compile_end_time) / 1000);
|
||||
}
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
ov::AnyMap get_ov_compile_config(const std::string & device) {
|
||||
ov::AnyMap config;
|
||||
auto * cache_dir = getenv("GGML_OPENVINO_CACHE_DIR");
|
||||
if (device == "NPU") {
|
||||
config = {
|
||||
{"NPU_COMPILER_DYNAMIC_QUANTIZATION", "YES" },
|
||||
{"NPU_USE_NPUW", "YES" },
|
||||
{"NPUW_DEVICES", "NPU" },
|
||||
{"NPUW_FOLD", "YES" },
|
||||
{"NPUW_WEIGHTS_BANK", "shared"},
|
||||
{"NPUW_FUNCALL_FOR_ALL", "YES" },
|
||||
{"NPUW_FUNCALL_ASYNC", "YES" },
|
||||
{"NPUW_DQ", "YES" },
|
||||
{"NPUW_DQ_FULL", "NO" },
|
||||
};
|
||||
if (cache_dir) {
|
||||
config["NPUW_CACHE_DIR"] = cache_dir;
|
||||
}
|
||||
} else if (cache_dir) {
|
||||
core.set_property(ov::cache_dir(cache_dir));
|
||||
}
|
||||
return config;
|
||||
}
|
||||
|
||||
bool is_naive(ggml_cgraph * cgraph) {
|
||||
constexpr int naive_graph_size_threshold = 20;
|
||||
return cgraph->n_nodes < naive_graph_size_threshold;
|
||||
|
|
@ -428,7 +419,14 @@ enum ggml_status naive_compute(ggml_cgraph * cgraph,
|
|||
if (getenv("GGML_OPENVINO_DUMP_IR")) {
|
||||
ov::serialize(model, "IR_naive.xml");
|
||||
}
|
||||
auto infer_request = core.compile_model(model, device, config).create_infer_request();
|
||||
|
||||
ov::InferRequest infer_request;
|
||||
auto remote_context = ggml_openvino_get_remote_context();
|
||||
if (remote_context.has_value()) {
|
||||
infer_request = core.compile_model(model, remote_context.value(), config).create_infer_request();
|
||||
} else {
|
||||
infer_request = core.compile_model(model, device, config).create_infer_request();
|
||||
}
|
||||
|
||||
auto ov_params = model->get_parameters();
|
||||
for (size_t i = 0; i < ov_params.size(); i++) {
|
||||
|
|
@ -451,6 +449,18 @@ enum ggml_status naive_compute(ggml_cgraph * cgraph,
|
|||
namespace {
|
||||
ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string & name) {
|
||||
const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(name);
|
||||
|
||||
if (ggml_tensor->extra != nullptr) {
|
||||
// GGML_LOG_DEBUG("Using ggml_tensor->extra as ov::Tensor for input: %s\n", name.c_str());
|
||||
auto * extra_base = static_cast<ggml_openvino_extra_base *>(ggml_tensor->extra);
|
||||
if (extra_base->type != ggml_openvino_extra_base::Type::TENSOR) {
|
||||
throw std::runtime_error("ggml tensor extra is not of type TENSOR for input: " + name);
|
||||
}
|
||||
auto * tensor_extra = static_cast<ggml_openvino_tensor_extra *>(extra_base);
|
||||
return *tensor_extra->tensor;
|
||||
}
|
||||
|
||||
// GGML_LOG_DEBUG("Converting ggml tensor to ov::Tensor for input: %s\n", name.c_str());
|
||||
auto * input_data = ggml_tensor->data;
|
||||
ov::Shape input_shape;
|
||||
if (ggml_tensor->op == GGML_OP_VIEW) {
|
||||
|
|
|
|||
|
|
@ -71,8 +71,6 @@ bool get_is_prefill(const ggml_tensor * inp_pos);
|
|||
|
||||
graph_key compute_graph_key(struct ggml_cgraph * cgraph);
|
||||
|
||||
ov::AnyMap get_ov_compile_config(const std::string & device);
|
||||
|
||||
ov::Tensor get_ov_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string & param_name);
|
||||
ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr<GgmlOvDecoder> ggml_decoder,
|
||||
const std::string & param_name);
|
||||
|
|
|
|||
Loading…
Reference in New Issue