diff --git a/ggml/include/ggml-openvino.h b/ggml/include/ggml-openvino.h index b690a16378..392e26c48e 100644 --- a/ggml/include/ggml-openvino.h +++ b/ggml/include/ggml-openvino.h @@ -18,9 +18,17 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device); GGML_BACKEND_API bool ggml_backend_is_openvino(ggml_backend_t backend); +GGML_BACKEND_API bool ggml_backend_buffer_is_openvino(ggml_backend_buffer_t buffer); + +GGML_BACKEND_API bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t buft); + +GGML_BACKEND_API bool ggml_backend_buft_is_openvino_host(ggml_backend_buffer_type_t buft); + // device buffer GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(int device); +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_host_buffer_type(int device); + GGML_BACKEND_API int ggml_backend_openvino_get_device_count(void); GGML_BACKEND_API ggml_backend_reg_t ggml_backend_openvino_reg(void); diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 2d6437f069..13ef00dcb6 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -593,11 +593,19 @@ void GgmlOvDecoder::dump_cgraph(const ggml_cgraph * cgraph, std::string & filena << std::setw(20) << "op" << std::setw(20) << "name" << std::setw(3) << " " - << std::setw(50) << "stride" + << std::setw(62) << "stride" + << std::setw(20) << "buffer_type" << "\n"; for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; + // Get buffer type name + const char * buf_name = "none"; + ggml_backend_buffer_t buf = node->view_src ? node->view_src->buffer : node->buffer; + if (buf) { + buf_name = ggml_backend_buffer_name(buf); + } + file << " - " << std::setw(3) << i << ": [ " << std::setw(5) << node->ne[0] << ", " << std::setw(5) << node->ne[1] << ", " @@ -610,10 +618,18 @@ void GgmlOvDecoder::dump_cgraph(const ggml_cgraph * cgraph, std::string & filena << std::setw(5) << node->nb[1] << ", " << std::setw(5) << node->nb[2] << ", " << std::setw(5) << node->nb[3] << "] " + << std::right << std::setw(15) << buf_name << std::right << "\n"; for (int i = 0; i < GGML_MAX_SRC; i++) { if (auto* src = node->src[i]) { + // Get buffer type name for source + const char * src_buf_name = "none"; + ggml_backend_buffer_t src_buf = src->view_src ? src->view_src->buffer : src->buffer; + if (src_buf) { + src_buf_name = ggml_backend_buffer_name(src_buf); + } + file << std::setw(10) << " [ " << std::setw(5) << src->ne[0] << ", " << std::setw(5) << src->ne[1] << ", " @@ -627,6 +643,7 @@ void GgmlOvDecoder::dump_cgraph(const ggml_cgraph * cgraph, std::string & filena << std::setw(5) << src->nb[1] << ", " << std::setw(5) << src->nb[2] << ", " << std::setw(5) << src->nb[3] << "] " + << std::right << std::setw(15) << src_buf_name << std::right << "\n"; } } @@ -636,11 +653,19 @@ void GgmlOvDecoder::dump_cgraph(const ggml_cgraph * cgraph, std::string & filena for (int i = 0; i < cgraph->n_leafs; i++) { ggml_tensor * node = cgraph->leafs[i]; + // Get buffer type name for leaf + const char * leaf_buf_name = "none"; + ggml_backend_buffer_t leaf_buf = node->view_src ? node->view_src->buffer : node->buffer; + if (leaf_buf) { + leaf_buf_name = ggml_backend_buffer_name(leaf_buf); + } + file << " - " << std::setw(3) << i << ": [ " << std::setw(5) << node->ne[0] << ", " << std::setw(5) << node->ne[1] << "] " << std::setw(8) << ggml_op_name(node->op) << " " - << std::setw(16) << ggml_get_name(node) << "\n"; + << std::setw(16) << ggml_get_name(node) + << std::setw(20) << leaf_buf_name << "\n"; } // clang-format on file << "========================================\n"; diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp index 75b27c8fa8..085ae1ece4 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp @@ -2,6 +2,9 @@ #include "ggml-impl.h" +#include +#include + ov::Core & ov_singleton_core() { static ov::Core core; return core; @@ -22,6 +25,31 @@ void ggml_openvino_device_config::init() { device_name = "CPU"; } is_npu = (device_name == "NPU"); + + auto * cache_dir = getenv("GGML_OPENVINO_CACHE_DIR"); + if (device_name == "NPU") { + compile_config = { + {"NPU_COMPILER_DYNAMIC_QUANTIZATION", "YES" }, + {"NPU_USE_NPUW", "YES" }, + {"NPUW_DEVICES", "NPU" }, + {"NPUW_FOLD", "YES" }, + {"NPUW_WEIGHTS_BANK", "shared"}, + {"NPUW_FUNCALL_FOR_ALL", "YES" }, + {"NPUW_FUNCALL_ASYNC", "YES" }, + {"NPUW_DQ", "YES" }, + {"NPUW_DQ_FULL", "NO" }, + }; + if (cache_dir) { + compile_config["NPUW_CACHE_DIR"] = cache_dir; + } + } else if (cache_dir) { + ov_singleton_core().set_property(ov::cache_dir(cache_dir)); + } + + if (device_name != "CPU") { + remote_context = ov_singleton_core().get_default_context(device_name); + } + initialized = true; } @@ -46,6 +74,16 @@ bool ggml_openvino_is_npu() { return ggml_openvino_get_device_config().is_npu; } +// Get the remote context for the current device (returns empty optional for CPU) +std::optional ggml_openvino_get_remote_context() { + return ggml_openvino_get_device_config().remote_context; +} + +// Get the compile config for the current device +const ov::AnyMap & ggml_openvino_get_compile_config() { + return ggml_openvino_get_device_config().compile_config; +} + // Get requantization type for a tensor type (returns nullopt if no requant needed) std::optional ggml_openvino_get_requant_type(ggml_type type) { if (!ggml_openvino_is_npu()) { @@ -175,3 +213,50 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten return layout; } + +ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor * tensor) { + ov::Shape shape; + for (int i = GGML_MAX_DIMS - 1; i >= 0; --i) { + shape.push_back(static_cast(tensor->ne[i])); + } + + ov::element::Type element_type; + switch (tensor->type) { + case GGML_TYPE_F32: + element_type = ov::element::f32; + break; + case GGML_TYPE_F16: + element_type = ov::element::f16; + break; + case GGML_TYPE_BF16: + element_type = ov::element::bf16; + break; + case GGML_TYPE_I32: + element_type = ov::element::i32; + break; + case GGML_TYPE_I64: + element_type = ov::element::i64; + break; + default: + GGML_LOG_ERROR("%s: unsupported tensor type for ov::Tensor: %s\n", __func__, ggml_type_name(tensor->type)); + return nullptr; + } + + const auto & device_name = ggml_openvino_get_device_name(); + auto remote_context = ggml_openvino_get_remote_context(); + + std::shared_ptr ov_tensor; + if (device_name == "CPU") { + ov_tensor = std::make_shared(element_type, shape, tensor->data); + } else if (device_name == "GPU") { + auto gpu_context = remote_context->as(); + auto usm_tensor = gpu_context.create_tensor(element_type, shape, tensor->data); + ov_tensor = std::make_shared(std::move(usm_tensor)); + } else { + auto npu_context = remote_context->as(); + auto l0_tensor = npu_context.create_tensor(element_type, shape, tensor->data); + ov_tensor = std::make_shared(std::move(l0_tensor)); + } + + return new ggml_openvino_tensor_extra(ov_tensor); +} diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.h b/ggml/src/ggml-openvino/ggml-openvino-extra.h index 7e0138388f..fdd8312dff 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.h +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -15,6 +16,12 @@ enum class ExtraQuantType { F16, Q4_0_C, Q8_1_C, Q4_0_128, Q8_0_C, Q8_0_32 }; ov::Core & ov_singleton_core(); +// Get the remote context for the current device (returns empty optional for CPU) +std::optional ggml_openvino_get_remote_context(); + +// Get the compile config for the current device +const ov::AnyMap & ggml_openvino_get_compile_config(); + // ===================================================== // Global Device Configuration (singleton) // ===================================================== @@ -24,6 +31,8 @@ struct ggml_openvino_device_config { std::string device_name = "CPU"; bool is_npu = false; bool initialized = false; + std::optional remote_context; + ov::AnyMap compile_config; void init(); }; @@ -112,3 +121,5 @@ struct ggml_openvino_extracted_layout { // Calculate the buffer layout for extracted quantized data ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_tensor * tensor); + +ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor * tensor); diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index e20ae71e40..c5c25fb6c1 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -56,7 +56,7 @@ struct ggml_backend_openvino_buffer_context { std::shared_ptr ov_tensor; // Track all extras for cleanup - std::vector tensor_extras; + std::map tensor_extras; ggml_backend_openvino_buffer_context(int device, size_t size) : device(device), @@ -103,8 +103,8 @@ struct ggml_backend_openvino_buffer_context { ~ggml_backend_openvino_buffer_context() { // Clean up all tensor extras - for (auto * extra : tensor_extras) { - delete extra; + for (auto & pair : tensor_extras) { + delete pair.second; } tensor_extras.clear(); if (data && ggml_openvino_get_device_name() == "CPU") { @@ -144,9 +144,20 @@ static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_bu return GGML_STATUS_SUCCESS; } - // For non-view tensors, tensor->extra will be set in set_tensor - // when the actual weight data is loaded - GGML_UNUSED(buffer); + ggml_backend_openvino_buffer_context * ctx = (ggml_backend_openvino_buffer_context *) buffer->context; + + if (tensor->data != nullptr) { + ggml_openvino_tensor_extra * extra = ggml_openvino_create_tensor_extra(tensor); + if (extra != nullptr) { + auto it = ctx->tensor_extras.find(tensor); + if (it != ctx->tensor_extras.end()) { + delete it->second; + } + ctx->tensor_extras[tensor] = extra; + tensor->extra = extra; + } + } + return GGML_STATUS_SUCCESS; } @@ -194,7 +205,7 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer layout.requant_type.value() == ExtraQuantType::F16) { // F16 requant case - use weight_extra auto * extra = new ggml_openvino_weight_extra(constant); - ctx->tensor_extras.push_back(extra); + ctx->tensor_extras[tensor] = extra; tensor->extra = extra; GGML_LOG_DEBUG("%s: requantized %s to F16\n", __func__, tensor->name); } else { @@ -211,7 +222,7 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer auto * extra = new ggml_openvino_quantized_weight_extra(std::move(weights), std::move(scales), std::move(biases), constant); - ctx->tensor_extras.push_back(extra); + ctx->tensor_extras[tensor] = extra; tensor->extra = extra; if (layout.is_requant) { @@ -239,7 +250,7 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer // Store in tensor->extra ggml_openvino_weight_extra * extra = new ggml_openvino_weight_extra(constant); - ctx->tensor_extras.push_back(extra); + ctx->tensor_extras[tensor] = extra; tensor->extra = extra; GGML_LOG_DEBUG("%s: created shared-memory constant for %s\n", __func__, tensor->name); @@ -251,6 +262,19 @@ static void ggml_backend_openvino_buffer_set_tensor(ggml_backend_buffer_t buffer } else { // Non-weight tensor (KV cache, activations, etc.) - just copy data memcpy((char *) tensor->data + offset, data, size); + + ggml_openvino_tensor_extra * extra = ggml_openvino_create_tensor_extra(tensor); + if (extra == nullptr) { + GGML_LOG_ERROR("%s: failed to create tensor extra for %s\n", __func__, tensor->name); + return; + } + + auto it = ctx->tensor_extras.find(tensor); + if (it != ctx->tensor_extras.end()) { + delete it->second; + } + ctx->tensor_extras[tensor] = extra; + tensor->extra = extra; } } @@ -393,11 +417,67 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(in return &buffer_types[device]; } -// Check if a buffer is an OpenVINO buffer +// ===================================================== +// OpenVINO Host Buffer Implementation +// ===================================================== + +static const char * ggml_backend_openvino_host_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + ggml_backend_openvino_buffer_type_context * ctx = (ggml_backend_openvino_buffer_type_context *) buft->context; + static std::string name; + name = ctx->name + "_HOST"; + return name.c_str(); +} + +static const ggml_backend_buffer_type_i ggml_backend_openvino_host_buffer_type_interface = { + /* .get_name = */ ggml_backend_openvino_host_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_openvino_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_openvino_buffer_type_get_alignment, + /* .get_max_size = */ ggml_backend_openvino_buffer_type_get_max_size, + /* .get_alloc_size = */ ggml_backend_openvino_buffer_type_get_alloc_size, + /* .is_host = */ ggml_backend_openvino_buffer_type_is_host, +}; + +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_host_buffer_type(int device) { + GGML_ASSERT(device >= 0 && device < ggml_backend_openvino_get_device_count()); + + static std::mutex mutex; + std::lock_guard lock(mutex); + + static std::vector buffer_types; + static std::vector buffer_type_contexts; + + if (buffer_types.empty()) { + int device_count = ggml_backend_openvino_get_device_count(); + buffer_types.resize(device_count); + buffer_type_contexts.resize(device_count); + + for (int i = 0; i < device_count; i++) { + buffer_type_contexts[i].device = i; + buffer_type_contexts[i].name = std::string(GGML_OPENVINO_NAME) + std::to_string(i); + + buffer_types[i] = ggml_backend_buffer_type{ + /* .iface = */ ggml_backend_openvino_host_buffer_type_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_openvino_reg(), i), + /* .context = */ &buffer_type_contexts[i], + }; + } + } + + return &buffer_types[device]; +} + bool ggml_backend_buffer_is_openvino(ggml_backend_buffer_t buffer) { return buffer->iface.free_buffer == ggml_backend_openvino_buffer_free_buffer; } +bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t buft) { + return buft->iface.get_name == ggml_backend_openvino_buffer_type_get_name; +} + +bool ggml_backend_buft_is_openvino_host(ggml_backend_buffer_type_t buft) { + return buft->iface.get_name == ggml_backend_openvino_host_buffer_type_get_name; +} + // ===================================================== // OpenVINO Backend Context and Interface // ===================================================== @@ -552,6 +632,11 @@ static ggml_backend_buffer_type_t ggml_backend_openvino_device_get_buffer_type(g return ggml_backend_openvino_buffer_type(ctx->device); } +static ggml_backend_buffer_type_t ggml_backend_openvino_device_get_host_buffer_type(ggml_backend_dev_t dev) { + ggml_backend_openvino_device_context * ctx = (ggml_backend_openvino_device_context *) dev->context; + return ggml_backend_openvino_host_buffer_type(ctx->device); +} + static bool is_op_unsupported_case(const ggml_tensor * op) { switch (op->op) { case GGML_OP_SOFT_MAX: { @@ -731,7 +816,8 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con static bool ggml_backend_openvino_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { // Support our own buffer type and any host buffer (for mmap'd files, etc.) - return buft->iface.get_name == ggml_backend_openvino_buffer_type_get_name || ggml_backend_buft_is_host(buft); + return ggml_backend_buft_is_openvino(buft) || ggml_backend_buft_is_host(buft); + // return ggml_backend_buft_is_openvino(buft) || ggml_backend_buft_is_openvino_host(buft); GGML_UNUSED(dev); } @@ -743,7 +829,8 @@ static const struct ggml_backend_device_i ggml_backend_openvino_device_interface /* .get_props = */ ggml_backend_openvino_device_get_props, /* .init_backend = */ ggml_backend_openvino_device_init, /* .get_buffer_type = */ ggml_backend_openvino_device_get_buffer_type, - /* .get_host_buffer_type = */ NULL, + // /* .get_host_buffer_type = */ NULL, + /* .get_host_buffer_type = */ ggml_backend_openvino_device_get_host_buffer_type, /* .buffer_from_host_ptr = */ NULL, /* .supports_op = */ ggml_backend_openvino_device_supports_op, /* .supports_buft = */ ggml_backend_openvino_device_supports_buft, diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 6d56af9318..89cf51f880 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -37,11 +37,9 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" -static ov::Core core; - enum ggml_status ov_graph_compute(ggml_cgraph * cgraph) { if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { - std::string filename = "cgraph.txt"; + std::string filename = "cgraph_ov.txt"; GgmlOvDecoder::dump_cgraph(cgraph, filename); } @@ -52,8 +50,9 @@ enum ggml_status ov_graph_compute(ggml_cgraph * cgraph) { } enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::string & device) { + auto & core = ov_singleton_core(); + const auto & config = ggml_openvino_get_compile_config(); static auto is_static = false; - static auto config = get_ov_compile_config(device); // if (is_naive(cgraph)) { // return naive_compute(cgraph, core, device, config); @@ -124,7 +123,13 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin ov::serialize(model, timestamped_filename); } - auto compiled_model = core.compile_model(model, device, config); + ov::CompiledModel compiled_model; + auto remote_context = ggml_openvino_get_remote_context(); + if (remote_context.has_value()) { + compiled_model = core.compile_model(model, remote_context.value(), config); + } else { + compiled_model = core.compile_model(model, device, config); + } compile_end_time = ggml_time_us(); infer_request = std::make_shared(compiled_model.create_infer_request()); infer_request_cache[key] = infer_request; @@ -173,18 +178,20 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, const std::strin if (getenv("GGML_OPENVINO_PROFILING")) { GGML_LOG_INFO("\nGGML OpenVINO Backend: \n"); - GGML_LOG_INFO(" - Graph decoder Time: %ld ms \n", (decoder_end_time - start_time) / 1000); + GGML_LOG_INFO(" - Graph decoder time: %ld ms \n", (decoder_end_time - start_time) / 1000); if (!cache_hit) { - GGML_LOG_INFO(" - Graph conversion Time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000); - GGML_LOG_INFO(" - Graph compile Time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000); + GGML_LOG_INFO(" - Graph conversion time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000); + GGML_LOG_INFO(" - Graph compile time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000); } - GGML_LOG_INFO(" - Graph Inference Time: %ld ms \n", (infer_end_time - compile_end_time) / 1000); + GGML_LOG_INFO(" - Graph inference time: %ld ms \n", (infer_end_time - compile_end_time) / 1000); } return GGML_STATUS_SUCCESS; } enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { + auto & core = ov_singleton_core(); + auto get_prefill_chunk_size = [] { const char * chunk_size_str = getenv("GGML_OPENVINO_PREFILL_CHUNK_SIZE"); if (chunk_size_str && atoi(chunk_size_str) > 0) { @@ -196,7 +203,7 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { static std::string device = "NPU"; static auto is_static = true; static auto prefill_chunk_size = get_prefill_chunk_size(); - static auto config = get_ov_compile_config(device); + const auto & config = ggml_openvino_get_compile_config(); if (is_naive(cgraph)) { return naive_compute(cgraph, core, device, config); @@ -281,8 +288,16 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { ov::serialize(model_decode, timestamped_filename); } - auto compiled_model_prefill = core.compile_model(model_prefill, device, get_ov_compile_config(device)); - auto compiled_model_decode = core.compile_model(model_decode, device, get_ov_compile_config(device)); + ov::CompiledModel compiled_model_prefill; + ov::CompiledModel compiled_model_decode; + auto remote_context = ggml_openvino_get_remote_context(); + if (remote_context.has_value()) { + compiled_model_prefill = core.compile_model(model_prefill, remote_context.value(), config); + compiled_model_decode = core.compile_model(model_decode, remote_context.value(), config); + } else { + compiled_model_prefill = core.compile_model(model_prefill, device, config); + compiled_model_decode = core.compile_model(model_decode, device, config); + } infer_request_cache_prefill[key] = std::make_shared(compiled_model_prefill.create_infer_request()); @@ -369,41 +384,17 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph) { if (getenv("GGML_OPENVINO_PROFILING")) { GGML_LOG_INFO("\nGGML OpenVINO Backend: \n"); - GGML_LOG_INFO(" - Graph decoder Time: %ld ms \n", (decoder_end_time - start_time) / 1000); + GGML_LOG_INFO(" - Graph decoder time: %ld ms \n", (decoder_end_time - start_time) / 1000); if (!cache_hit) { - GGML_LOG_INFO(" - Graph conversion Time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000); - GGML_LOG_INFO(" - Graph compile Time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000); + GGML_LOG_INFO(" - Graph conversion time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000); + GGML_LOG_INFO(" - Graph compile time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000); } - GGML_LOG_INFO(" - Graph Inference Time: %ld ms \n", (infer_end_time - compile_end_time) / 1000); + GGML_LOG_INFO(" - Graph inference time: %ld ms \n", (infer_end_time - compile_end_time) / 1000); } return GGML_STATUS_SUCCESS; } -ov::AnyMap get_ov_compile_config(const std::string & device) { - ov::AnyMap config; - auto * cache_dir = getenv("GGML_OPENVINO_CACHE_DIR"); - if (device == "NPU") { - config = { - {"NPU_COMPILER_DYNAMIC_QUANTIZATION", "YES" }, - {"NPU_USE_NPUW", "YES" }, - {"NPUW_DEVICES", "NPU" }, - {"NPUW_FOLD", "YES" }, - {"NPUW_WEIGHTS_BANK", "shared"}, - {"NPUW_FUNCALL_FOR_ALL", "YES" }, - {"NPUW_FUNCALL_ASYNC", "YES" }, - {"NPUW_DQ", "YES" }, - {"NPUW_DQ_FULL", "NO" }, - }; - if (cache_dir) { - config["NPUW_CACHE_DIR"] = cache_dir; - } - } else if (cache_dir) { - core.set_property(ov::cache_dir(cache_dir)); - } - return config; -} - bool is_naive(ggml_cgraph * cgraph) { constexpr int naive_graph_size_threshold = 20; return cgraph->n_nodes < naive_graph_size_threshold; @@ -428,7 +419,14 @@ enum ggml_status naive_compute(ggml_cgraph * cgraph, if (getenv("GGML_OPENVINO_DUMP_IR")) { ov::serialize(model, "IR_naive.xml"); } - auto infer_request = core.compile_model(model, device, config).create_infer_request(); + + ov::InferRequest infer_request; + auto remote_context = ggml_openvino_get_remote_context(); + if (remote_context.has_value()) { + infer_request = core.compile_model(model, remote_context.value(), config).create_infer_request(); + } else { + infer_request = core.compile_model(model, device, config).create_infer_request(); + } auto ov_params = model->get_parameters(); for (size_t i = 0; i < ov_params.size(); i++) { @@ -451,6 +449,18 @@ enum ggml_status naive_compute(ggml_cgraph * cgraph, namespace { ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, const std::string & name) { const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(name); + + if (ggml_tensor->extra != nullptr) { + // GGML_LOG_DEBUG("Using ggml_tensor->extra as ov::Tensor for input: %s\n", name.c_str()); + auto * extra_base = static_cast(ggml_tensor->extra); + if (extra_base->type != ggml_openvino_extra_base::Type::TENSOR) { + throw std::runtime_error("ggml tensor extra is not of type TENSOR for input: " + name); + } + auto * tensor_extra = static_cast(extra_base); + return *tensor_extra->tensor; + } + + // GGML_LOG_DEBUG("Converting ggml tensor to ov::Tensor for input: %s\n", name.c_str()); auto * input_data = ggml_tensor->data; ov::Shape input_shape; if (ggml_tensor->op == GGML_OP_VIEW) { diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 81fb2c2035..44ca2db00f 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -71,8 +71,6 @@ bool get_is_prefill(const ggml_tensor * inp_pos); graph_key compute_graph_key(struct ggml_cgraph * cgraph); -ov::AnyMap get_ov_compile_config(const std::string & device); - ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string & param_name); ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr ggml_decoder, const std::string & param_name);