diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index efb8ff12bc..5221a1ff8b 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -53,8 +53,7 @@ static ggml_backend_buffer_type_t ggml_backend_openvino_get_default_buffer_type( static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backend, struct ggml_cgraph *cgraph) { - int end_node = cgraph->n_nodes - 1; - openvino_frontend_compute(backend, cgraph, 0, end_node); + openvino_frontend_compute(backend, cgraph); ov::Core core; diff --git a/ggml/src/ggml-openvino/decoder.h b/ggml/src/ggml-openvino/decoder.h index e287f31e23..c0641e2662 100644 --- a/ggml/src/ggml-openvino/decoder.h +++ b/ggml/src/ggml-openvino/decoder.h @@ -1,19 +1,14 @@ #pragma once +#include + #include "openvino/core/node.hpp" #include "openvino/frontend/decoder.hpp" -#include "openvino/op/parameter.hpp" namespace ov { namespace frontend { namespace ggml { -// 定义 tensor_info 结构体 -struct tensor_info { - - std::vector shape; - std::vector stride; -}; // TODO: Directly include from openvino class GgmlDecoder : public DecoderBase { public: @@ -36,10 +31,6 @@ public: virtual std::vector get_input_names() const = 0; - virtual std::string& get_op_node_name(const std::string& name, const int index = -1) = 0; - - // virtual const struct tensor_info get_node_op_info(const std::string& name) const = 0; - virtual PartialShape get_output_shape(const std::string& name) const = 0; virtual std::vector get_output_stride(const std::string& name) const = 0; @@ -64,14 +55,11 @@ public: virtual void visit_subgraph(std::function)> node_visitor) const = 0; - // virtual const std::vector& outputs() const = 0; - - // virtual size_t output(size_t index) const = 0; - virtual bool check_if_continuous() const = 0; - virtual const std::vector>& get_params() const = 0; - + virtual const std::unordered_map>& get_model_inputs() const = 0; + virtual const std::unordered_map>& get_model_weights() const = 0; + virtual const std::vector& get_model_output_names() const = 0; }; } // namespace ggml diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 2dbde9ea5a..05947ff579 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -1,11 +1,62 @@ #include "ggml-decoder.h" -#include -#include -#include -#include -#include -void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::map& inputs, std::map& outputs) { +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ggml-backend-impl.h" +#include "ggml-backend.h" + +GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph) + : m_cgraph(cgraph), + m_node(node), + m_op_name(m_node ? std::string(m_node->name) : "NONE_OP") { + if (m_node) { + set_input_output(m_node); + } else { + // std::map> address_map; + // for (int node_n = start_index; node_n <= end_index; node_n++) { + // auto node = cgraph->nodes[node_n]; + // if (node->data) { + // auto it = address_map.find(node->data); + // if (it == address_map.end()) { + // address_map[node->data] = std::vector(); + // } + // address_map[node->data].push_back(node->name); + // } + // } + // for (const auto& pair : address_map) { + // std::cout << "Address: " << pair.first << " -> "; + // for (const auto& name : pair.second) { + // std::cout << name << " ;"; + // } + // std::cout << std::endl; + // } + + for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { + auto* cur_node = m_cgraph->nodes[node_n]; + m_nodes.push_back(cur_node); + // Init model input and output + set_input_output(cur_node); + } + if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { + dump_cgraph(m_cgraph); + } + } +} + +// Called in GgmlOvDecoder constructor. Two cases: 1. constructing a decoder for the whole graph; +// 2. constructing a decoder for a node. +void GgmlOvDecoder::set_input_output(ggml_tensor* node) { std::string node_name; if (node->op == GGML_OP_CPY) { // CPY updates the input tensor in place. For later ov op that uses the @@ -17,51 +68,130 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapname); } - std::string src0_name = std::string(node->src[0]->name); - inputs[src0_name] = node->src[0]; - outputs[node_name] = node; - m_input_names.push_back(src0_name); - m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); - if (node->op == GGML_OP_CPY && node->view_src) { - m_output_names.push_back(node->view_src->name); - } else { - m_output_names.push_back(node_name); + m_output_names.push_back(node_name); + m_outputs[node_name] = node; + + for (int i = 0; i < GGML_MAX_SRC; i++) { + auto* src = node->src[i]; + if (src == nullptr) { + continue; + } + std::string src_name = std::string(src->name); + m_input_names.push_back(src_name); + m_inputs[src_name] = src; + m_op_node_name.emplace_back(src_name, ggml_op_name(node->op)); + + // If called for the whole graph, create constant nodes for weights and param nodes for inputs + if (!m_node && !src->view_src) { + ggml_backend_buffer* buffer = src->buffer; + + if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { + bool weight_as_input = getenv("GGML_OPENVINO_WEIGHT_AS_INPUT"); + auto& weights_map = weight_as_input ? m_model_inputs : m_model_weights; + if (weights_map.find(src_name) != weights_map.end()) { + continue; + } + + std::shared_ptr weight_node = + weight_as_input + ? std::make_shared(get_ov_type(src), ov::Shape{get_shape(src)}) + : create_weight_node(src); + weight_node->set_friendly_name(src_name); + weights_map[src_name] = weight_node; + + } else if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) { + // GGML_BACKEND_BUFFER_USAGE_ANY are kv caches + if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY) { + assert(src_name.find("cache_k") == 0 || src_name.find("cache_v") == 0); + } + if (m_model_inputs.find(src_name) != m_model_inputs.end()) { + continue; + } + auto param_node = std::make_shared(get_ov_type(src), ov::Shape{get_shape(src)}); + param_node->set_friendly_name(src_name); + m_model_inputs[src_name] = param_node; + } + } } - if (node->src[1]) { - std::string src1_name = std::string(node->src[1]->name); - inputs[src1_name] = node->src[1]; - m_input_names.push_back(src1_name); - m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); - } - if (node->src[2]) { - std::string src2_name = std::string(node->src[2]->name); - inputs[src2_name] = node->src[2]; - m_input_names.push_back(src2_name); - m_op_node_name.emplace_back(src2_name, ggml_op_name(node->op)); + if (!m_node) { + // Workaround: the final tensor "result_output" does not have GGML_TENSOR_FLAG_OUTPUT flag set in cgraph + if (node->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || node->flags & GGML_TENSOR_FLAG_OUTPUT || + std::string(node->name).find("result") == 0) { + auto name = node->view_src ? std::string(node->view_src->name) : std::string(node->name); + if (node->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY) { + assert(name.find("cache_k") == 0 || name.find("cache_v") == 0); + } + auto it = std::find(m_model_output_names.begin(), m_model_output_names.end(), name); + if (it == m_model_output_names.end()) { + m_model_output_names.push_back(name); + } + } } - switch (node->op) { - case GGML_OP_CONT: { - // Currently only two cases, either the input comes from a VIEW which is subtensor or from a PERMUTE - m_continuous = ggml_nelements(node->src[0]) == ggml_nelements(node->src[0]->view_src); - break; - } - case GGML_OP_CPY: { - m_continuous = ggml_is_contiguous(node); - break; - } - case GGML_OP_MUL_MAT: { - m_continuous = node->src[0]->view_src == nullptr; - break; - } - default: - break; + if (m_node) { + switch (node->op) { + case GGML_OP_CONT: { + // Currently only two cases, either the input comes from a VIEW which is subtensor or from a PERMUTE + m_continuous = ggml_nelements(node->src[0]) == ggml_nelements(node->src[0]->view_src); + break; + } + case GGML_OP_CPY: { + m_continuous = ggml_is_contiguous(node); + break; + } + case GGML_OP_MUL_MAT: { + m_continuous = node->src[0]->view_src == nullptr; + break; + } + default: + break; + } } } -void ggml_graph_op_print(const struct ggml_cgraph * cgraph) { - std::ofstream file("01_nodes.txt"); +std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) { + std::shared_ptr weight_node; + auto node_type = get_ov_type(tensor); + auto node_shape = get_shape(tensor); + auto ne_total = ggml_nelements(tensor); + switch (tensor->type) { + case GGML_TYPE_I32: { + const auto* ptr = reinterpret_cast(tensor->data); + std::vector data(ptr, ptr + ne_total); + weight_node = std::make_shared(node_type, node_shape, data); + break; + } + case GGML_TYPE_I64: { + const auto* ptr = reinterpret_cast(tensor->data); + std::vector data(ptr, ptr + ne_total); + weight_node = std::make_shared(node_type, node_shape, data); + break; + } + case GGML_TYPE_F32: { + const auto* ptr = reinterpret_cast(tensor->data); + std::vector data(ptr, ptr + ne_total); + weight_node = std::make_shared(node_type, node_shape, data); + break; + } + case GGML_TYPE_F16: { + const auto* ptr = reinterpret_cast(tensor->data); + std::vector data_f16; + data_f16.reserve(ne_total); + for (int i = 0; i < ne_total; ++i) { + data_f16.push_back(ov::float16::from_bits(ptr[i])); + } + weight_node = std::make_shared(node_type, node_shape, data_f16); + break; + } + default: + throw std::invalid_argument("Unsupported tensor type"); + } + return weight_node; +} + +void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph) { + std::ofstream file("cgraph.txt"); if (!file.is_open()) { std::cerr << "Failed to open file" << std::endl; return; @@ -160,88 +290,53 @@ void ggml_graph_op_print(const struct ggml_cgraph * cgraph) { file.close(); } - -GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgraph, const int32_t start_index, const int32_t end_index) - :m_cgraph(cgraph), - m_node(node), - m_op_name(m_node ? std::string(m_node->name) : "NONE_OP") { - m_inputs.clear(); - m_outputs.clear(); - m_input_names.clear(); - m_output_names.clear(); - m_params.clear(); - m_op_node_name.clear(); - m_decoders.clear(); - - if (m_node) { - set_input_output(m_node, m_inputs, m_outputs); - } else { - // for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { - for (int node_n = start_index; node_n <= end_index; node_n++) { - auto cur_node = m_cgraph->nodes[node_n]; - m_nodes.push_back(cur_node); - // Init model input and output - set_input_output(cur_node, m_inputs, m_outputs); - } - if (getenv("GGML_OPENVINO_DEBUG")) { - ggml_graph_op_print(m_cgraph); - } +std::vector GgmlOvDecoder::get_shape(const ggml_tensor* tensor) { + std::vector shape; + for (int i = GGML_MAX_DIMS - 2; i >= 0; --i) { + shape.push_back(static_cast(tensor->ne[i])); } + return shape; +} + +std::vector GgmlOvDecoder::get_stride(const ggml_tensor* tensor) { + std::vector stride; + for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) { + stride.push_back(static_cast(tensor->nb[i])); + } + return stride; +} + +ov::element::Type GgmlOvDecoder::get_ov_type(const ggml_tensor* tensor) { + ov::element::Type type = ov::element::dynamic; + switch (tensor->type) { + case GGML_TYPE_F32: + type = ov::element::f32; + break; + case GGML_TYPE_F16: + type = ov::element::f16; + break; + case GGML_TYPE_I64: + type = ov::element::i64; + break; + case GGML_TYPE_I32: + type = ov::element::i32; + break; + default: + break; + } + return type; } ov::PartialShape GgmlOvDecoder::get_input_shape(const std::string& name) const { - ov::PartialShape input_shape; - // Use input_node->ne - ggml_tensor * node = m_inputs.at(name); - std::vector shape; - - for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) { - if (node->ne[i] == 0) { - return input_shape; - } - shape.push_back(static_cast(node->ne[i])); - } - input_shape = ov::PartialShape(shape); - return input_shape; + return ov::PartialShape(get_shape(m_inputs.at(name))); } std::vector GgmlOvDecoder::get_input_stride(const std::string& name) const { - std::vector stride; - ggml_tensor * node = m_inputs.at(name); - for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) { - stride.push_back(static_cast(node->nb[i])); - } - return stride; -} - -std::vector GgmlOvDecoder::get_output_stride(const std::string& name) const { - std::vector stride; - ggml_tensor * node = m_outputs.at(name); - for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) { - stride.push_back(static_cast(node->nb[i])); - } - return stride; + return get_stride(m_inputs.at(name)); } ov::element::Type GgmlOvDecoder::get_input_type(const std::string& name) const { - ov::element::Type type = ov::element::dynamic; - switch (m_inputs.at(name)->type) { - case GGML_TYPE_F32: - type = ov::element::f32; - break; - case GGML_TYPE_F16: - type = ov::element::f16; - break; - case GGML_TYPE_I64: - type = ov::element::i64; - break; - case GGML_TYPE_I32: - type = ov::element::i32; - break; - default: - break; - } - return type; + return get_ov_type(m_inputs.at(name)); } size_t GgmlOvDecoder::get_input_size() const { @@ -257,69 +352,16 @@ std::vector GgmlOvDecoder::get_input_names() const { return m_input_names; } -std::string& GgmlOvDecoder::get_op_node_name(const std::string& key_name, const int index) { - if (index == -1) { - for (size_t i = 0; i < m_op_node_name.size(); ++i) { - if (m_op_node_name[i].first == key_name) { - return m_op_node_name[i].second; - } - } - } else { - return m_op_node_name[index].second; - } - - static std::string empty_string = ""; - return empty_string; // empty string -} - -const std::vector>& GgmlOvDecoder::get_params() const { - return m_params; +std::vector GgmlOvDecoder::get_output_stride(const std::string& name) const { + return get_stride(m_outputs.at(name)); } ov::PartialShape GgmlOvDecoder::get_output_shape(const std::string& name) const { - ov::PartialShape output_shape; - ggml_tensor * node = m_outputs.at(name); - std::vector shape; - - for (int i = GGML_MAX_DIMS - 2; i >= 0 ; --i) { - if (node->ne[i] == 0 ) { - // empty if any dimension has no elements - return output_shape; - } - shape.push_back(static_cast(node->ne[i])); - } - output_shape = ov::PartialShape(shape); - return output_shape; + return ov::PartialShape(get_shape(m_outputs.at(name))); } ov::element::Type GgmlOvDecoder::get_output_type(const std::string& name) const { - // TODO: Change to Output - ov::element::Type type = ov::element::dynamic; - switch (m_outputs.at(name)->type) { - case GGML_TYPE_F32: - type = ov::element::f32; - break; - case GGML_TYPE_F16: - type = ov::element::f16; - break; - case GGML_TYPE_I64: - type = ov::element::i64; - break; - case GGML_TYPE_I32: - type = ov::element::i32; - break; - default: - break; - } - return type; -} - -int32_t* GgmlOvDecoder::get_input_op_params(const std::string& name) const{ - return m_inputs.at(name)->op_params; -} - -int32_t* GgmlOvDecoder::get_output_op_params(const std::string& name) const{ - return m_outputs.at(name)->op_params; + return get_ov_type(m_outputs.at(name)); } std::string& GgmlOvDecoder::get_output_name(size_t index) const { @@ -335,10 +377,17 @@ const std::string& GgmlOvDecoder::get_op_name() const { return m_op_name; } +int32_t* GgmlOvDecoder::get_input_op_params(const std::string& name) const { + return m_inputs.at(name)->op_params; +} + +int32_t* GgmlOvDecoder::get_output_op_params(const std::string& name) const { + return m_outputs.at(name)->op_params; +} + void GgmlOvDecoder::visit_subgraph(std::function)> node_visitor) const { for (const auto& node : m_nodes) { auto decoder = std::make_shared(node, m_cgraph); - // m_decoders.push_back(decoder); node_visitor(decoder); } } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index eac045d158..2182ad624d 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -1,14 +1,17 @@ #pragma once +#include +#include +#include + #include "decoder.h" #include "ggml.h" -#include "openvino/op/parameter.hpp" class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { public: using ov::frontend::ggml::GgmlDecoder::GgmlDecoder; - GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgraph, const int32_t start_index=0, const int32_t end_index=0); + GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph); virtual ov::Any get_attribute(const std::string& name) const override { return nullptr; @@ -73,12 +76,23 @@ public: return m_continuous; } - std::string& get_op_node_name(const std::string& key_name, const int index) override; - - virtual const std::vector>& get_params() const override; + virtual const std::unordered_map>& get_model_inputs() const override { + return m_model_inputs; + } + virtual const std::unordered_map>& get_model_weights() const override { + return m_model_weights; + } + virtual const std::vector& get_model_output_names() const override { + return m_model_output_names; + } private: - void set_input_output(ggml_tensor* node, std::map& inputs, std::map& outputs); + void set_input_output(ggml_tensor* node); + static void dump_cgraph(const struct ggml_cgraph* cgraph); + static std::vector get_shape(const ggml_tensor* tensor); + static std::vector get_stride(const ggml_tensor* tensor); + static ov::element::Type get_ov_type(const ggml_tensor* tensor); + static std::shared_ptr create_weight_node(ggml_tensor* tensor); struct ggml_cgraph * m_cgraph; std::map m_inputs; @@ -86,12 +100,12 @@ private: std::map m_outputs; std::vector m_output_names; ggml_tensor* m_node; - std::vector m_nodes; - std::vector> m_decoders; + std::vector m_nodes; std::string m_op_name; mutable std::string m_name; bool m_continuous; - std::vector> m_params; std::vector> m_op_node_name; + std::unordered_map> m_model_inputs; + std::unordered_map> m_model_weights; + std::vector m_model_output_names; }; - diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index c32ad65842..7937d5793a 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -1,49 +1,22 @@ #include "utils.h" -#include "ggml-backend-impl.h" -#include "ggml-impl.h" -#include "ggml.h" + +#include +#include #include -#include #include +#include #include #include -using ov::frontend::ggml::GgmlDecoder; +#include "ggml-impl.h" +#include "ggml.h" -std::shared_ptr get_ggml_decoder(struct ggml_cgraph * cgraph, const int32_t start_index, const int32_t end_index) { - return std::make_shared(nullptr, cgraph, start_index, end_index); -} - -std::vector> get_ggml_graph_input_tensors(std::shared_ptr ggml_decoder) { - std::vector> input_tensors; - auto input_names = ggml_decoder->get_input_names(); - size_t op_iter = 0; - for (size_t inp = 0; inp < input_names.size(); ++inp) { - auto name = input_names[inp]; - std::string op_node_name = ggml_decoder->get_op_node_name(name, op_iter++); - // auto node_op_name = ggml_decoder->get_node_op_name(name); - auto input_data = ggml_decoder->get_input_ggml_tensor(name)->data; - #ifdef GGML_OPENVINO_DEBUG - printf("Subgraph input %d: %g\n", inp, *(double*)(input_data)); - #endif - ov::Tensor input_tensor; - ov::Shape input_shape = ggml_decoder->get_input_shape(name).to_shape(); - - std::vector input_stride = ggml_decoder->get_input_stride(name); - input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); - - // input_tensors[name] = input_tensor; - input_tensors.emplace_back(name, input_tensor); - } - // std::cout << "input_names.size(): " << input_names.size() << std::endl; - return input_tensors; +std::shared_ptr get_ggml_decoder(struct ggml_cgraph* cgraph) { + return std::make_shared(nullptr, cgraph); } ov::Tensor get_ggml_graph_input_tensor(std::shared_ptr ggml_decoder, std::string& name) { - auto input_data = ggml_decoder->get_input_ggml_tensor(name)->data; - #ifdef GGML_OPENVINO_DEBUG - printf("Subgraph input %s: %g\n", name.c_str(), *(double*)(input_data)); - #endif + auto* input_data = ggml_decoder->get_input_ggml_tensor(name)->data; ov::Tensor input_tensor; ov::Shape input_shape = ggml_decoder->get_input_shape(name).to_shape(); std::vector input_stride = ggml_decoder->get_input_stride(name); @@ -53,19 +26,16 @@ ov::Tensor get_ggml_graph_input_tensor(std::shared_ptr ggml_decod std::map get_ggml_graph_output_dst(std::shared_ptr ggml_decoder) { std::map output_tensors; - auto output_names = ggml_decoder->get_output_names(); + auto output_names = ggml_decoder->get_model_output_names(); for (size_t inp = 0; inp < output_names.size(); ++inp) { auto name = output_names[inp]; - auto output_data = ggml_decoder->get_output_ggml_tensor(name)->data; - #ifdef GGML_OPENVINO_DEBUG - printf("Output %d: %g\n", inp, *(double*)(output_data)); - #endif + const auto* tensor = ggml_decoder->get_output_ggml_tensor(name); + auto* output_data = tensor->view_src ? tensor->view_src->data : tensor->data; output_tensors[name] = output_data; } return output_tensors; } - static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { ov::frontend::FrontEnd::Ptr front_end = nullptr; auto fem = ov::frontend::FrontEndManager(); @@ -78,10 +48,9 @@ static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { return front_end; } -enum ggml_status openvino_frontend_compute(ggml_backend_t backend, - struct ggml_cgraph *cgraph, - const int32_t start_index, - const int32_t end_index) { +enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph) { + auto start_time = ggml_time_us(); + static ov::Core core; // auto devices = core.get_available_devices(); @@ -89,65 +58,102 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, if (!front_end) { GGML_LOG_ERROR("GGML FrontEnd is not initialized \n"); return GGML_STATUS_FAILED; - } else { - #ifdef GGML_OPENVINO_DEBUG - GGML_LOG_INFO("GGML FrontEnd is initialized \n"); - #endif } - auto ggml_decoder = get_ggml_decoder(cgraph, start_index, end_index); + auto ggml_decoder = get_ggml_decoder(cgraph); std::shared_ptr graph_decoder = ggml_decoder; - // Load GraphIterator -> InputModel ov::frontend::InputModel::Ptr input_model = front_end->load(graph_decoder); if (!input_model) { GGML_LOG_ERROR("Input Model is not loaded \n"); return GGML_STATUS_FAILED; - } else { - #ifdef GGML_OPENVINO_DEBUG - GGML_LOG_INFO("Input Model loaded \n"); - #endif } - // Convert InputModel -> ov::Model std::shared_ptr model = front_end->convert(input_model); + auto conversion_end_time = ggml_time_us(); - if (getenv("OPENVINO_DUMP_GRAPH")) { - char timestamped_filename[64]; - auto timestamp = (long long)ggml_time_us(); - snprintf(timestamped_filename, sizeof(timestamped_filename), - "model_%lld.xml", timestamp); - ov::serialize(model, timestamped_filename); + if (getenv("GGML_OPENVINO_DUMP_IR")) { + char timestamped_filename[64]; + auto timestamp = (long long)ggml_time_us(); + snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); + ov::serialize(model, timestamped_filename); } if (!model) { GGML_LOG_ERROR("Model is not converted \n"); - } else { - #ifdef GGML_OPENVINO_DEBUG - GGML_LOG_INFO("Model converted \n"); - #endif } - ov::CompiledModel compiled_model = core.compile_model(model); + ov::CompiledModel compiled_model = + core.compile_model(model, "CPU", ov::device::properties("CPU", ov::cache_dir("/tmp/ov_cache"))); + auto compile_end_time = ggml_time_us(); + ov::InferRequest infer_request = compiled_model.create_infer_request(); + auto infer_request_start_time = ggml_time_us(); auto input_names = ggml_decoder->get_input_names(); - auto input_tensors = get_ggml_graph_input_tensors(ggml_decoder); auto ov_params = model->get_parameters(); for (size_t i = 0; i < ov_params.size(); i++) { auto param_name = ov_params[i]->get_friendly_name(); - infer_request.set_input_tensor(i, get_ggml_graph_input_tensor(ggml_decoder, param_name)); + auto input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name); + + if (getenv("GGML_OPENVINO_DEBUG_INPUT")) { + std::cout << "Input name: " << param_name << ", Input shape: " << input_tensor.get_shape() + << ", Address: " << input_tensor.data() << std::endl; + switch (input_tensor.get_element_type()) { + case ov::element::f32: + std::cout << *(float*)(input_tensor.data()) << std::endl; + break; + case ov::element::f16: + std::cout << ov::float16::from_bits(*(uint16_t*)(input_tensor.data())) << std::endl; + break; + case ov::element::i32: + std::cout << *(int32_t*)(input_tensor.data()) << std::endl; + break; + case ov::element::i64: + std::cout << *(int64_t*)(input_tensor.data()) << std::endl; + break; + default: + break; + } + } + infer_request.set_input_tensor(i, input_tensor); } + auto input_end_time = ggml_time_us(); infer_request.infer(); + auto infer_end_time = ggml_time_us(); - auto output_names = ggml_decoder->get_output_names(); + auto output_names = ggml_decoder->get_model_output_names(); auto output_tensors = get_ggml_graph_output_dst(ggml_decoder); for (size_t i = 0; i < output_names.size(); i++) { auto output_tensor = infer_request.get_output_tensor(i); std::memcpy(output_tensors[output_names[i]], output_tensor.data(), output_tensor.get_byte_size()); - #ifdef GGML_OPENVINO_DEBUG - printf("Output %s after: %g\n", output_names[i].c_str(), *(double*)(output_tensor.data())); - #endif + + if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) { + std::cout << "Output name: " << output_names[i] << ", Output shape: " << output_tensor.get_shape() + << ", Address: " << output_tensors[output_names[i]] << std::endl; + switch (output_tensor.get_element_type()) { + case ov::element::f32: + std::cout << *(float*)(output_tensors[output_names[i]]) << std::endl; + break; + case ov::element::f16: + std::cout << ov::float16::from_bits(*(uint16_t*)(output_tensors[output_names[i]])) << std::endl; + break; + default: + break; + } + } + } + auto end_time = ggml_time_us(); + + if (getenv("GGML_OPENVINO_PROFILING")) { + GGML_LOG_INFO("GGML OpenVINO Backend: \n"); + GGML_LOG_INFO(" - Graph conversion Time: %ld ms \n", (conversion_end_time - start_time) / 1000); + GGML_LOG_INFO(" - Graph compile Time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000); + GGML_LOG_INFO(" - Graph InferRequest created Time: %ld ms \n", + (infer_request_start_time - compile_end_time) / 1000); + GGML_LOG_INFO(" - Graph Input Time: %ld ms \n", (input_end_time - infer_request_start_time) / 1000); + GGML_LOG_INFO(" - Graph Inference Time: %ld ms \n", (infer_end_time - input_end_time) / 1000); + GGML_LOG_INFO(" - Graph Output Time: %ld ms \n", (end_time - infer_end_time) / 1000); } return GGML_STATUS_SUCCESS; diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 0f5617ab4b..b4174c9f21 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -1,4 +1,4 @@ #include "ggml-decoder.h" #include "ggml-backend-impl.h" -enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, const int32_t start_index=0, const int32_t end_index=0); +enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph);