diff --git a/ggml/src/ggml-openvino/decoder.h b/ggml/src/ggml-openvino/decoder.h index b0775d43aa..790ed2e88d 100644 --- a/ggml/src/ggml-openvino/decoder.h +++ b/ggml/src/ggml-openvino/decoder.h @@ -58,6 +58,7 @@ public: virtual bool check_if_continuous() const = 0; virtual const std::map>& get_model_inputs() const = 0; + virtual const std::map>& get_model_extra_inputs() const = 0; virtual const std::map>& get_model_weights() const = 0; virtual const std::vector& get_model_output_names() const = 0; }; diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 44b46f2c63..372f880b1d 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -10,9 +10,11 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -35,6 +37,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap printed = true; } + set_max_token_len(); for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { auto* cur_node = m_cgraph->nodes[node_n]; m_nodes.push_back(cur_node); @@ -42,6 +45,8 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap } m_model_weights = model_weights; + add_extra_inputs(); + if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { dump_cgraph(m_cgraph); } @@ -102,7 +107,16 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, if (m_model_inputs.find(src_name) != m_model_inputs.end()) { continue; } - auto param_node = std::make_shared(get_ov_type(src), ov::Shape{get_shape(src)}); + ov::PartialShape input_shape; + if (std::string(src->name) == "inp_tokens" || std::string(src->name) == "inp_pos") { + input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_max_token_len)}; + } else if (std::string(src->name).find("KQ_mask") == 0) { + input_shape = + ov::PartialShape{1, ov::Dimension(1, m_max_token_len), ov::Dimension(1, m_max_token_len)}; + } else { + input_shape = ov::Shape{get_shape(src)}; + } + auto param_node = std::make_shared(get_ov_type(src), input_shape); param_node->set_friendly_name(src_name); m_model_inputs[src_name] = param_node; } @@ -146,6 +160,57 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, } } +void GgmlOvDecoder::set_max_token_len() { + for (int i = 0; i < m_cgraph->n_nodes; i++) { + auto* node = m_cgraph->nodes[i]; + if (std::string(node->name) == "v-0") { + auto* cache_v = node->src[0]; + m_max_token_len = cache_v->ne[0] / node->ne[1] / node->ne[2]; + break; + } + } +} + +void GgmlOvDecoder::add_extra_inputs() { + int64_t past_token_len; + int64_t attention_size; + + for (const auto& node : m_nodes) { + if (node->op == GGML_OP_CPY && ggml_is_contiguous(node)) { + assert(std::string(node->view_src->name).find("cache_k") == 0); + int64_t head_size = node->src[0]->ne[0]; + int64_t num_heads = node->src[0]->ne[1]; + past_token_len = (int64_t)(node->src[1]->op_params[0] / node->src[1]->nb[0] / head_size / num_heads); + + std::string name = "past_token_len"; + auto param_node = std::make_shared(ov::element::i64, ov::Shape{}); + param_node->set_friendly_name(name); + m_model_extra_inputs[name] = param_node; + + auto tensor = std::make_shared(ov::element::i64, ov::Shape{}); + *tensor->data() = past_token_len; + m_model_extra_input_values[name] = tensor; + break; + } + } + for (const auto& node : m_nodes) { + if (node->src[1] && std::string(node->src[1]->name).find("inp_tokens") == 0) { + int64_t total_token_len = node->src[1]->ne[0] + past_token_len; + attention_size = (total_token_len + 31) / 32 * 32; + + std::string name = "attention_size"; + auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); + param_node->set_friendly_name(name); + m_model_extra_inputs[name] = param_node; + + auto tensor = std::make_shared(ov::element::i64, ov::Shape{1}); + *tensor->data() = attention_size; + m_model_extra_input_values[name] = tensor; + break; + } + } +} + std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor* tensor) { std::shared_ptr weight_node; auto node_type = get_ov_type(tensor); diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index c4f7612d76..22ff9d85f7 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -79,6 +80,12 @@ public: virtual const std::map>& get_model_inputs() const override { return m_model_inputs; } + virtual const std::map>& get_model_extra_inputs() const override { + return m_model_extra_inputs; + } + virtual const std::map>& get_model_extra_input_values() const { + return m_model_extra_input_values; + } virtual const std::map>& get_model_weights() const override { return m_model_weights; } @@ -88,12 +95,16 @@ public: private: void set_input_output(ggml_tensor* node, std::map>& model_weights); + void add_extra_inputs(); static void dump_cgraph(const struct ggml_cgraph* cgraph); static std::vector get_shape(const ggml_tensor* tensor); static std::vector get_stride(const ggml_tensor* tensor); static ov::element::Type get_ov_type(const ggml_tensor* tensor); static std::shared_ptr create_weight_node(ggml_tensor* tensor); + void set_max_token_len(); + int64_t m_max_token_len; + struct ggml_cgraph * m_cgraph; std::map m_inputs; std::vector m_input_names; @@ -106,6 +117,8 @@ private: bool m_continuous; std::vector> m_op_node_name; std::map> m_model_inputs; + std::map> m_model_extra_inputs; + std::map> m_model_extra_input_values; std::map> m_model_weights; std::vector m_model_output_names; }; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 32fa7cf481..6166161c41 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -3,10 +3,14 @@ #include #include #include +#include #include #include #include #include +#include +#include +#include #include "ggml-impl.h" #include "ggml.h" @@ -63,61 +67,65 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c return GGML_STATUS_FAILED; } + using CachedItem = std::pair, ov::CompiledModel>; + static std::unordered_map compiled_cache; + + std::shared_ptr model; + ov::CompiledModel compiled_model; + int64_t conversion_end_time; + int64_t compile_end_time; + auto ggml_decoder = get_ggml_decoder(cgraph); - std::shared_ptr graph_decoder = ggml_decoder; - ov::frontend::InputModel::Ptr input_model = front_end->load(graph_decoder); - if (!input_model) { - GGML_LOG_ERROR("Input Model is not loaded \n"); - return GGML_STATUS_FAILED; + auto it = compiled_cache.find(cgraph); + if (it != compiled_cache.end()) { + model = it->second.first; + conversion_end_time = ggml_time_us(); + + compiled_model = it->second.second; + compile_end_time = ggml_time_us(); + } else { + std::shared_ptr graph_decoder = ggml_decoder; + ov::frontend::InputModel::Ptr input_model = front_end->load(graph_decoder); + if (!input_model) { + GGML_LOG_ERROR("Input Model is not loaded \n"); + return GGML_STATUS_FAILED; + } + + model = front_end->convert(input_model); + conversion_end_time = ggml_time_us(); + + if (getenv("GGML_OPENVINO_DUMP_IR")) { + char timestamped_filename[64]; + auto timestamp = (long long)ggml_time_us(); + snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); + ov::serialize(model, timestamped_filename); + } + + if (!model) { + GGML_LOG_ERROR("Model is not converted \n"); + } + compiled_model = core.compile_model(model, "CPU"); + compile_end_time = ggml_time_us(); + + compiled_cache[cgraph] = std::make_pair(model, compiled_model); } - std::shared_ptr model = front_end->convert(input_model); - auto conversion_end_time = ggml_time_us(); - - if (getenv("GGML_OPENVINO_DUMP_IR")) { - char timestamped_filename[64]; - auto timestamp = (long long)ggml_time_us(); - snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); - ov::serialize(model, timestamped_filename); - } - - if (!model) { - GGML_LOG_ERROR("Model is not converted \n"); - } - - ov::CompiledModel compiled_model = core.compile_model(model, "CPU"); - auto compile_end_time = ggml_time_us(); - ov::InferRequest infer_request = compiled_model.create_infer_request(); - auto infer_request_start_time = ggml_time_us(); - auto input_names = ggml_decoder->get_input_names(); auto ov_params = model->get_parameters(); for (size_t i = 0; i < ov_params.size(); i++) { auto param_name = ov_params[i]->get_friendly_name(); - auto input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name); - - if (getenv("GGML_OPENVINO_DEBUG_INPUT")) { - std::cout << "Input name: " << param_name << ", Input shape: " << input_tensor.get_shape() - << ", Address: " << input_tensor.data() << std::endl; - switch (input_tensor.get_element_type()) { - case ov::element::f32: - std::cout << *(float*)(input_tensor.data()) << std::endl; - break; - case ov::element::f16: - std::cout << ov::float16::from_bits(*(uint16_t*)(input_tensor.data())) << std::endl; - break; - case ov::element::i32: - std::cout << *(int32_t*)(input_tensor.data()) << std::endl; - break; - case ov::element::i64: - std::cout << *(int64_t*)(input_tensor.data()) << std::endl; - break; - default: - break; - } + ov::Tensor input_tensor; + if (ggml_decoder->get_model_extra_inputs().find(param_name) != ggml_decoder->get_model_extra_inputs().end()) { + input_tensor = *ggml_decoder->get_model_extra_input_values().at(param_name); + } else { + input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name); } infer_request.set_input_tensor(i, input_tensor); + + if (getenv("GGML_OPENVINO_DEBUG_INPUT")) { + print_input_tensor_info(param_name, input_tensor); + } } auto input_end_time = ggml_time_us(); @@ -131,20 +139,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c std::memcpy(output_tensors[output_names[i]], output_tensor.data(), output_tensor.get_byte_size()); if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) { - std::cout << "Output name: " << output_names[i] << ", Output shape: " << output_tensor.get_shape() - << ", Address: " << output_tensors[output_names[i]] << std::endl; - switch (output_tensor.get_element_type()) { - case ov::element::f32: - std::cout << *(float*)(output_tensor.data()) << std::endl; - std::cout << checksum(output_tensor.data(), output_tensor.get_byte_size()) << std::endl; - break; - case ov::element::f16: - std::cout << ov::float16::from_bits(*(uint16_t*)(output_tensor.data())) << std::endl; - std::cout << checksum(output_tensor.data(), output_tensor.get_byte_size()) << std::endl; - break; - default: - break; - } + print_output_tensor_info(output_names[i], output_tensor, output_tensors); } } auto end_time = ggml_time_us(); @@ -153,9 +148,7 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c GGML_LOG_INFO("GGML OpenVINO Backend: \n"); GGML_LOG_INFO(" - Graph conversion Time: %ld ms \n", (conversion_end_time - start_time) / 1000); GGML_LOG_INFO(" - Graph compile Time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000); - GGML_LOG_INFO(" - Graph InferRequest created Time: %ld ms \n", - (infer_request_start_time - compile_end_time) / 1000); - GGML_LOG_INFO(" - Graph Input Time: %ld ms \n", (input_end_time - infer_request_start_time) / 1000); + GGML_LOG_INFO(" - Graph Input Time: %ld ms \n", (input_end_time - compile_end_time) / 1000); GGML_LOG_INFO(" - Graph Inference Time: %ld ms \n", (infer_end_time - input_end_time) / 1000); GGML_LOG_INFO(" - Graph Output Time: %ld ms \n", (end_time - infer_end_time) / 1000); } @@ -172,3 +165,43 @@ size_t checksum(const void* data, size_t size) { } return sum; } + +void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor) { + std::cout << "Input name: " << name << ", Input shape: " << tensor.get_shape() << ", Address: " << tensor.data() + << std::endl; + switch (tensor.get_element_type()) { + case ov::element::f32: + std::cout << *(float*)(tensor.data()) << std::endl; + break; + case ov::element::f16: + std::cout << ov::float16::from_bits(*(uint16_t*)(tensor.data())) << std::endl; + break; + case ov::element::i32: + std::cout << *(int32_t*)(tensor.data()) << std::endl; + break; + case ov::element::i64: + std::cout << *(int64_t*)(tensor.data()) << std::endl; + break; + default: + break; + } +} + +void print_output_tensor_info(const std::string& name, + const ov::Tensor& tensor, + std::map& output_dst) { + std::cout << "Output name: " << name << ", Output shape: " << tensor.get_shape() + << ", Address: " << output_dst[name] << std::endl; + switch (tensor.get_element_type()) { + case ov::element::f32: + std::cout << *(float*)(tensor.data()) << std::endl; + std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl; + break; + case ov::element::f16: + std::cout << ov::float16::from_bits(*(uint16_t*)(tensor.data())) << std::endl; + std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl; + break; + default: + break; + } +} diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 4458e71f54..96b07008ec 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -4,3 +4,9 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph); size_t checksum(const void* data, size_t size); + +void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor); + +void print_output_tensor_info(const std::string& name, + const ov::Tensor& tensor, + std::map& output_dst);