From 651b2c06cb593d62d4a4a925a492be27063a1cc0 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 14 Apr 2025 18:04:03 +0800 Subject: [PATCH] * Use find_package in CMake to configure OpenVINO * Remove OPENVINO_OP_DEBUG * Simplify set_input_output in decoder * Fix CPY in set_input_output * Use params from converted ov model in setting input --- ggml/src/ggml-openvino.cpp | 28 ++- ggml/src/ggml-openvino/ggml-decoder.cpp | 274 +++++------------------- ggml/src/ggml-openvino/utils.cpp | 55 +++-- 3 files changed, 114 insertions(+), 243 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index b9f1b89722..762ed786a9 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -3,6 +3,7 @@ #include "ggml-impl.h" #include "ggml-openvino.h" #include "ggml-openvino/utils.h" +#include "ggml.h" #include #include @@ -1367,7 +1368,7 @@ static const std::set& openvino_ops = []() -> const std::set& openvino_ops = []() -> const std::setop); - if (it == op_mapping.end()) { - return false; + static const std::map> op_mapping_unary = { + {GGML_UNARY_OP_SILU, {"Sigmoid", "Multiply"}}, + }; + + std::vector mapped_ops; + if (op->op == GGML_OP_UNARY) { + auto it = op_mapping_unary.find(ggml_get_unary_op(op)); + if (it == op_mapping_unary.end()) { + return false; + } + mapped_ops = it->second; + } else { + auto it = op_mapping.find(op->op); + if (it == op_mapping.end()) { + return false; + } + mapped_ops = it->second; } - for (const std::string& op_name : it->second) { + for (const std::string& op_name : mapped_ops) { if (openvino_ops.count(op_name) == 0) { return false; } } return true; -#endif } static bool ggml_backend_openvino_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 3b396c05f7..d7895c3d7f 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -6,222 +6,66 @@ #include void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::map& inputs, std::map& outputs) { + std::string node_name; + if (node->op == GGML_OP_CPY) { + // CPY updates the input tensor in place. For later ov op that uses the + // input tensor of CPY, we need to make sure they get the updated tensor + // by putting the src tensor name in the tensor_map in + // /src/frontends/ggml/src/translate_session.cpp + node_name = std::string(node->view_src->name); + } else { + node_name = std::string(node->name); + } + std::string src0_name = std::string(node->src[0]->name); - std::string node_name = std::string(node->name); + inputs[src0_name] = node->src[0]; + outputs[node_name] = node; + m_input_names.push_back(src0_name); + m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); + if (node->op == GGML_OP_CPY && node->view_src) { + m_output_names.push_back(node->view_src->name); + } else { + m_output_names.push_back(node_name); + } + + if (node->src[1]) { + std::string src1_name = std::string(node->src[1]->name); + inputs[src1_name] = node->src[1]; + m_input_names.push_back(src1_name); + m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); + } + if (node->src[2]) { + std::string src2_name = std::string(node->src[2]->name); + inputs[src2_name] = node->src[2]; + m_input_names.push_back(src2_name); + m_op_node_name.emplace_back(src2_name, ggml_op_name(node->op)); + } switch (node->op) { - // Unary OPs - case GGML_OP_UNARY: - case GGML_OP_RESHAPE: - case GGML_OP_TRANSPOSE: - case GGML_OP_PERMUTE: - case GGML_OP_RMS_NORM: - { - inputs[src0_name] = node->src[0]; - outputs[node_name] = node; - m_input_names.push_back(src0_name); - m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); - m_output_names.push_back(node_name); - break; + case GGML_OP_CONT: { + if (ggml_is_contiguous(node->src[0]) && ggml_is_contiguous(node) && + (node->src[0]->ne[0] * node->src[0]->nb[0] == node->src[0]->nb[1])) { + m_continuous = true; + } else { + m_continuous = false; } - case GGML_OP_CONT: - { - if (ggml_is_contiguous(node->src[0]) - && ggml_is_contiguous(node) - && (node->src[0]->ne[0] * node->src[0]->nb[0] == node->src[0]->nb[1])) { - inputs[src0_name] = node->src[0]; - outputs[node_name] = node; - m_input_names.push_back(src0_name); - m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); - m_output_names.push_back(node_name); - - ov::Shape input_shape = { static_cast(node->src[0]->ne[2]), - static_cast(node->src[0]->ne[1]), - static_cast(node->src[0]->ne[0])}; - auto input_param = std::make_shared(ov::element::f32, input_shape); - m_params.push_back(input_param); - - m_continuous = true; - break; - } - - if (node->src[0]->type == node->type && node->src[0]->ne[0] == node->ne[0] && - node->src[0]->nb[0] == ggml_type_size(node->src[0]->type) && - node->nb[0] == ggml_type_size(node->src[0]->type)) { - - inputs[src0_name] = node->src[0]; - outputs[node_name] = node; - m_input_names.push_back(src0_name); - m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); - m_output_names.push_back(node_name); - - const size_t element_size = ggml_type_size(node->src[0]->type); - size_t valid_elems = static_cast(node->src[0]->ne[0]); // 3072 - size_t num_rows = static_cast(node->src[0]->ne[1]); // 7 - size_t dim2 = static_cast(node->src[0]->ne[2]); // 1 - size_t phys_stride = static_cast(node->src[0]->nb[1]) / element_size; // 9216 - // size_t total_phys = (num_rows - 1) * phys_stride + valid_elems; // 6*9216 + 3072 = 58368 - size_t total_phys = num_rows * phys_stride; // 7 * 9216 = 64512 - ov::Shape input_shape = { dim2, num_rows, phys_stride }; - auto input_param = std::make_shared(ov::element::f32, input_shape); - m_params.push_back(input_param); - - m_continuous = false; - break; - } - - if (ggml_is_contiguous(node)) { - inputs[src0_name] = node->src[0]; - outputs[node_name] = node; - m_input_names.push_back(src0_name); - m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); - m_output_names.push_back(node_name); - - ov::Shape input_shape = { static_cast(node->src[0]->ne[2]), - static_cast(node->src[0]->ne[1]), - static_cast(node->src[0]->ne[0])}; - auto input_param = std::make_shared(ov::element::f32, input_shape); - m_params.push_back(input_param); - - m_continuous = false; - break; - } + break; + } + case GGML_OP_CPY: { + m_continuous = ggml_is_contiguous(node); + break; + } + case GGML_OP_MUL_MAT: { + if (!ggml_is_contiguous(node->src[1]) || + node->src[1]->ne[0] * node->src[1]->nb[0] != node->src[1]->nb[1]) { + m_continuous = false; + } else { + m_continuous = true; } - case GGML_OP_CPY: - { - if (ggml_is_contiguous(node)) { - std::string src1_name = std::string(node->src[1]->name); - inputs[src0_name] = node->src[0]; - src1_name = std::string(node->src[1]->view_src->name); - inputs[src1_name] = node->src[1]; - node_name = std::string(node->view_src->name); - outputs[node_name] = node; - m_input_names.push_back(src0_name); - m_input_names.push_back(src1_name); - m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); - m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); - m_output_names.push_back(node_name); - m_continuous = true; - - ov::Shape input1_shape = { static_cast(node->src[0]->ne[2]), - static_cast(node->src[0]->ne[1]), - static_cast(node->src[0]->ne[0])}; - auto input1_param = std::make_shared(ov::element::f32, input1_shape); - m_params.push_back(input1_param); - ov::Shape input2_shape = { static_cast(node->src[1]->ne[2]), - static_cast(node->src[1]->ne[1]), - static_cast(node->src[1]->view_src->ne[0])}; - auto input2_param = std::make_shared(ov::element::f16, input2_shape); - m_params.push_back(input2_param); - break; - } else { - std::string src1_name = std::string(node->src[1]->name); - inputs[src0_name] = node->src[0]; - src1_name = std::string(node->src[1]->view_src->name); - inputs[src1_name] = node->src[1]; - node_name = std::string(node->view_src->name); - outputs[node_name] = node; - m_input_names.push_back(src0_name); - m_input_names.push_back(src1_name); - m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); - m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); - m_output_names.push_back(node_name); - - ov::Shape input0_shape = { static_cast(node->src[0]->ne[2]), - static_cast(node->src[0]->ne[1]), - static_cast(node->src[0]->ne[0])}; - auto input0_param = std::make_shared(ov::element::f32, input0_shape); - m_params.push_back(input0_param); - ov::Shape input1_shape = { 1, 1, static_cast(node->src[1]->nb[2] / node->src[1]->nb[0])}; - auto input1_param = std::make_shared(ov::element::f16, input1_shape); - m_params.push_back(input1_param); - - m_continuous = false; - - break; - } - } - // For view, input is node itself - case GGML_OP_VIEW: - { - inputs[src0_name] = node->src[0]; - outputs[node_name] = node; - m_input_names.push_back(src0_name); - m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); - m_output_names.push_back(node_name); - break; - } - // SCALE - case GGML_OP_SCALE: - { - inputs[src0_name] = node->src[0]; - outputs[node_name] = node; - m_input_names.push_back(src0_name); - m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); - m_output_names.push_back(node_name); - break; - } - case GGML_OP_MUL_MAT: - { - if (!ggml_is_contiguous(node->src[1]) || node->src[1]->ne[0] * node->src[1]->nb[0] != node->src[1]->nb[1]) { - m_continuous = false; - } else { - m_continuous = true; - } - std::string src1_name = std::string(node->src[1]->name); - inputs[src0_name] = node->src[0]; - inputs[src1_name] = node->src[1]; - outputs[node_name] = node; - m_input_names.push_back(src0_name); - m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); - m_input_names.push_back(src1_name); - m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); - m_output_names.push_back(node_name); - break; - } - // OPs with 2 inputs - case GGML_OP_ADD: - case GGML_OP_DIV: - case GGML_OP_MUL: - case GGML_OP_SUB: - case GGML_OP_GET_ROWS: - case GGML_OP_SOFT_MAX: - { - inputs[src0_name] = node->src[0]; - outputs[node_name] = node; - m_input_names.push_back(src0_name); - m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); - m_output_names.push_back(node_name); - if (node->src[1]) { - std::string src1_name = std::string(node->src[1]->name); - inputs[src1_name] = node->src[1]; - m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); - m_input_names.push_back(src1_name); - } - break; - } - // OPs with 3 inputs: - case GGML_OP_ROPE: - { - std::string src1_name = std::string(node->src[1]->name); - inputs[src0_name] = node->src[0]; - inputs[src1_name] = node->src[1]; - m_input_names.push_back(src0_name); - m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); - m_input_names.push_back(src1_name); - m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); - outputs[node_name] = node; - m_output_names.push_back(node_name); - if (node->src[2]) { - std::string src2_name = std::string(node->src[2]->name); - inputs[src2_name] = node->src[2]; - m_input_names.push_back(src2_name); - m_op_node_name.emplace_back(src2_name, ggml_op_name(node->op)); - } - break; - } - default: - break; + break; + } + default: + break; } } @@ -334,7 +178,6 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgr m_op_node_name.clear(); m_decoders.clear(); - // If first init if (m_node) { set_input_output(m_node, m_inputs, m_outputs); } else { @@ -353,7 +196,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgr ov::PartialShape GgmlOvDecoder::get_input_shape(const std::string& name) const { ov::PartialShape input_shape; - // Use input_node->ne + // Use input_node->ne ggml_tensor * node = m_inputs.at(name); std::vector shape; @@ -440,7 +283,6 @@ const std::vector>& GgmlOvDecoder::get_pa ov::PartialShape GgmlOvDecoder::get_output_shape(const std::string& name) const { ov::PartialShape output_shape; - // Use input_node->ne ggml_tensor * node = m_outputs.at(name); std::vector shape; @@ -552,10 +394,10 @@ const std::string& GgmlOvDecoder::get_op_type() const { auto unary_it = unaryOpTypeMap.find(ggml_get_unary_op(m_node)); if (unary_it != unaryOpTypeMap.end()) { return unary_it->second; - } + } } return it->second; - } + } static const std::string unknown_op = "UNKNOWN_OP"; return unknown_op; } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 736c7f690b..f4d9c7705a 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -1,9 +1,11 @@ #include "utils.h" -#include "ggml-impl.h" #include "ggml-backend-impl.h" +#include "ggml-impl.h" +#include +#include +#include #include #include -#include using ov::frontend::ggml::GgmlDecoder; @@ -20,27 +22,14 @@ std::vector> get_ggml_graph_input_tensors(std std::string op_node_name = ggml_decoder->get_op_node_name(name, op_iter++); // auto node_op_name = ggml_decoder->get_node_op_name(name); auto input_data = ggml_decoder->get_input_ggml_tensor(name)->data; - auto check_if_contiguous = ggml_is_contiguous(ggml_decoder->get_input_ggml_tensor(name)); #ifdef GGML_OPENVINO_DEBUG printf("Subgraph input %d: %g\n", inp, *(double*)(input_data)); #endif ov::Tensor input_tensor; ov::Shape input_shape = ggml_decoder->get_input_shape(name).to_shape(); - ov::element::Type input_type = ggml_decoder->get_input_type(name); - size_t element_size = input_type.size(); std::vector input_stride = ggml_decoder->get_input_stride(name); - if (op_node_name == "CONT" && input_shape[0] == 1 // Except for the kqv_merge node - && (input_shape[1] != 1 && flag || input_shape[2]*element_size!=input_stride[1]) - ) { - const size_t num_rows = static_cast(ggml_decoder->get_input_shape(name).to_shape()[1]); - const size_t dim2 = static_cast(ggml_decoder->get_input_shape(name).to_shape()[0]); - size_t phys_stride = static_cast(input_stride[1]) / element_size; - ov::Shape input_shape = { dim2, num_rows, phys_stride }; // {1, 7, 9216 } - input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); - } else { - input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), ggml_decoder->get_input_shape(name).to_shape(), input_data); - } + input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); // input_tensors[name] = input_tensor; input_tensors.emplace_back(name, input_tensor); @@ -49,6 +38,18 @@ std::vector> get_ggml_graph_input_tensors(std return input_tensors; } +ov::Tensor get_ggml_graph_input_tensor(std::shared_ptr ggml_decoder, std::string& name) { + auto input_data = ggml_decoder->get_input_ggml_tensor(name)->data; + #ifdef GGML_OPENVINO_DEBUG + printf("Subgraph input %s: %g\n", name.c_str(), *(double*)(input_data)); + #endif + ov::Tensor input_tensor; + ov::Shape input_shape = ggml_decoder->get_input_shape(name).to_shape(); + std::vector input_stride = ggml_decoder->get_input_stride(name); + input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); + return input_tensor; +} + std::map get_ggml_graph_output_dst(std::shared_ptr ggml_decoder) { std::map output_tensors; auto output_names = ggml_decoder->get_output_names(); @@ -79,7 +80,7 @@ static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph, const int32_t start_index, const int32_t end_index, bool flag) { static ov::Core core; // auto devices = core.get_available_devices(); - // Get GGML Frontend + // Get GGML Frontend static auto front_end = get_ggml_frontend(); if (!front_end) { GGML_LOG_ERROR("GGML FrontEnd is not initialized \n"); @@ -102,9 +103,17 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c #endif } - // Convert InputModel -> ov::Model + // Convert InputModel -> ov::Model std::shared_ptr model = front_end->convert(input_model); + if (getenv("OPENVINO_DUMP_GRAPH")) { + char timestamped_filename[64]; + auto timestamp = (long long)ggml_time_us(); + snprintf(timestamped_filename, sizeof(timestamped_filename), + "model_%lld.xml", timestamp); + ov::serialize(model, timestamped_filename); + } + if (!model) { GGML_LOG_ERROR("Model is not converted \n"); } else { @@ -122,10 +131,14 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c auto input_names = ggml_decoder->get_input_names(); auto input_tensors = get_ggml_graph_input_tensors(ggml_decoder, flag); - // Set input tensor - for (size_t i = 0; i < input_names.size(); i++) { - infer_request.set_input_tensor(i, input_tensors.at(i).second); + auto ov_params = model->get_parameters(); + for (size_t i = 0; i < ov_params.size(); i++) { + auto param_name = ov_params[i]->get_friendly_name(); + infer_request.set_input_tensor(i, get_ggml_graph_input_tensor(ggml_decoder, param_name)); } + // for (size_t i = 0; i < input_names.size(); i++) { + // infer_request.set_input_tensor(i, input_tensors.at(i).second); + // } infer_request.infer();