From 8ae700ae11345a1d8aa0c600ca639c4c8839da13 Mon Sep 17 00:00:00 2001 From: zhanmyz Date: Wed, 26 Mar 2025 16:31:52 +0800 Subject: [PATCH] Process Prompt and predict first token is OK --- ggml/src/ggml-openvino.cpp | 146 ++++++++++++----- ggml/src/ggml-openvino/ggml-decoder.cpp | 88 ++++++---- ggml/src/ggml-openvino/utils.cpp | 208 ++++++++++++++++++++---- 3 files changed, 340 insertions(+), 102 deletions(-) diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp index 2c83edaeb5..a508aeea40 100644 --- a/ggml/src/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino.cpp @@ -652,6 +652,7 @@ void ggml_backend_openvino_reshape(ggml_tensor *dst) { } void ggml_backend_openvino_view(ggml_tensor *dst) { + GGML_UNUSED(dst); } @@ -985,8 +986,11 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe std::vector reshape_indices; std::vector view_indices; std::vector view_indices_prompt; + std::vector view_split; std::vector cpy_indices; + std::vector cpy_split_16; + std::vector cpy_split_19; std::vector transpose_indices; std::vector permute_indices; @@ -1000,12 +1004,23 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe reshape_indices.push_back(i); // } else if (cgraph->nodes[i]->op == GGML_OP_VIEW) { } else if (cgraph->nodes[i]->op == GGML_OP_VIEW) { + // if (cgraph->nodes[i]->src[0]->ne[0] == 98304) + // continue; view_indices.push_back(i); - if (cgraph->nodes[i]->ne[0] == 96) { + if (cgraph->nodes[i]->ne[0] == 32) { view_indices_prompt.push_back(i); } + if (i == 18) { + view_split.push_back(i); + } } else if (cgraph->nodes[i]->op == GGML_OP_CPY) { cpy_indices.push_back(i); + if (i == 16) { + cpy_split_16.push_back(i); + } + if (i == 19) { + cpy_split_19.push_back(i); + } } else if (cgraph->nodes[i]->op == GGML_OP_TRANSPOSE) { transpose_indices.push_back(i); } else if (cgraph->nodes[i]->op == GGML_OP_PERMUTE) { @@ -1023,10 +1038,18 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe bool prompt_process_flag = true; if (cgraph->nodes[0]->ne[1] == 1) { prompt_process_flag = false; - // int end_node = cgraph->n_nodes - 1; - // openvino_frontend_compute(backend, cgraph, 0, end_node, prompt_process_flag); for (int i = 0; i < cgraph->n_nodes; i++) { - if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { + if (std::find(add_indices.begin(), add_indices.end(), i) != add_indices.end()) { + ggml_backend_openvino_add_forward(cgraph->nodes[i]); + } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { + ggml_backend_openvino_transpose(cgraph->nodes[i]); + } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { + ggml_backend_openvino_cpy(cgraph->nodes[i]); + } else if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { + ggml_backend_openvino_permute(cgraph->nodes[i]); + // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { + // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); + } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { ggml_backend_openvino_view(cgraph->nodes[i]); } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); @@ -1036,6 +1059,11 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe // Process a range of nodes with openvino_frontend_compute int start_index = i; while (i < cgraph->n_nodes + && std::find(add_indices.begin(), add_indices.end(), i) == add_indices.end() + && std::find(transpose_indices.begin(), transpose_indices.end(), i) == transpose_indices.end() + && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() + && std::find(permute_indices.begin(), permute_indices.end(), i) == permute_indices.end() + // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end() @@ -1047,41 +1075,85 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe } } } - } else { // int end_node = cgraph->n_nodes - 1; // openvino_frontend_compute(backend, cgraph, 0, end_node, prompt_process_flag); - for (int i = 0; i < cgraph->n_nodes; i++) { - if (std::find(add_indices.begin(), add_indices.end(), i) != add_indices.end()) { - ggml_backend_openvino_add_forward(cgraph->nodes[i]); - } else if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { - ggml_backend_openvino_permute(cgraph->nodes[i]); - // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { - // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); - // } else if (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) != view_indices_prompt.end()) { - // ggml_backend_openvino_view(cgraph->nodes[i]); - // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { - // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); - // } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { - // ggml_backend_openvino_reshape(cgraph->nodes[i]); - } else { - // Process a range of nodes with openvino_frontend_compute - int start_index = i; - while (i < cgraph->n_nodes - && std::find(add_indices.begin(), add_indices.end(), i) == add_indices.end() - && std::find(permute_indices.begin(), permute_indices.end(), i) == permute_indices.end() - // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() - // && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() - // && (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) == view_indices_prompt.end()) - // && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() - // && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end() - ) { - i++; - } - if (start_index < i) { - openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag); - } - } - } + // for (int i = 0; i < cgraph->n_nodes; i++) { + // // if (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) != view_indices_prompt.end()) { + // // ggml_backend_openvino_view(cgraph->nodes[i]); + // if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { + // ggml_backend_openvino_view(cgraph->nodes[i]); + // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { + // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); + // } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { + // ggml_backend_openvino_reshape(cgraph->nodes[i]); + // } else { + // // Process a range of nodes with openvino_frontend_compute + // int start_index = i; + // while (i < cgraph->n_nodes + // // && (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) == view_indices_prompt.end()) + // && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() + // && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() + // && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end() + // ) { + // i++; + // } + // if (start_index < i) { + // openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag); + // } + // } + // } + } else { + int end_node = cgraph->n_nodes - 1; + openvino_frontend_compute(backend, cgraph, 0, end_node, prompt_process_flag); + // for (int i = 0; i < cgraph->n_nodes; i++) { + // if (std::find(add_indices.begin(), add_indices.end(), i) != add_indices.end()) { + // ggml_backend_openvino_add_forward(cgraph->nodes[i]); + // // } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) { + // // ggml_backend_openvino_transpose(cgraph->nodes[i]); + // // } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) { + // // ggml_backend_openvino_cpy(cgraph->nodes[i]); + // // } else if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) { + // // ggml_backend_openvino_permute(cgraph->nodes[i]); + // // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) { + // // ggml_backend_openvino_mul_mat(cgraph->nodes[i]); + // // } else if (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) != view_indices_prompt.end()) { + // // ggml_backend_openvino_view(cgraph->nodes[i]); + // // } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) { + // // ggml_backend_openvino_view(cgraph->nodes[i]); + // // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) { + // // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]); + // // } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) { + // // ggml_backend_openvino_reshape(cgraph->nodes[i]); + // // }else if (std::find(view_split.begin(), view_split.end(), i) != view_split.end()) { + // // ggml_backend_openvino_view(cgraph->nodes[i]); + // // }else if (std::find(cpy_split_16.begin(), cpy_split_16.end(), i) != cpy_split_16.end()) { + // // ggml_backend_openvino_cpy(cgraph->nodes[i]); + // // }else if (std::find(cpy_split_19.begin(), cpy_split_19.end(), i) != cpy_split_19.end()) { + // // ggml_backend_openvino_cpy(cgraph->nodes[i]); + // } else { + // // Process a range of nodes with openvino_frontend_compute + // int start_index = i; + // while (i < cgraph->n_nodes + // && std::find(add_indices.begin(), add_indices.end(), i) == add_indices.end() + // // && std::find(transpose_indices.begin(), transpose_indices.end(), i) == transpose_indices.end() + // // && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end() + // // && std::find(permute_indices.begin(), permute_indices.end(), i) == permute_indices.end() + // // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end() + // // && (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) == view_indices_prompt.end()) + // // && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end() + // // && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end() + // // && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end() + // // && std::find(view_split.begin(), view_split.end(), i) == view_split.end() + // // && std::find(cpy_split_16.begin(), cpy_split_16.end(), i) == cpy_split_16.end() + // // && std::find(cpy_split_19.begin(), cpy_split_19.end(), i) == cpy_split_19.end() + // ) { + // i++; + // } + // if (start_index < i) { + // openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag); + // } + // } + // } } return GGML_STATUS_SUCCESS; diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index d91338127a..4ec1be7b4d 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -90,47 +90,49 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]; - outputs[node_name] = node; - m_input_names.push_back(src0_name); - m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); - m_output_names.push_back(node_name); - m_continuous = true; - - ov::Shape input_shape = { static_cast(node->src[0]->ne[2]), - static_cast(node->src[0]->ne[1]), - static_cast(node->src[0]->ne[0])}; - auto input_param = std::make_shared(ov::element::f32, input_shape); - m_params.push_back(input_param); - break; - } else { std::string src1_name = std::string(node->src[1]->name); inputs[src0_name] = node->src[0]; + // inputs[src1_name] = node->src[1]; + // outputs[node_name] = node; + src1_name = std::string(node->src[1]->view_src->name); inputs[src1_name] = node->src[1]; + node_name = std::string(node->view_src->name); outputs[node_name] = node; m_input_names.push_back(src0_name); m_input_names.push_back(src1_name); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); m_output_names.push_back(node_name); + m_continuous = true; - // int src0_elem_size = ggml_type_size(node->src[0]->type); - // int src1_elem_size = ggml_type_size(node->src[1]->type); - - // int src0_logical_rows = node->src[0]->ne[1]; - // int src1_logical_rows = node->src[1]->ne[1]; - - // int src0_phys_cols = node->src[0]->nb[0] / src0_elem_size; - // int src0_phys_rows = src0_logical_rows; - - // int src1_phys_cols = node->src[1]->nb[1] / src1_elem_size; - // int src1_phys_rows = src1_logical_rows; - // ov::Shape src0_phys_shape = {1, static_cast(src0_phys_rows), static_cast(src0_phys_cols) }; - // ov::Shape src1_phys_shape = {1, static_cast(src1_phys_rows), static_cast(src1_phys_cols) }; - // auto input0_param = std::make_shared(ov::element::f32, src0_phys_shape); - // auto input1_param = std::make_shared(ov::element::f16, src1_phys_shape); - // m_params.push_back(input0_param); - // m_params.push_back(input1_param); + ov::Shape input1_shape = { static_cast(node->src[0]->ne[2]), + static_cast(node->src[0]->ne[1]), + static_cast(node->src[0]->ne[0])}; + auto input1_param = std::make_shared(ov::element::f32, input1_shape); + m_params.push_back(input1_param); + // ov::Shape input2_shape = { static_cast(node->src[1]->ne[2]), + // static_cast(node->src[1]->ne[1]), + // static_cast(node->src[1]->ne[0])}; + ov::Shape input2_shape = { static_cast(node->src[1]->ne[2]), + static_cast(node->src[1]->ne[1]), + static_cast(node->src[1]->view_src->ne[0])}; + auto input2_param = std::make_shared(ov::element::f16, input2_shape); + m_params.push_back(input2_param); + break; + } else { + std::string src1_name = std::string(node->src[1]->name); + inputs[src0_name] = node->src[0]; + // inputs[src1_name] = node->src[1]; + // outputs[node_name] = node; + src1_name = std::string(node->src[1]->view_src->name); + inputs[src1_name] = node->src[1]; + node_name = std::string(node->view_src->name); + outputs[node_name] = node; + m_input_names.push_back(src0_name); + m_input_names.push_back(src1_name); + m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); + m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op)); + m_output_names.push_back(node_name); ov::Shape input0_shape = { static_cast(node->src[0]->ne[2]), static_cast(node->src[0]->ne[1]), @@ -150,6 +152,15 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[0]; + // if (node->ne[0] == 21504 || node->ne[0] == 7 + // || node->ne[0] == 3072 && node->src[0]->ne[0] == 98304 + // || node->ne[0] == 1 && node->src[0]->ne[0] == 98304) { + // // if (node->ne[0] == 21504 || node->ne[0] == 7) { + // node_name = std::string(node->view_src->name); + // outputs[node_name] = node; + // } else { + // outputs[node_name] = node; + // } outputs[node_name] = node; m_input_names.push_back(src0_name); m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op)); @@ -193,6 +204,11 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::mapsrc[1]->name); inputs[src0_name] = node->src[0]; + // if (node->ne[0] == 32 &&node->src[0]->type == GGML_TYPE_I32) { + // static_cast(inputs[src0_name]->data)[0] = 1; + // } else if (node->ne[0] == 32 && node->src[0]->type == GGML_TYPE_F16) { + // static_cast(inputs[src0_name]->data)[0] = static_cast(1); + // } inputs[src1_name] = node->src[1]; outputs[node_name] = node; m_input_names.push_back(src0_name); @@ -346,13 +362,17 @@ void ggml_graph_op_print(const struct ggml_cgraph * cgraph) { } GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgraph, const int32_t start_index, const int32_t end_index) - :m_cgraph(cgraph), - m_node(node), - m_op_name(m_node ? std::string(m_node->name) : "NONE_OP") { + :m_cgraph(cgraph), + m_node(node), + m_op_name(m_node ? std::string(m_node->name) : "NONE_OP") { m_inputs.clear(); m_outputs.clear(); m_input_names.clear(); m_output_names.clear(); + m_params.clear(); + m_op_node_name.clear(); + m_decoders.clear(); + // If first init if (m_node) { set_input_output(m_node, m_inputs, m_outputs); diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 3909afbe2d..53fecd3b23 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -3,6 +3,7 @@ #include "ggml-backend-impl.h" #include #include +#include using ov::frontend::ggml::GgmlDecoder; @@ -32,32 +33,70 @@ std::vector> get_ggml_graph_input_tensors(std ov::element::Type input_type = ggml_decoder->get_input_type(name); size_t element_size = input_type.size(); std::vector input_stride = ggml_decoder->get_input_stride(name); - if (op_node_name == "CONT" && input_shape[0] == 1 && (input_shape[1] != 1 && flag || input_shape[2]*element_size!=input_stride[1])) { + if (op_node_name == "CONT" && input_shape[0] == 1 // Except for the kqv_merge node + && (input_shape[1] != 1 && flag || input_shape[2]*element_size!=input_stride[1]) + ) { const size_t num_rows = static_cast(ggml_decoder->get_input_shape(name).to_shape()[1]); const size_t dim2 = static_cast(ggml_decoder->get_input_shape(name).to_shape()[0]); size_t phys_stride = static_cast(input_stride[1]) / element_size; ov::Shape input_shape = { dim2, num_rows, phys_stride }; // {1, 7, 9216 } + // if (!flag) { + // std::cout << "CONT input shape: " << input_shape << std::endl; + // } input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data); - } else if (op_node_name == "CPY" && (!check_if_contiguous || input_shape[2] == 1)) { //[TODO]: Temporarily determine whether the node corresponding to the input tensor of the Phi-3 model CPY is continuous - std::vector input_stride = ggml_decoder->get_input_stride(name); - ov::element::Type input_type = ggml_decoder->get_input_type(name); - size_t element_size = input_type.size(); - // ov::Shape phys_shape; - static int iter = 0; - if (iter++ % 2 == 0) { - // phys_shape = {1, input_shape[1], input_stride[2] / element_size}; - input_tensor = ov::Tensor(ov::element::f32, input_shape, input_data); - } else { - ov::Shape flat_shape = {1, 1, input_stride[0] / element_size}; - input_tensor = ov::Tensor(ov::element::f16, flat_shape, input_data); - } + // } else if (op_node_name == "CPY" && (!check_if_contiguous || input_shape[2] == 1)) { //[TODO]: Temporarily determine whether the node corresponding to the input tensor of the Phi-3 model CPY is continuous + // } else if (op_node_name == "CPY") { + // std::vector input_stride = ggml_decoder->get_input_stride(name); + // ov::element::Type input_type = ggml_decoder->get_input_type(name); + // size_t element_size = input_type.size(); + // // ov::Shape phys_shape; + // static int iter = 0; + // if (iter++ % 2 == 0) { + // // phys_shape = {1, input_shape[1], input_stride[2] / element_size}; + // input_tensor = ov::Tensor(ov::element::f32, input_shape, input_data); + // } else { + // ov::Shape flat_shape = {1, 1, input_stride[0] / element_size}; + // input_tensor = ov::Tensor(ov::element::f16, flat_shape, input_data); + // } } else { input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), ggml_decoder->get_input_shape(name).to_shape(), input_data); + // if(!flag) { + // std::cout << std::left << "[ " << std::setw(2) << inp << " ]: " + // << "Input Name: " << std::setw(20) << name + // << "Type: " << std::setw(5) << ggml_decoder->get_input_type(name) + // << "OP: " << std::setw(10) << op_node_name + // << "CONT: " << check_if_contiguous + // << ", shape: " << std::setw(4) << input_tensor.get_shape()[0] << " " << std::setw(4) << input_tensor.get_shape()[1] << " " << input_tensor.get_shape()[2] + // << ", address: " + // << std::setw(15) << input_tensor.data() << " " + // << std::setw(15) << ((float*)input_tensor.data())[0] + // << std::setw(15) << ((float*)input_tensor.data())[1] + // << ", ne[0]: " + // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0]] << std::right + // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0] + 1] << std::right + // << std::right + // << std::endl; + // } + // if (op_node_name == "MUL_MAT") { + // std::cout << std::left << "[ " << std::setw(2) << inp << " ]: " + // << "Input MUL_MAT name: " << std::setw(20) << name + // << ", shape: " << std::setw(4) << input_tensor.get_shape()[0] << " " << std::setw(4) << input_tensor.get_shape()[1] << " " << input_tensor.get_shape()[2] + // << ", address: " + // << std::setw(15) << input_tensor.data() << " " + // << std::setw(15) << ((float*)input_tensor.data())[0] + // << std::setw(15) << ((float*)input_tensor.data())[1] + // << ", ne[0]: " + // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0]] << std::right + // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0] + 1] << std::right + // << std::right + // << std::endl; + // } } // input_tensors[name] = input_tensor; input_tensors.emplace_back(name, input_tensor); } + // std::cout << "input_names.size(): " << input_names.size() << std::endl; return input_tensors; } @@ -117,7 +156,13 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // Convert InputModel -> ov::Model std::shared_ptr model = front_end->convert(input_model); // ov::save_model(model, "/home/user/zhan/merge_git_commits/llama.cpp-ov/001_model.xml"); - + + // auto cloned_model = model->clone(); + // std::string model_dir = "/home/user/zhan/merge_git_commits/llama.cpp-ov"; + // auto path_base = model_dir + "/" + cloned_model->get_name(); + // // ov::pass::VisualizeTree(path_base + ".svg").run_on_model(cloned_model); + // ov::serialize(cloned_model, path_base + ".xml", path_base + ".bin"); + if (!model) { GGML_LOG_ERROR("Model is not converted \n"); } else { @@ -126,9 +171,14 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c #endif } + // model = core.read_model("/home/user/zhan/merge_git_commits/llama.cpp-ov/replaceWithInputLayer_000_model.xml"); // Loading a model to the device + // std::cout << "Compile ..." << std::endl; ov::CompiledModel compiled_model = core.compile_model(model); // ov::save_model(compiled_model.get_runtime_model(), "/home/user/zhan/merge_git_commits/llama.cpp-ov/001_compile_model.xml"); + // std::ofstream output_file("/home/user/zhan/merge_git_commits/llama.cpp-ov/000_compile_model.xml"); + // compiled_model.export_model(output_file); + // output_file.close(); // Create infer request ov::InferRequest infer_request = compiled_model.create_infer_request(); @@ -151,34 +201,130 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c // std::cout << std::endl; } + // std::cout << "Infer ..." << std::endl; infer_request.infer(); // Set dst data for outputs auto output_names = ggml_decoder->get_output_names(); auto output_tensors = get_ggml_graph_output_dst(ggml_decoder); for (size_t i = 0; i < output_names.size(); i++) { - // std::string op_name = ggml_decoder->get_node_op_name(output_names[i]); auto output_tensor = infer_request.get_output_tensor(i); - // output_tensor.get_shape(); std::memcpy(output_tensors[output_names[i]], output_tensor.data(), output_tensor.get_byte_size()); - auto tensor = ggml_decoder->get_output_ggml_tensor(output_names[i]); - // std::cout << std::left << "[ " << std::setw(2) << i << " ]: " - // << "output_names: " << std::setw(20) << output_names[i] - // << ", shape: " << std::setw(4) << tensor->ne[0] << " " << std::setw(4) << tensor->ne[1] << " " << tensor->ne[2] - // << ", address: " - // << std::setw(15) << tensor->data << " " - // << std::setw(15) << ((float*)output_tensor.data())[0] - // << std::setw(15) << ((float*)output_tensor.data())[1] - // << ", ne[0]: " - // << std::setw(15) << ((float*)output_tensor.data())[tensor->ne[0]] << std::right - // << std::setw(15) << ((float*)output_tensor.data())[tensor->ne[0] + 1] << std::right - // << std::right - // << std::endl; + // if(!flag) { + // auto tensor = ggml_decoder->get_output_ggml_tensor(output_names[i]); + // std::cout << std::left << "[ " << std::setw(2) << i << " ]: " + // << "output_names: " << std::setw(20) << output_names[i] + // << ", shape: " << std::setw(4) << tensor->ne[0] << " " << std::setw(4) << tensor->ne[1] << " " << tensor->ne[2] + // << ", address: " + // << std::setw(15) << tensor->data << " " + // << std::setw(15) << static_cast(((float*)output_tensor.data())[0]) + // << std::setw(15) << static_cast(((float*)output_tensor.data())[1]) + // << ", ne[0]: " + // << std::setw(15) << static_cast(((float*)output_tensor.data())[tensor->ne[0]]) << std::right + // << std::setw(15) << static_cast(((float*)output_tensor.data())[tensor->ne[0] + 1]) << std::right + // << std::right + // << std::endl; + // if (i == 19) { + // auto output_tensor_18 = infer_request.get_output_tensor(18); + // auto tensor = ggml_decoder->get_output_ggml_tensor(output_names[18]); + // std::cout << std::left << " " << std::setw(2) << 18 << " : " + // << "output_names: " << std::setw(20) << output_names[18] + // << ", shape: " << std::setw(4) << tensor->ne[0] << " " << std::setw(4) << tensor->ne[1] << " " << tensor->ne[2] + // << ", address: " + // << std::setw(15) << tensor->data << " " + // << std::setw(15) << static_cast(((float*)output_tensor_18.data())[0]) + // << std::setw(15) << static_cast(((float*)output_tensor_18.data())[1]) + // << ", ne[0]: " + // << std::setw(15) << static_cast(((float*)output_tensor_18.data())[tensor->ne[0]]) << std::right + // << std::setw(15) << static_cast(((float*)output_tensor_18.data())[tensor->ne[0] + 1]) << std::right + // << std::right + // << std::endl; + // } + // if(i == 23) { + // auto output_tensor_15 = infer_request.get_output_tensor(15); + // auto tensor = ggml_decoder->get_output_ggml_tensor(output_names[15]); + // std::cout << std::left << " " << std::setw(2) << 15 << " : " + // << "output_names: " << std::setw(20) << output_names[15] + // << ", shape: " << std::setw(4) << tensor->ne[0] << " " << std::setw(4) << tensor->ne[1] << " " << tensor->ne[2] + // << ", address: " + // << std::setw(15) << tensor->data << " " + // << std::setw(15) << static_cast(((float*)output_tensor_15.data())[0]) + // << std::setw(15) << static_cast(((float*)output_tensor_15.data())[1]) + // << ", ne[0]: " + // << std::setw(15) << static_cast(((float*)output_tensor_15.data())[tensor->ne[0]]) << std::right + // << std::setw(15) << static_cast(((float*)output_tensor_15.data())[tensor->ne[0] + 1]) << std::right + // << std::right + // << std::endl; + // auto cache_k_l0_20 = ggml_decoder->get_input_names()[20]; + // // auto input_data = ggml_decoder->get_input_ggml_tensor(cache_k_l0_20)->data; + // auto input_tensor = input_tensors.at(20).second; + // std::cout << std::left << " " << std::setw(2) << 20 << " : " + // << "Input Name: " << std::setw(20) << cache_k_l0_20 + // << ", shape: " << std::setw(4) << input_tensor.get_shape()[0] << " " << std::setw(4) << input_tensor.get_shape()[1] << " " << input_tensor.get_shape()[2] + // << ", address: " + // << std::setw(15) << input_tensor.data() << " " + // << std::setw(15) << ((float*)input_tensor.data())[0] + // << std::setw(15) << ((float*)input_tensor.data())[1] + // << ", ne[0]: " + // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0]] << std::right + // << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0] + 1] << std::right + // << std::right + // << std::endl; + + // auto cache_k_l0_27 = ggml_decoder->get_input_names()[27]; + // // auto input_data = ggml_decoder->get_input_ggml_tensor(cache_k_l0_20)->data; + // auto input_tensor_27 = input_tensors.at(27).second; + // std::cout << std::left << " " << std::setw(2) << 27 << " : " + // << "Input Name: " << std::setw(20) << cache_k_l0_27 + // << ", shape: " << std::setw(4) << input_tensor_27.get_shape()[0] << " " << std::setw(4) << input_tensor_27.get_shape()[1] << " " << input_tensor_27.get_shape()[2] + // << ", address: " + // << std::setw(15) << input_tensor_27.data() << " " + // << std::setw(15) << ((float*)input_tensor_27.data())[0] + // << std::setw(15) << ((float*)input_tensor_27.data())[1] + // << ", ne[0]: " + // << std::setw(15) << ((float*)input_tensor_27.data())[input_tensor_27.get_shape()[0]] << std::right + // << std::setw(15) << ((float*)input_tensor_27.data())[input_tensor_27.get_shape()[0] + 1] << std::right + // << std::right + // << std::endl; + + // auto cache_k_l0_29 = ggml_decoder->get_input_names()[29]; + // // auto input_data = ggml_decoder->get_input_ggml_tensor(cache_k_l0_20)->data; + // auto input_tensor_29 = input_tensors.at(29).second; + // std::cout << std::left << " " << std::setw(2) << 29 << " : " + // << "Input Name: " << std::setw(20) << cache_k_l0_29 + // << ", shape: " << std::setw(4) << input_tensor_29.get_shape()[0] << " " << std::setw(4) << input_tensor_29.get_shape()[1] << " " << input_tensor_29.get_shape()[2] + // << ", address: " + // << std::setw(15) << input_tensor_29.data() << " " + // << std::setw(15) << ((float*)input_tensor_29.data())[0] + // << std::setw(15) << ((float*)input_tensor_29.data())[1] + // << ", ne[0]: " + // << std::setw(15) << ((float*)input_tensor_29.data())[input_tensor_29.get_shape()[0]] << std::right + // << std::setw(15) << ((float*)input_tensor_29.data())[input_tensor_29.get_shape()[0] + 1] << std::right + // << std::right + // << std::endl; + + // auto cache_k_l0_30 = ggml_decoder->get_input_names()[30]; + // // auto input_data = ggml_decoder->get_input_ggml_tensor(cache_k_l0_20)->data; + // auto input_tensor_30 = input_tensors.at(30).second; + // std::cout << std::left << " " << std::setw(2) << 30 << " : " + // << "Input Name: " << std::setw(20) << cache_k_l0_30 + // << ", shape: " << std::setw(4) << input_tensor_30.get_shape()[0] << " " << std::setw(4) << input_tensor_30.get_shape()[1] << " " << input_tensor_30.get_shape()[2] + // << ", address: " + // << std::setw(15) << input_tensor_30.data() << " " + // << std::setw(15) << ((float*)input_tensor_30.data())[0] + // << std::setw(15) << ((float*)input_tensor_30.data())[1] + // << ", ne[0]: " + // << std::setw(15) << ((float*)input_tensor_30.data())[input_tensor_30.get_shape()[0]] << std::right + // << std::setw(15) << ((float*)input_tensor_30.data())[input_tensor_30.get_shape()[0] + 1] << std::right + // << std::right + // << std::endl; + // } + // } #ifdef GGML_OPENVINO_DEBUG printf("Output %s after: %g\n", output_names[i].c_str(), *(double*)(output_tensor.data())); #endif } - + return GGML_STATUS_SUCCESS; GGML_UNUSED(backend); }