Process Prompt and predict first token is OK
This commit is contained in:
parent
8020138406
commit
8ae700ae11
|
|
@ -652,6 +652,7 @@ void ggml_backend_openvino_reshape(ggml_tensor *dst) {
|
|||
}
|
||||
|
||||
void ggml_backend_openvino_view(ggml_tensor *dst) {
|
||||
|
||||
GGML_UNUSED(dst);
|
||||
}
|
||||
|
||||
|
|
@ -985,8 +986,11 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe
|
|||
std::vector<int> reshape_indices;
|
||||
std::vector<int> view_indices;
|
||||
std::vector<int> view_indices_prompt;
|
||||
std::vector<int> view_split;
|
||||
|
||||
std::vector<int> cpy_indices;
|
||||
std::vector<int> cpy_split_16;
|
||||
std::vector<int> cpy_split_19;
|
||||
std::vector<int> transpose_indices;
|
||||
std::vector<int> permute_indices;
|
||||
|
||||
|
|
@ -1000,12 +1004,23 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe
|
|||
reshape_indices.push_back(i);
|
||||
// } else if (cgraph->nodes[i]->op == GGML_OP_VIEW) {
|
||||
} else if (cgraph->nodes[i]->op == GGML_OP_VIEW) {
|
||||
// if (cgraph->nodes[i]->src[0]->ne[0] == 98304)
|
||||
// continue;
|
||||
view_indices.push_back(i);
|
||||
if (cgraph->nodes[i]->ne[0] == 96) {
|
||||
if (cgraph->nodes[i]->ne[0] == 32) {
|
||||
view_indices_prompt.push_back(i);
|
||||
}
|
||||
if (i == 18) {
|
||||
view_split.push_back(i);
|
||||
}
|
||||
} else if (cgraph->nodes[i]->op == GGML_OP_CPY) {
|
||||
cpy_indices.push_back(i);
|
||||
if (i == 16) {
|
||||
cpy_split_16.push_back(i);
|
||||
}
|
||||
if (i == 19) {
|
||||
cpy_split_19.push_back(i);
|
||||
}
|
||||
} else if (cgraph->nodes[i]->op == GGML_OP_TRANSPOSE) {
|
||||
transpose_indices.push_back(i);
|
||||
} else if (cgraph->nodes[i]->op == GGML_OP_PERMUTE) {
|
||||
|
|
@ -1023,10 +1038,18 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe
|
|||
bool prompt_process_flag = true;
|
||||
if (cgraph->nodes[0]->ne[1] == 1) {
|
||||
prompt_process_flag = false;
|
||||
// int end_node = cgraph->n_nodes - 1;
|
||||
// openvino_frontend_compute(backend, cgraph, 0, end_node, prompt_process_flag);
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) {
|
||||
if (std::find(add_indices.begin(), add_indices.end(), i) != add_indices.end()) {
|
||||
ggml_backend_openvino_add_forward(cgraph->nodes[i]);
|
||||
} else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) {
|
||||
ggml_backend_openvino_transpose(cgraph->nodes[i]);
|
||||
} else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) {
|
||||
ggml_backend_openvino_cpy(cgraph->nodes[i]);
|
||||
} else if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) {
|
||||
ggml_backend_openvino_permute(cgraph->nodes[i]);
|
||||
// } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) {
|
||||
// ggml_backend_openvino_mul_mat(cgraph->nodes[i]);
|
||||
} else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) {
|
||||
ggml_backend_openvino_view(cgraph->nodes[i]);
|
||||
} else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) {
|
||||
ggml_backend_openvino_dup_bytes(cgraph->nodes[i]);
|
||||
|
|
@ -1036,6 +1059,11 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe
|
|||
// Process a range of nodes with openvino_frontend_compute
|
||||
int start_index = i;
|
||||
while (i < cgraph->n_nodes
|
||||
&& std::find(add_indices.begin(), add_indices.end(), i) == add_indices.end()
|
||||
&& std::find(transpose_indices.begin(), transpose_indices.end(), i) == transpose_indices.end()
|
||||
&& std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end()
|
||||
&& std::find(permute_indices.begin(), permute_indices.end(), i) == permute_indices.end()
|
||||
// && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end()
|
||||
&& std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end()
|
||||
&& std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end()
|
||||
&& std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end()
|
||||
|
|
@ -1047,41 +1075,85 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe
|
|||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// int end_node = cgraph->n_nodes - 1;
|
||||
// openvino_frontend_compute(backend, cgraph, 0, end_node, prompt_process_flag);
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
if (std::find(add_indices.begin(), add_indices.end(), i) != add_indices.end()) {
|
||||
ggml_backend_openvino_add_forward(cgraph->nodes[i]);
|
||||
} else if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) {
|
||||
ggml_backend_openvino_permute(cgraph->nodes[i]);
|
||||
// } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) {
|
||||
// ggml_backend_openvino_mul_mat(cgraph->nodes[i]);
|
||||
// } else if (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) != view_indices_prompt.end()) {
|
||||
// ggml_backend_openvino_view(cgraph->nodes[i]);
|
||||
// } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) {
|
||||
// ggml_backend_openvino_dup_bytes(cgraph->nodes[i]);
|
||||
// } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) {
|
||||
// ggml_backend_openvino_reshape(cgraph->nodes[i]);
|
||||
} else {
|
||||
// Process a range of nodes with openvino_frontend_compute
|
||||
int start_index = i;
|
||||
while (i < cgraph->n_nodes
|
||||
&& std::find(add_indices.begin(), add_indices.end(), i) == add_indices.end()
|
||||
&& std::find(permute_indices.begin(), permute_indices.end(), i) == permute_indices.end()
|
||||
// && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end()
|
||||
// && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end()
|
||||
// && (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) == view_indices_prompt.end())
|
||||
// && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end()
|
||||
// && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end()
|
||||
) {
|
||||
i++;
|
||||
}
|
||||
if (start_index < i) {
|
||||
openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag);
|
||||
}
|
||||
}
|
||||
}
|
||||
// for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
// // if (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) != view_indices_prompt.end()) {
|
||||
// // ggml_backend_openvino_view(cgraph->nodes[i]);
|
||||
// if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) {
|
||||
// ggml_backend_openvino_view(cgraph->nodes[i]);
|
||||
// } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) {
|
||||
// ggml_backend_openvino_dup_bytes(cgraph->nodes[i]);
|
||||
// } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) {
|
||||
// ggml_backend_openvino_reshape(cgraph->nodes[i]);
|
||||
// } else {
|
||||
// // Process a range of nodes with openvino_frontend_compute
|
||||
// int start_index = i;
|
||||
// while (i < cgraph->n_nodes
|
||||
// // && (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) == view_indices_prompt.end())
|
||||
// && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end()
|
||||
// && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end()
|
||||
// && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end()
|
||||
// ) {
|
||||
// i++;
|
||||
// }
|
||||
// if (start_index < i) {
|
||||
// openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
} else {
|
||||
int end_node = cgraph->n_nodes - 1;
|
||||
openvino_frontend_compute(backend, cgraph, 0, end_node, prompt_process_flag);
|
||||
// for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
// if (std::find(add_indices.begin(), add_indices.end(), i) != add_indices.end()) {
|
||||
// ggml_backend_openvino_add_forward(cgraph->nodes[i]);
|
||||
// // } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) {
|
||||
// // ggml_backend_openvino_transpose(cgraph->nodes[i]);
|
||||
// // } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) {
|
||||
// // ggml_backend_openvino_cpy(cgraph->nodes[i]);
|
||||
// // } else if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) {
|
||||
// // ggml_backend_openvino_permute(cgraph->nodes[i]);
|
||||
// // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) {
|
||||
// // ggml_backend_openvino_mul_mat(cgraph->nodes[i]);
|
||||
// // } else if (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) != view_indices_prompt.end()) {
|
||||
// // ggml_backend_openvino_view(cgraph->nodes[i]);
|
||||
// // } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) {
|
||||
// // ggml_backend_openvino_view(cgraph->nodes[i]);
|
||||
// // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) {
|
||||
// // ggml_backend_openvino_dup_bytes(cgraph->nodes[i]);
|
||||
// // } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) {
|
||||
// // ggml_backend_openvino_reshape(cgraph->nodes[i]);
|
||||
// // }else if (std::find(view_split.begin(), view_split.end(), i) != view_split.end()) {
|
||||
// // ggml_backend_openvino_view(cgraph->nodes[i]);
|
||||
// // }else if (std::find(cpy_split_16.begin(), cpy_split_16.end(), i) != cpy_split_16.end()) {
|
||||
// // ggml_backend_openvino_cpy(cgraph->nodes[i]);
|
||||
// // }else if (std::find(cpy_split_19.begin(), cpy_split_19.end(), i) != cpy_split_19.end()) {
|
||||
// // ggml_backend_openvino_cpy(cgraph->nodes[i]);
|
||||
// } else {
|
||||
// // Process a range of nodes with openvino_frontend_compute
|
||||
// int start_index = i;
|
||||
// while (i < cgraph->n_nodes
|
||||
// && std::find(add_indices.begin(), add_indices.end(), i) == add_indices.end()
|
||||
// // && std::find(transpose_indices.begin(), transpose_indices.end(), i) == transpose_indices.end()
|
||||
// // && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end()
|
||||
// // && std::find(permute_indices.begin(), permute_indices.end(), i) == permute_indices.end()
|
||||
// // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end()
|
||||
// // && (std::find(view_indices_prompt.begin(), view_indices_prompt.end(), i) == view_indices_prompt.end())
|
||||
// // && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end()
|
||||
// // && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end()
|
||||
// // && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end()
|
||||
// // && std::find(view_split.begin(), view_split.end(), i) == view_split.end()
|
||||
// // && std::find(cpy_split_16.begin(), cpy_split_16.end(), i) == cpy_split_16.end()
|
||||
// // && std::find(cpy_split_19.begin(), cpy_split_19.end(), i) == cpy_split_19.end()
|
||||
// ) {
|
||||
// i++;
|
||||
// }
|
||||
// if (start_index < i) {
|
||||
// openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
}
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
|
|
|
|||
|
|
@ -90,47 +90,49 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::map<std::string, gg
|
|||
case GGML_OP_CPY:
|
||||
{
|
||||
if (ggml_is_contiguous(node)) {
|
||||
inputs[src0_name] = node->src[0];
|
||||
outputs[node_name] = node;
|
||||
m_input_names.push_back(src0_name);
|
||||
m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op));
|
||||
m_output_names.push_back(node_name);
|
||||
m_continuous = true;
|
||||
|
||||
ov::Shape input_shape = { static_cast<size_t>(node->src[0]->ne[2]),
|
||||
static_cast<size_t>(node->src[0]->ne[1]),
|
||||
static_cast<size_t>(node->src[0]->ne[0])};
|
||||
auto input_param = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, input_shape);
|
||||
m_params.push_back(input_param);
|
||||
break;
|
||||
} else {
|
||||
std::string src1_name = std::string(node->src[1]->name);
|
||||
inputs[src0_name] = node->src[0];
|
||||
// inputs[src1_name] = node->src[1];
|
||||
// outputs[node_name] = node;
|
||||
src1_name = std::string(node->src[1]->view_src->name);
|
||||
inputs[src1_name] = node->src[1];
|
||||
node_name = std::string(node->view_src->name);
|
||||
outputs[node_name] = node;
|
||||
m_input_names.push_back(src0_name);
|
||||
m_input_names.push_back(src1_name);
|
||||
m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op));
|
||||
m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op));
|
||||
m_output_names.push_back(node_name);
|
||||
m_continuous = true;
|
||||
|
||||
// int src0_elem_size = ggml_type_size(node->src[0]->type);
|
||||
// int src1_elem_size = ggml_type_size(node->src[1]->type);
|
||||
|
||||
// int src0_logical_rows = node->src[0]->ne[1];
|
||||
// int src1_logical_rows = node->src[1]->ne[1];
|
||||
|
||||
// int src0_phys_cols = node->src[0]->nb[0] / src0_elem_size;
|
||||
// int src0_phys_rows = src0_logical_rows;
|
||||
|
||||
// int src1_phys_cols = node->src[1]->nb[1] / src1_elem_size;
|
||||
// int src1_phys_rows = src1_logical_rows;
|
||||
// ov::Shape src0_phys_shape = {1, static_cast<size_t>(src0_phys_rows), static_cast<size_t>(src0_phys_cols) };
|
||||
// ov::Shape src1_phys_shape = {1, static_cast<size_t>(src1_phys_rows), static_cast<size_t>(src1_phys_cols) };
|
||||
// auto input0_param = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, src0_phys_shape);
|
||||
// auto input1_param = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, src1_phys_shape);
|
||||
// m_params.push_back(input0_param);
|
||||
// m_params.push_back(input1_param);
|
||||
ov::Shape input1_shape = { static_cast<size_t>(node->src[0]->ne[2]),
|
||||
static_cast<size_t>(node->src[0]->ne[1]),
|
||||
static_cast<size_t>(node->src[0]->ne[0])};
|
||||
auto input1_param = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, input1_shape);
|
||||
m_params.push_back(input1_param);
|
||||
// ov::Shape input2_shape = { static_cast<size_t>(node->src[1]->ne[2]),
|
||||
// static_cast<size_t>(node->src[1]->ne[1]),
|
||||
// static_cast<size_t>(node->src[1]->ne[0])};
|
||||
ov::Shape input2_shape = { static_cast<size_t>(node->src[1]->ne[2]),
|
||||
static_cast<size_t>(node->src[1]->ne[1]),
|
||||
static_cast<size_t>(node->src[1]->view_src->ne[0])};
|
||||
auto input2_param = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, input2_shape);
|
||||
m_params.push_back(input2_param);
|
||||
break;
|
||||
} else {
|
||||
std::string src1_name = std::string(node->src[1]->name);
|
||||
inputs[src0_name] = node->src[0];
|
||||
// inputs[src1_name] = node->src[1];
|
||||
// outputs[node_name] = node;
|
||||
src1_name = std::string(node->src[1]->view_src->name);
|
||||
inputs[src1_name] = node->src[1];
|
||||
node_name = std::string(node->view_src->name);
|
||||
outputs[node_name] = node;
|
||||
m_input_names.push_back(src0_name);
|
||||
m_input_names.push_back(src1_name);
|
||||
m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op));
|
||||
m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op));
|
||||
m_output_names.push_back(node_name);
|
||||
|
||||
ov::Shape input0_shape = { static_cast<size_t>(node->src[0]->ne[2]),
|
||||
static_cast<size_t>(node->src[0]->ne[1]),
|
||||
|
|
@ -150,6 +152,15 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::map<std::string, gg
|
|||
case GGML_OP_VIEW:
|
||||
{
|
||||
inputs[src0_name] = node->src[0];
|
||||
// if (node->ne[0] == 21504 || node->ne[0] == 7
|
||||
// || node->ne[0] == 3072 && node->src[0]->ne[0] == 98304
|
||||
// || node->ne[0] == 1 && node->src[0]->ne[0] == 98304) {
|
||||
// // if (node->ne[0] == 21504 || node->ne[0] == 7) {
|
||||
// node_name = std::string(node->view_src->name);
|
||||
// outputs[node_name] = node;
|
||||
// } else {
|
||||
// outputs[node_name] = node;
|
||||
// }
|
||||
outputs[node_name] = node;
|
||||
m_input_names.push_back(src0_name);
|
||||
m_op_node_name.emplace_back(src0_name, ggml_op_name(node->op));
|
||||
|
|
@ -193,6 +204,11 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::map<std::string, gg
|
|||
}
|
||||
std::string src1_name = std::string(node->src[1]->name);
|
||||
inputs[src0_name] = node->src[0];
|
||||
// if (node->ne[0] == 32 &&node->src[0]->type == GGML_TYPE_I32) {
|
||||
// static_cast<int32_t*>(inputs[src0_name]->data)[0] = 1;
|
||||
// } else if (node->ne[0] == 32 && node->src[0]->type == GGML_TYPE_F16) {
|
||||
// static_cast<uint16_t*>(inputs[src0_name]->data)[0] = static_cast<uint16_t>(1);
|
||||
// }
|
||||
inputs[src1_name] = node->src[1];
|
||||
outputs[node_name] = node;
|
||||
m_input_names.push_back(src0_name);
|
||||
|
|
@ -346,13 +362,17 @@ void ggml_graph_op_print(const struct ggml_cgraph * cgraph) {
|
|||
}
|
||||
|
||||
GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgraph, const int32_t start_index, const int32_t end_index)
|
||||
:m_cgraph(cgraph),
|
||||
m_node(node),
|
||||
m_op_name(m_node ? std::string(m_node->name) : "NONE_OP") {
|
||||
:m_cgraph(cgraph),
|
||||
m_node(node),
|
||||
m_op_name(m_node ? std::string(m_node->name) : "NONE_OP") {
|
||||
m_inputs.clear();
|
||||
m_outputs.clear();
|
||||
m_input_names.clear();
|
||||
m_output_names.clear();
|
||||
m_params.clear();
|
||||
m_op_node_name.clear();
|
||||
m_decoders.clear();
|
||||
|
||||
// If first init
|
||||
if (m_node) {
|
||||
set_input_output(m_node, m_inputs, m_outputs);
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
#include "ggml-backend-impl.h"
|
||||
#include <openvino/frontend/manager.hpp>
|
||||
#include <openvino/openvino.hpp>
|
||||
#include <fstream>
|
||||
|
||||
using ov::frontend::ggml::GgmlDecoder;
|
||||
|
||||
|
|
@ -32,32 +33,70 @@ std::vector<std::pair<std::string, ov::Tensor>> get_ggml_graph_input_tensors(std
|
|||
ov::element::Type input_type = ggml_decoder->get_input_type(name);
|
||||
size_t element_size = input_type.size();
|
||||
std::vector<size_t> input_stride = ggml_decoder->get_input_stride(name);
|
||||
if (op_node_name == "CONT" && input_shape[0] == 1 && (input_shape[1] != 1 && flag || input_shape[2]*element_size!=input_stride[1])) {
|
||||
if (op_node_name == "CONT" && input_shape[0] == 1 // Except for the kqv_merge node
|
||||
&& (input_shape[1] != 1 && flag || input_shape[2]*element_size!=input_stride[1])
|
||||
) {
|
||||
const size_t num_rows = static_cast<size_t>(ggml_decoder->get_input_shape(name).to_shape()[1]);
|
||||
const size_t dim2 = static_cast<size_t>(ggml_decoder->get_input_shape(name).to_shape()[0]);
|
||||
size_t phys_stride = static_cast<size_t>(input_stride[1]) / element_size;
|
||||
ov::Shape input_shape = { dim2, num_rows, phys_stride }; // {1, 7, 9216 }
|
||||
// if (!flag) {
|
||||
// std::cout << "CONT input shape: " << input_shape << std::endl;
|
||||
// }
|
||||
input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data);
|
||||
} else if (op_node_name == "CPY" && (!check_if_contiguous || input_shape[2] == 1)) { //[TODO]: Temporarily determine whether the node corresponding to the input tensor of the Phi-3 model CPY is continuous
|
||||
std::vector<size_t> input_stride = ggml_decoder->get_input_stride(name);
|
||||
ov::element::Type input_type = ggml_decoder->get_input_type(name);
|
||||
size_t element_size = input_type.size();
|
||||
// ov::Shape phys_shape;
|
||||
static int iter = 0;
|
||||
if (iter++ % 2 == 0) {
|
||||
// phys_shape = {1, input_shape[1], input_stride[2] / element_size};
|
||||
input_tensor = ov::Tensor(ov::element::f32, input_shape, input_data);
|
||||
} else {
|
||||
ov::Shape flat_shape = {1, 1, input_stride[0] / element_size};
|
||||
input_tensor = ov::Tensor(ov::element::f16, flat_shape, input_data);
|
||||
}
|
||||
// } else if (op_node_name == "CPY" && (!check_if_contiguous || input_shape[2] == 1)) { //[TODO]: Temporarily determine whether the node corresponding to the input tensor of the Phi-3 model CPY is continuous
|
||||
// } else if (op_node_name == "CPY") {
|
||||
// std::vector<size_t> input_stride = ggml_decoder->get_input_stride(name);
|
||||
// ov::element::Type input_type = ggml_decoder->get_input_type(name);
|
||||
// size_t element_size = input_type.size();
|
||||
// // ov::Shape phys_shape;
|
||||
// static int iter = 0;
|
||||
// if (iter++ % 2 == 0) {
|
||||
// // phys_shape = {1, input_shape[1], input_stride[2] / element_size};
|
||||
// input_tensor = ov::Tensor(ov::element::f32, input_shape, input_data);
|
||||
// } else {
|
||||
// ov::Shape flat_shape = {1, 1, input_stride[0] / element_size};
|
||||
// input_tensor = ov::Tensor(ov::element::f16, flat_shape, input_data);
|
||||
// }
|
||||
} else {
|
||||
input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), ggml_decoder->get_input_shape(name).to_shape(), input_data);
|
||||
// if(!flag) {
|
||||
// std::cout << std::left << "[ " << std::setw(2) << inp << " ]: "
|
||||
// << "Input Name: " << std::setw(20) << name
|
||||
// << "Type: " << std::setw(5) << ggml_decoder->get_input_type(name)
|
||||
// << "OP: " << std::setw(10) << op_node_name
|
||||
// << "CONT: " << check_if_contiguous
|
||||
// << ", shape: " << std::setw(4) << input_tensor.get_shape()[0] << " " << std::setw(4) << input_tensor.get_shape()[1] << " " << input_tensor.get_shape()[2]
|
||||
// << ", address: "
|
||||
// << std::setw(15) << input_tensor.data() << " "
|
||||
// << std::setw(15) << ((float*)input_tensor.data())[0]
|
||||
// << std::setw(15) << ((float*)input_tensor.data())[1]
|
||||
// << ", ne[0]: "
|
||||
// << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0]] << std::right
|
||||
// << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0] + 1] << std::right
|
||||
// << std::right
|
||||
// << std::endl;
|
||||
// }
|
||||
// if (op_node_name == "MUL_MAT") {
|
||||
// std::cout << std::left << "[ " << std::setw(2) << inp << " ]: "
|
||||
// << "Input MUL_MAT name: " << std::setw(20) << name
|
||||
// << ", shape: " << std::setw(4) << input_tensor.get_shape()[0] << " " << std::setw(4) << input_tensor.get_shape()[1] << " " << input_tensor.get_shape()[2]
|
||||
// << ", address: "
|
||||
// << std::setw(15) << input_tensor.data() << " "
|
||||
// << std::setw(15) << ((float*)input_tensor.data())[0]
|
||||
// << std::setw(15) << ((float*)input_tensor.data())[1]
|
||||
// << ", ne[0]: "
|
||||
// << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0]] << std::right
|
||||
// << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0] + 1] << std::right
|
||||
// << std::right
|
||||
// << std::endl;
|
||||
// }
|
||||
}
|
||||
|
||||
// input_tensors[name] = input_tensor;
|
||||
input_tensors.emplace_back(name, input_tensor);
|
||||
}
|
||||
// std::cout << "input_names.size(): " << input_names.size() << std::endl;
|
||||
return input_tensors;
|
||||
}
|
||||
|
||||
|
|
@ -117,7 +156,13 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
|
|||
// Convert InputModel -> ov::Model
|
||||
std::shared_ptr<ov::Model> model = front_end->convert(input_model);
|
||||
// ov::save_model(model, "/home/user/zhan/merge_git_commits/llama.cpp-ov/001_model.xml");
|
||||
|
||||
|
||||
// auto cloned_model = model->clone();
|
||||
// std::string model_dir = "/home/user/zhan/merge_git_commits/llama.cpp-ov";
|
||||
// auto path_base = model_dir + "/" + cloned_model->get_name();
|
||||
// // ov::pass::VisualizeTree(path_base + ".svg").run_on_model(cloned_model);
|
||||
// ov::serialize(cloned_model, path_base + ".xml", path_base + ".bin");
|
||||
|
||||
if (!model) {
|
||||
GGML_LOG_ERROR("Model is not converted \n");
|
||||
} else {
|
||||
|
|
@ -126,9 +171,14 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
|
|||
#endif
|
||||
}
|
||||
|
||||
// model = core.read_model("/home/user/zhan/merge_git_commits/llama.cpp-ov/replaceWithInputLayer_000_model.xml");
|
||||
// Loading a model to the device
|
||||
// std::cout << "Compile ..." << std::endl;
|
||||
ov::CompiledModel compiled_model = core.compile_model(model);
|
||||
// ov::save_model(compiled_model.get_runtime_model(), "/home/user/zhan/merge_git_commits/llama.cpp-ov/001_compile_model.xml");
|
||||
// std::ofstream output_file("/home/user/zhan/merge_git_commits/llama.cpp-ov/000_compile_model.xml");
|
||||
// compiled_model.export_model(output_file);
|
||||
// output_file.close();
|
||||
|
||||
// Create infer request
|
||||
ov::InferRequest infer_request = compiled_model.create_infer_request();
|
||||
|
|
@ -151,34 +201,130 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
|
|||
// std::cout << std::endl;
|
||||
}
|
||||
|
||||
// std::cout << "Infer ..." << std::endl;
|
||||
infer_request.infer();
|
||||
|
||||
// Set dst data for outputs
|
||||
auto output_names = ggml_decoder->get_output_names();
|
||||
auto output_tensors = get_ggml_graph_output_dst(ggml_decoder);
|
||||
for (size_t i = 0; i < output_names.size(); i++) {
|
||||
// std::string op_name = ggml_decoder->get_node_op_name(output_names[i]);
|
||||
auto output_tensor = infer_request.get_output_tensor(i);
|
||||
// output_tensor.get_shape();
|
||||
std::memcpy(output_tensors[output_names[i]], output_tensor.data(), output_tensor.get_byte_size());
|
||||
auto tensor = ggml_decoder->get_output_ggml_tensor(output_names[i]);
|
||||
// std::cout << std::left << "[ " << std::setw(2) << i << " ]: "
|
||||
// << "output_names: " << std::setw(20) << output_names[i]
|
||||
// << ", shape: " << std::setw(4) << tensor->ne[0] << " " << std::setw(4) << tensor->ne[1] << " " << tensor->ne[2]
|
||||
// << ", address: "
|
||||
// << std::setw(15) << tensor->data << " "
|
||||
// << std::setw(15) << ((float*)output_tensor.data())[0]
|
||||
// << std::setw(15) << ((float*)output_tensor.data())[1]
|
||||
// << ", ne[0]: "
|
||||
// << std::setw(15) << ((float*)output_tensor.data())[tensor->ne[0]] << std::right
|
||||
// << std::setw(15) << ((float*)output_tensor.data())[tensor->ne[0] + 1] << std::right
|
||||
// << std::right
|
||||
// << std::endl;
|
||||
// if(!flag) {
|
||||
// auto tensor = ggml_decoder->get_output_ggml_tensor(output_names[i]);
|
||||
// std::cout << std::left << "[ " << std::setw(2) << i << " ]: "
|
||||
// << "output_names: " << std::setw(20) << output_names[i]
|
||||
// << ", shape: " << std::setw(4) << tensor->ne[0] << " " << std::setw(4) << tensor->ne[1] << " " << tensor->ne[2]
|
||||
// << ", address: "
|
||||
// << std::setw(15) << tensor->data << " "
|
||||
// << std::setw(15) << static_cast<float>(((float*)output_tensor.data())[0])
|
||||
// << std::setw(15) << static_cast<float>(((float*)output_tensor.data())[1])
|
||||
// << ", ne[0]: "
|
||||
// << std::setw(15) << static_cast<float>(((float*)output_tensor.data())[tensor->ne[0]]) << std::right
|
||||
// << std::setw(15) << static_cast<float>(((float*)output_tensor.data())[tensor->ne[0] + 1]) << std::right
|
||||
// << std::right
|
||||
// << std::endl;
|
||||
// if (i == 19) {
|
||||
// auto output_tensor_18 = infer_request.get_output_tensor(18);
|
||||
// auto tensor = ggml_decoder->get_output_ggml_tensor(output_names[18]);
|
||||
// std::cout << std::left << " " << std::setw(2) << 18 << " : "
|
||||
// << "output_names: " << std::setw(20) << output_names[18]
|
||||
// << ", shape: " << std::setw(4) << tensor->ne[0] << " " << std::setw(4) << tensor->ne[1] << " " << tensor->ne[2]
|
||||
// << ", address: "
|
||||
// << std::setw(15) << tensor->data << " "
|
||||
// << std::setw(15) << static_cast<float>(((float*)output_tensor_18.data())[0])
|
||||
// << std::setw(15) << static_cast<float>(((float*)output_tensor_18.data())[1])
|
||||
// << ", ne[0]: "
|
||||
// << std::setw(15) << static_cast<float>(((float*)output_tensor_18.data())[tensor->ne[0]]) << std::right
|
||||
// << std::setw(15) << static_cast<float>(((float*)output_tensor_18.data())[tensor->ne[0] + 1]) << std::right
|
||||
// << std::right
|
||||
// << std::endl;
|
||||
// }
|
||||
// if(i == 23) {
|
||||
// auto output_tensor_15 = infer_request.get_output_tensor(15);
|
||||
// auto tensor = ggml_decoder->get_output_ggml_tensor(output_names[15]);
|
||||
// std::cout << std::left << " " << std::setw(2) << 15 << " : "
|
||||
// << "output_names: " << std::setw(20) << output_names[15]
|
||||
// << ", shape: " << std::setw(4) << tensor->ne[0] << " " << std::setw(4) << tensor->ne[1] << " " << tensor->ne[2]
|
||||
// << ", address: "
|
||||
// << std::setw(15) << tensor->data << " "
|
||||
// << std::setw(15) << static_cast<float>(((float*)output_tensor_15.data())[0])
|
||||
// << std::setw(15) << static_cast<float>(((float*)output_tensor_15.data())[1])
|
||||
// << ", ne[0]: "
|
||||
// << std::setw(15) << static_cast<float>(((float*)output_tensor_15.data())[tensor->ne[0]]) << std::right
|
||||
// << std::setw(15) << static_cast<float>(((float*)output_tensor_15.data())[tensor->ne[0] + 1]) << std::right
|
||||
// << std::right
|
||||
// << std::endl;
|
||||
// auto cache_k_l0_20 = ggml_decoder->get_input_names()[20];
|
||||
// // auto input_data = ggml_decoder->get_input_ggml_tensor(cache_k_l0_20)->data;
|
||||
// auto input_tensor = input_tensors.at(20).second;
|
||||
// std::cout << std::left << " " << std::setw(2) << 20 << " : "
|
||||
// << "Input Name: " << std::setw(20) << cache_k_l0_20
|
||||
// << ", shape: " << std::setw(4) << input_tensor.get_shape()[0] << " " << std::setw(4) << input_tensor.get_shape()[1] << " " << input_tensor.get_shape()[2]
|
||||
// << ", address: "
|
||||
// << std::setw(15) << input_tensor.data() << " "
|
||||
// << std::setw(15) << ((float*)input_tensor.data())[0]
|
||||
// << std::setw(15) << ((float*)input_tensor.data())[1]
|
||||
// << ", ne[0]: "
|
||||
// << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0]] << std::right
|
||||
// << std::setw(15) << ((float*)input_tensor.data())[input_tensor.get_shape()[0] + 1] << std::right
|
||||
// << std::right
|
||||
// << std::endl;
|
||||
|
||||
// auto cache_k_l0_27 = ggml_decoder->get_input_names()[27];
|
||||
// // auto input_data = ggml_decoder->get_input_ggml_tensor(cache_k_l0_20)->data;
|
||||
// auto input_tensor_27 = input_tensors.at(27).second;
|
||||
// std::cout << std::left << " " << std::setw(2) << 27 << " : "
|
||||
// << "Input Name: " << std::setw(20) << cache_k_l0_27
|
||||
// << ", shape: " << std::setw(4) << input_tensor_27.get_shape()[0] << " " << std::setw(4) << input_tensor_27.get_shape()[1] << " " << input_tensor_27.get_shape()[2]
|
||||
// << ", address: "
|
||||
// << std::setw(15) << input_tensor_27.data() << " "
|
||||
// << std::setw(15) << ((float*)input_tensor_27.data())[0]
|
||||
// << std::setw(15) << ((float*)input_tensor_27.data())[1]
|
||||
// << ", ne[0]: "
|
||||
// << std::setw(15) << ((float*)input_tensor_27.data())[input_tensor_27.get_shape()[0]] << std::right
|
||||
// << std::setw(15) << ((float*)input_tensor_27.data())[input_tensor_27.get_shape()[0] + 1] << std::right
|
||||
// << std::right
|
||||
// << std::endl;
|
||||
|
||||
// auto cache_k_l0_29 = ggml_decoder->get_input_names()[29];
|
||||
// // auto input_data = ggml_decoder->get_input_ggml_tensor(cache_k_l0_20)->data;
|
||||
// auto input_tensor_29 = input_tensors.at(29).second;
|
||||
// std::cout << std::left << " " << std::setw(2) << 29 << " : "
|
||||
// << "Input Name: " << std::setw(20) << cache_k_l0_29
|
||||
// << ", shape: " << std::setw(4) << input_tensor_29.get_shape()[0] << " " << std::setw(4) << input_tensor_29.get_shape()[1] << " " << input_tensor_29.get_shape()[2]
|
||||
// << ", address: "
|
||||
// << std::setw(15) << input_tensor_29.data() << " "
|
||||
// << std::setw(15) << ((float*)input_tensor_29.data())[0]
|
||||
// << std::setw(15) << ((float*)input_tensor_29.data())[1]
|
||||
// << ", ne[0]: "
|
||||
// << std::setw(15) << ((float*)input_tensor_29.data())[input_tensor_29.get_shape()[0]] << std::right
|
||||
// << std::setw(15) << ((float*)input_tensor_29.data())[input_tensor_29.get_shape()[0] + 1] << std::right
|
||||
// << std::right
|
||||
// << std::endl;
|
||||
|
||||
// auto cache_k_l0_30 = ggml_decoder->get_input_names()[30];
|
||||
// // auto input_data = ggml_decoder->get_input_ggml_tensor(cache_k_l0_20)->data;
|
||||
// auto input_tensor_30 = input_tensors.at(30).second;
|
||||
// std::cout << std::left << " " << std::setw(2) << 30 << " : "
|
||||
// << "Input Name: " << std::setw(20) << cache_k_l0_30
|
||||
// << ", shape: " << std::setw(4) << input_tensor_30.get_shape()[0] << " " << std::setw(4) << input_tensor_30.get_shape()[1] << " " << input_tensor_30.get_shape()[2]
|
||||
// << ", address: "
|
||||
// << std::setw(15) << input_tensor_30.data() << " "
|
||||
// << std::setw(15) << ((float*)input_tensor_30.data())[0]
|
||||
// << std::setw(15) << ((float*)input_tensor_30.data())[1]
|
||||
// << ", ne[0]: "
|
||||
// << std::setw(15) << ((float*)input_tensor_30.data())[input_tensor_30.get_shape()[0]] << std::right
|
||||
// << std::setw(15) << ((float*)input_tensor_30.data())[input_tensor_30.get_shape()[0] + 1] << std::right
|
||||
// << std::right
|
||||
// << std::endl;
|
||||
// }
|
||||
// }
|
||||
#ifdef GGML_OPENVINO_DEBUG
|
||||
printf("Output %s after: %g\n", output_names[i].c_str(), *(double*)(output_tensor.data()));
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
GGML_UNUSED(backend);
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue