Change output for infer request to set output tensor. Support scale, view op.

This commit is contained in:
yumengbo 2024-12-06 07:37:58 +08:00 committed by Mustafa Cavus
parent 31bd816426
commit 5b46dc23be
4 changed files with 78 additions and 37 deletions

View File

@ -10,13 +10,21 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgr
// Unary OPs // Unary OPs
case GGML_OP_UNARY: case GGML_OP_UNARY:
case GGML_OP_RESHAPE: case GGML_OP_RESHAPE:
case GGML_OP_VIEW: case GGML_OP_TRANSPOSE:
case GGML_OP_PERMUTE:
case GGML_OP_CONT:
case GGML_OP_CPY:
case GGML_OP_RMS_NORM:
{ {
m_inputs.push_back(m_node->src[0]); m_inputs.push_back(m_node->src[0]);
m_outputs.push_back(m_node); m_outputs.push_back(m_node);
#ifdef GGML_OPENVINO_DEBUG break;
GGML_LOG_INFO("Decoder input 0: %f \n", *(float*)(m_node->src[0]->data)); }
#endif // For view, input is m_node itself
case GGML_OP_VIEW:
{
m_inputs.push_back(m_node);
m_outputs.push_back(m_node);
break; break;
} }
// SCALE // SCALE
@ -24,12 +32,6 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgr
{ {
m_inputs.push_back(m_node->src[0]); m_inputs.push_back(m_node->src[0]);
m_outputs.push_back(m_node); m_outputs.push_back(m_node);
#ifdef GGML_OPENVINO_DEBUG
float v;
memcpy(&v, m_node->op_params, sizeof(float));
GGML_LOG_INFO("Decoder input 0: %f \n", *(float*)(m_node->src[0]->data));
GGML_LOG_INFO("Scale: %f \n", v);
#endif
break; break;
} }
// OPs with 2 inputs // OPs with 2 inputs
@ -39,14 +41,20 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor * node, struct ggml_cgraph * cgr
case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT:
case GGML_OP_SUB: case GGML_OP_SUB:
case GGML_OP_GET_ROWS: case GGML_OP_GET_ROWS:
case GGML_OP_SOFT_MAX:
{ {
m_inputs.push_back(m_node->src[0]); m_inputs.push_back(m_node->src[0]);
m_inputs.push_back(m_node->src[1]); m_inputs.push_back(m_node->src[1]);
m_outputs.push_back(m_node); m_outputs.push_back(m_node);
#ifdef GGML_OPENVINO_DEBUG break;
GGML_LOG_INFO("Decoder input 0: %f \n", *(float*)(m_node->src[0]->data)); }
GGML_LOG_INFO("Decoder input 1: %f \n", *(float*)(m_node->src[1]->data)); // OPs with 3 inputs:
#endif case GGML_OP_ROPE:
{
m_inputs.push_back(m_node->src[0]);
m_inputs.push_back(m_node->src[1]);
m_inputs.push_back(m_node->src[2]); // ???
m_outputs.push_back(m_node);
break; break;
} }
default: default:
@ -130,7 +138,6 @@ ov::PartialShape GgmlOvDecoder::get_output_shape(size_t index) const {
ov::element::Type GgmlOvDecoder::get_output_type(size_t index) const { ov::element::Type GgmlOvDecoder::get_output_type(size_t index) const {
// TODO: Change to Output // TODO: Change to Output
ov::element::Type type = ov::element::dynamic; ov::element::Type type = ov::element::dynamic;
// GGML_LOG_DEBUG("%d\n", m_outputs[index]->type);
switch (m_outputs[index]->type) { switch (m_outputs[index]->type) {
case GGML_TYPE_F32: case GGML_TYPE_F32:
type = ov::element::f32; type = ov::element::f32;
@ -179,6 +186,8 @@ const std::string& GgmlOvDecoder::get_op_type() const {
{GGML_OP_ACC, "GGML_OP_ACC"}, {GGML_OP_ACC, "GGML_OP_ACC"},
{GGML_OP_ADD, "GGML_OP_ADD"}, {GGML_OP_ADD, "GGML_OP_ADD"},
{GGML_OP_ADD1, "GGML_OP_ADD1"}, {GGML_OP_ADD1, "GGML_OP_ADD1"},
{GGML_OP_CONT, "GGML_OP_CONT"},
{GGML_OP_CPY, "GGML_OP_CPY"},
{GGML_OP_DIV, "GGML_OP_DIV"}, {GGML_OP_DIV, "GGML_OP_DIV"},
{GGML_OP_DUP, "GGML_OP_DUP"}, {GGML_OP_DUP, "GGML_OP_DUP"},
{GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS"}, {GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS"},
@ -186,8 +195,12 @@ const std::string& GgmlOvDecoder::get_op_type() const {
{GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT"}, {GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT"},
{GGML_OP_PERMUTE, "GGML_OP_PERMUTE"}, {GGML_OP_PERMUTE, "GGML_OP_PERMUTE"},
{GGML_OP_RESHAPE, "GGML_OP_RESHAPE"}, {GGML_OP_RESHAPE, "GGML_OP_RESHAPE"},
{GGML_OP_RMS_NORM, "GGML_OP_RMS_NORM"},
{GGML_OP_ROPE, "GGML_OP_ROPE"},
{GGML_OP_SCALE, "GGML_OP_SCALE"}, {GGML_OP_SCALE, "GGML_OP_SCALE"},
{GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX"},
{GGML_OP_SUB, "GGML_OP_SUB"}, {GGML_OP_SUB, "GGML_OP_SUB"},
{GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE"},
{GGML_OP_UNARY, "GGML_OP_UNARY"}, {GGML_OP_UNARY, "GGML_OP_UNARY"},
{GGML_OP_VIEW, "GGML_OP_VIEW"} {GGML_OP_VIEW, "GGML_OP_VIEW"}
}; };

View File

@ -59,6 +59,10 @@ public:
return m_inputs[index]; return m_inputs[index];
} }
const ggml_tensor* get_output_ggml_tensor(size_t index) const {
return m_outputs[index];
}
// virtual const std::vector<size_t>& outputs() const override; // virtual const std::vector<size_t>& outputs() const override;
// virtual size_t output(size_t index) const override; // virtual size_t output(size_t index) const override;

View File

@ -15,16 +15,17 @@ GgmlOvGraphIterator::GgmlOvGraphIterator(struct ggml_cgraph * cgraph)
#endif #endif
} }
void GgmlOvGraphIterator::initialize_decoders() { void GgmlOvGraphIterator::initialize_decoders() {
auto nodes_size = m_cgraph->n_nodes; auto nodes_size = m_cgraph->n_nodes;
// Initialize decoder for each node // Initialize decoder for each node
// m_decoders.resize(static_cast<size_t>(nodes_size)); // m_decoders.resize(static_cast<size_t>(nodes_size));
for (int i = 0; i < nodes_size; ++i) { for (int i = 0; i < nodes_size; ++i) {
// Skip View Op // Skip View Op
if (m_cgraph->nodes[i] ->op == GGML_OP_VIEW || m_cgraph->nodes[i] ->op == GGML_OP_PERMUTE) { // if (m_cgraph->nodes[i] ->op == GGML_OP_PERMUTE
continue; // || m_cgraph->nodes[i] ->op == GGML_OP_CPY ) {
} // continue;
// }
auto decoder = std::make_shared<GgmlOvDecoder>(m_cgraph->nodes[i], m_cgraph); auto decoder = std::make_shared<GgmlOvDecoder>(m_cgraph->nodes[i], m_cgraph);
m_decoders.push_back(decoder); m_decoders.push_back(decoder);
for (size_t inp = 0; inp < decoder->get_input_size(); ++inp) { for (size_t inp = 0; inp < decoder->get_input_size(); ++inp) {
@ -33,9 +34,9 @@ GgmlOvGraphIterator::GgmlOvGraphIterator(struct ggml_cgraph * cgraph)
// } // }
} }
for (size_t inp = 0; inp < decoder->get_output_size(); ++inp) { for (size_t inp = 0; inp < decoder->get_output_size(); ++inp) {
if (i == nodes_size - 1 || decoder->is_graph_output(inp)) { // if (i == nodes_size - 1 || decoder->is_graph_output(inp)) {
m_output_names.push_back(decoder->get_output_name(inp)); m_output_names.push_back(decoder->get_output_name(inp));
} // }
} }
} }
@ -71,20 +72,20 @@ std::vector<std::string> GgmlOvGraphIterator::get_output_names() const {
void GgmlOvGraphIterator::dump_graph_iterator() const { void GgmlOvGraphIterator::dump_graph_iterator() const {
for (size_t i = 0; i < m_decoders.size(); ++i) { for (size_t i = 0; i < m_decoders.size(); ++i) {
GGML_LOG_INFO("OP %zu: %s\n", i, m_decoders[i]->get_op_name().c_str()); GGML_LOG_INFO("\nOP %zu: %s\n", i, m_decoders[i]->get_op_name().c_str());
for (size_t inp = 0; inp < m_decoders[i]->get_input_size(); ++inp) { for (size_t inp = 0; inp < m_decoders[i]->get_input_size(); ++inp) {
ov::PartialShape pshape = std::dynamic_pointer_cast<GgmlOvDecoder>(m_decoders[i])->get_input_shape(inp); ov::PartialShape pshape = std::dynamic_pointer_cast<GgmlOvDecoder>(m_decoders[i])->get_input_shape(inp);
ov::element::Type ptype = std::dynamic_pointer_cast<GgmlOvDecoder>(m_decoders[i])->get_input_type(inp); ov::element::Type ptype = std::dynamic_pointer_cast<GgmlOvDecoder>(m_decoders[i])->get_input_type(inp);
GGML_LOG_INFO("Input name: %s\n", std::dynamic_pointer_cast<GgmlOvDecoder>(m_decoders[i])->get_input_name(inp).c_str()); GGML_LOG_INFO("- Input name: %s\n", std::dynamic_pointer_cast<GgmlOvDecoder>(m_decoders[i])->get_input_name(inp).c_str());
GGML_LOG_INFO("Input shape: %s\n", pshape.to_string().c_str()); GGML_LOG_INFO(" Input shape: %s\n", pshape.to_string().c_str());
GGML_LOG_INFO("Input type: %s\n", ptype.to_string().c_str()); GGML_LOG_INFO(" Input type: %s\n", ptype.to_string().c_str());
} }
for (size_t outp = 0; outp < std::dynamic_pointer_cast<GgmlOvDecoder>(m_decoders[i])->get_output_size(); ++outp) { for (size_t outp = 0; outp < std::dynamic_pointer_cast<GgmlOvDecoder>(m_decoders[i])->get_output_size(); ++outp) {
ov::PartialShape pshape = std::dynamic_pointer_cast<GgmlOvDecoder>(m_decoders[i])->get_output_shape(outp); ov::PartialShape pshape = std::dynamic_pointer_cast<GgmlOvDecoder>(m_decoders[i])->get_output_shape(outp);
ov::element::Type ptype = std::dynamic_pointer_cast<GgmlOvDecoder>(m_decoders[i])->get_output_type(outp); ov::element::Type ptype = std::dynamic_pointer_cast<GgmlOvDecoder>(m_decoders[i])->get_output_type(outp);
GGML_LOG_INFO("Output name: %s\n", std::dynamic_pointer_cast<GgmlOvDecoder>(m_decoders[i])->get_output_name(outp).c_str()); GGML_LOG_INFO("- Output name: %s\n", std::dynamic_pointer_cast<GgmlOvDecoder>(m_decoders[i])->get_output_name(outp).c_str());
GGML_LOG_INFO("Output shape: %s\n", pshape.to_string().c_str()); GGML_LOG_INFO(" Output shape: %s\n", pshape.to_string().c_str());
GGML_LOG_INFO("Output type: %s\n", ptype.to_string().c_str()); GGML_LOG_INFO(" Output type: %s\n", ptype.to_string().c_str());
} }
} }

View File

@ -18,6 +18,9 @@ std::map<std::string, ov::Tensor> get_ggml_graph_input_tensors(std::shared_ptr<G
for (size_t inp = 0; inp < decoder->get_input_size(); ++inp) { for (size_t inp = 0; inp < decoder->get_input_size(); ++inp) {
if (std::find(input_names.begin(), input_names.end(), decoder->get_input_name(inp)) != input_names.end()) { if (std::find(input_names.begin(), input_names.end(), decoder->get_input_name(inp)) != input_names.end()) {
auto input_data = decoder->get_input_ggml_tensor(inp)->data; auto input_data = decoder->get_input_ggml_tensor(inp)->data;
#ifdef GGML_OPENVINO_DEBUG
printf("Subgraph input %d: %g\n", inp, *(double*)(input_data));
#endif
ov::Tensor input_tensor = ov::Tensor(decoder->get_input_type(inp), decoder->get_input_shape(inp).to_shape(), input_data); ov::Tensor input_tensor = ov::Tensor(decoder->get_input_type(inp), decoder->get_input_shape(inp).to_shape(), input_data);
input_tensors[decoder->get_input_name(inp)] = input_tensor; input_tensors[decoder->get_input_name(inp)] = input_tensor;
} }
@ -26,6 +29,27 @@ std::map<std::string, ov::Tensor> get_ggml_graph_input_tensors(std::shared_ptr<G
return input_tensors; return input_tensors;
} }
std::map<std::string, ov::Tensor> get_ggml_graph_output_tensors(std::shared_ptr<GgmlOvGraphIterator> ggml_graph_iterator) {
std::map<std::string, ov::Tensor> output_tensors;
auto output_names = ggml_graph_iterator->get_output_names();
ggml_graph_iterator->reset();
for (; !ggml_graph_iterator->is_end(); ggml_graph_iterator->next()) {
auto decoder = std::dynamic_pointer_cast<GgmlOvDecoder>(ggml_graph_iterator->get_decoder());
for (size_t inp = 0; inp < decoder->get_output_size(); ++inp) {
if (std::find(output_names.begin(), output_names.end(), decoder->get_output_name(inp)) != output_names.end()) {
auto output_data = decoder->get_output_ggml_tensor(inp)->data;
#ifdef GGML_OPENVINO_DEBUG
printf("Output %d: %g\n", inp, *(double*)(output_data));
#endif
ov::Tensor output_tensor = ov::Tensor(decoder->get_output_type(inp), decoder->get_output_shape(inp).to_shape(), output_data);
output_tensors[decoder->get_output_name(inp)] = output_tensor;
}
}
}
return output_tensors;
}
static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { static ov::frontend::FrontEnd::Ptr get_ggml_frontend() {
ov::frontend::FrontEnd::Ptr front_end = nullptr; ov::frontend::FrontEnd::Ptr front_end = nullptr;
auto fem = ov::frontend::FrontEndManager(); auto fem = ov::frontend::FrontEndManager();
@ -92,16 +116,15 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
infer_request.set_input_tensor(i, input_tensors[input_names[i]]); infer_request.set_input_tensor(i, input_tensors[input_names[i]]);
} }
infer_request.infer(); // Set output tensor
ov::Tensor output_tensor = infer_request.get_output_tensor(); auto output_names = ggml_graph_iterator->get_output_names();
// Put data in output tensor to the last node -> data in cgraph auto output_tensors = get_ggml_graph_output_tensors(ggml_graph_iterator);
// Get output type for (size_t i = 0; i < output_names.size(); i++) {
ggml_tensor* dst = cgraph->nodes[cgraph->n_nodes - 1]; infer_request.set_output_tensor(i, output_tensors[output_names[i]]);
std::memcpy(dst->data, output_tensor.data(), output_tensor.get_byte_size()); }
#ifdef GGML_OPENVINO_DEBUG
GGML_LOG_INFO("Output: %f\n", *output_tensor.data<float>()); infer_request.infer();
#endif
return GGML_STATUS_SUCCESS; return GGML_STATUS_SUCCESS;
GGML_UNUSED(backend); GGML_UNUSED(backend);