From b02265a5072119cdbdb7ded26a7bb2e8dc26f273 Mon Sep 17 00:00:00 2001
From: zhanmyz <yazhan.ma@intel.com>
Date: Sat, 15 Mar 2025 19:32:40 +0800
Subject: [PATCH] 1. In the Prompt process and predict first token stage, the
 PERMUTE node needs to be integrated into the OV Frontend 2. In the predict
 latest token stage, the VIEW, CONT, Reshape need to be integrated into the OV
 Frontend.

---
 ggml/src/ggml-openvino.cpp              | 242 ++++--------------------
 ggml/src/ggml-openvino/ggml-decoder.cpp |  40 ++--
 ggml/src/ggml-openvino/utils.cpp        |  25 ++-
 3 files changed, 83 insertions(+), 224 deletions(-)

diff --git a/ggml/src/ggml-openvino.cpp b/ggml/src/ggml-openvino.cpp
index d2a21511dd..fd24356412 100644
--- a/ggml/src/ggml-openvino.cpp
+++ b/ggml/src/ggml-openvino.cpp
@@ -647,168 +647,6 @@ void ggml_backend_openvino_reshape(ggml_tensor *dst) {
 }
 
 void ggml_backend_openvino_view(ggml_tensor *dst) {
-
-    /*
-    // Case 1: Set the output tensor shape as the same shape of the input tensor [1, 7, 9216], for next CONT node operator
-    if (dst->ne[0] > dst->ne[1] && (dst->ne[0] * dst->nb[0] != dst->nb[1]) && dst->ne[2] == 1) {
-        // if (dst->view_offs == 0) {
-        //     return;
-        // }
-        ov::Core core;
-        ov::Shape input_shape{ static_cast<size_t>(dst->src[0]->ne[2]), static_cast<size_t>(dst->src[0]->ne[1]), static_cast<size_t>(dst->src[0]->ne[0])};
-        ov::Shape out_shape{ static_cast<size_t>(dst->ne[2]), static_cast<size_t>(dst->ne[1]), static_cast<size_t>(dst->ne[0])};
-
-        auto input_param = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, input_shape);
-
-        // auto new_shape_node = ov::op::v0::Constant::create(ov::element::i64,
-        //     ov::Shape{input_shape.size()},
-        //     std::vector<int64_t>(input_shape.begin(), input_shape.end()));
-        // auto res = std::make_shared<ov::op::v1::Reshape>(input_param, new_shape_node, false);
-
-        int64_t split_addr = dst->view_offs / dst->nb[0];
-        std::vector<int64_t> begin = { 0, 0, split_addr };
-        std::vector<int64_t> end   = { static_cast<int64_t>(dst->src[0]->ne[2]),
-                                        static_cast<int64_t>(dst->src[0]->ne[1]),
-                                        split_addr + static_cast<int64_t>(dst->ne[0]) };
-        std::vector<int64_t> strides = { 1, 1, 1 };
-
-        auto begin_const = ov::op::v0::Constant::create(ov::element::i64, { begin.size() }, begin);
-        auto end_const   = ov::op::v0::Constant::create(ov::element::i64, { end.size() }, end);
-        auto strides_const = ov::op::v0::Constant::create(ov::element::i64, { strides.size() }, strides);
-
-        std::vector<int64_t> begin_mask = {0, 0, 0};
-        std::vector<int64_t> end_mask   = {0, 0, 0};
-        auto slice = std::make_shared<ov::op::v1::StridedSlice>(
-            input_param, 
-            begin_const, 
-            end_const, 
-            strides_const, 
-            begin_mask, 
-            end_mask
-        );
-
-        auto model = std::make_shared<ov::Model>(ov::OutputVector{ slice },
-                                                 ov::ParameterVector{ input_param });
-
-        auto compiled_model = core.compile_model(model, "CPU");
-
-        ov::InferRequest infer_request = compiled_model.create_infer_request();
-
-        ov::Tensor input_tensor(ov::element::f32, input_shape, dst->src[0]->data);
-        infer_request.set_input_tensor(0, input_tensor);
-
-        ov::Tensor output_tensor(ov::element::f32, out_shape, dst->data);
-        infer_request.set_output_tensor(0, output_tensor);
-
-        infer_request.infer();
-    }
-    */
-
-
-    /*
-    // Case 2: Slice contiguous input tensor [98304, 1, 1] to contiguout output tensor [ 21504, 1, 1]
-    if (ggml_is_contiguous(dst) && dst->ne[1] == 1 && (dst->ne[0] * dst->nb[0] == dst->nb[1])) {
-        ov::Core core;
-        ov::Shape input_shape = { static_cast<size_t>(dst->src[0]->ne[2]),
-                                    static_cast<size_t>(dst->src[0]->ne[1]),
-                                    static_cast<size_t>(dst->src[0]->ne[0])};
-        ov::Shape output_shape = { static_cast<size_t>(dst->ne[2]),
-                                    static_cast<size_t>(dst->ne[1]),
-                                    static_cast<size_t>(dst->ne[0])};
-        auto input_param = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, input_shape);
-
-
-        std::vector<int64_t> begin = { 0, 0, 0 };
-        std::vector<int64_t> end   = { static_cast<int64_t>(dst->ne[2]),
-                                        static_cast<int64_t>(dst->ne[1]),
-                                        static_cast<int64_t>(dst->ne[0]) };
-        std::vector<int64_t> strides = { 1, 1, 1 };
-
-        auto begin_const = ov::op::v0::Constant::create(ov::element::i64, { begin.size() }, begin);
-        auto end_const   = ov::op::v0::Constant::create(ov::element::i64, { end.size() }, end);
-        auto strides_const = ov::op::v0::Constant::create(ov::element::i64, { strides.size() }, strides);
-
-        std::vector<int64_t> begin_mask = {0, 0, 0};
-        std::vector<int64_t> end_mask   = {0, 0, 0};
-        auto slice = std::make_shared<ov::op::v1::StridedSlice>(
-            input_param, 
-            begin_const, 
-            end_const, 
-            strides_const, 
-            begin_mask, 
-            end_mask
-        );
-
-        std::shared_ptr<ov::Model> model = std::make_shared<ov::Model>(ov::OutputVector{ slice },
-                                                 ov::ParameterVector{ input_param });
-
-        auto compiled_model = core.compile_model(model, "CPU");
-        ov::InferRequest infer_request = compiled_model.create_infer_request();
-
-        ov::Tensor input_tensor(ov::element::f16, input_shape, dst->src[0]->data);
-        ov::Tensor output_tensor(ov::element::f16, output_shape, dst->data);
-        infer_request.set_input_tensor(0, input_tensor);
-        infer_request.set_output_tensor(0, output_tensor);
-
-        infer_request.infer();
-    }
-    */
-
-    /*
-    // Case 3: Reshape the input tensor [1, 1, 98304] to output tensor [1, 3072, 32](Physical shape)
-    if (dst->ne[0] < dst->ne[1] && dst->ne[2] == 1) {
-        ov::Core core;
-        ov::Shape input_shape = { static_cast<size_t>(dst->src[0]->ne[2]),
-                                    static_cast<size_t>(dst->src[0]->ne[1]),
-                                    static_cast<size_t>(dst->src[0]->ne[0])};
-        ov::Shape output_shape = { static_cast<size_t>(dst->nb[2]),
-                                    static_cast<size_t>(dst->ne[1]),
-                                    static_cast<size_t>(dst->nb[1] / dst->nb[0])};
-        auto input_param = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, input_shape);
-    
-        auto new_shape_node = ov::op::v0::Constant::create(ov::element::i64,
-                                                        ov::Shape{output_shape.size()},
-                                                        std::vector<int64_t>(output_shape.begin(), output_shape.end()));
-        auto res = std::make_shared<ov::op::v1::Reshape>(input_param, new_shape_node, false);
-
-        std::shared_ptr<ov::Model> model = std::make_shared<ov::Model>(ov::OutputVector{res},
-                                                                        ov::ParameterVector{input_param});
-        auto compiled_model = core.compile_model(model, "CPU");
-        ov::InferRequest infer_request = compiled_model.create_infer_request();
-    
-        ov::Tensor input_tensor(ov::element::f16, input_shape, dst->src[0]->data);
-        ov::Tensor output_tensor(ov::element::f16, output_shape, dst->data);
-        infer_request.set_input_tensor(0, input_tensor);
-        infer_request.set_output_tensor(0, output_tensor);
-    
-        infer_request.infer();
-    }
-    */
-
-    /*
-    // Case 4:
-    if (dst->ne[0] != 1 && dst->ne[1] != 1 && dst->ne[2] !=1) {
-        
-    }
-    */
-
-    ov::Core core;
-    ov::Shape input_shape{static_cast<size_t>(dst->src[0]->ne[2]), static_cast<size_t>(dst->src[0]->ne[1]), static_cast<size_t>(dst->src[0]->ne[0])};
-    // ov::Shape output_shape{static_cast<size_t>(dst->ne[2]), static_cast<size_t>(dst->ne[1]), static_cast<size_t>(dst->ne[0])};
-    auto input_param = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, input_shape);
-
-    std::shared_ptr<ov::Model> model = std::make_shared<ov::Model>(ov::OutputVector{input_param},
-                                                                    ov::ParameterVector{input_param});
-    auto compiled_model = core.compile_model(model, "CPU");
-    ov::InferRequest infer_request = compiled_model.create_infer_request();
-
-    ov::Tensor input_tensor(ov::element::f32, input_shape, dst->src[0]->data);
-    // ov::Tensor output_tensor(ov::element::f32, input_shape, dst->data);
-    infer_request.set_input_tensor(0, input_tensor);
-    // infer_request.set_output_tensor(0, output_tensor);
-
-    infer_request.infer();
-
     GGML_UNUSED(dst);
 }
 
@@ -823,7 +661,7 @@ void ggml_backend_openvino_dup_bytes(struct ggml_tensor *dst) {
     const size_t element_size = ggml_type_size(src0->type);
 
     // Case 1: Both tensors are contiguous
-    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
+    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && (src0->ne[0] * element_size == src0->nb[1])) {
         ov::Shape input_shape = {
             static_cast<size_t>(src0->ne[2]),
             static_cast<size_t>(src0->ne[1]),
@@ -1152,6 +990,7 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe
     std::vector<int> permute_indices;
 
     std::vector<int> mul_mat_indices;
+    std::vector<int> add_indices;
 
     for (int i = 0; i < cgraph->n_nodes; i++) {
         if (cgraph->nodes[i]->op == GGML_OP_CONT) {
@@ -1168,6 +1007,8 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe
             permute_indices.push_back(i);
         } else if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT) {
             mul_mat_indices.push_back(i);
+        } else if (cgraph->nodes[i]->op == GGML_OP_ADD) {
+            add_indices.push_back(i);
         }
     }
 
@@ -1177,48 +1018,49 @@ static enum ggml_status ggml_backend_openvino_graph_compute(ggml_backend_t backe
     bool prompt_process_flag = true;
     if (cgraph->nodes[0]->ne[1] == 1) {
         prompt_process_flag = false;
-    }
-    //     int end_node = cgraph->n_nodes - 1;
-    //     openvino_frontend_compute(backend, cgraph, 0, end_node, prompt_process_flag);
-    // } else {
-
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) {
-            ggml_backend_openvino_permute(cgraph->nodes[i]);
-        // } else if (std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) != mul_mat_indices.end()) {
-        //     ggml_backend_openvino_mul_mat(cgraph->nodes[i]);
-        // } else if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) {
-        //     ggml_backend_openvino_view(cgraph->nodes[i]);
-        // } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) {
-        //     ggml_backend_openvino_dup_bytes(cgraph->nodes[i]);
-        } else if (std::find(transpose_indices.begin(), transpose_indices.end(), i) != transpose_indices.end()) {
-            ggml_backend_openvino_transpose(cgraph->nodes[i]);
-        // } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) {
-        //     ggml_backend_openvino_reshape(cgraph->nodes[i]);
-        } else if (std::find(cpy_indices.begin(), cpy_indices.end(), i) != cpy_indices.end()) {
-            ggml_backend_openvino_cpy(cgraph->nodes[i]);
-        } else {
-            // Process a range of nodes with openvino_frontend_compute
-            int start_index = i;
-            while (i < cgraph->n_nodes
-                    && std::find(permute_indices.begin(), permute_indices.end(), i) == permute_indices.end()
-                    // && std::find(mul_mat_indices.begin(), mul_mat_indices.end(), i) == mul_mat_indices.end()
-                    // && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end()
-                    // && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end()
-                    // && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end()
-                    && std::find(transpose_indices.begin(), transpose_indices.end(), i) == transpose_indices.end()
-                    && std::find(cpy_indices.begin(), cpy_indices.end(), i) == cpy_indices.end()
-                    ) {
-                i++;
+        // int end_node = cgraph->n_nodes - 1;
+        // openvino_frontend_compute(backend, cgraph, 0, end_node, prompt_process_flag);
+        for (int i = 0; i < cgraph->n_nodes; i++) {
+            if (std::find(view_indices.begin(), view_indices.end(), i) != view_indices.end()) {
+                ggml_backend_openvino_view(cgraph->nodes[i]);
+            } else if (std::find(cont_indices.begin(), cont_indices.end(), i) != cont_indices.end()) {
+                ggml_backend_openvino_dup_bytes(cgraph->nodes[i]);
+            } else if (std::find(reshape_indices.begin(), reshape_indices.end(), i) != reshape_indices.end()) {
+                ggml_backend_openvino_reshape(cgraph->nodes[i]);
+            } else {
+                // Process a range of nodes with openvino_frontend_compute
+                int start_index = i;
+                while (i < cgraph->n_nodes
+                        && std::find(view_indices.begin(), view_indices.end(), i) == view_indices.end()
+                        && std::find(cont_indices.begin(), cont_indices.end(), i) == cont_indices.end()
+                        && std::find(reshape_indices.begin(), reshape_indices.end(), i) == reshape_indices.end()
+                        ) {
+                    i++;
+                }
+                if (start_index < i) {
+                        openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag);
+                }
             }
-            if (start_index < i) {
-                    openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag);
+        }
+    } else {
+        for (int i = 0; i < cgraph->n_nodes; i++) {
+            if (std::find(permute_indices.begin(), permute_indices.end(), i) != permute_indices.end()) {
+                ggml_backend_openvino_permute(cgraph->nodes[i]);
+            } else {
+                // Process a range of nodes with openvino_frontend_compute
+                int start_index = i;
+                while (i < cgraph->n_nodes
+                        && std::find(permute_indices.begin(), permute_indices.end(), i) == permute_indices.end()
+                        ) {
+                    i++;
+                }
+                if (start_index < i) {
+                        openvino_frontend_compute(backend, cgraph, start_index, --i, prompt_process_flag);
+                }
             }
         }
     }
 
-    // }
-
     return GGML_STATUS_SUCCESS;
 
     GGML_UNUSED(backend);
diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
index 4483241481..d91338127a 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -26,7 +26,9 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::map<std::string, gg
         }
         case GGML_OP_CONT:
         {
-            if (ggml_is_contiguous(node->src[0]) && ggml_is_contiguous(node)) {
+            if (ggml_is_contiguous(node->src[0])
+                && ggml_is_contiguous(node)
+                && (node->src[0]->ne[0] * node->src[0]->nb[0] == node->src[0]->nb[1])) {
                 inputs[src0_name] = node->src[0];
                 outputs[node_name] = node;
                 m_input_names.push_back(src0_name);
@@ -112,22 +114,31 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::map<std::string, gg
                 m_op_node_name.emplace_back(src1_name, ggml_op_name(node->op));
                 m_output_names.push_back(node_name);
 
-                int src0_elem_size = ggml_type_size(node->src[0]->type);
-                int src1_elem_size = ggml_type_size(node->src[1]->type);
+                // int src0_elem_size = ggml_type_size(node->src[0]->type);
+                // int src1_elem_size = ggml_type_size(node->src[1]->type);
 
-                int src0_logical_rows = node->src[0]->ne[1];
-                int src1_logical_rows = node->src[1]->ne[1];
+                // int src0_logical_rows = node->src[0]->ne[1];
+                // int src1_logical_rows = node->src[1]->ne[1];
 
-                int src0_phys_cols = node->src[0]->nb[0] / src0_elem_size;
-                int src0_phys_rows = src0_logical_rows;
+                // int src0_phys_cols = node->src[0]->nb[0] / src0_elem_size;
+                // int src0_phys_rows = src0_logical_rows;
 
-                int src1_phys_cols = node->src[1]->nb[1] / src1_elem_size;
-                int src1_phys_rows = src1_logical_rows;
-                ov::Shape src0_phys_shape = {1, static_cast<size_t>(src0_phys_rows), static_cast<size_t>(src0_phys_cols) };
-                ov::Shape src1_phys_shape = {1, static_cast<size_t>(src1_phys_rows), static_cast<size_t>(src1_phys_cols) };
-                auto input0_param = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, src0_phys_shape);
-                auto input1_param = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, src1_phys_shape);
+                // int src1_phys_cols = node->src[1]->nb[1] / src1_elem_size;
+                // int src1_phys_rows = src1_logical_rows;
+                // ov::Shape src0_phys_shape = {1, static_cast<size_t>(src0_phys_rows), static_cast<size_t>(src0_phys_cols) };
+                // ov::Shape src1_phys_shape = {1, static_cast<size_t>(src1_phys_rows), static_cast<size_t>(src1_phys_cols) };
+                // auto input0_param = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, src0_phys_shape);
+                // auto input1_param = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, src1_phys_shape);
+                // m_params.push_back(input0_param);
+                // m_params.push_back(input1_param);
+
+                ov::Shape input0_shape = { static_cast<size_t>(node->src[0]->ne[2]),
+                    static_cast<size_t>(node->src[0]->ne[1]),
+                    static_cast<size_t>(node->src[0]->ne[0])};
+                auto input0_param = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, input0_shape);
                 m_params.push_back(input0_param);
+                ov::Shape input1_shape = { 1, 1, static_cast<size_t>(node->src[1]->nb[2] / node->src[1]->nb[0])};
+                auto input1_param = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, input1_shape);
                 m_params.push_back(input1_param);
 
                 m_continuous = false;
@@ -147,7 +158,8 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, std::map<std::string, gg
             // ov::Shape input_shape = { static_cast<size_t>(node->src[0]->ne[2]),
             //                             static_cast<size_t>(node->src[0]->ne[1]),
             //                             static_cast<size_t>(node->src[0]->ne[0])};
-            // auto input_param = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, input_shape);
+            // auto type = get_input_type(src0_name);
+            // auto input_param = std::make_shared<ov::op::v0::Parameter>(type, input_shape);
             // m_params.push_back(input_param);
 
             // if (node->ne[0] > node->ne[1] && (node->ne[0] * node->nb[0] != node->nb[1]) && node->ne[2] == 1) {
diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp
index a0adc917e7..b8315a0013 100644
--- a/ggml/src/ggml-openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/utils.cpp
@@ -27,12 +27,12 @@ std::vector<std::pair<std::string, ov::Tensor>> get_ggml_graph_input_tensors(std
             printf("Subgraph input %d: %g\n", inp, *(double*)(input_data));
         #endif
         ov::Tensor input_tensor;
-        auto input_shape = ggml_decoder->get_input_shape(name).to_shape();
+        ov::Shape input_shape = ggml_decoder->get_input_shape(name).to_shape();
 
-        if (flag & op_node_name == "CONT" && input_shape[0] == 1 && input_shape[1] != 1) {
-            std::vector<size_t> input_stride = ggml_decoder->get_input_stride(name);
-            ov::element::Type input_type = ggml_decoder->get_input_type(name);
-            size_t element_size = input_type.size();
+        ov::element::Type input_type = ggml_decoder->get_input_type(name);
+        size_t element_size = input_type.size();
+        std::vector<size_t> input_stride = ggml_decoder->get_input_stride(name);
+        if (op_node_name == "CONT" && input_shape[0] == 1 && (input_shape[1] != 1 && flag || input_shape[2]*element_size!=input_stride[1])) {
             const size_t num_rows    = static_cast<size_t>(ggml_decoder->get_input_shape(name).to_shape()[1]);
             const size_t dim2        = static_cast<size_t>(ggml_decoder->get_input_shape(name).to_shape()[0]);
             size_t phys_stride = static_cast<size_t>(input_stride[1]) / element_size;
@@ -42,14 +42,14 @@ std::vector<std::pair<std::string, ov::Tensor>> get_ggml_graph_input_tensors(std
             std::vector<size_t> input_stride = ggml_decoder->get_input_stride(name);
             ov::element::Type input_type = ggml_decoder->get_input_type(name);
             size_t element_size = input_type.size();
-            ov::Shape phys_shape;
+            // ov::Shape phys_shape;
             static int iter = 0;
             if (iter++ % 2 == 0) {
-                phys_shape = {1, input_shape[1], input_stride[2] / element_size};
-                input_tensor = ov::Tensor(ov::element::f32, phys_shape, input_data);
+                // phys_shape = {1, input_shape[1], input_stride[2] / element_size};
+                input_tensor = ov::Tensor(ov::element::f32, input_shape, input_data);
             } else {
-                phys_shape = {1, input_shape[1], input_stride[1] / element_size};
-                input_tensor = ov::Tensor(ov::element::f16, phys_shape, input_data);
+                ov::Shape flat_shape = {1, 1, input_stride[0] / element_size};
+                input_tensor = ov::Tensor(ov::element::f16, flat_shape, input_data);
             }
         } else {
             input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), ggml_decoder->get_input_shape(name).to_shape(), input_data);
@@ -161,6 +161,11 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
         auto output_tensor = infer_request.get_output_tensor(i);
         // output_tensor.get_shape();
         std::memcpy(output_tensors[output_names[i]], output_tensor.data(), output_tensor.get_byte_size());
+        // std::cout << std::left  << "[ " << std::setw(2) << i << " ]: "
+        //             << "output_names: " << std::setw(20) << output_names[i]
+        //             << " output data: " << std::setw(15) << ((float*)output_tensor.data())[0]
+        //             << std::setw(15) << ((float*)output_tensor.data())[1] << std::right
+        //             << std::endl;
         #ifdef GGML_OPENVINO_DEBUG
             printf("Output %s after: %g\n", output_names[i].c_str(), *(double*)(output_tensor.data()));
         #endif