draft NPU support version 2: prefill + kvcache

2025-05-29 17:53:00 +08:00 · 2025-05-29 17:53:00 +08:00 · 34531abce4
parent 7fec223334
commit 34531abce4
7 changed files with 212 additions and 114 deletions
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@ -108,22 +108,25 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) {
                ov::PartialShape input_shape;
                if (std::string(src->name) == "inp_tokens" || std::string(src->name) == "inp_pos") {
                    if (m_is_static) {
-                        input_shape = ov::PartialShape(get_shape(src));
-                        // if (m_is_first_token) {
-                        //     input_shape = ov::PartialShape{1, 1, m_max_token_len};
-                        // } else {
-                        //     input_shape = ov::PartialShape{1, 1, 1};
-                        // }
+                        if (m_is_first_token) {
+                            input_shape = ov::PartialShape{1, 1, m_max_token_len};
+                        } else {
+                            input_shape = ov::PartialShape{1, 1, 1};
+                        }
                    } else {
                        input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_max_token_len)};
                    }
-                } else if (std::string(src->name).find("KQ_mask") == 0) {
+                } else if (std::string(src->name) == "KQ_mask") {
                    if (m_is_static) {
-                        input_shape = ov::PartialShape(get_shape(src));
+                        if (m_is_first_token) {
+                            input_shape = ov::PartialShape{1, m_max_token_len, m_max_token_len};
+                        } else {
+                            input_shape = ov::PartialShape{1, 1, m_max_token_len};
+                        }
                    } else {
-                        auto max_token_len = GGML_PAD(m_max_token_len, GGML_KQ_MASK_PAD);
+                        auto max_mask_size = GGML_PAD(m_max_token_len, GGML_KQ_MASK_PAD);
                        input_shape =
-                            ov::PartialShape{1, ov::Dimension(1, max_token_len), ov::Dimension(1, max_token_len)};
+                            ov::PartialShape{1, ov::Dimension(1, max_mask_size), ov::Dimension(1, max_mask_size)};
                    }
                } else {
                    input_shape = ov::Shape{get_shape(src)};
@ -208,6 +211,7 @@ void GgmlOvDecoder::set_max_token_len() {

 void GgmlOvDecoder::add_extra_inputs() {
    int64_t past_token_len;
+    // attention_size not used for NPU
    int64_t attention_size;

    for (const auto& node : m_nodes) {
@ -231,8 +235,7 @@ void GgmlOvDecoder::add_extra_inputs() {
    for (const auto& node : m_nodes) {
        if (node->src[1] && std::string(node->src[1]->name).find("inp_tokens") == 0) {
            int64_t total_token_len = node->src[1]->ne[0] + past_token_len;
-            attention_size = (total_token_len + 31) / 32 * 32;
-
+            attention_size = GGML_PAD(total_token_len, 32);
            std::string name = "attention_size";
            auto param_node = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::Shape{1});
            param_node->set_friendly_name(name);
--- a/ggml/src/ggml-openvino/ggml-decoder.h
+++ b/ggml/src/ggml-openvino/ggml-decoder.h
@ -92,9 +92,12 @@ public:
    virtual bool is_static() const override {
        return m_is_static;
    }
-    virtual bool is_first_token() const {
+    virtual bool is_first_token() const override {
        return m_is_first_token;
    }
+    virtual int get_max_token_len() const override {
+        return m_max_token_len;
+    }

 private:
    void set_input_output(ggml_tensor* node);
@ -106,7 +109,7 @@ private:
    static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor);

    void set_max_token_len();
-    int64_t m_max_token_len;
+    int m_max_token_len;

    void add_weight_const_parallel(std::map<std::string, std::shared_ptr<ov::Node>>& model_weights);

--- a/ggml/src/ggml-openvino/openvino/decoder.hpp
+++ b/ggml/src/ggml-openvino/openvino/decoder.hpp
@ -1,5 +1,6 @@
 #pragma once

+#include <cstdint>
 #include <map>
 #include <openvino/core/node.hpp>
 #include <openvino/frontend/decoder.hpp>
@ -57,6 +58,8 @@ public:
    virtual const std::vector<std::string>& get_model_output_names() const = 0;

    virtual bool is_static() const = 0;
+    virtual bool is_first_token() const = 0;
+    virtual int get_max_token_len() const = 0;
 };

 }  // namespace ggml
--- a/ggml/src/ggml-openvino/openvino/node_context.hpp
+++ b/ggml/src/ggml-openvino/openvino/node_context.hpp
@ -1,5 +1,6 @@
 #pragma once

+#include <cstdint>
 #include <openvino/frontend/node_context.hpp>

 #include "decoder.hpp"
@ -87,6 +88,12 @@ public:
    bool is_static() const {
        return m_decoder->is_static();
    }
+    bool is_first_token() const {
+        return m_decoder->is_first_token();
+    }
+    int get_max_token_len() const {
+        return m_decoder->get_max_token_len();
+    }

 private:
    std::shared_ptr<GgmlDecoder> m_decoder;
--- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp
+++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp
@ -8,7 +8,7 @@
 #include <openvino/op/broadcast.hpp>
 #include <openvino/op/concat.hpp>
 #include <openvino/op/constant.hpp>
-#include <openvino/op/convert_like.hpp>
+#include <openvino/op/convert.hpp>
 #include <openvino/op/range.hpp>
 #include <openvino/op/reshape.hpp>
 #include <openvino/op/scatter_nd_update.hpp>
@ -34,18 +34,26 @@ OutputVector translate_cpy(const NodeContext& context) {

    auto src0 = context.get_input(0);
    auto src1 = context.get_input(1);
-    auto past_token_len = context.get_input("past_token_len");
+    auto past_token_len_scalar = context.get_input("past_token_len");
+
+    src0 = std::make_shared<ov::op::v0::Convert>(src0, context.get_input_type(1));
    ov::Output<Node> res;

+    if (context.is_static() && context.is_first_token()) {
+        res = src0;
+        return rename_outputs_with_suffix({res}, context.get_name());
+    }
+
    auto src0_shape = context.get_input_shape(0).to_shape();
    auto output_shape = context.get_output_shape(0).to_shape();

    std::vector<size_t> input0_strides = context.get_input_stride(0);
    std::vector<size_t> output_strides = context.get_output_stride(0);

-    auto one = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {1});
+    auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
+    auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
+    auto one_scalar = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {1});

-    src0 = std::make_shared<ov::op::v1::ConvertLike>(src0, src1);
    if (op_case == 1) {
        // Write K to cache_k
        int64_t head_size = src0_shape[2];
@ -56,32 +64,29 @@ OutputVector translate_cpy(const NodeContext& context) {
        auto reshaped_src1 = std::make_shared<ov::op::v1::Reshape>(src1, reshaped_src1_shape, false);

        auto token_len = get_dimensions(src0.get_node_shared_ptr(), {0});
-        token_len = std::make_shared<ov::op::v1::Reshape>(token_len,
-                                                          ov::op::v0::Constant::create(ov::element::i64, {0}, {}),
-                                                          false);
+        auto token_len_scalar = std::make_shared<ov::op::v0::Squeeze>(token_len, zero);

+        std::shared_ptr<ov::Node> indices;
        if (context.is_static()) {
-            int32_t* op_params = context.get_input_op_params(1);
-            int64_t past_token_len_val = op_params[0] / context.get_input_stride(1)[2] / num_heads / head_size;
-            past_token_len = ov::op::v0::Constant::create(ov::element::i64, {}, {past_token_len_val});
+            indices = past_token_len_scalar.get_node_shared_ptr();
+            indices = std::make_shared<ov::op::v0::Unsqueeze>(
+                indices,
+                ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector<int64_t>{0, 1}));
+        } else {
+            auto total_token_len_scalar = std::make_shared<ov::op::v1::Add>(past_token_len_scalar, token_len_scalar);
+            indices = std::make_shared<ov::op::v4::Range>(past_token_len_scalar,
+                                                          total_token_len_scalar,
+                                                          one_scalar,
+                                                          ov::element::i64);
+            indices = std::make_shared<ov::op::v0::Unsqueeze>(indices, one);
        }

-        auto total_token_len = std::make_shared<ov::op::v1::Add>(past_token_len, token_len);
-        std::shared_ptr<ov::Node> indices =
-            std::make_shared<ov::op::v4::Range>(past_token_len, total_token_len, one, ov::element::i64);
-        indices = std::make_shared<ov::op::v0::Unsqueeze>(
-            indices,
-            ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector<int64_t>{1}));
-
        res = std::make_shared<ov::op::v3::ScatterNDUpdate>(reshaped_src1, indices, src0);
    } else {
        // Write V to cache_v
-        auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
        auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
        auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
-
        auto zero_scalar = ov::op::v0::Constant::create(ov::element::i64, {}, {0});
-        auto one_scalar = ov::op::v0::Constant::create(ov::element::i64, {}, {1});

        int64_t total_head_size = src0_shape[1];
        auto total_head_size_node = ov::op::v0::Constant::create(ov::element::i64, {1}, {total_head_size});
@ -89,36 +94,6 @@ OutputVector translate_cpy(const NodeContext& context) {

        auto token_len = get_dimensions(src0.get_node_shared_ptr(), {2});
        auto token_len_scalar = std::make_shared<ov::op::v0::Squeeze>(token_len, zero);
-        if (context.is_static()) {
-            int32_t* op_params = context.get_input_op_params(1);
-            int64_t past_token_len_val = op_params[0] / context.get_input_stride(1)[2];
-            past_token_len = ov::op::v0::Constant::create(ov::element::i64, {}, {past_token_len_val});
-        }
-        auto total_token_len_scalar = std::make_shared<ov::op::v1::Add>(past_token_len, token_len_scalar);
-
-        // auto reshaped_src1 = std::make_shared<ov::op::v1::Reshape>(
-        //     src1,
-        //     ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{1, total_head_size, -1}),
-        //     false);
-
-        // auto src1_left = std::make_shared<ov::op::v8::Slice>(
-        //     reshaped_src1,
-        //     ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 0, 0}),
-        //     std::make_shared<ov::op::v0::Concat>(ov::OutputVector{one, total_head_size_node, past_token_len}, 0),
-        //     ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1}));
-
-        // auto src1_right = std::make_shared<ov::op::v8::Slice>(
-        //     reshaped_src1,
-        //     std::make_shared<ov::op::v0::Concat>(ov::OutputVector{zero, zero, total_token_len}, 0),
-        //     ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{1, total_head_size, INT_MAX}),
-        //     ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1}));
-
-        // auto reshaped_src0 = std::make_shared<ov::op::v1::Reshape>(
-        //     src0,
-        //     ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{1, total_head_size, -1}),
-        //     false);
-
-        // auto res = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{src1_left, reshaped_src0, src1_right}, 2);

        // 1D tensor of shape [total_head_size], values starting from 0
        auto range_row =
@ -131,8 +106,19 @@ OutputVector translate_cpy(const NodeContext& context) {
            std::make_shared<ov::op::v0::Concat>(ov::OutputVector{total_head_size_node, token_len, one}, 0));

        // 1D tensor of shape [token_len], values starting from past_token_len
-        auto range_col =
-            std::make_shared<ov::op::v4::Range>(past_token_len, total_token_len_scalar, one_scalar, element::i64);
+        std::shared_ptr<ov::Node> range_col;
+        if (context.is_static()) {
+            range_col = past_token_len_scalar.get_node_shared_ptr();
+            range_col = std::make_shared<ov::op::v0::Unsqueeze>(
+                range_col,
+                ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector<int64_t>{0}));
+        } else {
+            auto total_token_len_scalar = std::make_shared<ov::op::v1::Add>(past_token_len_scalar, token_len_scalar);
+            range_col = std::make_shared<ov::op::v4::Range>(past_token_len_scalar,
+                                                            total_token_len_scalar,
+                                                            one_scalar,
+                                                            ov::element::i64);
+        }
        auto range_col_reshaped =
            std::make_shared<ov::op::v0::Unsqueeze>(range_col,
                                                    ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 2}));
--- a/ggml/src/ggml-openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/utils.cpp
@ -1,5 +1,7 @@
 #include "utils.h"

+#include <algorithm>
+#include <cmath>
 #include <cstddef>
 #include <cstdint>
 #include <cstdlib>
@ -13,6 +15,7 @@
 #include <openvino/runtime/intel_npu/properties.hpp>
 #include <openvino/runtime/tensor.hpp>
 #include <unordered_map>
+#include <vector>

 #include "ggml-impl.h"
 #include "ggml.h"
@ -52,7 +55,6 @@ static ov::frontend::FrontEnd::Ptr get_ggml_frontend() {

 enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph) {
    static ov::Core core;
-    static bool is_first_token = true;

    static std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : "";
    if (device.empty()) {
@ -66,12 +68,16 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c

    bool is_static = device == "NPU" ? true : false;
    ov::AnyMap config;
-    if (is_static) {
+    if (device == "NPU") {
        config = {
            {"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=ReduceMean"},
            {"NPU_USE_NPUW", "YES"},
            {"NPUW_DEVICES", "NPU"},
            {"NPUW_FOLD", "YES"},
+            {"NPUW_DQ", "YES"},
+            {"NPUW_FUNCALL_ASYNC", "YES"},
+            {"NPUW_HOST_GATHER", "YES"},
+            {"NPUW_WEIGHTS_BANK", "shared"},
            // {"NPU_COMPILER_TYPE", "MLIR"},
        };
    }
@ -83,69 +89,128 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
        core.set_property(ov::cache_dir(cache_dir));
    }

-    // For CPU and GPU, there is only one compiled model, so only use the first element of the pair
-    // For NPU, there are prefill model and kvcache model (This is the ideal approach, but not implemented yet,
-    // currently recompile for every token)
-    using CachedItem = std::pair<std::shared_ptr<ov::Model>, std::pair<ov::CompiledModel, ov::CompiledModel>>;
-    static std::unordered_map<struct ggml_cgraph*, CachedItem> compiled_cache;
+    // CPU and GPU will only use cache_prefill
+    using CachedItem = std::pair<std::shared_ptr<ov::Model>, ov::CompiledModel>;
+    static std::unordered_map<struct ggml_cgraph*, CachedItem> compiled_cache_prefill;
+    static std::unordered_map<struct ggml_cgraph*, CachedItem> compiled_cache_kvcache;

+    std::shared_ptr<GgmlOvDecoder> ggml_decoder;
    std::shared_ptr<ov::Model> model;
-    ov::CompiledModel compiled_model_prefill;
-    ov::CompiledModel compiled_model_kvcache;
+    ov::CompiledModel compiled_model;
+
    int64_t decoder_end_time;
    int64_t conversion_end_time;
    int64_t compile_end_time;

-    auto ggml_decoder = get_ggml_decoder(cgraph, is_static, is_first_token);
-    decoder_end_time = ggml_time_us();
+    auto it = compiled_cache_prefill.find(cgraph);
+    bool is_first_token = it == compiled_cache_prefill.end();
+    if (!is_first_token) {
+        ggml_decoder = get_ggml_decoder(cgraph, is_static, false);
+        decoder_end_time = ggml_time_us();

-    auto it = compiled_cache.find(cgraph);
-    if (it != compiled_cache.end() && !is_static) {
-        model = it->second.first;
-        conversion_end_time = ggml_time_us();
-
-        compiled_model_prefill = it->second.second.first;
-        compiled_model_kvcache = it->second.second.second;
-        compile_end_time = ggml_time_us();
-    } else {
-        ov::frontend::InputModel::Ptr input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder);
-        model = ov::frontend::ggml::FrontEnd::convert(input_model);
-
-        conversion_end_time = ggml_time_us();
-
-        if (getenv("GGML_OPENVINO_DUMP_IR")) {
-            char timestamped_filename[64];
-            auto timestamp = (long long)ggml_time_us();
-            snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp);
-            ov::serialize(model, timestamped_filename);
+        if (is_static) {
+            model = compiled_cache_kvcache[cgraph].first;
+            compiled_model = compiled_cache_kvcache[cgraph].second;
+        } else {
+            model = it->second.first;
+            compiled_model = it->second.second;
        }
-
-        compiled_model_prefill = core.compile_model(model, device, config);
-        compile_end_time = ggml_time_us();
-
-        compiled_cache[cgraph] = std::make_pair(model, std::make_pair(compiled_model_prefill, compiled_model_kvcache));
-    }
-
-    ov::InferRequest infer_request;
-    if (!is_static) {
-        infer_request = compiled_model_prefill.create_infer_request();
+        conversion_end_time = ggml_time_us();
+        compile_end_time = conversion_end_time;
    } else {
-        infer_request = compiled_model_prefill.create_infer_request();
-        // if (is_first_token) {
-        //     infer_request = compiled_model_prefill.create_infer_request();
-        // } else {
-        //     infer_request = compiled_model_kvcache.create_infer_request();
-        // }
+        if (is_static) {
+            ggml_decoder = get_ggml_decoder(cgraph, is_static, true);
+            auto ggml_decoder_kvcache = get_ggml_decoder(cgraph, is_static, false);
+            decoder_end_time = ggml_time_us();
+
+            auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder);
+            auto input_model_kvcache = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder_kvcache);
+
+            model = ov::frontend::ggml::FrontEnd::convert(input_model);
+            auto model_kvcache = ov::frontend::ggml::FrontEnd::convert(input_model_kvcache);
+            conversion_end_time = ggml_time_us();
+
+            compiled_model = core.compile_model(model, device, config);
+            auto compiled_model_kvcache = core.compile_model(model_kvcache, device, config);
+            compile_end_time = ggml_time_us();
+
+            compiled_cache_prefill[cgraph] = std::make_pair(model, compiled_model);
+            compiled_cache_kvcache[cgraph] = std::make_pair(model_kvcache, compiled_model_kvcache);
+
+            if (getenv("GGML_OPENVINO_DUMP_IR")) {
+                char timestamped_filename[64];
+                auto timestamp = (long long)ggml_time_us();
+                snprintf(timestamped_filename, sizeof(timestamped_filename), "model_prefill_%lld.xml", timestamp);
+                ov::serialize(model, timestamped_filename);
+                snprintf(timestamped_filename, sizeof(timestamped_filename), "model_kvcache_%lld.xml", timestamp);
+                ov::serialize(model_kvcache, timestamped_filename);
+            }
+        } else {
+            ggml_decoder = get_ggml_decoder(cgraph, is_static, true);
+            decoder_end_time = ggml_time_us();
+
+            auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder);
+            model = ov::frontend::ggml::FrontEnd::convert(input_model);
+            conversion_end_time = ggml_time_us();
+
+            compiled_model = core.compile_model(model, device, config);
+            compile_end_time = ggml_time_us();
+            compiled_cache_prefill[cgraph] = std::make_pair(model, compiled_model);
+
+            if (getenv("GGML_OPENVINO_DUMP_IR")) {
+                char timestamped_filename[64];
+                auto timestamp = (long long)ggml_time_us();
+                snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp);
+                ov::serialize(model, timestamped_filename);
+            }
+        }
    }
+    auto infer_request = compiled_model.create_infer_request();

    auto ov_params = model->get_parameters();
    for (size_t i = 0; i < ov_params.size(); i++) {
        auto param_name = ov_params[i]->get_friendly_name();
        ov::Tensor input_tensor;
+
        if (ggml_decoder->get_model_extra_inputs().find(param_name) != ggml_decoder->get_model_extra_inputs().end()) {
            input_tensor = *ggml_decoder->get_model_extra_input_values().at(param_name);
-        } else {
+
+        } else if (!is_static) {
            input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name);
+
+        } else {
+            if (param_name == "inp_tokens" || param_name == "inp_pos") {
+                if (is_first_token) {
+                    size_t max_token_len = ggml_decoder->get_max_token_len();
+                    const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name);
+                    std::vector<int32_t> padded_data = pad_input<int32_t>(input_tensor_ggml, 1, max_token_len, 0);
+                    input_tensor = ov::Tensor(ov::element::i32, ov::Shape{1, 1, max_token_len});
+                    auto* data_ptr = input_tensor.data<int32_t>();
+                    std::copy(padded_data.begin(), padded_data.end(), data_ptr);
+                } else {
+                    input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name);
+                }
+
+            } else if (param_name == "KQ_mask") {
+                size_t max_token_len = ggml_decoder->get_max_token_len();
+                const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name);
+                if (is_first_token) {
+                    std::vector<float> padded_data =
+                        pad_input<float>(input_tensor_ggml, max_token_len, max_token_len, -INFINITY);
+                    set_zero_diagonal(padded_data, max_token_len);
+                    input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, max_token_len, max_token_len});
+                    auto* data_ptr = input_tensor.data<float>();
+                    std::copy(padded_data.begin(), padded_data.end(), data_ptr);
+                } else {
+                    std::vector<float> padded_data = pad_input<float>(input_tensor_ggml, 1, max_token_len, -INFINITY);
+                    input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 1, max_token_len});
+                    auto* data_ptr = input_tensor.data<float>();
+                    std::copy(padded_data.begin(), padded_data.end(), data_ptr);
+                }
+
+            } else {
+                input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name);
+            }
        }
        infer_request.set_input_tensor(i, input_tensor);

@ -234,3 +299,9 @@ void print_output_tensor_info(const std::string& name,
        break;
    }
 }
+
+void set_zero_diagonal(std::vector<float>& matrix, size_t dim) {
+    for (size_t i = 0; i < dim; ++i) {
+        matrix[i * dim + i] = 0.0f;
+    }
+}
--- a/ggml/src/ggml-openvino/utils.h
+++ b/ggml/src/ggml-openvino/utils.h
@ -1,12 +1,37 @@
+#include <algorithm>
+
 #include "ggml-backend-impl.h"
 #include "ggml-decoder.h"

 enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph);

+std::shared_ptr<GgmlOvDecoder> get_ggml_decoder(struct ggml_cgraph* cgraph, bool is_static, bool is_first_token);
+
+ov::Tensor get_ggml_graph_input_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder, std::string& name);
+
+std::map<std::string, void*> get_ggml_graph_output_dst(std::shared_ptr<GgmlOvDecoder> ggml_decoder);
+
 size_t checksum(const void* data, size_t size);

 void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor);

 void print_output_tensor_info(const std::string& name,
                              const ov::Tensor& tensor,
-                              std::map<std::string, void*>& output_dst);
+                              std::map<std::string, void*>& output_dst);
+
+template <typename T>
+std::vector<T> pad_input(const ggml_tensor* tensor, size_t padded_rows, size_t padded_cols, T pad_value) {
+    std::vector<T> padded_data(padded_rows * padded_cols, pad_value);
+    size_t rows = tensor->ne[1];
+    size_t cols = tensor->ne[0];
+    T* data = static_cast<T*>(tensor->data);
+
+    for (size_t i = 0; i < std::min(rows, padded_rows); ++i) {
+        for (size_t j = 0; j < std::min(cols, padded_cols); ++j) {
+            padded_data[i * padded_cols + j] = data[i * cols + j];
+        }
+    }
+    return padded_data;
+}
+
+void set_zero_diagonal(std::vector<float>& matrix, size_t dim);