From 4c582ac7a313a27639ab1b06d590b4b80b565864 Mon Sep 17 00:00:00 2001
From: "Yu, Zijun" <zijun.yu@intel.com>
Date: Thu, 26 Jun 2025 13:54:06 +0800
Subject: [PATCH] Statful transformation for CPU GPU

---
 ggml/src/ggml-openvino/ggml-decoder.cpp       | 104 +++++++++++-------
 ggml/src/ggml-openvino/ggml-decoder.h         |  40 ++++---
 ggml/src/ggml-openvino/openvino/decoder.hpp   |   6 +
 ggml/src/ggml-openvino/openvino/op/cpy.cpp    |  13 +--
 .../openvino/translate_session.cpp            |  69 +++++++++---
 .../openvino/translate_session.hpp            |   2 +-
 ggml/src/ggml-openvino/utils.cpp              | 102 ++++++++++-------
 7 files changed, 216 insertions(+), 120 deletions(-)

diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
index 04f68a4950..e30f026e36 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -26,12 +26,13 @@
 #include "ggml-backend-impl.h"
 #include "ggml-backend.h"
 
-GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token)
-    : m_cgraph(cgraph),
-      m_node(node),
-      m_op_name(m_node ? std::string(m_node->name) : "NONE_OP"),
-      m_is_static(is_static),
-      m_is_first_token(is_first_token) {
+GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* m_cgraph, bool is_static,
+                             bool is_first_token) :
+    m_cgraph(m_cgraph),
+    m_node(node),
+    m_op_name(m_node ? std::string(m_node->name) : "NONE_OP"),
+    m_is_static(is_static),
+    m_is_first_token(is_first_token) {
     static std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
 
     if (m_node) {
@@ -44,10 +45,11 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap
         }
 
         if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) {
-            dump_cgraph(m_cgraph);
+            std::string filename = "cgraph.txt";
+            dump_cgraph(m_cgraph, filename);
         }
 
-        set_max_token_len();
+        set_llm_params();
 
         static bool weight_created = false;
         if (!weight_created) {
@@ -105,33 +107,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) {
                 if (m_model_inputs.find(src_name) != m_model_inputs.end()) {
                     continue;
                 }
-                ov::PartialShape input_shape;
-                if (std::string(src->name) == "inp_tokens" || std::string(src->name) == "inp_pos") {
-                    if (m_is_static) {
-                        if (m_is_first_token) {
-                            input_shape = ov::PartialShape{1, 1, m_max_token_len};
-                        } else {
-                            input_shape = ov::PartialShape{1, 1, 1};
-                        }
-                    } else {
-                        input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_max_token_len)};
-                    }
-                } else if (std::string(src->name) == "KQ_mask") {
-                    if (m_is_static) {
-                        if (m_is_first_token) {
-                            input_shape = ov::PartialShape{1, m_max_token_len, m_max_token_len};
-                        } else {
-                            input_shape = ov::PartialShape{1, 1, m_max_token_len};
-                        }
-                    } else {
-                        auto max_mask_size = GGML_PAD(m_max_token_len, GGML_KQ_MASK_PAD);
-                        input_shape =
-                            ov::PartialShape{1, ov::Dimension(1, max_mask_size), ov::Dimension(1, max_mask_size)};
-                    }
-                } else {
-                    input_shape = ov::Shape{get_shape(src)};
-                }
-                auto param_node = std::make_shared<ov::op::v0::Parameter>(get_ov_type(src), input_shape);
+                auto param_node = std::make_shared<ov::op::v0::Parameter>(get_ov_type(src), get_graph_input_shape(src));
                 param_node->set_friendly_name(src_name);
                 m_model_inputs[src_name] = param_node;
             }
@@ -150,6 +126,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) {
             auto it = std::find(m_model_output_names.begin(), m_model_output_names.end(), name);
             if (it == m_model_output_names.end()) {
                 m_model_output_names.push_back(name);
+                m_kv_names.push_back(name);
             }
         }
     }
@@ -213,17 +190,54 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) {
     }
 }
 
-void GgmlOvDecoder::set_max_token_len() {
+void GgmlOvDecoder::set_llm_params() {
     for (int i = 0; i < m_cgraph->n_nodes; i++) {
         auto* node = m_cgraph->nodes[i];
-        if (std::string(node->name) == "cache_k_l0 (view)") {
+        if (node->op == GGML_OP_VIEW && std::string(node->name) == "cache_k_l0 (view)") {
             auto* cache_k = node->src[0];
             m_max_token_len = cache_k->ne[1];
-            break;
+        } else if (node->op == GGML_OP_ROPE && std::string(node->name) == "Qcur-0") {
+            m_head_size = node->ne[0];
+            m_num_heads = node->ne[1];
+        } else if (node->op == GGML_OP_ROPE && std::string(node->name) == "Kcur-0") {
+            m_num_heads_kv = node->ne[1];
         }
     }
 }
 
+ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) const {
+    ov::PartialShape input_shape;
+    if (std::string(src->name) == "inp_tokens" || std::string(src->name) == "inp_pos") {
+        if (m_is_static) {
+            if (m_is_first_token) {
+                input_shape = ov::PartialShape{ 1, 1, m_max_token_len };
+            } else {
+                input_shape = ov::PartialShape{ 1, 1, 1 };
+            }
+        } else {
+            input_shape = ov::PartialShape{ 1, 1, ov::Dimension(1, m_max_token_len) };
+        }
+    } else if (std::string(src->name) == "KQ_mask") {
+        if (m_is_static) {
+            if (m_is_first_token) {
+                input_shape = ov::PartialShape{ 1, m_max_token_len, m_max_token_len };
+            } else {
+                input_shape = ov::PartialShape{ 1, 1, m_max_token_len };
+            }
+        } else {
+            auto max_mask_size = GGML_PAD(m_max_token_len, GGML_KQ_MASK_PAD);
+            input_shape = ov::PartialShape{ 1, ov::Dimension(1, max_mask_size), ov::Dimension(1, max_mask_size) };
+        }
+    } else if (std::string(src->name).find("cache_k") == 0) {
+        input_shape = ov::PartialShape{ m_max_token_len, m_num_heads_kv, m_head_size };
+    } else if (std::string(src->name).find("cache_v") == 0) {
+        input_shape = ov::PartialShape{ m_num_heads_kv, m_head_size, m_max_token_len };
+    } else {
+        input_shape = ov::PartialShape{ get_shape(src) };
+    }
+    return input_shape;
+}
+
 void GgmlOvDecoder::add_extra_inputs() {
     int64_t past_token_len = -1;
     // attention_size not used for NPU
@@ -267,6 +281,16 @@ void GgmlOvDecoder::add_extra_inputs() {
     }
 }
 
+std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const {
+    std::map<std::string, std::string> kv_param_res_names;
+    for (const auto& name : m_kv_names) {
+        if (name.find("cache_k") == 0 || name.find("cache_v") == 0) {
+            kv_param_res_names[name] = name;
+        }
+    }
+    return kv_param_res_names;
+}
+
 void GgmlOvDecoder::add_weight_const_parallel(std::map<std::string, std::shared_ptr<ov::Node>>& model_weights) {
     static std::mutex weights_mutex;
     auto* nodes = m_cgraph->nodes;
@@ -344,8 +368,8 @@ std::shared_ptr<ov::Node> GgmlOvDecoder::create_weight_node(ggml_tensor* tensor)
     return weight_node;
 }
 
-void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph) {
-    std::ofstream file("cgraph.txt");
+void GgmlOvDecoder::dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename) {
+    std::ofstream file(filename);
     if (!file.is_open()) {
         std::cerr << "Failed to open file" << std::endl;
         return;
diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h
index b6b13d1f11..6d3f24b093 100644
--- a/ggml/src/ggml-openvino/ggml-decoder.h
+++ b/ggml/src/ggml-openvino/ggml-decoder.h
@@ -3,6 +3,7 @@
 #include <cstdint>
 #include <map>
 #include <memory>
+#include <openvino/core/partial_shape.hpp>
 #include <vector>
 
 #include "ggml.h"
@@ -89,28 +90,34 @@ public:
         return m_model_output_names;
     }
 
-    virtual bool is_static() const override {
-        return m_is_static;
-    }
-    virtual bool is_first_token() const override {
-        return m_is_first_token;
-    }
-    virtual int get_max_token_len() const override {
-        return m_max_token_len;
-    }
+    virtual int get_max_token_len() const override { return m_max_token_len; }
+
+    virtual int get_num_heads() const override { return m_num_heads; }
+
+    virtual int get_num_heads_kv() const override { return m_num_heads_kv; }
+
+    virtual int get_head_size() const override { return m_head_size; }
+
+    virtual std::map<std::string, std::string> get_kv_param_res_names() const override;
+
+    virtual bool is_static() const override { return m_is_static; }
+
+    virtual bool is_first_token() const override { return m_is_first_token; }
+
+    ov::PartialShape get_graph_input_shape(const ggml_tensor* src) const;
 
 private:
     void set_input_output(ggml_tensor* node);
     void add_extra_inputs();
-    static void dump_cgraph(const struct ggml_cgraph* cgraph);
+    static void dump_cgraph(const struct ggml_cgraph* cgraph, std::string& filename);
     static std::vector<size_t> get_shape(const ggml_tensor* tensor);
     static std::vector<size_t> get_stride(const ggml_tensor* tensor);
     static ov::element::Type get_ov_type(const ggml_tensor* tensor);
+
+    // set max_token_len, num_heads, etc
+    void set_llm_params();
+
     static std::shared_ptr<ov::Node> create_weight_node(ggml_tensor* tensor);
-
-    void set_max_token_len();
-    int m_max_token_len;
-
     void add_weight_const_parallel(std::map<std::string, std::shared_ptr<ov::Node>>& model_weights);
 
     struct ggml_cgraph* m_cgraph;
@@ -129,6 +136,11 @@ private:
     std::map<std::string, std::shared_ptr<ov::Tensor>> m_model_extra_input_values;
     std::map<std::string, std::shared_ptr<ov::Node>> m_model_weights;
     std::vector<std::string> m_model_output_names;
+    int m_max_token_len;
+    int m_num_heads;
+    int m_num_heads_kv;
+    int m_head_size;
+    std::vector<std::string> m_kv_names;
     bool m_is_static;
     bool m_is_first_token;
 };
diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp
index 6212568399..3105d0f16f 100644
--- a/ggml/src/ggml-openvino/openvino/decoder.hpp
+++ b/ggml/src/ggml-openvino/openvino/decoder.hpp
@@ -4,6 +4,7 @@
 #include <map>
 #include <openvino/core/node.hpp>
 #include <openvino/frontend/decoder.hpp>
+#include <string>
 
 namespace ov {
 namespace frontend {
@@ -57,6 +58,11 @@ public:
     virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_weights() const = 0;
     virtual const std::vector<std::string>& get_model_output_names() const = 0;
 
+    virtual int get_num_heads() const = 0;
+    virtual int get_num_heads_kv() const = 0;
+    virtual int get_head_size() const = 0;
+    virtual std::map<std::string, std::string> get_kv_param_res_names() const = 0;
+
     virtual bool is_static() const = 0;
     virtual bool is_first_token() const = 0;
     virtual int get_max_token_len() const = 0;
diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp
index d27f4babb4..b183b97f23 100644
--- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp
+++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp
@@ -12,6 +12,7 @@
 #include <openvino/op/range.hpp>
 #include <openvino/op/reshape.hpp>
 #include <openvino/op/scatter_nd_update.hpp>
+#include <openvino/op/shape_of.hpp>
 #include <openvino/op/slice.hpp>
 #include <openvino/op/squeeze.hpp>
 #include <openvino/op/transpose.hpp>
@@ -57,13 +58,6 @@ OutputVector translate_cpy(const NodeContext& context) {
 
     if (op_case == 1) {
         // Write K to cache_k
-        int64_t head_size = src0_shape[2];
-        int64_t num_heads = src0_shape[1];
-
-        auto reshaped_src1_shape =
-            ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector<int64_t>{-1, num_heads, head_size});
-        auto reshaped_src1 = std::make_shared<ov::op::v1::Reshape>(src1, reshaped_src1_shape, false);
-
         auto token_len = get_dimensions(src0.get_node_shared_ptr(), {0});
         auto token_len_scalar = std::make_shared<ov::op::v0::Squeeze>(token_len, zero);
 
@@ -80,7 +74,8 @@ OutputVector translate_cpy(const NodeContext& context) {
         }
         indices = std::make_shared<ov::op::v0::Unsqueeze>(indices, one);
 
-        res = std::make_shared<ov::op::v3::ScatterNDUpdate>(reshaped_src1, indices, src0);
+        auto updated = std::make_shared<ov::op::v3::ScatterNDUpdate>(src1, indices, src0);
+        res = std::make_shared<ov::op::v1::Reshape>(updated, std::make_shared<ov::op::v0::ShapeOf>(src1), false);
     } else {
         // Write V to cache_v
         auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
@@ -140,7 +135,7 @@ OutputVector translate_cpy(const NodeContext& context) {
             false);
 
         auto updated = std::make_shared<ov::op::v3::ScatterNDUpdate>(reshaped_src1, indices_final, flattend_src0);
-        res = std::make_shared<ov::op::v0::Unsqueeze>(updated, zero);
+        res = std::make_shared<ov::op::v1::Reshape>(updated, std::make_shared<ov::op::v0::ShapeOf>(src1), false);
     }
 
     return rename_outputs_with_suffix({res}, context.get_name());
diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp
index 8eda23c1c5..3bf0403a64 100644
--- a/ggml/src/ggml-openvino/openvino/translate_session.cpp
+++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp
@@ -1,7 +1,12 @@
 #include "translate_session.hpp"
 
 #include <cstdlib>
+#include <map>
+#include <memory>
+#include <openvino/op/parameter.hpp>
+#include <openvino/op/result.hpp>
 #include <openvino/pass/constant_folding.hpp>
+#include <openvino/pass/make_stateful.hpp>
 
 #include "input_model.hpp"
 
@@ -11,6 +16,41 @@ namespace ggml {
 
 using namespace ov::op;
 
+namespace {
+ov::pass::MakeStateful::ParamResPairs get_kv_param_res_pairs(
+    const std::shared_ptr<ov::Model>& model, const std::map<std::string, std::string>& kv_param_res_names) {
+    ov::pass::MakeStateful::ParamResPairs pairs;
+    const auto& params = model->get_parameters();
+    const auto& results = model->get_results();
+
+    for (const auto& param_res : kv_param_res_names) {
+        const auto& param_name = param_res.first;
+        const auto& res_name = param_res.second;
+
+        auto param_it = std::find_if(params.begin(), params.end(), [&](const std::shared_ptr<v0::Parameter>& node) {
+            return node->get_friendly_name() == param_name;
+        });
+
+        OPENVINO_ASSERT(param_it != params.end(), "The tensor name ", param_name,
+                        " is not associated with any of "
+                        "Parameters in the network.");
+
+        auto res_it = std::find_if(results.begin(), results.end(), [&](const std::shared_ptr<v0::Result>& node) {
+            return node->get_friendly_name() == res_name;
+        });
+
+        OPENVINO_ASSERT(res_it != results.end(), "The tensor name ", res_name,
+                        " is not associated with any of "
+                        "Results in the network.");
+
+        std::shared_ptr<ov::op::v0::Parameter> param = *param_it;
+        std::shared_ptr<ov::op::v0::Result> res = *res_it;
+        pairs.emplace_back(param, res);
+    }
+    return pairs;
+}
+}  // namespace
+
 TranslateSession::TranslateSession(const frontend::InputModel::Ptr& input_model,
                                    const std::unordered_map<std::string, CreatorFunction>& translator_map)
     : m_input_model(input_model),
@@ -88,25 +128,26 @@ std::shared_ptr<Model> TranslateSession::translate_graph(const frontend::InputMo
         results.push_back(result);
     }
 
-    ov::ParameterVector used_params;
-    for (const auto& param : params) {
-        if (!param->output(0).get_target_inputs().empty()) {
-            used_params.push_back(param);
-        }
-    }
-    if (getenv("GGML_OPENVINO_PROFILING")) {
-        if (auto diff = params.size() - used_params.size()) {
-            std::cout << diff << " parameters are not used in the model." << std::endl;
-        }
-    }
-    resulting_model = std::make_shared<Model>(results, used_params);
+    resulting_model = std::make_shared<Model>(results, params);
+
+    apply_transformations(resulting_model);
+    return resulting_model;
+}
+
+void TranslateSession::apply_transformations(const std::shared_ptr<Model>& model) {
+    auto ggml_model_decoder = std::dynamic_pointer_cast<InputModel>(m_input_model)->get_model_decoder();
 
     ov::pass::Manager manager;
     manager.set_per_pass_validation(true);
     manager.register_pass<ov::pass::ConstantFolding>();
-    manager.run_passes(resulting_model);
 
-    return resulting_model;
+    if (!ggml_model_decoder->is_static()) {
+        const auto kv_param_res_names = ggml_model_decoder->get_kv_param_res_names();
+        const auto kv_param_res_pairs = get_kv_param_res_pairs(model, kv_param_res_names);
+        manager.register_pass<ov::pass::MakeStateful>(kv_param_res_pairs);
+    }
+
+    manager.run_passes(model);
 }
 
 }  // namespace ggml
diff --git a/ggml/src/ggml-openvino/openvino/translate_session.hpp b/ggml/src/ggml-openvino/openvino/translate_session.hpp
index 5c7a9d464d..9167b55fe5 100644
--- a/ggml/src/ggml-openvino/openvino/translate_session.hpp
+++ b/ggml/src/ggml-openvino/openvino/translate_session.hpp
@@ -16,7 +16,7 @@ public:
     std::shared_ptr<Model> translate_graph(const frontend::InputModel::Ptr& input_model);
 
 private:
-    void print_model_topology();
+    void apply_transformations(const std::shared_ptr<Model>& model);
     const frontend::InputModel::Ptr m_input_model;
     const std::unordered_map<std::string, CreatorFunction>& m_translator_map;
     std::shared_ptr<Model> m_ov_model;
diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp
index d20e671064..2620fa5615 100644
--- a/ggml/src/ggml-openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/utils.cpp
@@ -9,10 +9,13 @@
 #include <memory>
 #include <openvino/core/any.hpp>
 #include <openvino/core/graph_util.hpp>
+#include <openvino/core/partial_shape.hpp>
 #include <openvino/core/type/float16.hpp>
 #include <openvino/frontend/manager.hpp>
+#include <openvino/op/parameter.hpp>
 #include <openvino/openvino.hpp>
 #include <openvino/runtime/compiled_model.hpp>
+#include <openvino/runtime/infer_request.hpp>
 #include <openvino/runtime/intel_npu/properties.hpp>
 #include <openvino/runtime/tensor.hpp>
 #include <unordered_map>
@@ -28,11 +31,15 @@ std::shared_ptr<GgmlOvDecoder> get_ggml_decoder(struct ggml_cgraph* cgraph, bool
 }
 
 ov::Tensor convert_ggml_input_to_ov(std::shared_ptr<GgmlOvDecoder> ggml_decoder, const std::string& name) {
-    auto* input_data = ggml_decoder->get_input_ggml_tensor(name)->data;
-    ov::Tensor input_tensor;
-    ov::Shape input_shape = ggml_decoder->get_input_shape(name).to_shape();
-    std::vector<size_t> input_stride = ggml_decoder->get_input_stride(name);
-    input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data);
+    const auto* ggml_tensor = ggml_decoder->get_input_ggml_tensor(name);
+    auto* input_data = ggml_tensor->data;
+    ov::Shape input_shape;
+    if (name.find("cache_k") == 0 || name.find("cache_v") == 0) {
+        input_shape = ggml_decoder->get_graph_input_shape(ggml_tensor).to_shape();
+    } else {
+        input_shape = ggml_decoder->get_input_shape(name).to_shape();
+    }
+    auto input_tensor = ov::Tensor(ggml_decoder->get_input_type(name), input_shape, input_data);
     return input_tensor;
 }
 
@@ -82,41 +89,37 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
         core.set_property(ov::cache_dir(cache_dir));
     }
 
-    // CPU and GPU will only use cache_prefill
-    using CachedItem = std::pair<std::shared_ptr<ov::Model>, ov::CompiledModel>;
-    static std::unordered_map<struct ggml_cgraph*, CachedItem> compiled_cache_prefill;
-    static std::unordered_map<struct ggml_cgraph*, CachedItem> compiled_cache_kvcache;
+    static std::unordered_map<struct ggml_cgraph*, std::shared_ptr<ov::InferRequest>> infer_request_cache;
+    static std::unordered_map<struct ggml_cgraph*, std::vector<std::string>> ov_input_names_cache;
+    static std::unordered_map<struct ggml_cgraph*, std::vector<std::string>> ov_output_names_cache;
+    // For NPU, store the kvcache model, since we cannot create two infer_request
+    static std::unordered_map<struct ggml_cgraph*, ov::CompiledModel> compiled_model_cache;
 
     std::shared_ptr<GgmlOvDecoder> ggml_decoder;
-    std::shared_ptr<ov::Model> model;
-    ov::CompiledModel compiled_model;
+    ov::InferRequest infer_request;
 
     int64_t decoder_end_time;
     int64_t conversion_end_time;
     int64_t compile_end_time;
 
-    bool is_first_token = is_prefill(cgraph);
-
-    auto it = compiled_cache_prefill.find(cgraph);
-    if (it != compiled_cache_prefill.end()) {
+    auto it = infer_request_cache.find(cgraph);
+    if (it != infer_request_cache.end()) {
         ggml_decoder = get_ggml_decoder(cgraph, is_static, false);
         decoder_end_time = ggml_time_us();
 
-        if (is_static) {
-            if (is_first_token) {
-                model = compiled_cache_prefill[cgraph].first;
-                compiled_model = compiled_cache_prefill[cgraph].second;
-            } else {
-                model = compiled_cache_kvcache[cgraph].first;
-                compiled_model = compiled_cache_kvcache[cgraph].second;
-            }
-        } else {
-            model = it->second.first;
-            compiled_model = it->second.second;
+        // For NPU for the first time we call kvcache modle, pop the compiled kvcache model from cache
+        if (is_static && compiled_model_cache.find(cgraph) != compiled_model_cache.end()) {
+            infer_request_cache[cgraph] =
+                std::make_shared<ov::InferRequest>(compiled_model_cache[cgraph].create_infer_request());
+            compiled_model_cache.erase(cgraph);
         }
+        infer_request = *infer_request_cache[cgraph];
+
         conversion_end_time = ggml_time_us();
         compile_end_time = conversion_end_time;
     } else {
+        std::shared_ptr<ov::Model> model;
+
         if (is_static) {
             ggml_decoder = get_ggml_decoder(cgraph, is_static, true);
             auto ggml_decoder_kvcache = get_ggml_decoder(cgraph, is_static, false);
@@ -129,12 +132,14 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
             auto model_kvcache = ov::frontend::ggml::FrontEnd::convert(input_model_kvcache);
             conversion_end_time = ggml_time_us();
 
-            compiled_model = core.compile_model(model, device, config);
+            auto compiled_model = core.compile_model(model, device, config);
             auto compiled_model_kvcache = core.compile_model(model_kvcache, device, config);
+            compiled_model_cache[cgraph] = compiled_model_kvcache;
             compile_end_time = ggml_time_us();
 
-            compiled_cache_prefill[cgraph] = std::make_pair(model, compiled_model);
-            compiled_cache_kvcache[cgraph] = std::make_pair(model_kvcache, compiled_model_kvcache);
+            infer_request_cache[cgraph] = std::make_shared<ov::InferRequest>(compiled_model.create_infer_request());
+            infer_request = *infer_request_cache[cgraph];
+            compiled_model_cache[cgraph] = compiled_model_kvcache;
 
             if (getenv("GGML_OPENVINO_DUMP_IR")) {
                 char timestamped_filename[64];
@@ -152,9 +157,10 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
             model = ov::frontend::ggml::FrontEnd::convert(input_model);
             conversion_end_time = ggml_time_us();
 
-            compiled_model = core.compile_model(model, device, config);
+            auto compiled_model = core.compile_model(model, device, config);
             compile_end_time = ggml_time_us();
-            compiled_cache_prefill[cgraph] = std::make_pair(model, compiled_model);
+            infer_request_cache[cgraph] = std::make_shared<ov::InferRequest>(compiled_model.create_infer_request());
+            infer_request = *infer_request_cache[cgraph];
 
             if (getenv("GGML_OPENVINO_DUMP_IR")) {
                 char timestamped_filename[64];
@@ -163,12 +169,23 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
                 ov::serialize(model, timestamped_filename);
             }
         }
-    }
-    auto infer_request = compiled_model.create_infer_request();
 
-    auto ov_params = model->get_parameters();
-    for (size_t i = 0; i < ov_params.size(); i++) {
-        auto param_name = ov_params[i]->get_friendly_name();
+        std::vector<std::string> ov_input_names;
+        std::vector<std::string> ov_output_names;
+        for (const auto& ov_param : model->get_parameters()) {
+            ov_input_names.push_back(ov_param->get_friendly_name());
+        }
+        for (const auto& ov_output : model->get_results()) {
+            ov_output_names.push_back(ov_output->get_friendly_name());
+        }
+        ov_input_names_cache[cgraph] = ov_input_names;
+        ov_output_names_cache[cgraph] = ov_output_names;
+    }
+
+    auto ov_input_names = ov_input_names_cache[cgraph];
+    auto ov_output_names = ov_output_names_cache[cgraph];
+    for (size_t i = 0; i < ov_input_names.size(); i++) {
+        auto param_name = ov_input_names[i];
         auto input_tensor = get_ov_input_tensor(ggml_decoder, param_name);
         infer_request.set_input_tensor(i, input_tensor);
 
@@ -181,14 +198,15 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c
     infer_request.infer();
     auto infer_end_time = ggml_time_us();
 
-    auto output_names = ggml_decoder->get_model_output_names();
-    auto output_tensors = get_ggml_graph_output_dst(ggml_decoder);
-    for (size_t i = 0; i < output_names.size(); i++) {
-        auto output_tensor = infer_request.get_output_tensor(i);
-        std::memcpy(output_tensors[output_names[i]], output_tensor.data(), output_tensor.get_byte_size());
+    auto gguf_tensor_addrs = get_ggml_graph_output_dst(ggml_decoder);
+    for (size_t i = 0; i < ov_output_names.size(); i++) {
+        auto result_name = ov_output_names[i];
+        const auto output_tensor = infer_request.get_output_tensor(i);
+
+        std::memcpy(gguf_tensor_addrs[result_name], output_tensor.data(), output_tensor.get_byte_size());
 
         if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) {
-            print_output_tensor_info(output_names[i], output_tensor, output_tensors);
+            print_output_tensor_info(result_name, output_tensor, gguf_tensor_addrs);
         }
     }
     auto end_time = ggml_time_us();