Merge 37f6bca87b into 342d6125bc

2026-03-24 10:09:01 +02:00 · 2026-03-24 10:09:01 +02:00 · eeccc68fdf
parent 342d6125bc 37f6bca87b
commit eeccc68fdf
5 changed files with 371 additions and 20 deletions
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@ -19,7 +19,6 @@
 #include <iomanip>
 #include <map>
 #include <memory>
-#include <mutex>
 #include <openvino/core/dimension.hpp>
 #include <openvino/core/except.hpp>
 #include <openvino/core/node.hpp>
@ -45,6 +44,7 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph,
                             std::map<std::string, std::shared_ptr<ov::Node>> & model_weights,
                             bool is_static,
                             bool is_stateful,
+                             bool model_is_splitted,
                             bool is_prefill,
                             int prefill_chunk_size) :
    m_is_static(is_static),
@ -52,6 +52,7 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph,
    m_is_prefill(is_prefill),
    m_naive(false),
    m_prefill_chunk_size(prefill_chunk_size),
+    m_model_is_splitted(model_is_splitted),
    m_cgraph(cgraph),
    m_model_weights(model_weights),
    m_model_params(model_params),
@ -68,6 +69,7 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph,
    validate_cgraph();

    set_input_output();
+    compute_node_dynamic_dims();
    compute_model_inputs();
    compute_model_outputs();

@ -330,7 +332,7 @@ void GgmlOvDecoder::validate_cgraph() const {
    }
 }

-ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const {
+ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input, int dynamic_dim_index) const {
    if (m_naive) {
        return input!= nullptr ? ov::PartialShape{get_shape(input)} : ov::PartialShape{get_shape(op)};
    }
@ -381,6 +383,9 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co
    } else {
        input_shape = ov::PartialShape{get_shape(input)};
    }
+    if (dynamic_dim_index != -1) {
+        input_shape[3 - dynamic_dim_index] = -1;
+    }
    return input_shape;
 }

@ -443,7 +448,7 @@ void GgmlOvDecoder::compute_model_inputs() {
            if (m_model_weights.find(node_name) == m_model_weights.end()) {
                m_inputs[node_name] = node;
                auto param_node =
-                    std::make_shared<ov::op::v0::Parameter>(get_ov_type(node), get_graph_input_shape(node, nullptr));
+                    std::make_shared<ov::op::v0::Parameter>(get_ov_type(node), get_graph_input_shape(node, nullptr, m_node_dynamic_dims[node]));
                param_node->set_friendly_name(node_name);
                param_node->output(0).get_tensor().set_names({node_name});
                m_model_inputs[node_name] = param_node;
@ -487,7 +492,7 @@ void GgmlOvDecoder::compute_model_inputs() {
                    m_model_params.kv_names.push_back(src_name);
                }
            }
-            ov::PartialShape param_shape = get_graph_input_shape(node, src);
+            ov::PartialShape param_shape = get_graph_input_shape(node, src, m_node_dynamic_dims[src]);
            auto param_node = std::make_shared<ov::op::v0::Parameter>(get_ov_type(src), param_shape);
            param_node->set_friendly_name(src_name);
            param_node->output(0).get_tensor().set_names({src_name});
@ -573,9 +578,6 @@ std::map<std::string, std::string> GgmlOvDecoder::get_kv_param_res_names() const
 }

 std::map<std::string, std::shared_ptr<ov::Node>> GgmlOvDecoder::create_weight_nodes(ggml_cgraph * cgraph, bool naive) {
-    static std::mutex weights_mutex;
-    std::lock_guard<std::mutex> lock(weights_mutex);
-
    std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
    auto * nodes = cgraph->nodes;
    auto n_nodes = cgraph->n_nodes;
@ -973,3 +975,265 @@ const std::string & GgmlOvDecoder::get_op_type() const {
    static const std::string unknown_op = "UNKNOWN_GGML_OP";
    return unknown_op;
 }
+
+void GgmlOvDecoder::compute_node_dynamic_dims() {
+    auto visit_node = [&](auto && self, ggml_tensor * node) -> void {
+        if (!node) {
+            return;
+        }
+
+        if (node->op == GGML_OP_CPY) {
+            m_node_dynamic_dims[node] = -1;
+        }
+
+        if (m_node_dynamic_dims.count(node)) {
+            return;
+        }
+        for (int i = 0; i < GGML_MAX_SRC; i++) {
+            ggml_tensor * src = node->src[i];
+            if (src == nullptr) {
+                continue;
+            }
+            struct ggml_tensor *root_src = nullptr;
+            // if (src->org_src) {
+            //     root_src = src->org_src;
+            // }
+            if (root_src) {
+                if (is_inp_tok(root_src, node) || is_inp_pos(root_src, node) ||
+                    is_output_idx(root_src, node)) {
+                    m_node_dynamic_dims[root_src] = 0;
+                    m_node_dynamic_dims[src] = m_node_dynamic_dims[root_src];
+                    continue;
+                }
+                self(self, root_src);
+                m_node_dynamic_dims[src] = m_node_dynamic_dims[root_src];
+            } else {
+                if (is_inp_tok(src, node) || is_inp_pos(src, node) || is_output_idx(src, node)) {
+                    m_node_dynamic_dims[src] = 0;
+                    continue;
+                }
+                self(self, src);
+            }
+        }
+        switch (node->op) {
+        case GGML_OP_NONE:
+            m_node_dynamic_dims[node] = -1;
+            break;
+        case GGML_OP_GET_ROWS:
+            m_node_dynamic_dims[node] = -1;
+            if (m_node_dynamic_dims[node->src[1]] != -1) {
+                auto dynamic_dim_idx = m_node_dynamic_dims[node->src[1]];
+                auto dynamic_dim_value = node->src[1]->ne[dynamic_dim_idx];
+                if (dynamic_dim_idx == 0) {
+                    m_node_dynamic_dims[node] = 1;
+                } else {
+                    auto dynamic_dim_stride = node->src[1]->nb[dynamic_dim_idx] / ggml_type_size(node->src[1]->type) *
+                                              ggml_type_size(node->src[0]->type);
+                    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+                        if (dynamic_dim_stride == node->src[0]->nb[i]) {
+                            m_node_dynamic_dims[node] = i;
+                            break;
+                        }
+                    }
+                }
+                OPENVINO_ASSERT(dynamic_dim_value == node->ne[m_node_dynamic_dims[node]],
+                                "Dynamic dim value mismatch for node: " + std::string(node->name) +
+                                    " and its src[1]: " + std::string(node->src[1]->name));
+            }
+            break;
+        case GGML_OP_MUL:
+        case GGML_OP_MUL_MAT:
+            m_node_dynamic_dims[node] = -1;
+            if (m_node_dynamic_dims[node->src[0]] != -1) {
+                m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]];
+            }
+            if (m_node_dynamic_dims[node->src[1]] != -1) {
+                m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[1]];
+            }
+            break;
+        case GGML_OP_PERMUTE:
+            m_node_dynamic_dims[node] = -1;
+            if (m_node_dynamic_dims[node->src[0]] != -1) {
+                auto dynamic_dim_idx = m_node_dynamic_dims[node->src[0]];
+                auto dynamic_dim_value = node->src[0]->ne[dynamic_dim_idx];
+                for (int i = 0; i < GGML_MAX_DIMS; i++) {
+                    if (node->op_params[i] == dynamic_dim_idx) {
+                        m_node_dynamic_dims[node] = i;
+                        break;
+                    }
+                }
+                OPENVINO_ASSERT(dynamic_dim_value == node->ne[m_node_dynamic_dims[node]],
+                                "Dynamic dim value mismatch for node: " + std::string(node->name) +
+                                    " and its src[0]: " + std::string(node->src[0]->name));
+            }
+            break;
+        case GGML_OP_VIEW: {
+            // Use stride-based matching: the stride of a VIEW dimension directly
+            // encodes which source dimension it indexes into, so it uniquely
+            // identifies the dynamic dim even when two dims share the same size.
+            m_node_dynamic_dims[node] = -1;
+            if (m_node_dynamic_dims[node->src[0]] != -1) {
+                auto dynamic_dim_idx   = m_node_dynamic_dims[node->src[0]];
+                auto dynamic_dim_value = node->src[0]->ne[dynamic_dim_idx];
+                auto dynamic_dim_stride =
+                    node->src[0]->nb[dynamic_dim_idx] / ggml_type_size(node->src[0]->type) *
+                    ggml_type_size(node->type);
+                for (int i = 0; i < GGML_MAX_DIMS; i++) {
+                    if (node->nb[i] == dynamic_dim_stride) {
+                        m_node_dynamic_dims[node] = i;
+                        break;
+                    }
+                }
+                OPENVINO_ASSERT(m_node_dynamic_dims[node] != -1 &&
+                                dynamic_dim_value == node->ne[m_node_dynamic_dims[node]],
+                                "Dynamic dim value mismatch for node: " + std::string(node->name) +
+                                    " and its src[0]: " + std::string(node->src[0]->name));
+            }
+            break;
+        }
+        case GGML_OP_RESHAPE: {
+            // RESHAPE requires src[0] to be contiguous, so both src and result
+            // have standard compact strides: nb[i] = type_size * prod(ne[0..i-1]).
+            // Match src->nb[dynamic_dim] against result->nb[i] to find the output
+            // dimension whose flat-memory boundary aligns with the source dynamic
+            // boundary. This is unambiguous (result strides are strictly monotone)
+            // and handles merged-lower-dim cases that ne-value matching misses.
+            m_node_dynamic_dims[node] = -1;
+            if (m_node_dynamic_dims[node->src[0]] != -1) {
+                auto dynamic_dim_idx    = m_node_dynamic_dims[node->src[0]];
+                auto dynamic_dim_stride = node->src[0]->nb[dynamic_dim_idx];
+                for (int i = 0; i < GGML_MAX_DIMS; i++) {
+                    if (node->nb[i] == dynamic_dim_stride && node->ne[i] == node->src[0]->ne[dynamic_dim_idx]) {
+                        m_node_dynamic_dims[node] = i;
+                        break;
+                    }
+                }
+                if (m_node_dynamic_dims[node] == -1) {
+                    std::cout << "Cannot determine dynamic dim for RESHAPE node: " << node->name << std::endl;
+                }
+            }
+            break;
+        }
+        case GGML_OP_FLASH_ATTN_EXT: {
+            // Output shape is hard-coded in ggml_flash_attn_ext as:
+            //   ne = { v->ne[0], q->ne[2], q->ne[1], q->ne[3] }
+            // i.e. output dim 0 <- v dim 0 (head_size, static)
+            //      output dim 1 <- q dim 2 (n_heads,   static)
+            //      output dim 2 <- q dim 1 (n_tokens,  potentially dynamic)
+            //      output dim 3 <- q dim 3 (batch,     static)
+            // Using the fixed q-dim -> output-dim mapping table.
+            // q is src[0]; the mapping from q's dynamic dim to the output dim is:
+            //   q dim 1 -> output dim 2
+            //   q dim 2 -> output dim 1
+            //   q dim 3 -> output dim 3
+            //   q dim 0 -> output dim 0  (head_size axis, unlikely to be dynamic)
+            constexpr int q_to_out[GGML_MAX_DIMS] = { 0, 2, 1, 3 };
+            m_node_dynamic_dims[node] = -1;
+            if (m_node_dynamic_dims[node->src[0]] != -1) {
+                auto q_dynamic_dim = m_node_dynamic_dims[node->src[0]];
+                m_node_dynamic_dims[node] = q_to_out[q_dynamic_dim];
+            }
+            break;
+        }
+        case GGML_OP_CONT:
+            m_node_dynamic_dims[node] = -1;
+            if (m_node_dynamic_dims[node->src[0]] != -1) {
+                auto dynamic_dim_idx = m_node_dynamic_dims[node->src[0]];
+                if (ggml_are_same_shape(node, node->src[0])) {
+                    m_node_dynamic_dims[node] = dynamic_dim_idx;
+                } else {
+                    size_t src_logical_nb[GGML_MAX_DIMS];
+                    src_logical_nb[0] = ggml_type_size(node->src[0]->type);
+                    src_logical_nb[1] = src_logical_nb[0] *
+                                        (node->src[0]->ne[0] / ggml_blck_size(node->src[0]->type));
+                    for (int i = 2; i < GGML_MAX_DIMS; i++) {
+                        src_logical_nb[i] = src_logical_nb[i - 1] * node->src[0]->ne[i - 1];
+                    }
+
+                    auto dynamic_dim_stride = src_logical_nb[dynamic_dim_idx] /
+                                              ggml_type_size(node->src[0]->type) *
+                                              ggml_type_size(node->type);
+                    int matched_dim_count = 0;
+                    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+                        if (node->nb[i] == dynamic_dim_stride && node->ne[i] == node->src[0]->ne[dynamic_dim_idx]) {
+                            m_node_dynamic_dims[node] = i;
+                            matched_dim_count++;
+                        }
+                    }
+
+                    OPENVINO_ASSERT(matched_dim_count == 1,
+                                    "Cannot determine dynamic dim for CONT node: " + std::string(node->name));
+                }
+            }
+            break;
+        case GGML_OP_RMS_NORM:
+        case GGML_OP_ADD:
+        case GGML_OP_GLU:
+        case GGML_OP_ROPE:
+        case GGML_OP_SCALE:
+        case GGML_OP_TRANSPOSE:
+        case GGML_OP_SOFT_MAX:
+        case GGML_OP_ARGSORT:
+        case GGML_OP_ADD_ID:
+            m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]];
+            break;
+        case GGML_OP_MUL_MAT_ID:
+            m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[1]];
+            break;
+        case GGML_OP_CPY:
+        case GGML_OP_SET_ROWS:
+            m_node_dynamic_dims[node] = -1;
+            break;
+        default:
+            std::cout << "Doesn't handle node name: " << node->name << " op: " << ggml_op_name(node->op) << std::endl;
+            break;
+        }
+    };
+
+    for (int i = 0; i < m_cgraph->n_nodes; i++) {
+        ggml_tensor * node = m_cgraph->nodes[i];
+        visit_node(visit_node, node);
+    }
+
+    // print the nodes in m_cgraph name & shape with the dynamic dim (the dynamic dim is the dimension with -1 in m_node_dynamic_dims) for debugging
+    if (0) {
+        for (int i = 0; i < m_cgraph->n_nodes; i++) {
+            ggml_tensor * node = m_cgraph->nodes[i];
+            int dynamic_dim = m_node_dynamic_dims[node];
+            std::cout << "[" << i << "] " << "node_name: " << node->name << " op: " << ggml_op_name(node->op)
+                      << " shape: [";
+            for (int j = 0; j < 4; j++) {
+                if (j == dynamic_dim) {
+                    std::cout << "*";
+                } else {
+                    std::cout << node->ne[j];
+                }
+                if (j < 3) {
+                    std::cout << ", ";
+                }
+            }
+            std::cout << "]" << std::endl;
+            // print the src name & shape with the dynamic dim for debugging
+            for (int j = 0; j < GGML_MAX_SRC; j++) {
+                ggml_tensor * src = node->src[j];
+                if (src == nullptr) {
+                    continue;
+                }
+                int src_dynamic_dim = m_node_dynamic_dims[src];
+                std::cout << "    [" << j << "] src_name: " << src->name << " [";
+                for (int k = 0; k < 4; k++) {
+                    if (k == src_dynamic_dim) {
+                        std::cout << "*";
+                    } else {
+                        std::cout << src->ne[k];
+                    }
+                    if (k < 3) {
+                        std::cout << ", ";
+                    }
+                }
+                std::cout << "]" << std::endl;
+            }
+            std::cout << std::endl;
+        }
+    }
+}
--- a/ggml/src/ggml-openvino/ggml-decoder.h
+++ b/ggml/src/ggml-openvino/ggml-decoder.h
@ -69,6 +69,7 @@ public:
                  std::map<std::string, std::shared_ptr<ov::Node>> & model_weights,
                  bool is_static,
                  bool is_stateful = false,
+                  bool model_is_splitted = false,
                  bool is_prefill = false,
                  int prefill_chunk_size = 256);

@ -175,7 +176,11 @@ public:

    virtual bool is_stateful() const override { return m_is_stateful; }

-    ov::PartialShape get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const;
+    virtual bool is_splited_model() const override {
+        return m_model_is_splitted;
+    }
+
+    ov::PartialShape get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input, int dynamic_dim_index=-1) const;

    static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename);

@ -205,6 +210,7 @@ public:
    bool m_is_prefill = false;
    bool m_naive = false;
    int m_prefill_chunk_size = 0;
+    bool m_model_is_splitted = false; // label the cgraph is splited or not

    static ov::Shape get_shape(const ggml_tensor * tensor);
    static std::vector<size_t> get_stride(const ggml_tensor * tensor);
@ -272,6 +278,9 @@ private:
    void compute_model_inputs();
    void compute_model_outputs();

+    // Infer and propagate dynamic-dimension indices for all tensors in the GGML graph.
+    void compute_node_dynamic_dims();
+
    void validate_cgraph() const;

    ggml_cgraph * m_cgraph = nullptr;
@ -284,6 +293,7 @@ private:
    std::map<std::string, ggml_tensor *> m_model_outputs;
    std::vector<std::string> m_model_output_names;
    std::vector<NodeInfo> m_node_info_list;
+    std::map<ggml_tensor *, int> m_node_dynamic_dims;

    ModelParams m_model_params;
    ComputeParams m_compute_params;
--- a/ggml/src/ggml-openvino/openvino/decoder.h
+++ b/ggml/src/ggml-openvino/openvino/decoder.h
@ -66,6 +66,8 @@ public:

    virtual bool is_stateful() const = 0;

+    virtual bool is_splited_model() const = 0;
+
    virtual int is_swa_layer(int layer) const = 0;
 };

--- a/ggml/src/ggml-openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/utils.cpp
@ -85,7 +85,9 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
    bool stateful = r_ctx->stateful;
    static auto is_static = false;

-    if (is_naive(cgraph)) {
+    bool model_is_splitted = is_model_splitted(cgraph);
+
+    if (is_naive(cgraph) && !model_is_splitted) {
        return naive_compute(cgraph, core, device, config);
    }

@ -106,17 +108,23 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
    int64_t infer_end_time;

    {
-        std::lock_guard<std::mutex> lock(r_ctx->ov_compute_mutex);
+        std::shared_ptr<std::mutex> mutex;

        auto it = r_ctx->decoder_cache.find(key);

        cache_hit = it != r_ctx->decoder_cache.end();
        ModelParams old_m_params;
        if (cache_hit) {
-            ggml_decoder = it->second;
+            mutex = it->second->mutex;
+            std::lock_guard<std::mutex> lock(*(mutex));
+            ggml_decoder = it->second->ptr;
            old_m_params = ggml_decoder->get_model_params();
            cache_hit = old_m_params.can_reuse_dynamically(m_params);
+        } else {
+            mutex = std::make_shared<std::mutex>();
+            r_ctx->decoder_cache[key] = std::make_shared<decoder_runtime_ctx>(mutex);
        }
+        std::lock_guard<std::mutex> lock(*(mutex));

        if (cache_hit) {
            std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
@ -175,8 +183,8 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
            std::shared_ptr<ov::Model> model;
            auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);

-            ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, m_params, c_params, model_weights, is_static, stateful);
-            decoder_end_time = ggml_time_us();
+        ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, m_params, c_params, model_weights, is_static, stateful, model_is_splitted);
+        decoder_end_time = ggml_time_us();

            auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder);
            model = ov::frontend::ggml::FrontEnd::convert(input_model);
@ -200,7 +208,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
            compile_end_time = ggml_time_us();
            infer_request = std::make_shared<ov::InferRequest>(compiled_model.create_infer_request());
            r_ctx->infer_request_cache[key] = infer_request;
-            r_ctx->decoder_cache[key] = ggml_decoder;
+            r_ctx->decoder_cache.at(key)->ptr = ggml_decoder;

            std::vector<std::string> ov_input_names;
            std::vector<std::string> ov_output_names;
@ -306,15 +314,23 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
    int64_t compile_end_time;
    int64_t infer_end_time;

+    std::shared_ptr<std::mutex> mutex;
+
    auto it = r_ctx->decoder_cache.find(key);

    cache_hit = it != r_ctx->decoder_cache.end();
    ModelParams old_m_params;
    if (cache_hit) {
-        ggml_decoder = it->second;
+        mutex = it->second->mutex;
+        std::lock_guard<std::mutex> lock(*(mutex));
+        ggml_decoder = it->second->ptr;
        old_m_params = ggml_decoder->get_model_params();
        cache_hit = old_m_params.can_reuse_statically(m_params);
+    } else {
+        mutex = std::make_shared<std::mutex>();
+        r_ctx->decoder_cache[key] = std::make_shared<decoder_runtime_ctx>(mutex);
    }
+    std::lock_guard<std::mutex> lock(*(mutex));

    if (cache_hit) {
        std::map<std::string, std::shared_ptr<ov::Node>> model_weights;
@ -338,9 +354,9 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
        auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);

        auto ggml_decoder_prefill = std::make_shared<GgmlOvDecoder>(cgraph, m_params, c_params, model_weights,
-                                                                    is_static, stateful, true, prefill_chunk_size);
+                                                                    is_static, stateful, false, true, prefill_chunk_size);
        auto ggml_decoder_decode = std::make_shared<GgmlOvDecoder>(cgraph, m_params, c_params, model_weights, is_static,
-                                                                   stateful, false, prefill_chunk_size);
+                                                                   stateful, false, false, prefill_chunk_size);
        decoder_end_time = ggml_time_us();

        auto input_model_prefill = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder_prefill);
@ -381,7 +397,7 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
        model = is_prefill ? model_prefill : model_decode;
        ggml_decoder = is_prefill ? ggml_decoder_prefill : ggml_decoder_decode;
        infer_request = is_prefill ? r_ctx->infer_request_cache_prefill[key] : r_ctx->infer_request_cache[key];
-        r_ctx->decoder_cache[key] = ggml_decoder;
+        r_ctx->decoder_cache.at(key)->ptr = ggml_decoder;

        std::vector<std::string> ov_input_names;
        std::vector<std::string> ov_output_names;
@ -470,6 +486,52 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
    return GGML_STATUS_SUCCESS;
 }

+// Detect whether a cgraph is a split subgraph or not.
+// Step 1 compares each node's recorded use_count with actual fan-out references in node->src.
+// Step 2 verifies that node inputs come from model nodes/weights/leafs; external sources imply split.
+bool is_model_splitted(ggml_cgraph * cgraph) {
+    // check the nodes of the model are used by the following nodes, through compare the node's use count and the count of nodes that use it as input. If does not match, return true, else return false.
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        ggml_tensor * node = cgraph->nodes[i];
+        int use_count = cgraph->use_counts[ggml_hash_find(&cgraph->visited_hash_set, node)];
+        // TODO: this is a workround for the tests case from llama.cpp, fix should from the root cause in the future.
+        if ((cgraph->n_nodes <= 1 && use_count==0) || (cgraph->n_nodes <= 1 && node->op == GGML_OP_VIEW && use_count == 1 && node->src[0] != nullptr && node->src[0]->op == GGML_OP_NONE)) {
+            return false;
+        }
+        int input_use_count = 0;
+        for (int j = 0; j < cgraph->n_nodes; j++) {
+            ggml_tensor * other_node = cgraph->nodes[j];
+            for (int k = 0; k < GGML_MAX_SRC; k++) {
+                if (other_node->src[k] == node) {
+                    input_use_count++;
+                }
+            }
+        }
+        if (use_count != input_use_count && node->op != GGML_OP_NONE) {
+            return true;
+        }
+    }
+    // if all nodes's src node's src is not come from the nodes in the model, we think the model is splitted. This is a complementary check for the above check, because for some special case like the output node is not used by any node, the use count and input use count are both 0, we can not determine whether the model is splitted or not just based on the first check.
+    auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, true);
+    std::set<ggml_tensor *> model_nodes(cgraph->nodes, cgraph->nodes + cgraph->n_nodes);
+    // leaf nodes
+    std::set<ggml_tensor *> model_leafs(cgraph->leafs, cgraph->leafs + cgraph->n_leafs);
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        ggml_tensor * node = cgraph->nodes[i];
+        for (int j = 0; j < GGML_MAX_SRC; j++) {
+            ggml_tensor * src = node->src[j];
+            // the src is also not the model weights, we think the model is splitted.
+            // the src is also not in model leafs, we think the model is splitted.
+            if (src != nullptr && model_nodes.find(src) == model_nodes.end() &&
+                model_weights.find(std::string(src->name)) == model_weights.end() && !model_leafs.empty() == false &&
+                model_leafs.find(src) == model_leafs.end()) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
 bool is_naive(ggml_cgraph * cgraph) {
    constexpr int naive_graph_size_threshold = 20;
    int count = 0;
--- a/ggml/src/ggml-openvino/utils.h
+++ b/ggml/src/ggml-openvino/utils.h
@ -40,11 +40,17 @@ struct graph_key_hash {
    }
 };

+struct decoder_runtime_ctx {
+    decoder_runtime_ctx(std::shared_ptr<std::mutex> mutex) :
+        mutex(mutex) {}
+    std::shared_ptr<std::mutex> mutex;
+    std::shared_ptr<GgmlOvDecoder> ptr;
+};
+
 struct ov_runtime_context {
-    std::mutex ov_compute_mutex;
    std::string device;
    bool stateful;
-    std::unordered_map<graph_key, std::shared_ptr<GgmlOvDecoder>, graph_key_hash> decoder_cache;
+    std::unordered_map<graph_key, std::shared_ptr<decoder_runtime_ctx>, graph_key_hash> decoder_cache;
    std::unordered_map<graph_key, std::shared_ptr<ov::InferRequest>, graph_key_hash> infer_request_cache;
    std::unordered_map<graph_key, std::shared_ptr<ov::InferRequest>, graph_key_hash> infer_request_cache_prefill;
    std::unordered_map<graph_key, std::vector<std::string>, graph_key_hash> ov_input_names_cache;
@ -117,6 +123,13 @@ ov::Tensor create_ov_output_tensor(std::shared_ptr<GgmlOvDecoder> ggml_decoder,

 bool is_naive(struct ggml_cgraph * cgraph);

+/**
+ * @brief Heuristically checks whether the given computation graph is a split-model fragment.
+ * @param cgraph Pointer to the GGML computation graph to analyze.
+ * @return true if the graph is identified as split; otherwise false.
+ */
+bool is_model_splitted(struct ggml_cgraph * cgraph);
+
 enum ggml_status naive_compute(struct ggml_cgraph * cgraph,
                               ov::Core & core,
                               const std::string & device,