From 2b6d2daa6ac60a33760503dbbd3b8ad9d7177876 Mon Sep 17 00:00:00 2001 From: Xuejun Zhai Date: Thu, 5 Mar 2026 17:37:21 -0800 Subject: [PATCH] Add interface is_model_splitted() to check the c-graph is splited or not --- ggml/src/ggml-openvino/ggml-decoder.cpp | 4 +- ggml/src/ggml-openvino/ggml-decoder.h | 2 + ggml/src/ggml-openvino/utils.cpp | 55 ++++++++++++++++++++++--- ggml/src/ggml-openvino/utils.h | 2 + 4 files changed, 57 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 0938d2273e..b663cd67f6 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -45,6 +45,7 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map> & model_weights, bool is_static, bool is_stateful, + bool model_is_splitted, bool is_prefill, int prefill_chunk_size) : m_is_static(is_static), @@ -52,6 +53,7 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, m_is_prefill(is_prefill), m_naive(false), m_prefill_chunk_size(prefill_chunk_size), + m_model_is_splitted(model_is_splitted), m_cgraph(cgraph), m_model_weights(model_weights), m_model_params(model_params), @@ -972,4 +974,4 @@ const std::string & GgmlOvDecoder::get_op_type(int node_idx) const { const std::string & GgmlOvDecoder::get_op_type() const { static const std::string unknown_op = "UNKNOWN_GGML_OP"; return unknown_op; -} +} \ No newline at end of file diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 3ae25ddda3..5c44f4b9ac 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -69,6 +69,7 @@ public: std::map> & model_weights, bool is_static, bool is_stateful = false, + bool model_is_splitted = false, bool is_prefill = false, int prefill_chunk_size = 256); @@ -205,6 +206,7 @@ public: bool m_is_prefill = false; bool m_naive = false; int m_prefill_chunk_size = 0; + bool m_model_is_splitted = false; // label the cgraph is splited or not static ov::Shape get_shape(const ggml_tensor * tensor); static std::vector get_stride(const ggml_tensor * tensor); diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 1b553a0de0..c126005bce 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -85,7 +85,9 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr< bool stateful = r_ctx->stateful; static auto is_static = false; - if (is_naive(cgraph)) { + bool model_is_splitted = is_model_splitted(cgraph); + + if (is_naive(cgraph) && !model_is_splitted) { return naive_compute(cgraph, core, device, config); } @@ -175,8 +177,8 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr< std::shared_ptr model; auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); - ggml_decoder = std::make_shared(cgraph, m_params, c_params, model_weights, is_static, stateful); - decoder_end_time = ggml_time_us(); + ggml_decoder = std::make_shared(cgraph, m_params, c_params, model_weights, is_static, stateful, model_is_splitted); + decoder_end_time = ggml_time_us(); auto input_model = std::make_shared(ggml_decoder); model = ov::frontend::ggml::FrontEnd::convert(input_model); @@ -338,9 +340,9 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr(cgraph, m_params, c_params, model_weights, - is_static, stateful, true, prefill_chunk_size); + is_static, stateful, false, true, prefill_chunk_size); auto ggml_decoder_decode = std::make_shared(cgraph, m_params, c_params, model_weights, is_static, - stateful, false, prefill_chunk_size); + stateful, false, false, prefill_chunk_size); decoder_end_time = ggml_time_us(); auto input_model_prefill = std::make_shared(ggml_decoder_prefill); @@ -470,6 +472,49 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptrn_nodes; i++) { + ggml_tensor * node = cgraph->nodes[i]; + int use_count = cgraph->use_counts[ggml_hash_find(&cgraph->visited_hash_set, node)]; + // TODO: this is a workround for the tests case from llama.cpp, fix should from the root cause in the future. + if ((cgraph->n_nodes <= 1 && use_count==0) || (cgraph->n_nodes <= 1 && node->op == GGML_OP_VIEW && use_count == 1 && node->src[0] != nullptr && node->src[0]->op == GGML_OP_NONE)) { + return false; + } + int input_use_count = 0; + for (int j = 0; j < cgraph->n_nodes; j++) { + ggml_tensor * other_node = cgraph->nodes[j]; + for (int k = 0; k < GGML_MAX_SRC; k++) { + if (other_node->src[k] == node) { + input_use_count++; + } + } + } + if (use_count != input_use_count && node->op != GGML_OP_NONE) { + return true; + } + } + // if all nodes's src node's src is not come from the nodes in the model, we think the model is splitted. This is a complementary check for the above check, because for some special case like the output node is not used by any node, the use count and input use count are both 0, we can not determine whether the model is splitted or not just based on the first check. + auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); + std::set model_nodes(cgraph->nodes, cgraph->nodes + cgraph->n_nodes); + // leaf nodes + std::set model_leafs(cgraph->leafs, cgraph->leafs + cgraph->n_leafs); + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor * node = cgraph->nodes[i]; + for (int j = 0; j < GGML_MAX_SRC; j++) { + ggml_tensor * src = node->src[j]; + // the src is also not the model weights, we think the model is splitted. + // the src is also not in model leafs, we think the model is splitted. + if (src != nullptr && model_nodes.find(src) == model_nodes.end() && + model_weights.find(std::string(src->name)) == model_weights.end() && !model_leafs.empty() == false && + model_leafs.find(src) == model_leafs.end()) { + return true; + } + } + } + return false; +} + bool is_naive(ggml_cgraph * cgraph) { constexpr int naive_graph_size_threshold = 20; int count = 0; diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 656573d138..b5ab395002 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -117,6 +117,8 @@ ov::Tensor create_ov_output_tensor(std::shared_ptr ggml_decoder, bool is_naive(struct ggml_cgraph * cgraph); +bool is_model_splitted(struct ggml_cgraph * cgraph); + enum ggml_status naive_compute(struct ggml_cgraph * cgraph, ov::Core & core, const std::string & device,