Infer and propagate dynamic-dimension indices for all tensors in the GGML graph in api compute_model_outputs()

2026-03-17 15:15:54 +08:00 · 2026-03-17 15:15:54 +08:00 · fbc3128c17
parent b6c83aad55
commit fbc3128c17
2 changed files with 274 additions and 4 deletions
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@ -68,6 +68,7 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph,
    validate_cgraph();

    set_input_output();
+    compute_node_dynamic_dims();
    compute_model_inputs();
    compute_model_outputs();

@ -330,7 +331,7 @@ void GgmlOvDecoder::validate_cgraph() const {
    }
 }

-ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const {
+ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input, int dynamic_dim_index) const {
    if (m_naive) {
        return input!= nullptr ? ov::PartialShape{get_shape(input)} : ov::PartialShape{get_shape(op)};
    }
@ -381,6 +382,9 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co
    } else {
        input_shape = ov::PartialShape{get_shape(input)};
    }
+    if (dynamic_dim_index != -1) {
+        input_shape[3 - dynamic_dim_index] = -1;
+    }
    return input_shape;
 }

@ -443,7 +447,7 @@ void GgmlOvDecoder::compute_model_inputs() {
            if (m_model_weights.find(node_name) == m_model_weights.end()) {
                m_inputs[node_name] = node;
                auto param_node =
-                    std::make_shared<ov::op::v0::Parameter>(get_ov_type(node), get_graph_input_shape(node, nullptr));
+                    std::make_shared<ov::op::v0::Parameter>(get_ov_type(node), get_graph_input_shape(node, nullptr, m_node_dynamic_dims[node]));
                param_node->set_friendly_name(node_name);
                param_node->output(0).get_tensor().set_names({node_name});
                m_model_inputs[node_name] = param_node;
@ -487,7 +491,7 @@ void GgmlOvDecoder::compute_model_inputs() {
                    m_model_params.kv_names.push_back(src_name);
                }
            }
-            ov::PartialShape param_shape = get_graph_input_shape(node, src);
+            ov::PartialShape param_shape = get_graph_input_shape(node, src, m_node_dynamic_dims[src]);
            auto param_node = std::make_shared<ov::op::v0::Parameter>(get_ov_type(src), param_shape);
            param_node->set_friendly_name(src_name);
            param_node->output(0).get_tensor().set_names({src_name});
@ -973,3 +977,265 @@ const std::string & GgmlOvDecoder::get_op_type() const {
    static const std::string unknown_op = "UNKNOWN_GGML_OP";
    return unknown_op;
 }
+
+void GgmlOvDecoder::compute_node_dynamic_dims() {
+    auto visit_node = [&](auto && self, ggml_tensor * node) -> void {
+        if (!node) {
+            return;
+        }
+
+        if (node->op == GGML_OP_CPY) {
+            m_node_dynamic_dims[node] = -1;
+        }
+
+        if (m_node_dynamic_dims.count(node)) {
+            return;
+        }
+        for (int i = 0; i < GGML_MAX_SRC; i++) {
+            ggml_tensor * src = node->src[i];
+            if (src == nullptr) {
+                continue;
+            }
+            struct ggml_tensor *root_src = nullptr;
+            // if (src->org_src) {
+            //     root_src = src->org_src;
+            // }
+            if (root_src) {
+                if (is_inp_tok(root_src, node) || is_inp_pos(root_src, node) ||
+                    is_output_idx(root_src, node)) {
+                    m_node_dynamic_dims[root_src] = 0;
+                    m_node_dynamic_dims[src] = m_node_dynamic_dims[root_src];
+                    continue;
+                }
+                self(self, root_src);
+                m_node_dynamic_dims[src] = m_node_dynamic_dims[root_src];
+            } else {
+                if (is_inp_tok(src, node) || is_inp_pos(src, node) || is_output_idx(src, node)) {
+                    m_node_dynamic_dims[src] = 0;
+                    continue;
+                }
+                self(self, src);
+            }
+        }
+        switch (node->op) {
+        case GGML_OP_NONE:
+            m_node_dynamic_dims[node] = -1;
+            break;
+        case GGML_OP_GET_ROWS:
+            m_node_dynamic_dims[node] = -1;
+            if (m_node_dynamic_dims[node->src[1]] != -1) {
+                auto dynamic_dim_idx = m_node_dynamic_dims[node->src[1]];
+                auto dynamic_dim_value = node->src[1]->ne[dynamic_dim_idx];
+                if (dynamic_dim_idx == 0) {
+                    m_node_dynamic_dims[node] = 1;
+                } else {
+                    auto dynamic_dim_stride = node->src[1]->nb[dynamic_dim_idx] / ggml_type_size(node->src[1]->type) *
+                                              ggml_type_size(node->src[0]->type);
+                    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+                        if (dynamic_dim_stride == node->src[0]->nb[i]) {
+                            m_node_dynamic_dims[node] = i;
+                            break;
+                        }
+                    }
+                }
+                OPENVINO_ASSERT(dynamic_dim_value == node->ne[m_node_dynamic_dims[node]],
+                                "Dynamic dim value mismatch for node: " + std::string(node->name) +
+                                    " and its src[1]: " + std::string(node->src[1]->name));
+            }
+            break;
+        case GGML_OP_MUL:
+        case GGML_OP_MUL_MAT:
+            m_node_dynamic_dims[node] = -1;
+            if (m_node_dynamic_dims[node->src[0]] != -1) {
+                m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]];
+            }
+            if (m_node_dynamic_dims[node->src[1]] != -1) {
+                m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[1]];
+            }
+            break;
+        case GGML_OP_PERMUTE:
+            m_node_dynamic_dims[node] = -1;
+            if (m_node_dynamic_dims[node->src[0]] != -1) {
+                auto dynamic_dim_idx = m_node_dynamic_dims[node->src[0]];
+                auto dynamic_dim_value = node->src[0]->ne[dynamic_dim_idx];
+                for (int i = 0; i < GGML_MAX_DIMS; i++) {
+                    if (node->op_params[i] == dynamic_dim_idx) {
+                        m_node_dynamic_dims[node] = i;
+                        break;
+                    }
+                }
+                OPENVINO_ASSERT(dynamic_dim_value == node->ne[m_node_dynamic_dims[node]],
+                                "Dynamic dim value mismatch for node: " + std::string(node->name) +
+                                    " and its src[0]: " + std::string(node->src[0]->name));
+            }
+            break;
+        case GGML_OP_VIEW: {
+            // Use stride-based matching: the stride of a VIEW dimension directly
+            // encodes which source dimension it indexes into, so it uniquely
+            // identifies the dynamic dim even when two dims share the same size.
+            m_node_dynamic_dims[node] = -1;
+            if (m_node_dynamic_dims[node->src[0]] != -1) {
+                auto dynamic_dim_idx   = m_node_dynamic_dims[node->src[0]];
+                auto dynamic_dim_value = node->src[0]->ne[dynamic_dim_idx];
+                auto dynamic_dim_stride =
+                    node->src[0]->nb[dynamic_dim_idx] / ggml_type_size(node->src[0]->type) *
+                    ggml_type_size(node->type);
+                for (int i = 0; i < GGML_MAX_DIMS; i++) {
+                    if (node->nb[i] == dynamic_dim_stride) {
+                        m_node_dynamic_dims[node] = i;
+                        break;
+                    }
+                }
+                OPENVINO_ASSERT(m_node_dynamic_dims[node] != -1 &&
+                                dynamic_dim_value == node->ne[m_node_dynamic_dims[node]],
+                                "Dynamic dim value mismatch for node: " + std::string(node->name) +
+                                    " and its src[0]: " + std::string(node->src[0]->name));
+            }
+            break;
+        }
+        case GGML_OP_RESHAPE: {
+            // RESHAPE requires src[0] to be contiguous, so both src and result
+            // have standard compact strides: nb[i] = type_size * prod(ne[0..i-1]).
+            // Match src->nb[dynamic_dim] against result->nb[i] to find the output
+            // dimension whose flat-memory boundary aligns with the source dynamic
+            // boundary. This is unambiguous (result strides are strictly monotone)
+            // and handles merged-lower-dim cases that ne-value matching misses.
+            m_node_dynamic_dims[node] = -1;
+            if (m_node_dynamic_dims[node->src[0]] != -1) {
+                auto dynamic_dim_idx    = m_node_dynamic_dims[node->src[0]];
+                auto dynamic_dim_stride = node->src[0]->nb[dynamic_dim_idx];
+                for (int i = 0; i < GGML_MAX_DIMS; i++) {
+                    if (node->nb[i] == dynamic_dim_stride && node->ne[i] == node->src[0]->ne[dynamic_dim_idx]) {
+                        m_node_dynamic_dims[node] = i;
+                        break;
+                    }
+                }
+                if (m_node_dynamic_dims[node] == -1) {
+                    std::cout << "Cannot determine dynamic dim for RESHAPE node: " << node->name << std::endl;
+                }
+            }
+            break;
+        }
+        case GGML_OP_FLASH_ATTN_EXT: {
+            // Output shape is hard-coded in ggml_flash_attn_ext as:
+            //   ne = { v->ne[0], q->ne[2], q->ne[1], q->ne[3] }
+            // i.e. output dim 0 <- v dim 0 (head_size, static)
+            //      output dim 1 <- q dim 2 (n_heads,   static)
+            //      output dim 2 <- q dim 1 (n_tokens,  potentially dynamic)
+            //      output dim 3 <- q dim 3 (batch,     static)
+            // Using the fixed q-dim -> output-dim mapping table.
+            // q is src[0]; the mapping from q's dynamic dim to the output dim is:
+            //   q dim 1 -> output dim 2
+            //   q dim 2 -> output dim 1
+            //   q dim 3 -> output dim 3
+            //   q dim 0 -> output dim 0  (head_size axis, unlikely to be dynamic)
+            constexpr int q_to_out[GGML_MAX_DIMS] = { 0, 2, 1, 3 };
+            m_node_dynamic_dims[node] = -1;
+            if (m_node_dynamic_dims[node->src[0]] != -1) {
+                auto q_dynamic_dim = m_node_dynamic_dims[node->src[0]];
+                m_node_dynamic_dims[node] = q_to_out[q_dynamic_dim];
+            }
+            break;
+        }
+        case GGML_OP_CONT:
+            m_node_dynamic_dims[node] = -1;
+            if (m_node_dynamic_dims[node->src[0]] != -1) {
+                auto dynamic_dim_idx = m_node_dynamic_dims[node->src[0]];
+                if (ggml_are_same_shape(node, node->src[0])) {
+                    m_node_dynamic_dims[node] = dynamic_dim_idx;
+                } else {
+                    size_t src_logical_nb[GGML_MAX_DIMS];
+                    src_logical_nb[0] = ggml_type_size(node->src[0]->type);
+                    src_logical_nb[1] = src_logical_nb[0] *
+                                        (node->src[0]->ne[0] / ggml_blck_size(node->src[0]->type));
+                    for (int i = 2; i < GGML_MAX_DIMS; i++) {
+                        src_logical_nb[i] = src_logical_nb[i - 1] * node->src[0]->ne[i - 1];
+                    }
+
+                    auto dynamic_dim_stride = src_logical_nb[dynamic_dim_idx] /
+                                              ggml_type_size(node->src[0]->type) *
+                                              ggml_type_size(node->type);
+                    int matched_dim_count = 0;
+                    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+                        if (node->nb[i] == dynamic_dim_stride && node->ne[i] == node->src[0]->ne[dynamic_dim_idx]) {
+                            m_node_dynamic_dims[node] = i;
+                            matched_dim_count++;
+                        }
+                    }
+
+                    OPENVINO_ASSERT(matched_dim_count == 1,
+                                    "Cannot determine dynamic dim for CONT node: " + std::string(node->name));
+                }
+            }
+            break;
+        case GGML_OP_RMS_NORM:
+        case GGML_OP_ADD:
+        case GGML_OP_GLU:
+        case GGML_OP_ROPE:
+        case GGML_OP_SCALE:
+        case GGML_OP_TRANSPOSE:
+        case GGML_OP_SOFT_MAX:
+        case GGML_OP_ARGSORT:
+        case GGML_OP_ADD_ID:
+            m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]];
+            break;
+        case GGML_OP_MUL_MAT_ID:
+            m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[1]];
+            break;
+        case GGML_OP_CPY:
+        case GGML_OP_SET_ROWS:
+            m_node_dynamic_dims[node] = -1;
+            break;
+        default:
+            std::cout << "Doesn't handle node name: " << node->name << " op: " << ggml_op_name(node->op) << std::endl;
+            break;
+        }
+    };
+
+    for (int i = 0; i < m_cgraph->n_nodes; i++) {
+        ggml_tensor * node = m_cgraph->nodes[i];
+        visit_node(visit_node, node);
+    }
+
+    // print the nodes in m_cgraph name & shape with the dynamic dim (the dynamic dim is the dimension with -1 in m_node_dynamic_dims) for debugging
+    if (0) {
+        for (int i = 0; i < m_cgraph->n_nodes; i++) {
+            ggml_tensor * node = m_cgraph->nodes[i];
+            int dynamic_dim = m_node_dynamic_dims[node];
+            std::cout << "[" << i << "] " << "node_name: " << node->name << " op: " << ggml_op_name(node->op)
+                      << " shape: [";
+            for (int j = 0; j < 4; j++) {
+                if (j == dynamic_dim) {
+                    std::cout << "*";
+                } else {
+                    std::cout << node->ne[j];
+                }
+                if (j < 3) {
+                    std::cout << ", ";
+                }
+            }
+            std::cout << "]" << std::endl;
+            // print the src name & shape with the dynamic dim for debugging
+            for (int j = 0; j < GGML_MAX_SRC; j++) {
+                ggml_tensor * src = node->src[j];
+                if (src == nullptr) {
+                    continue;
+                }
+                int src_dynamic_dim = m_node_dynamic_dims[src];
+                std::cout << "    [" << j << "] src_name: " << src->name << " [";
+                for (int k = 0; k < 4; k++) {
+                    if (k == src_dynamic_dim) {
+                        std::cout << "*";
+                    } else {
+                        std::cout << src->ne[k];
+                    }
+                    if (k < 3) {
+                        std::cout << ", ";
+                    }
+                }
+                std::cout << "]" << std::endl;
+            }
+            std::cout << std::endl;
+        }
+    }
+}
--- a/ggml/src/ggml-openvino/ggml-decoder.h
+++ b/ggml/src/ggml-openvino/ggml-decoder.h
@ -175,7 +175,7 @@ public:

    virtual bool is_stateful() const override { return m_is_stateful; }

-    ov::PartialShape get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const;
+    ov::PartialShape get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input, int dynamic_dim_index=-1) const;

    static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename);

@ -272,6 +272,9 @@ private:
    void compute_model_inputs();
    void compute_model_outputs();

+    // Infer and propagate dynamic-dimension indices for all tensors in the GGML graph.
+    void compute_node_dynamic_dims();
+
    void validate_cgraph() const;

    ggml_cgraph * m_cgraph = nullptr;
@ -284,6 +287,7 @@ private:
    std::map<std::string, ggml_tensor *> m_model_outputs;
    std::vector<std::string> m_model_output_names;
    std::vector<NodeInfo> m_node_info_list;
+    std::map<ggml_tensor *, int> m_node_dynamic_dims;

    ModelParams m_model_params;
    ComputeParams m_compute_params;