FIX: Re-add tensor names in cgraph, Add another case for RESHAPE

2025-05-14 14:06:15 +08:00 · 2025-05-14 14:06:15 +08:00 · 041d220dfa
parent 0d505b4e56
commit 041d220dfa
10 changed files with 77 additions and 34 deletions
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@ -38,6 +38,10 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap
            printed = true;
        }

+        if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) {
+            dump_cgraph(m_cgraph);
+        }
+
        set_max_token_len();
        for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) {
            auto* cur_node = m_cgraph->nodes[node_n];
@ -47,10 +51,6 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap
        m_model_weights = model_weights;

        add_extra_inputs();
-
-        if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) {
-            dump_cgraph(m_cgraph);
-        }
    }
 }

@ -142,17 +142,40 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node,

    if (m_node) {
        switch (node->op) {
+        case GGML_OP_RESHAPE: {
+            if (node->ne[0] * node->ne[1] == node->src[0]->ne[0]) {
+                m_op_case = 1;
+            } else if (node->src[0]->ne[0] * node->src[0]->ne[1] == node->ne[0]) {
+                m_op_case = 2;
+            }
+            break;
+        }
        case GGML_OP_CONT: {
-            // Currently only two cases, either the input comes from a VIEW which is subtensor or from a PERMUTE
-            m_continuous = ggml_nelements(node->src[0]) == ggml_nelements(node->src[0]->view_src);
+            if (ggml_nelements(node->src[0]) == ggml_nelements(node->src[0]->view_src)) {
+                // The input comes from a PERMUTE
+                m_op_case = 1;
+            } else {
+                // The input comes from a VIEW which is subtensor
+                m_op_case = 2;
+            }
            break;
        }
        case GGML_OP_CPY: {
-            m_continuous = ggml_is_contiguous(node);
+            if (ggml_is_contiguous(node)) {
+                // Write K to cache_k
+                m_op_case = 1;
+            } else {
+                // Write V to cache_v
+                m_op_case = 2;
+            }
            break;
        }
        case GGML_OP_MUL_MAT: {
-            m_continuous = node->src[0]->view_src == nullptr;
+            if (node->src[0]->view_src == nullptr) {
+                m_op_case = 1;
+            } else {
+                m_op_case = 2;
+            }
            break;
        }
        default:
--- a/ggml/src/ggml-openvino/ggml-decoder.h
+++ b/ggml/src/ggml-openvino/ggml-decoder.h
@ -69,8 +69,8 @@ public:
        return m_outputs.at(name);
    }

-    virtual bool check_if_continuous() const override {
-        return m_continuous;
+    virtual int get_op_case() const override {
+        return m_op_case;
    }

    virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_inputs() const override {
@ -110,7 +110,7 @@ private:
    std::vector<ggml_tensor*> m_nodes;
    std::string m_op_name;
    mutable std::string m_name;
-    bool m_continuous;
+    int m_op_case;
    std::vector<std::pair<std::string, std::string>> m_op_node_name;
    std::map<std::string, std::shared_ptr<ov::Node>> m_model_inputs;
    std::map<std::string, std::shared_ptr<ov::Node>> m_model_extra_inputs;
@ -119,4 +119,4 @@ private:
    std::vector<std::string> m_model_output_names;
 };

-void print_tensor_address_map(const struct ggml_cgraph* cgraph);
+void print_tensor_address_map(const struct ggml_cgraph* cgraph);
--- a/ggml/src/ggml-openvino/openvino/decoder.hpp
+++ b/ggml/src/ggml-openvino/openvino/decoder.hpp
@ -49,7 +49,7 @@ public:

    virtual void visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>)> node_visitor) const = 0;

-    virtual bool check_if_continuous() const = 0;
+    virtual int get_op_case() const = 0;

    virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_inputs() const = 0;
    virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_extra_inputs() const = 0;
@ -59,4 +59,4 @@ public:

 }  // namespace ggml
 }  // namespace frontend
-}  // namespace ov
+}  // namespace ov
--- a/ggml/src/ggml-openvino/openvino/node_context.hpp
+++ b/ggml/src/ggml-openvino/openvino/node_context.hpp
@ -81,8 +81,8 @@ public:
        return m_decoder->get_attribute(name);
    }

-    bool check_if_continuous() const {
-        return m_decoder->check_if_continuous();
+    int get_op_case() const {
+        return m_decoder->get_op_case();
    }

 private:
--- a/ggml/src/ggml-openvino/openvino/op/cont.cpp
+++ b/ggml/src/ggml-openvino/openvino/op/cont.cpp
@ -17,11 +17,13 @@ namespace op {
 OutputVector translate_cont(const NodeContext& context) {
    num_inputs_check(context, 1, 1);

+    int op_case = context.get_op_case();
+    FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case");
+
    auto src_shape = context.get_input_shape(0).to_shape();
    auto dst_shape = context.get_output_shape(0).to_shape();

-    bool continuous = context.check_if_continuous();
-    if (continuous) {
+    if (op_case == 1) {
        // The input comes from a PERMUTE
        dst_shape[1] = -1;
        auto result = std::make_shared<ov::op::v1::Reshape>(
--- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp
+++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp
@ -22,13 +22,16 @@ namespace op {

 OutputVector translate_cpy(const NodeContext& context) {
    num_inputs_check(context, 2, 2);
+
+    int op_case = context.get_op_case();
+    FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CPY case");
+
    auto src0 = context.get_input(0);
    auto src1 = context.get_input(1);
    auto past_token_len = context.get_input("past_token_len");

    auto src0_shape = context.get_input_shape(0).to_shape();
    auto output_shape = context.get_output_shape(0).to_shape();
-    bool continuous = context.check_if_continuous();

    std::vector<size_t> input0_strides = context.get_input_stride(0);
    std::vector<size_t> output_strides = context.get_output_stride(0);
@ -36,7 +39,7 @@ OutputVector translate_cpy(const NodeContext& context) {
    auto one = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {1});

    src0 = std::make_shared<ov::op::v1::ConvertLike>(src0, src1);
-    if (continuous) {
+    if (op_case == 1) {
        // Write K to cache_k
        int64_t head_size = src0_shape[2];
        int64_t num_heads = src0_shape[1];
--- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp
+++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp
@ -22,8 +22,10 @@ namespace op {
 OutputVector translate_mulmat(const NodeContext& context) {
    num_inputs_check(context, 2, 2);

-    bool continuous = context.check_if_continuous();
-    if (continuous) {
+    int op_case = context.get_op_case();
+    FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported MULMAT case");
+
+    if (op_case == 1) {
        auto src0 = context.get_input(0);
        auto src1 = std::make_shared<ov::op::v0::Convert>(context.get_input(1), context.get_input_type(0));
        auto result_lp = std::make_shared<ov::op::v0::MatMul>(src1, src0, false, true);
--- a/ggml/src/ggml-openvino/openvino/op/reshape.cpp
+++ b/ggml/src/ggml-openvino/openvino/op/reshape.cpp
@ -1,6 +1,8 @@
 #include <cstdint>
+#include <memory>
 #include <openvino/core/node.hpp>
 #include <openvino/core/node_output.hpp>
+#include <openvino/frontend/exception.hpp>
 #include <openvino/op/constant.hpp>
 #include <openvino/op/reshape.hpp>
 #include <vector>
@ -19,11 +21,22 @@ OutputVector translate_reshape(const NodeContext& context) {
        return {context.get_input(0)};
    }

+    int op_case = context.get_op_case();
+    FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported RESHAPE case");
+
    auto output_shape = context.get_output_shape(0).to_shape();
-    auto new_shape_node =
-        ov::op::v0::Constant::create(ov::element::i64,
-                                     {3},
-                                     std::vector<int64_t>{-1, (int64_t)output_shape[1], (int64_t)output_shape[2]});
+    std::shared_ptr<ov::Node> new_shape_node;
+    if (op_case == 1) {
+        new_shape_node =
+            ov::op::v0::Constant::create(ov::element::i64,
+                                         {3},
+                                         std::vector<int64_t>{-1, (int64_t)output_shape[1], (int64_t)output_shape[2]});
+    } else {
+        new_shape_node =
+            ov::op::v0::Constant::create(ov::element::i64,
+                                         {3},
+                                         std::vector<int64_t>{(int64_t)output_shape[0], -1, (int64_t)output_shape[2]});
+    }
    Output<Node> res = std::make_shared<ov::op::v1::Reshape>(context.get_input(0), new_shape_node, false);
    return {res};
 }
--- a/ggml/src/ggml-openvino/openvino/translate_session.cpp
+++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp
@ -31,10 +31,6 @@ std::shared_ptr<Model> TranslateSession::translate_graph(const frontend::InputMo
    const auto& ggml_model = std::dynamic_pointer_cast<InputModel>(input_model);
    std::shared_ptr<GgmlDecoder> ggml_model_decoder = ggml_model->get_model_decoder();

-    FRONT_END_GENERAL_CHECK(ggml_model, "nullptr for InputModel is given for translation into OV Model");
-    const auto& model_inputs = ggml_model->get_inputs();
-    const auto& model_outputs = ggml_model->get_outputs();
-
    for (const auto& it : ggml_model_decoder->get_model_inputs()) {
        params.push_back(std::dynamic_pointer_cast<ov::op::v0::Parameter>(it.second));
        (*tensor_map)[it.first] = it.second;
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@ -1275,7 +1275,7 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {

    if (ubatch.token) {
        inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
-        //cb(inp->tokens, "inp_tokens", -1);
+        cb(inp->tokens, "inp_tokens", -1);
        ggml_set_input(inp->tokens);
        res->t_tokens = inp->tokens;

@ -1327,6 +1327,7 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
    auto & cur = inp->pos;

    cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, (int64_t)n_tokens*hparams.n_pos_per_embd());
+    cb(cur, "inp_pos", -1);
    ggml_set_input(cur);

    res->add_input(std::move(inp));
@ -1362,6 +1363,7 @@ ggml_tensor * llm_graph_context::build_inp_out_ids() const {
    auto & cur = inp->out_ids;

    cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
+    cb(cur, "inp_out_ids", -1);
    ggml_set_input(cur);

    res->add_input(std::move(inp));
@ -1603,6 +1605,7 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con

    // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
    inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
+    cb(inp->self_kq_mask, "KQ_mask", -1);
    ggml_set_input(inp->self_kq_mask);

    inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
@ -1661,7 +1664,7 @@ ggml_tensor * llm_graph_context::build_attn(
    }

    if (wo_b) {
-        //cb(cur, "kqv_wo", il);
+        cb(cur, "kqv_wo", il);
    }

    if (wo_b) {
@ -1691,6 +1694,7 @@ static std::unique_ptr<llm_graph_input_attn_kv> build_attn_inp_kv_impl(
        inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);

        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
+        cb(inp->self_kq_mask, "KQ_mask", -1);
        ggml_set_input(inp->self_kq_mask);

        inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
@ -1818,7 +1822,7 @@ ggml_tensor * llm_graph_context::build_attn(
    }

    if (wo_b) {
-        //cb(cur, "kqv_wo", il);
+        cb(cur, "kqv_wo", il);
    }

    if (wo_b) {
@ -1873,7 +1877,7 @@ ggml_tensor * llm_graph_context::build_attn(
    }

    if (wo_b) {
-        //cb(cur, "kqv_wo", il);
+        cb(cur, "kqv_wo", il);
    }

    if (wo_b) {