diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 43869ec228..0d612c1819 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -38,6 +38,10 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap printed = true; } + if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { + dump_cgraph(m_cgraph); + } + set_max_token_len(); for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { auto* cur_node = m_cgraph->nodes[node_n]; @@ -47,10 +51,6 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap m_model_weights = model_weights; add_extra_inputs(); - - if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { - dump_cgraph(m_cgraph); - } } } @@ -142,17 +142,40 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, if (m_node) { switch (node->op) { + case GGML_OP_RESHAPE: { + if (node->ne[0] * node->ne[1] == node->src[0]->ne[0]) { + m_op_case = 1; + } else if (node->src[0]->ne[0] * node->src[0]->ne[1] == node->ne[0]) { + m_op_case = 2; + } + break; + } case GGML_OP_CONT: { - // Currently only two cases, either the input comes from a VIEW which is subtensor or from a PERMUTE - m_continuous = ggml_nelements(node->src[0]) == ggml_nelements(node->src[0]->view_src); + if (ggml_nelements(node->src[0]) == ggml_nelements(node->src[0]->view_src)) { + // The input comes from a PERMUTE + m_op_case = 1; + } else { + // The input comes from a VIEW which is subtensor + m_op_case = 2; + } break; } case GGML_OP_CPY: { - m_continuous = ggml_is_contiguous(node); + if (ggml_is_contiguous(node)) { + // Write K to cache_k + m_op_case = 1; + } else { + // Write V to cache_v + m_op_case = 2; + } break; } case GGML_OP_MUL_MAT: { - m_continuous = node->src[0]->view_src == nullptr; + if (node->src[0]->view_src == nullptr) { + m_op_case = 1; + } else { + m_op_case = 2; + } break; } default: diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 959e00b65d..b8cc4c4cdf 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -69,8 +69,8 @@ public: return m_outputs.at(name); } - virtual bool check_if_continuous() const override { - return m_continuous; + virtual int get_op_case() const override { + return m_op_case; } virtual const std::map>& get_model_inputs() const override { @@ -110,7 +110,7 @@ private: std::vector m_nodes; std::string m_op_name; mutable std::string m_name; - bool m_continuous; + int m_op_case; std::vector> m_op_node_name; std::map> m_model_inputs; std::map> m_model_extra_inputs; @@ -119,4 +119,4 @@ private: std::vector m_model_output_names; }; -void print_tensor_address_map(const struct ggml_cgraph* cgraph); \ No newline at end of file +void print_tensor_address_map(const struct ggml_cgraph* cgraph); diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index 3987760a29..b3cf75817f 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -49,7 +49,7 @@ public: virtual void visit_subgraph(std::function)> node_visitor) const = 0; - virtual bool check_if_continuous() const = 0; + virtual int get_op_case() const = 0; virtual const std::map>& get_model_inputs() const = 0; virtual const std::map>& get_model_extra_inputs() const = 0; @@ -59,4 +59,4 @@ public: } // namespace ggml } // namespace frontend -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp index e934e2ac36..44f55222e3 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.hpp +++ b/ggml/src/ggml-openvino/openvino/node_context.hpp @@ -81,8 +81,8 @@ public: return m_decoder->get_attribute(name); } - bool check_if_continuous() const { - return m_decoder->check_if_continuous(); + int get_op_case() const { + return m_decoder->get_op_case(); } private: diff --git a/ggml/src/ggml-openvino/openvino/op/cont.cpp b/ggml/src/ggml-openvino/openvino/op/cont.cpp index e8e9bf0a4e..a052bf06ca 100644 --- a/ggml/src/ggml-openvino/openvino/op/cont.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cont.cpp @@ -17,11 +17,13 @@ namespace op { OutputVector translate_cont(const NodeContext& context) { num_inputs_check(context, 1, 1); + int op_case = context.get_op_case(); + FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case"); + auto src_shape = context.get_input_shape(0).to_shape(); auto dst_shape = context.get_output_shape(0).to_shape(); - bool continuous = context.check_if_continuous(); - if (continuous) { + if (op_case == 1) { // The input comes from a PERMUTE dst_shape[1] = -1; auto result = std::make_shared( diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp index 2808d3ee91..4ab1502f81 100644 --- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -22,13 +22,16 @@ namespace op { OutputVector translate_cpy(const NodeContext& context) { num_inputs_check(context, 2, 2); + + int op_case = context.get_op_case(); + FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CPY case"); + auto src0 = context.get_input(0); auto src1 = context.get_input(1); auto past_token_len = context.get_input("past_token_len"); auto src0_shape = context.get_input_shape(0).to_shape(); auto output_shape = context.get_output_shape(0).to_shape(); - bool continuous = context.check_if_continuous(); std::vector input0_strides = context.get_input_stride(0); std::vector output_strides = context.get_output_stride(0); @@ -36,7 +39,7 @@ OutputVector translate_cpy(const NodeContext& context) { auto one = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {1}); src0 = std::make_shared(src0, src1); - if (continuous) { + if (op_case == 1) { // Write K to cache_k int64_t head_size = src0_shape[2]; int64_t num_heads = src0_shape[1]; diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index 3e9c5c5083..5673551f70 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -22,8 +22,10 @@ namespace op { OutputVector translate_mulmat(const NodeContext& context) { num_inputs_check(context, 2, 2); - bool continuous = context.check_if_continuous(); - if (continuous) { + int op_case = context.get_op_case(); + FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported MULMAT case"); + + if (op_case == 1) { auto src0 = context.get_input(0); auto src1 = std::make_shared(context.get_input(1), context.get_input_type(0)); auto result_lp = std::make_shared(src1, src0, false, true); diff --git a/ggml/src/ggml-openvino/openvino/op/reshape.cpp b/ggml/src/ggml-openvino/openvino/op/reshape.cpp index 06b2bd339e..f6586d674c 100644 --- a/ggml/src/ggml-openvino/openvino/op/reshape.cpp +++ b/ggml/src/ggml-openvino/openvino/op/reshape.cpp @@ -1,6 +1,8 @@ #include +#include #include #include +#include #include #include #include @@ -19,11 +21,22 @@ OutputVector translate_reshape(const NodeContext& context) { return {context.get_input(0)}; } + int op_case = context.get_op_case(); + FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported RESHAPE case"); + auto output_shape = context.get_output_shape(0).to_shape(); - auto new_shape_node = - ov::op::v0::Constant::create(ov::element::i64, - {3}, - std::vector{-1, (int64_t)output_shape[1], (int64_t)output_shape[2]}); + std::shared_ptr new_shape_node; + if (op_case == 1) { + new_shape_node = + ov::op::v0::Constant::create(ov::element::i64, + {3}, + std::vector{-1, (int64_t)output_shape[1], (int64_t)output_shape[2]}); + } else { + new_shape_node = + ov::op::v0::Constant::create(ov::element::i64, + {3}, + std::vector{(int64_t)output_shape[0], -1, (int64_t)output_shape[2]}); + } Output res = std::make_shared(context.get_input(0), new_shape_node, false); return {res}; } diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 012e9178c6..910a0d8336 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -31,10 +31,6 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo const auto& ggml_model = std::dynamic_pointer_cast(input_model); std::shared_ptr ggml_model_decoder = ggml_model->get_model_decoder(); - FRONT_END_GENERAL_CHECK(ggml_model, "nullptr for InputModel is given for translation into OV Model"); - const auto& model_inputs = ggml_model->get_inputs(); - const auto& model_outputs = ggml_model->get_outputs(); - for (const auto& it : ggml_model_decoder->get_model_inputs()) { params.push_back(std::dynamic_pointer_cast(it.second)); (*tensor_map)[it.first] = it.second; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 944c7e53bd..d4a25ab59b 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1275,7 +1275,7 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const { if (ubatch.token) { inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); - //cb(inp->tokens, "inp_tokens", -1); + cb(inp->tokens, "inp_tokens", -1); ggml_set_input(inp->tokens); res->t_tokens = inp->tokens; @@ -1327,6 +1327,7 @@ ggml_tensor * llm_graph_context::build_inp_pos() const { auto & cur = inp->pos; cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, (int64_t)n_tokens*hparams.n_pos_per_embd()); + cb(cur, "inp_pos", -1); ggml_set_input(cur); res->add_input(std::move(inp)); @@ -1362,6 +1363,7 @@ ggml_tensor * llm_graph_context::build_inp_out_ids() const { auto & cur = inp->out_ids; cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs); + cb(cur, "inp_out_ids", -1); ggml_set_input(cur); res->add_input(std::move(inp)); @@ -1603,6 +1605,7 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1); + cb(inp->self_kq_mask, "KQ_mask", -1); ggml_set_input(inp->self_kq_mask); inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask; @@ -1661,7 +1664,7 @@ ggml_tensor * llm_graph_context::build_attn( } if (wo_b) { - //cb(cur, "kqv_wo", il); + cb(cur, "kqv_wo", il); } if (wo_b) { @@ -1691,6 +1694,7 @@ static std::unique_ptr build_attn_inp_kv_impl( inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch); inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream); + cb(inp->self_kq_mask, "KQ_mask", -1); ggml_set_input(inp->self_kq_mask); inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask; @@ -1818,7 +1822,7 @@ ggml_tensor * llm_graph_context::build_attn( } if (wo_b) { - //cb(cur, "kqv_wo", il); + cb(cur, "kqv_wo", il); } if (wo_b) { @@ -1873,7 +1877,7 @@ ggml_tensor * llm_graph_context::build_attn( } if (wo_b) { - //cb(cur, "kqv_wo", il); + cb(cur, "kqv_wo", il); } if (wo_b) {