FIX: Re-add tensor names in cgraph, Add another case for RESHAPE

This commit is contained in:
Yu, Zijun 2025-05-14 14:06:15 +08:00 committed by Mustafa Cavus
parent 0d505b4e56
commit 041d220dfa
10 changed files with 77 additions and 34 deletions

View File

@ -38,6 +38,10 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap
printed = true;
}
if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) {
dump_cgraph(m_cgraph);
}
set_max_token_len();
for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) {
auto* cur_node = m_cgraph->nodes[node_n];
@ -47,10 +51,6 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap
m_model_weights = model_weights;
add_extra_inputs();
if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) {
dump_cgraph(m_cgraph);
}
}
}
@ -142,17 +142,40 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node,
if (m_node) {
switch (node->op) {
case GGML_OP_RESHAPE: {
if (node->ne[0] * node->ne[1] == node->src[0]->ne[0]) {
m_op_case = 1;
} else if (node->src[0]->ne[0] * node->src[0]->ne[1] == node->ne[0]) {
m_op_case = 2;
}
break;
}
case GGML_OP_CONT: {
// Currently only two cases, either the input comes from a VIEW which is subtensor or from a PERMUTE
m_continuous = ggml_nelements(node->src[0]) == ggml_nelements(node->src[0]->view_src);
if (ggml_nelements(node->src[0]) == ggml_nelements(node->src[0]->view_src)) {
// The input comes from a PERMUTE
m_op_case = 1;
} else {
// The input comes from a VIEW which is subtensor
m_op_case = 2;
}
break;
}
case GGML_OP_CPY: {
m_continuous = ggml_is_contiguous(node);
if (ggml_is_contiguous(node)) {
// Write K to cache_k
m_op_case = 1;
} else {
// Write V to cache_v
m_op_case = 2;
}
break;
}
case GGML_OP_MUL_MAT: {
m_continuous = node->src[0]->view_src == nullptr;
if (node->src[0]->view_src == nullptr) {
m_op_case = 1;
} else {
m_op_case = 2;
}
break;
}
default:

View File

@ -69,8 +69,8 @@ public:
return m_outputs.at(name);
}
virtual bool check_if_continuous() const override {
return m_continuous;
virtual int get_op_case() const override {
return m_op_case;
}
virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_inputs() const override {
@ -110,7 +110,7 @@ private:
std::vector<ggml_tensor*> m_nodes;
std::string m_op_name;
mutable std::string m_name;
bool m_continuous;
int m_op_case;
std::vector<std::pair<std::string, std::string>> m_op_node_name;
std::map<std::string, std::shared_ptr<ov::Node>> m_model_inputs;
std::map<std::string, std::shared_ptr<ov::Node>> m_model_extra_inputs;
@ -119,4 +119,4 @@ private:
std::vector<std::string> m_model_output_names;
};
void print_tensor_address_map(const struct ggml_cgraph* cgraph);
void print_tensor_address_map(const struct ggml_cgraph* cgraph);

View File

@ -49,7 +49,7 @@ public:
virtual void visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>)> node_visitor) const = 0;
virtual bool check_if_continuous() const = 0;
virtual int get_op_case() const = 0;
virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_inputs() const = 0;
virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_extra_inputs() const = 0;
@ -59,4 +59,4 @@ public:
} // namespace ggml
} // namespace frontend
} // namespace ov
} // namespace ov

View File

@ -81,8 +81,8 @@ public:
return m_decoder->get_attribute(name);
}
bool check_if_continuous() const {
return m_decoder->check_if_continuous();
int get_op_case() const {
return m_decoder->get_op_case();
}
private:

View File

@ -17,11 +17,13 @@ namespace op {
OutputVector translate_cont(const NodeContext& context) {
num_inputs_check(context, 1, 1);
int op_case = context.get_op_case();
FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case");
auto src_shape = context.get_input_shape(0).to_shape();
auto dst_shape = context.get_output_shape(0).to_shape();
bool continuous = context.check_if_continuous();
if (continuous) {
if (op_case == 1) {
// The input comes from a PERMUTE
dst_shape[1] = -1;
auto result = std::make_shared<ov::op::v1::Reshape>(

View File

@ -22,13 +22,16 @@ namespace op {
OutputVector translate_cpy(const NodeContext& context) {
num_inputs_check(context, 2, 2);
int op_case = context.get_op_case();
FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CPY case");
auto src0 = context.get_input(0);
auto src1 = context.get_input(1);
auto past_token_len = context.get_input("past_token_len");
auto src0_shape = context.get_input_shape(0).to_shape();
auto output_shape = context.get_output_shape(0).to_shape();
bool continuous = context.check_if_continuous();
std::vector<size_t> input0_strides = context.get_input_stride(0);
std::vector<size_t> output_strides = context.get_output_stride(0);
@ -36,7 +39,7 @@ OutputVector translate_cpy(const NodeContext& context) {
auto one = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {1});
src0 = std::make_shared<ov::op::v1::ConvertLike>(src0, src1);
if (continuous) {
if (op_case == 1) {
// Write K to cache_k
int64_t head_size = src0_shape[2];
int64_t num_heads = src0_shape[1];

View File

@ -22,8 +22,10 @@ namespace op {
OutputVector translate_mulmat(const NodeContext& context) {
num_inputs_check(context, 2, 2);
bool continuous = context.check_if_continuous();
if (continuous) {
int op_case = context.get_op_case();
FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported MULMAT case");
if (op_case == 1) {
auto src0 = context.get_input(0);
auto src1 = std::make_shared<ov::op::v0::Convert>(context.get_input(1), context.get_input_type(0));
auto result_lp = std::make_shared<ov::op::v0::MatMul>(src1, src0, false, true);

View File

@ -1,6 +1,8 @@
#include <cstdint>
#include <memory>
#include <openvino/core/node.hpp>
#include <openvino/core/node_output.hpp>
#include <openvino/frontend/exception.hpp>
#include <openvino/op/constant.hpp>
#include <openvino/op/reshape.hpp>
#include <vector>
@ -19,11 +21,22 @@ OutputVector translate_reshape(const NodeContext& context) {
return {context.get_input(0)};
}
int op_case = context.get_op_case();
FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported RESHAPE case");
auto output_shape = context.get_output_shape(0).to_shape();
auto new_shape_node =
ov::op::v0::Constant::create(ov::element::i64,
{3},
std::vector<int64_t>{-1, (int64_t)output_shape[1], (int64_t)output_shape[2]});
std::shared_ptr<ov::Node> new_shape_node;
if (op_case == 1) {
new_shape_node =
ov::op::v0::Constant::create(ov::element::i64,
{3},
std::vector<int64_t>{-1, (int64_t)output_shape[1], (int64_t)output_shape[2]});
} else {
new_shape_node =
ov::op::v0::Constant::create(ov::element::i64,
{3},
std::vector<int64_t>{(int64_t)output_shape[0], -1, (int64_t)output_shape[2]});
}
Output<Node> res = std::make_shared<ov::op::v1::Reshape>(context.get_input(0), new_shape_node, false);
return {res};
}

View File

@ -31,10 +31,6 @@ std::shared_ptr<Model> TranslateSession::translate_graph(const frontend::InputMo
const auto& ggml_model = std::dynamic_pointer_cast<InputModel>(input_model);
std::shared_ptr<GgmlDecoder> ggml_model_decoder = ggml_model->get_model_decoder();
FRONT_END_GENERAL_CHECK(ggml_model, "nullptr for InputModel is given for translation into OV Model");
const auto& model_inputs = ggml_model->get_inputs();
const auto& model_outputs = ggml_model->get_outputs();
for (const auto& it : ggml_model_decoder->get_model_inputs()) {
params.push_back(std::dynamic_pointer_cast<ov::op::v0::Parameter>(it.second));
(*tensor_map)[it.first] = it.second;

View File

@ -1275,7 +1275,7 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
if (ubatch.token) {
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
//cb(inp->tokens, "inp_tokens", -1);
cb(inp->tokens, "inp_tokens", -1);
ggml_set_input(inp->tokens);
res->t_tokens = inp->tokens;
@ -1327,6 +1327,7 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
auto & cur = inp->pos;
cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, (int64_t)n_tokens*hparams.n_pos_per_embd());
cb(cur, "inp_pos", -1);
ggml_set_input(cur);
res->add_input(std::move(inp));
@ -1362,6 +1363,7 @@ ggml_tensor * llm_graph_context::build_inp_out_ids() const {
auto & cur = inp->out_ids;
cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
cb(cur, "inp_out_ids", -1);
ggml_set_input(cur);
res->add_input(std::move(inp));
@ -1603,6 +1605,7 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con
// note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
cb(inp->self_kq_mask, "KQ_mask", -1);
ggml_set_input(inp->self_kq_mask);
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
@ -1661,7 +1664,7 @@ ggml_tensor * llm_graph_context::build_attn(
}
if (wo_b) {
//cb(cur, "kqv_wo", il);
cb(cur, "kqv_wo", il);
}
if (wo_b) {
@ -1691,6 +1694,7 @@ static std::unique_ptr<llm_graph_input_attn_kv> build_attn_inp_kv_impl(
inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
cb(inp->self_kq_mask, "KQ_mask", -1);
ggml_set_input(inp->self_kq_mask);
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
@ -1818,7 +1822,7 @@ ggml_tensor * llm_graph_context::build_attn(
}
if (wo_b) {
//cb(cur, "kqv_wo", il);
cb(cur, "kqv_wo", il);
}
if (wo_b) {
@ -1873,7 +1877,7 @@ ggml_tensor * llm_graph_context::build_attn(
}
if (wo_b) {
//cb(cur, "kqv_wo", il);
cb(cur, "kqv_wo", il);
}
if (wo_b) {