FIX: Re-add tensor names in cgraph, Add another case for RESHAPE
This commit is contained in:
parent
0d505b4e56
commit
041d220dfa
|
|
@ -38,6 +38,10 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap
|
|||
printed = true;
|
||||
}
|
||||
|
||||
if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) {
|
||||
dump_cgraph(m_cgraph);
|
||||
}
|
||||
|
||||
set_max_token_len();
|
||||
for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) {
|
||||
auto* cur_node = m_cgraph->nodes[node_n];
|
||||
|
|
@ -47,10 +51,6 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap
|
|||
m_model_weights = model_weights;
|
||||
|
||||
add_extra_inputs();
|
||||
|
||||
if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) {
|
||||
dump_cgraph(m_cgraph);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -142,17 +142,40 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node,
|
|||
|
||||
if (m_node) {
|
||||
switch (node->op) {
|
||||
case GGML_OP_RESHAPE: {
|
||||
if (node->ne[0] * node->ne[1] == node->src[0]->ne[0]) {
|
||||
m_op_case = 1;
|
||||
} else if (node->src[0]->ne[0] * node->src[0]->ne[1] == node->ne[0]) {
|
||||
m_op_case = 2;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case GGML_OP_CONT: {
|
||||
// Currently only two cases, either the input comes from a VIEW which is subtensor or from a PERMUTE
|
||||
m_continuous = ggml_nelements(node->src[0]) == ggml_nelements(node->src[0]->view_src);
|
||||
if (ggml_nelements(node->src[0]) == ggml_nelements(node->src[0]->view_src)) {
|
||||
// The input comes from a PERMUTE
|
||||
m_op_case = 1;
|
||||
} else {
|
||||
// The input comes from a VIEW which is subtensor
|
||||
m_op_case = 2;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case GGML_OP_CPY: {
|
||||
m_continuous = ggml_is_contiguous(node);
|
||||
if (ggml_is_contiguous(node)) {
|
||||
// Write K to cache_k
|
||||
m_op_case = 1;
|
||||
} else {
|
||||
// Write V to cache_v
|
||||
m_op_case = 2;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case GGML_OP_MUL_MAT: {
|
||||
m_continuous = node->src[0]->view_src == nullptr;
|
||||
if (node->src[0]->view_src == nullptr) {
|
||||
m_op_case = 1;
|
||||
} else {
|
||||
m_op_case = 2;
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
|
|
|
|||
|
|
@ -69,8 +69,8 @@ public:
|
|||
return m_outputs.at(name);
|
||||
}
|
||||
|
||||
virtual bool check_if_continuous() const override {
|
||||
return m_continuous;
|
||||
virtual int get_op_case() const override {
|
||||
return m_op_case;
|
||||
}
|
||||
|
||||
virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_inputs() const override {
|
||||
|
|
@ -110,7 +110,7 @@ private:
|
|||
std::vector<ggml_tensor*> m_nodes;
|
||||
std::string m_op_name;
|
||||
mutable std::string m_name;
|
||||
bool m_continuous;
|
||||
int m_op_case;
|
||||
std::vector<std::pair<std::string, std::string>> m_op_node_name;
|
||||
std::map<std::string, std::shared_ptr<ov::Node>> m_model_inputs;
|
||||
std::map<std::string, std::shared_ptr<ov::Node>> m_model_extra_inputs;
|
||||
|
|
@ -119,4 +119,4 @@ private:
|
|||
std::vector<std::string> m_model_output_names;
|
||||
};
|
||||
|
||||
void print_tensor_address_map(const struct ggml_cgraph* cgraph);
|
||||
void print_tensor_address_map(const struct ggml_cgraph* cgraph);
|
||||
|
|
|
|||
|
|
@ -49,7 +49,7 @@ public:
|
|||
|
||||
virtual void visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>)> node_visitor) const = 0;
|
||||
|
||||
virtual bool check_if_continuous() const = 0;
|
||||
virtual int get_op_case() const = 0;
|
||||
|
||||
virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_inputs() const = 0;
|
||||
virtual const std::map<std::string, std::shared_ptr<ov::Node>>& get_model_extra_inputs() const = 0;
|
||||
|
|
@ -59,4 +59,4 @@ public:
|
|||
|
||||
} // namespace ggml
|
||||
} // namespace frontend
|
||||
} // namespace ov
|
||||
} // namespace ov
|
||||
|
|
|
|||
|
|
@ -81,8 +81,8 @@ public:
|
|||
return m_decoder->get_attribute(name);
|
||||
}
|
||||
|
||||
bool check_if_continuous() const {
|
||||
return m_decoder->check_if_continuous();
|
||||
int get_op_case() const {
|
||||
return m_decoder->get_op_case();
|
||||
}
|
||||
|
||||
private:
|
||||
|
|
|
|||
|
|
@ -17,11 +17,13 @@ namespace op {
|
|||
OutputVector translate_cont(const NodeContext& context) {
|
||||
num_inputs_check(context, 1, 1);
|
||||
|
||||
int op_case = context.get_op_case();
|
||||
FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CONT case");
|
||||
|
||||
auto src_shape = context.get_input_shape(0).to_shape();
|
||||
auto dst_shape = context.get_output_shape(0).to_shape();
|
||||
|
||||
bool continuous = context.check_if_continuous();
|
||||
if (continuous) {
|
||||
if (op_case == 1) {
|
||||
// The input comes from a PERMUTE
|
||||
dst_shape[1] = -1;
|
||||
auto result = std::make_shared<ov::op::v1::Reshape>(
|
||||
|
|
|
|||
|
|
@ -22,13 +22,16 @@ namespace op {
|
|||
|
||||
OutputVector translate_cpy(const NodeContext& context) {
|
||||
num_inputs_check(context, 2, 2);
|
||||
|
||||
int op_case = context.get_op_case();
|
||||
FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported CPY case");
|
||||
|
||||
auto src0 = context.get_input(0);
|
||||
auto src1 = context.get_input(1);
|
||||
auto past_token_len = context.get_input("past_token_len");
|
||||
|
||||
auto src0_shape = context.get_input_shape(0).to_shape();
|
||||
auto output_shape = context.get_output_shape(0).to_shape();
|
||||
bool continuous = context.check_if_continuous();
|
||||
|
||||
std::vector<size_t> input0_strides = context.get_input_stride(0);
|
||||
std::vector<size_t> output_strides = context.get_output_stride(0);
|
||||
|
|
@ -36,7 +39,7 @@ OutputVector translate_cpy(const NodeContext& context) {
|
|||
auto one = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {1});
|
||||
|
||||
src0 = std::make_shared<ov::op::v1::ConvertLike>(src0, src1);
|
||||
if (continuous) {
|
||||
if (op_case == 1) {
|
||||
// Write K to cache_k
|
||||
int64_t head_size = src0_shape[2];
|
||||
int64_t num_heads = src0_shape[1];
|
||||
|
|
|
|||
|
|
@ -22,8 +22,10 @@ namespace op {
|
|||
OutputVector translate_mulmat(const NodeContext& context) {
|
||||
num_inputs_check(context, 2, 2);
|
||||
|
||||
bool continuous = context.check_if_continuous();
|
||||
if (continuous) {
|
||||
int op_case = context.get_op_case();
|
||||
FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported MULMAT case");
|
||||
|
||||
if (op_case == 1) {
|
||||
auto src0 = context.get_input(0);
|
||||
auto src1 = std::make_shared<ov::op::v0::Convert>(context.get_input(1), context.get_input_type(0));
|
||||
auto result_lp = std::make_shared<ov::op::v0::MatMul>(src1, src0, false, true);
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <openvino/core/node.hpp>
|
||||
#include <openvino/core/node_output.hpp>
|
||||
#include <openvino/frontend/exception.hpp>
|
||||
#include <openvino/op/constant.hpp>
|
||||
#include <openvino/op/reshape.hpp>
|
||||
#include <vector>
|
||||
|
|
@ -19,11 +21,22 @@ OutputVector translate_reshape(const NodeContext& context) {
|
|||
return {context.get_input(0)};
|
||||
}
|
||||
|
||||
int op_case = context.get_op_case();
|
||||
FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2, "Unsupported RESHAPE case");
|
||||
|
||||
auto output_shape = context.get_output_shape(0).to_shape();
|
||||
auto new_shape_node =
|
||||
ov::op::v0::Constant::create(ov::element::i64,
|
||||
{3},
|
||||
std::vector<int64_t>{-1, (int64_t)output_shape[1], (int64_t)output_shape[2]});
|
||||
std::shared_ptr<ov::Node> new_shape_node;
|
||||
if (op_case == 1) {
|
||||
new_shape_node =
|
||||
ov::op::v0::Constant::create(ov::element::i64,
|
||||
{3},
|
||||
std::vector<int64_t>{-1, (int64_t)output_shape[1], (int64_t)output_shape[2]});
|
||||
} else {
|
||||
new_shape_node =
|
||||
ov::op::v0::Constant::create(ov::element::i64,
|
||||
{3},
|
||||
std::vector<int64_t>{(int64_t)output_shape[0], -1, (int64_t)output_shape[2]});
|
||||
}
|
||||
Output<Node> res = std::make_shared<ov::op::v1::Reshape>(context.get_input(0), new_shape_node, false);
|
||||
return {res};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -31,10 +31,6 @@ std::shared_ptr<Model> TranslateSession::translate_graph(const frontend::InputMo
|
|||
const auto& ggml_model = std::dynamic_pointer_cast<InputModel>(input_model);
|
||||
std::shared_ptr<GgmlDecoder> ggml_model_decoder = ggml_model->get_model_decoder();
|
||||
|
||||
FRONT_END_GENERAL_CHECK(ggml_model, "nullptr for InputModel is given for translation into OV Model");
|
||||
const auto& model_inputs = ggml_model->get_inputs();
|
||||
const auto& model_outputs = ggml_model->get_outputs();
|
||||
|
||||
for (const auto& it : ggml_model_decoder->get_model_inputs()) {
|
||||
params.push_back(std::dynamic_pointer_cast<ov::op::v0::Parameter>(it.second));
|
||||
(*tensor_map)[it.first] = it.second;
|
||||
|
|
|
|||
|
|
@ -1275,7 +1275,7 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
|
|||
|
||||
if (ubatch.token) {
|
||||
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
|
||||
//cb(inp->tokens, "inp_tokens", -1);
|
||||
cb(inp->tokens, "inp_tokens", -1);
|
||||
ggml_set_input(inp->tokens);
|
||||
res->t_tokens = inp->tokens;
|
||||
|
||||
|
|
@ -1327,6 +1327,7 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
|
|||
auto & cur = inp->pos;
|
||||
|
||||
cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, (int64_t)n_tokens*hparams.n_pos_per_embd());
|
||||
cb(cur, "inp_pos", -1);
|
||||
ggml_set_input(cur);
|
||||
|
||||
res->add_input(std::move(inp));
|
||||
|
|
@ -1362,6 +1363,7 @@ ggml_tensor * llm_graph_context::build_inp_out_ids() const {
|
|||
auto & cur = inp->out_ids;
|
||||
|
||||
cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
|
||||
cb(cur, "inp_out_ids", -1);
|
||||
ggml_set_input(cur);
|
||||
|
||||
res->add_input(std::move(inp));
|
||||
|
|
@ -1603,6 +1605,7 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con
|
|||
|
||||
// note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
|
||||
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
|
||||
cb(inp->self_kq_mask, "KQ_mask", -1);
|
||||
ggml_set_input(inp->self_kq_mask);
|
||||
|
||||
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
||||
|
|
@ -1661,7 +1664,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
|||
}
|
||||
|
||||
if (wo_b) {
|
||||
//cb(cur, "kqv_wo", il);
|
||||
cb(cur, "kqv_wo", il);
|
||||
}
|
||||
|
||||
if (wo_b) {
|
||||
|
|
@ -1691,6 +1694,7 @@ static std::unique_ptr<llm_graph_input_attn_kv> build_attn_inp_kv_impl(
|
|||
inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);
|
||||
|
||||
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
|
||||
cb(inp->self_kq_mask, "KQ_mask", -1);
|
||||
ggml_set_input(inp->self_kq_mask);
|
||||
|
||||
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
||||
|
|
@ -1818,7 +1822,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
|||
}
|
||||
|
||||
if (wo_b) {
|
||||
//cb(cur, "kqv_wo", il);
|
||||
cb(cur, "kqv_wo", il);
|
||||
}
|
||||
|
||||
if (wo_b) {
|
||||
|
|
@ -1873,7 +1877,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
|||
}
|
||||
|
||||
if (wo_b) {
|
||||
//cb(cur, "kqv_wo", il);
|
||||
cb(cur, "kqv_wo", il);
|
||||
}
|
||||
|
||||
if (wo_b) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue