From 7bda5021f982c8c5d7835766bd49cea7d36f1439 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 14 Aug 2025 15:40:36 +0800 Subject: [PATCH] Fix NPU --- ggml/src/ggml-openvino/ggml-decoder.cpp | 37 ++++++++++++++++++- ggml/src/ggml-openvino/ggml-decoder.h | 1 + .../ggml-openvino/openvino/op/set_rows.cpp | 30 ++++++++++++--- ggml/src/ggml-openvino/utils.cpp | 3 ++ 4 files changed, 65 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index c952fb8eaf..472dd157ef 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -193,6 +193,14 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { } break; } + case GGML_OP_SET_ROWS: { + if (std::string(node->name).find("cache_k") == 0) { + m_op_case = 1; + } else { + m_op_case = 2; + } + break; + } case GGML_OP_PERMUTE: { if (node->src[0]->view_src == nullptr) { // Permute Qcur @@ -274,8 +282,18 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size}; } else if (name.find("cache_v") == 0) { input_shape = ov::PartialShape{m_num_heads_kv, m_head_size, m_context_size}; - } else if (get_tensor_used_op(src)->op == GGML_OP_SET_ROWS) { + } else if (const auto* op = get_tensor_used_op(src); op->op == GGML_OP_SET_ROWS) { input_shape = ov::PartialShape{1, 1, -1}; + if (m_is_static) { + if (m_is_first_token) { + // Dummy static shape, since the indices are not used in this case + input_shape = ov::PartialShape{1}; + } else if (std::string(op->name).find("cache_k") == 0) { + input_shape = ov::PartialShape{1, 1, 1}; + } else { + input_shape = ov::PartialShape{1, 1, m_num_heads_kv * m_head_size}; + } + } } else if (src->op == GGML_OP_VIEW) { // This case is added to make test-backend-ops work input_shape = ov::PartialShape{get_shape(src->view_src)}; @@ -316,6 +334,7 @@ void GgmlOvDecoder::add_extra_inputs() { if (node->op == GGML_OP_SET_ROWS && std::string(node->name).find("cache_k") == 0) { assert(node->src[1]->type == GGML_TYPE_I64); past_token_len = *(int64_t*) (node->src[1]->data); + break; } } @@ -366,6 +385,22 @@ const ggml_tensor* GgmlOvDecoder::get_tensor_used_op(const ggml_tensor* tensor) throw std::runtime_error("Tensor not found in cgraph"); } +const ggml_tensor* GgmlOvDecoder::get_tensor_from_name(const std::string& name) const { + for (int i = 0; i < m_cgraph->n_nodes; i++) { + const auto* node = m_cgraph->nodes[i]; + for (int j = 0; j < GGML_MAX_SRC; j++) { + const auto* src = node->src[j]; + if (src == nullptr) { + break; + } + if (std::string(src->name) == name) { + return src; + } + } + } + return nullptr; +} + std::map GgmlOvDecoder::get_kv_param_res_names() const { std::map kv_param_res_names; for (const auto& name : m_kv_names) { diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index f6a4f74163..ae378273d3 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -119,6 +119,7 @@ public: static std::map> create_weight_nodes(struct ggml_cgraph* cgraph); const ggml_tensor* get_tensor_used_op(const ggml_tensor* tensor) const; + const ggml_tensor* get_tensor_from_name(const std::string& name) const; void clear_model_weights() { m_model_weights.clear(); } diff --git a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp index b6caa372b8..758454cd9d 100644 --- a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include "../node_context.hpp" #include "../op_table.hpp" @@ -25,21 +26,40 @@ OutputVector translate_set_rows(const NodeContext& context) { num_inputs_check(context, 2, 2); auto data = context.get_input(0); - auto indices = context.get_input(1); - auto dst = context.get_input(context.get_output_name()); + data = std::make_shared(data, context.get_output_type(0)); + auto dst_shape = context.get_output_shape(0).to_shape(); FRONT_END_OP_CONVERSION_CHECK(dst_shape[0] == 1, "Unsupported shape in SET_ROWS"); - auto zero = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {0}); + if (context.is_static() && context.is_first_token()) { + Output res; + if (context.get_op_case() == 2) { + res = std::make_shared( + data, + ov::op::v0::Constant::create( + ov::element::i64, + {3}, + {context.get_context_size(), context.get_num_heads_kv(), context.get_head_size()}), + false); + res = std::make_shared( + res, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 2, 0})); + } else { + res = data; + } + return rename_outputs_with_suffix({res}, context.get_name()); + } + auto indices = context.get_input(1); + auto dst = context.get_input(context.get_output_name()); + + auto zero = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {0}); auto dst_reshaped = std::make_shared( dst, ov::op::v0::Constant::create(ov::element::i64, {2}, {(int64_t) dst_shape[1], (int64_t) dst_shape[2]}), false); auto indices_reshaped = std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); - auto data_converted = std::make_shared(data, context.get_output_type(0)); - auto data_reshaped = std::make_shared(data_converted, zero); + auto data_reshaped = std::make_shared(data, zero); auto updated = std::make_shared(dst_reshaped, indices_reshaped, data_reshaped, zero); auto res = std::make_shared(updated, std::make_shared(dst), false); return rename_outputs_with_suffix({res}, context.get_name()); diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index cf0fc4dfd3..83ab7353a9 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -328,6 +328,9 @@ ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, cons std::copy(padded_data.begin(), padded_data.end(), data_ptr); } + } else if (const auto* op = ggml_decoder->get_tensor_used_op(ggml_decoder->get_tensor_from_name(param_name)); + op->op == GGML_OP_SET_ROWS && is_static && is_first_token) { + input_tensor = ov::Tensor(ov::element::i64, ov::Shape{1}); } else { input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); }