From 9de874cb7b936333da396fd73b8b49c7109d48fb Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Tue, 16 Sep 2025 16:30:45 +0800 Subject: [PATCH] Support iSWA --- ggml/src/ggml-openvino/ggml-decoder.cpp | 103 ++++++++++++------ ggml/src/ggml-openvino/ggml-decoder.h | 13 ++- ggml/src/ggml-openvino/openvino/decoder.hpp | 2 + .../ggml-openvino/openvino/node_context.hpp | 13 +-- .../openvino/op/flash_attn_ext.cpp | 9 +- .../src/ggml-openvino/openvino/op/permute.cpp | 38 ++----- .../openvino/translate_session.cpp | 21 +++- ggml/src/ggml-openvino/utils.cpp | 2 +- src/llama-graph.cpp | 4 +- 9 files changed, 124 insertions(+), 81 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index bc528e0cfb..e3dd5e0c1d 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -30,17 +30,21 @@ #include #include #include +#include #include "ggml-backend-impl.h" #include "ggml-backend.h" #include "ggml-quants.hpp" GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token, - int context_size, int num_heads, int num_heads_kv, int head_size) : + int context_size, int context_size_swa, int num_heads, int num_heads_kv, int head_size, + const std::vector& swa_layers) : m_cgraph(cgraph), m_node(node), m_op_name(std::string(node->name)), m_context_size(context_size), + m_context_size_swa(context_size_swa), + m_swa_layers(swa_layers), m_num_heads(num_heads), m_num_heads_kv(num_heads_kv), m_head_size(head_size), @@ -204,11 +208,14 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { if (node->src[0]->op != GGML_OP_VIEW) { m_op_case = 1; } else if (ggml_is_contiguous(node->src[0])) { - // Permute cache_k (view) - m_op_case = 2; - } else { - // Permute cache_v (view), deprecated, cache_v will also fall to case 2 - m_op_case = 3; + // Permute kv cache (view) + std::string src_name(node->view_src->name); + int layer = extract_layer_from_name(src_name); + if (!is_swa_layer(layer)) { + m_op_case = 2; + } else { + m_op_case = 3; + } } break; } @@ -239,13 +246,34 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, bool naive) { } } +int extract_layer_from_name(const std::string& name) { + size_t pos1 = name.find("_l"); + assert(pos1 != std::string::npos); + pos1 += 2; + size_t pos2 = name.find(' ', pos1); + if (pos2 == std::string::npos) { + pos2 = name.length(); + } + std::string layer_str = name.substr(pos1, pos2 - pos1); + int layer = std::stoi(layer_str); + return layer; +} + void GgmlOvDecoder::set_llm_params() { for (int i = 0; i < m_cgraph->n_nodes; i++) { auto* node = m_cgraph->nodes[i]; std::string name = std::string(node->name); - if (node->op == GGML_OP_VIEW && std::string(node->name) == "cache_k_l0 (view)") { - auto* cache_k = node->src[0]; - m_context_size = cache_k->ne[1]; + if (node->op == GGML_OP_FLASH_ATTN_EXT) { + auto* cache_k = node->src[1]; + cache_k = cache_k->view_src ? cache_k->view_src : cache_k; + int layer = extract_layer_from_name(cache_k->name); + + if (std::string(node->src[3]->name).find("swa") != std::string::npos) { + m_swa_layers.push_back(layer); + m_context_size_swa = cache_k->ne[1]; + } else { + m_context_size = cache_k->ne[1]; + } } else if (node->op == GGML_OP_ROPE && (name.find("Qcur-0") == 0 || std::string(node->src[0]->name).find("Qcur-0") == 0)) { m_head_size = node->ne[0]; @@ -269,11 +297,11 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co input_shape = ov::PartialShape{1, 1, 1}; } } else { - input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_context_size)}; + input_shape = ov::PartialShape{1, 1, -1}; } } else if (name == "inp_out_ids" && !m_is_static) { - input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_context_size)}; - } else if (name == "KQ_mask") { + input_shape = ov::PartialShape{1, 1, -1}; + } else if (name.find("KQ_mask") == 0) { if (m_is_static) { if (m_is_first_token) { input_shape = ov::PartialShape{1, m_context_size, m_context_size}; @@ -281,13 +309,12 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co input_shape = ov::PartialShape{1, 1, m_context_size}; } } else { - auto max_mask_size = GGML_PAD(m_context_size, GGML_KQ_MASK_PAD); - input_shape = ov::PartialShape{1, ov::Dimension(1, max_mask_size), ov::Dimension(1, max_mask_size)}; + input_shape = ov::PartialShape{1, -1, -1}; } - } else if (name.find("cache_k") == 0) { - input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size}; - } else if (name.find("cache_v") == 0) { - input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size}; + } else if (name.find("cache_") == 0) { + int layer = extract_layer_from_name(name); + bool is_swa = is_swa_layer(layer); + input_shape = ov::PartialShape{is_swa ? m_context_size_swa : m_context_size, m_num_heads_kv, m_head_size}; } else if (const auto* op = get_tensor_used_op(src); op && op->op == GGML_OP_SET_ROWS) { input_shape = ov::PartialShape{1, 1, m_is_static ? 1 : -1}; } else if (src->op == GGML_OP_VIEW) { @@ -305,35 +332,35 @@ void GgmlOvDecoder::add_extra_inputs() { // see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding. // Not used for NPU int64_t attention_size = -1; + int64_t attention_size_swa = -1; for (const auto& node : m_nodes) { - if (node->op == GGML_OP_SOFT_MAX) { - auto* mask = node->src[1]; - if (std::string(mask->name).find("KQ_mask") != 0) { - throw std::runtime_error("Unexpected softmax node: " + std::string(mask->name)); - } - attention_size = mask->ne[0]; - break; - } if (node->op == GGML_OP_FLASH_ATTN_EXT) { auto* mask = node->src[3]; - if (std::string(mask->name).find("KQ_mask") != 0) { + std::string mask_name(mask->name); + if (mask_name.find("KQ_mask") != 0) { throw std::runtime_error("Unexpected flash attention node: " + std::string(mask->name)); } - attention_size = mask->ne[0]; + if (mask_name.find("swa") != std::string::npos) { + attention_size_swa = mask->ne[0]; + } else { + attention_size = mask->ne[0]; + } } } - { - std::string name = "attention_size"; + auto create_attention_size_input = [this](const std::string& name, int64_t size) { auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); param_node->set_friendly_name(name); param_node->output(0).get_tensor().set_names({name}); m_model_extra_inputs[name] = param_node; auto tensor = std::make_shared(ov::element::i64, ov::Shape{1}); - *tensor->data() = attention_size; + *tensor->data() = size; m_model_extra_input_values[name] = tensor; - } + }; + + create_attention_size_input("attention_size", attention_size); + create_attention_size_input("attention_size_swa", attention_size_swa); } const ggml_tensor* GgmlOvDecoder::get_tensor_used_op(const ggml_tensor* tensor) const { @@ -706,8 +733,16 @@ int32_t* GgmlOvDecoder::get_output_op_params(const std::string& name) const { void GgmlOvDecoder::visit_subgraph(std::function)> node_visitor) const { for (const auto& node : m_nodes) { - auto decoder = std::make_shared( - node, m_cgraph, m_is_static, m_is_first_token, m_context_size, m_num_heads, m_num_heads_kv, m_head_size); + auto decoder = std::make_shared(node, + m_cgraph, + m_is_static, + m_is_first_token, + m_context_size, + m_context_size_swa, + m_num_heads, + m_num_heads_kv, + m_head_size, + m_swa_layers); node_visitor(decoder); } } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 4ba147da20..35e79ecefc 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -19,7 +19,8 @@ public: // Node decoder, called in GgmlOvDecoder::visit_subgraph GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token, - int context_size, int num_heads, int num_heads_kv, int head_size); + int context_size, int context_size_swa, int num_heads, int num_heads_kv, int head_size, + const std::vector& swa_layers); // Naive graph decoder GgmlOvDecoder(struct ggml_cgraph* cgraph, std::map>& model_weights); @@ -101,6 +102,12 @@ public: virtual int get_context_size() const override { return m_context_size; } + virtual int get_context_size_swa() const override { return m_context_size_swa; } + + virtual int is_swa_layer(int layer) const override { + return std::find(m_swa_layers.begin(), m_swa_layers.end(), layer) != m_swa_layers.end(); + } + virtual int get_num_heads() const override { return m_num_heads; } virtual int get_num_heads_kv() const override { return m_num_heads_kv; } @@ -156,6 +163,8 @@ private: std::map> m_model_weights; std::vector m_model_output_names; int m_context_size; + int m_context_size_swa; + std::vector m_swa_layers; int m_num_heads; int m_num_heads_kv; int m_head_size; @@ -166,3 +175,5 @@ private: }; void print_tensor_address_map(const struct ggml_cgraph* cgraph); + +int extract_layer_from_name(const std::string& name); diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index a3387ba394..6f11ff1283 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -67,6 +67,8 @@ public: virtual bool is_static() const = 0; virtual bool is_first_token() const = 0; virtual int get_context_size() const = 0; + virtual int get_context_size_swa() const = 0; + virtual int is_swa_layer(int layer) const = 0; }; } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp index cc1b5c0332..a64ae098ab 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.hpp +++ b/ggml/src/ggml-openvino/openvino/node_context.hpp @@ -2,6 +2,7 @@ #include #include +#include #include "decoder.hpp" @@ -30,6 +31,8 @@ public: return m_translate_session; } + const std::vector& get_input_names() const { return m_input_names; } + size_t get_input_size() const override { return m_decoder->get_input_size(); } @@ -101,15 +104,7 @@ public: return m_decoder->is_first_token(); } - int get_num_heads() const { return m_decoder->get_num_heads(); } - - int get_num_heads_kv() const { return m_decoder->get_num_heads_kv(); } - - int get_head_size() const { return m_decoder->get_head_size(); } - - int get_context_size() const { return m_decoder->get_context_size(); } - - private: +private: std::shared_ptr m_decoder; std::shared_ptr& m_tensor_map; TranslateSession* m_translate_session; diff --git a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp index d97603d98a..8b67778fb9 100644 --- a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +++ b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include "../node_context.hpp" #include "../op_table.hpp" @@ -32,8 +33,12 @@ OutputVector translate_flash_attn_ext(const NodeContext& context) { auto scale_node = std::make_shared(ov::element::f16, ov::Shape{}, std::vector{scale}); ov::Output mask_sliced; - if (context.has_input("KQ_mask_sliced")) { - mask_sliced = context.get_input("KQ_mask_sliced"); + std::string mask_name = "KQ_mask_sliced"; + if (context.get_input_names()[3].find("swa") != std::string::npos) { + mask_name = "KQ_mask_swa_sliced"; + } + if (context.has_input(mask_name)) { + mask_sliced = context.get_input(mask_name); } else { auto token_len = get_dimensions(q, {1}); auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index fcb091016a..086b1e4cdb 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -29,43 +29,29 @@ OutputVector translate_permute(const NodeContext& context) { ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); } else { auto src = context.get_input(0); - auto attention_size = context.get_input("attention_size"); + Output attention_size; if (context.is_static()) { attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {INT_MAX}); + } else if (op_case == 2) { + attention_size = context.get_input("attention_size"); + } else { + attention_size = context.get_input("attention_size_swa"); } auto src_shape_ = context.get_input_shape(0).to_shape(); std::vector src_shape(src_shape_.begin(), src_shape_.end()); - std::shared_ptr src_reshaped; - if (op_case == 2) { - src_reshaped = std::make_shared( - src, - ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{-1, src_shape[1], src_shape[2]}), - false); - } else { - src_reshaped = std::make_shared( - src, - ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{src_shape[1], src_shape[0], -1}), - false); - } + auto src_reshaped = std::make_shared( + src, + ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{-1, src_shape[1], src_shape[2]}), + false); auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); - std::shared_ptr slice_axis; - if (op_case == 2) { - slice_axis = zero; - } else { - slice_axis = two; - } - auto src_slice = std::make_shared(src_reshaped, zero, attention_size, one, slice_axis); + auto src_slice = std::make_shared(src_reshaped, zero, attention_size, one, zero); - if (op_case == 2) { - res = std::make_shared(src_slice, ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); - } else { - res = src_slice; - } + res = std::make_shared(src_slice, + ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 0, 2})); } return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 3b8c30361a..9c82fe5f85 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -78,13 +78,22 @@ void add_token_len(TensorMap& tensor_map) { } void add_sliced_mask(TensorMap& tensor_map) { - auto mask = tensor_map.at("KQ_mask").get_node_shared_ptr(); auto token_len = tensor_map.at("token_len").get_node_shared_ptr(); - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - std::shared_ptr mask_sliced = std::make_shared(mask, zero, token_len, one, one); - mask_sliced->set_friendly_name("KQ_mask_sliced"); - tensor_map.insert({"KQ_mask_sliced", mask_sliced->output(0)}); + + auto create_sliced_mask = [&](const std::string& mask_name, const std::string& sliced_name) { + if (tensor_map.find(mask_name) != tensor_map.end()) { + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto mask = tensor_map.at(mask_name).get_node_shared_ptr(); + std::shared_ptr mask_sliced = + std::make_shared(mask, zero, token_len, one, one); + mask_sliced->set_friendly_name(sliced_name); + tensor_map.insert({sliced_name, mask_sliced->output(0)}); + } + }; + + create_sliced_mask("KQ_mask", "KQ_mask_sliced"); + create_sliced_mask("KQ_mask_swa", "KQ_mask_swa_sliced"); } void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 7b696769fb..8724404098 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -362,7 +362,7 @@ ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, cons input_tensor = convert_ggml_input_to_ov(ggml_decoder, param_name); } - } else if (param_name == "KQ_mask") { + } else if (param_name.find("KQ_mask") == 0) { size_t context_size = ggml_decoder->get_context_size(); const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); if (is_first_token) { diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index e9fbff5995..257d86cd3e 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1605,7 +1605,7 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1); - cb(inp->self_kq_mask, "KQ_mask", -1); + cb(inp->self_kq_mask, "self_kq_mask", -1); ggml_set_input(inp->self_kq_mask); inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask; @@ -1694,7 +1694,7 @@ static std::unique_ptr build_attn_inp_kv_impl( inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch); inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream); - ggml_set_name(inp->self_kq_mask, "KQ_mask"); + ggml_set_name(inp->self_kq_mask, "self_kq_mask"); ggml_set_input(inp->self_kq_mask); inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;