diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index dbc3780027..c00efaf6ae 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -311,6 +311,11 @@ void GgmlOvDecoder::set_llm_params() { } else { m_attention_size = mask->ne[0]; } + if (m_is_static) { + m_attention_size = m_ctx_per_seq; + m_attention_size_swa = m_ctx_per_seq_swa; + m_token_len_per_seq = 1; + } } else if (node->op == GGML_OP_ROPE) { if (name.find("Qcur-0") == 0 || std::string(node->src[0]->name).find("Qcur-0") == 0) { @@ -330,7 +335,7 @@ void GgmlOvDecoder::set_llm_params() { void GgmlOvDecoder::validate_cgraph() const { if (m_n_seq > 1 && m_is_static == true) { - throw std::runtime_error("n_seq > 1 is not supported on NPU"); + throw std::runtime_error("n_seq > 1 is not supported on NPU. Try setting -np 1."); } } @@ -371,18 +376,24 @@ void GgmlOvDecoder::add_extra_inputs() { // Extra inputs: // 1. `attention_size`, used in FLASH_ATTN where the shape of the matmul's are 256 aligned, // see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding. - // Not used for NPU. // 2. `n_seq_active` and `seq_active_start`, used in FLASH_ATTN_EXT to indicate the active sequences in the batch - auto create_1d_input = [this](const std::string & name, int64_t size) { - auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); - param_node->set_friendly_name(name); - param_node->output(0).get_tensor().set_names({name}); - m_model_extra_inputs[name] = param_node; + auto create_1d_input = [this](const std::string & name, int64_t value) { + if (m_is_static) { + auto constant = + std::make_shared(ov::element::i64, ov::Shape{1}, std::vector{value}); + constant->set_friendly_name(name); + m_model_extra_inputs[name] = constant; + } else { + auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); + param_node->set_friendly_name(name); + param_node->output(0).get_tensor().set_names({name}); + m_model_extra_inputs[name] = param_node; - auto tensor = std::make_shared(ov::element::i64, ov::Shape{1}); - *tensor->data() = size; - m_model_extra_input_values[name] = tensor; + auto tensor = std::make_shared(ov::element::i64, ov::Shape{1}); + *tensor->data() = value; + m_model_extra_input_values[name] = tensor; + } }; create_1d_input("attention_size", m_attention_size); diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index 772342a2ae..d156e48e3c 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -56,9 +56,7 @@ OutputVector translate_permute(const NodeContext & context) { int64_t n_seq = cache_shape[1].get_length(); Output attention_size; - if (context.is_static()) { - attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {INT_MAX}); - } else if (op_case == 2) { + if (op_case == 2) { attention_size = context.get_input("attention_size"); } else { attention_size = context.get_input("attention_size_swa"); diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index a28946c617..d12701acdc 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -154,7 +154,9 @@ std::shared_ptr TranslateSession::translate_graph(const frontend::InputMo } for (const auto & it : ggml_model_decoder->get_model_extra_inputs()) { - params.push_back(std::dynamic_pointer_cast(it.second)); + if (std::dynamic_pointer_cast(it.second)) { + params.push_back(std::dynamic_pointer_cast(it.second)); + } (*tensor_map)[it.first] = it.second; } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 5b9ecb5f4f..6e1d7393c7 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -129,27 +129,22 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * ov_input_names_cache[cgraph] = ov_input_names; ov_output_names_cache[cgraph] = ov_output_names; - // // Set output tensors (for NPU) and kvcache i/o tensors once and for all - // // Note: does not seem to improve perf on CPU/GPU, but it breaks llama-bench, so disabled it - // for (size_t i = 0; i < ov_output_names.size(); i++) { - // auto output_name = ov_output_names[i]; - // if (is_static || output_name.find("cache") == 0) { - // auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); - // infer_request->set_output_tensor(i, output_tensor); - // } - // } - // for (size_t i = 0; i < ov_input_names.size(); i++) { - // auto param_name = ov_input_names[i]; - // if (param_name.find("cache") == 0) { - // ov::Tensor input_tensor; - // if (is_static) { - // input_tensor = get_ov_input_tensor_static(ggml_decoder, param_name, 0, 0); - // } else { - // input_tensor = get_ov_input_tensor(ggml_decoder, param_name); - // } - // infer_request->set_input_tensor(i, input_tensor); - // } - // } + // Set output tensors (for NPU) and kvcache i/o tensors once and for all + // Note: does not seem to improve perf on CPU/GPU, but breaks llama-bench, so disabled it for CPU/GPU + if (is_static) { + for (size_t i = 0; i < ov_output_names.size(); i++) { + auto output_name = ov_output_names[i]; + auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); + infer_request->set_output_tensor(i, output_tensor); + } + for (size_t i = 0; i < ov_input_names.size(); i++) { + auto param_name = ov_input_names[i]; + if (param_name.find("cache") == 0) { + auto input_tensor = get_ov_input_tensor_static(ggml_decoder, param_name, 0, 0); + infer_request->set_input_tensor(i, input_tensor); + } + } + } } } @@ -336,7 +331,8 @@ ov::Tensor get_ov_input_tensor_static(std::shared_ptr ggml_decode const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(param_name); const auto * op = ggml_decoder->get_tensor_used_op(ggml_tensor); - if (param_name == "inp_pos" || param_name == "inp_tokens" || op->op == GGML_OP_SET_ROWS) { + if (param_name == "inp_pos" || param_name == "inp_tokens" || + (op->op == GGML_OP_SET_ROWS && op->src[1] == ggml_tensor)) { ov::Shape input_shape = {1, 1, 1, 1}; ov::Tensor input_tensor(ggml_decoder->get_input_type(param_name), input_shape); // copy the j-th value from ggml_tensor