This commit is contained in:
Yu, Zijun 2025-11-24 11:31:28 +08:00 committed by Mustafa Cavus
parent ae404f7cbb
commit 531941b348
4 changed files with 43 additions and 36 deletions

View File

@ -311,6 +311,11 @@ void GgmlOvDecoder::set_llm_params() {
} else { } else {
m_attention_size = mask->ne[0]; m_attention_size = mask->ne[0];
} }
if (m_is_static) {
m_attention_size = m_ctx_per_seq;
m_attention_size_swa = m_ctx_per_seq_swa;
m_token_len_per_seq = 1;
}
} else if (node->op == GGML_OP_ROPE) { } else if (node->op == GGML_OP_ROPE) {
if (name.find("Qcur-0") == 0 || std::string(node->src[0]->name).find("Qcur-0") == 0) { if (name.find("Qcur-0") == 0 || std::string(node->src[0]->name).find("Qcur-0") == 0) {
@ -330,7 +335,7 @@ void GgmlOvDecoder::set_llm_params() {
void GgmlOvDecoder::validate_cgraph() const { void GgmlOvDecoder::validate_cgraph() const {
if (m_n_seq > 1 && m_is_static == true) { if (m_n_seq > 1 && m_is_static == true) {
throw std::runtime_error("n_seq > 1 is not supported on NPU"); throw std::runtime_error("n_seq > 1 is not supported on NPU. Try setting -np 1.");
} }
} }
@ -371,18 +376,24 @@ void GgmlOvDecoder::add_extra_inputs() {
// Extra inputs: // Extra inputs:
// 1. `attention_size`, used in FLASH_ATTN where the shape of the matmul's are 256 aligned, // 1. `attention_size`, used in FLASH_ATTN where the shape of the matmul's are 256 aligned,
// see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding. // see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding.
// Not used for NPU.
// 2. `n_seq_active` and `seq_active_start`, used in FLASH_ATTN_EXT to indicate the active sequences in the batch // 2. `n_seq_active` and `seq_active_start`, used in FLASH_ATTN_EXT to indicate the active sequences in the batch
auto create_1d_input = [this](const std::string & name, int64_t size) { auto create_1d_input = [this](const std::string & name, int64_t value) {
auto param_node = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::Shape{1}); if (m_is_static) {
param_node->set_friendly_name(name); auto constant =
param_node->output(0).get_tensor().set_names({name}); std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{value});
m_model_extra_inputs[name] = param_node; constant->set_friendly_name(name);
m_model_extra_inputs[name] = constant;
} else {
auto param_node = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::Shape{1});
param_node->set_friendly_name(name);
param_node->output(0).get_tensor().set_names({name});
m_model_extra_inputs[name] = param_node;
auto tensor = std::make_shared<ov::Tensor>(ov::element::i64, ov::Shape{1}); auto tensor = std::make_shared<ov::Tensor>(ov::element::i64, ov::Shape{1});
*tensor->data<int64_t>() = size; *tensor->data<int64_t>() = value;
m_model_extra_input_values[name] = tensor; m_model_extra_input_values[name] = tensor;
}
}; };
create_1d_input("attention_size", m_attention_size); create_1d_input("attention_size", m_attention_size);

View File

@ -56,9 +56,7 @@ OutputVector translate_permute(const NodeContext & context) {
int64_t n_seq = cache_shape[1].get_length(); int64_t n_seq = cache_shape[1].get_length();
Output<Node> attention_size; Output<Node> attention_size;
if (context.is_static()) { if (op_case == 2) {
attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {INT_MAX});
} else if (op_case == 2) {
attention_size = context.get_input("attention_size"); attention_size = context.get_input("attention_size");
} else { } else {
attention_size = context.get_input("attention_size_swa"); attention_size = context.get_input("attention_size_swa");

View File

@ -154,7 +154,9 @@ std::shared_ptr<Model> TranslateSession::translate_graph(const frontend::InputMo
} }
for (const auto & it : ggml_model_decoder->get_model_extra_inputs()) { for (const auto & it : ggml_model_decoder->get_model_extra_inputs()) {
params.push_back(std::dynamic_pointer_cast<ov::op::v0::Parameter>(it.second)); if (std::dynamic_pointer_cast<ov::op::v0::Parameter>(it.second)) {
params.push_back(std::dynamic_pointer_cast<ov::op::v0::Parameter>(it.second));
}
(*tensor_map)[it.first] = it.second; (*tensor_map)[it.first] = it.second;
} }

View File

@ -129,27 +129,22 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph *
ov_input_names_cache[cgraph] = ov_input_names; ov_input_names_cache[cgraph] = ov_input_names;
ov_output_names_cache[cgraph] = ov_output_names; ov_output_names_cache[cgraph] = ov_output_names;
// // Set output tensors (for NPU) and kvcache i/o tensors once and for all // Set output tensors (for NPU) and kvcache i/o tensors once and for all
// // Note: does not seem to improve perf on CPU/GPU, but it breaks llama-bench, so disabled it // Note: does not seem to improve perf on CPU/GPU, but breaks llama-bench, so disabled it for CPU/GPU
// for (size_t i = 0; i < ov_output_names.size(); i++) { if (is_static) {
// auto output_name = ov_output_names[i]; for (size_t i = 0; i < ov_output_names.size(); i++) {
// if (is_static || output_name.find("cache") == 0) { auto output_name = ov_output_names[i];
// auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]);
// infer_request->set_output_tensor(i, output_tensor); infer_request->set_output_tensor(i, output_tensor);
// } }
// } for (size_t i = 0; i < ov_input_names.size(); i++) {
// for (size_t i = 0; i < ov_input_names.size(); i++) { auto param_name = ov_input_names[i];
// auto param_name = ov_input_names[i]; if (param_name.find("cache") == 0) {
// if (param_name.find("cache") == 0) { auto input_tensor = get_ov_input_tensor_static(ggml_decoder, param_name, 0, 0);
// ov::Tensor input_tensor; infer_request->set_input_tensor(i, input_tensor);
// if (is_static) { }
// input_tensor = get_ov_input_tensor_static(ggml_decoder, param_name, 0, 0); }
// } else { }
// input_tensor = get_ov_input_tensor(ggml_decoder, param_name);
// }
// infer_request->set_input_tensor(i, input_tensor);
// }
// }
} }
} }
@ -336,7 +331,8 @@ ov::Tensor get_ov_input_tensor_static(std::shared_ptr<GgmlOvDecoder> ggml_decode
const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(param_name); const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(param_name);
const auto * op = ggml_decoder->get_tensor_used_op(ggml_tensor); const auto * op = ggml_decoder->get_tensor_used_op(ggml_tensor);
if (param_name == "inp_pos" || param_name == "inp_tokens" || op->op == GGML_OP_SET_ROWS) { if (param_name == "inp_pos" || param_name == "inp_tokens" ||
(op->op == GGML_OP_SET_ROWS && op->src[1] == ggml_tensor)) {
ov::Shape input_shape = {1, 1, 1, 1}; ov::Shape input_shape = {1, 1, 1, 1};
ov::Tensor input_tensor(ggml_decoder->get_input_type(param_name), input_shape); ov::Tensor input_tensor(ggml_decoder->get_input_type(param_name), input_shape);
// copy the j-th value from ggml_tensor // copy the j-th value from ggml_tensor