Fix llama-bench

This commit is contained in:
Yu, Zijun 2025-11-20 16:23:17 +08:00 committed by Mustafa Cavus
parent 072dde0b2b
commit ae404f7cbb
4 changed files with 40 additions and 30 deletions

View File

@ -301,7 +301,9 @@ void GgmlOvDecoder::set_llm_params() {
m_n_seq_active = mask->ne[3];
auto seq_size = cache_k->ne[0] * cache_k->ne[1] * ggml_type_size(cache_k->type);
m_seq_active_start = ((size_t *) cache_k_view->op_params)[0] / seq_size;
size_t offset;
memcpy(&offset, cache_k_view->op_params, sizeof(size_t));
m_seq_active_start = offset / seq_size;
m_token_len_per_seq = node->ne[2];
if (mask_name.find("swa") != std::string::npos) {
@ -346,6 +348,13 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co
input_shape = ov::PartialShape{-1, 1, -1, -1};
}
} else if (name.find("cache_") == 0) {
input_shape = ov::PartialShape{get_shape(input)};
if (!m_is_static) {
// do not fix ctx size to make llama-bench work
input_shape[2] = -1;
}
} else if (op && op->op == GGML_OP_SET_ROWS && op->src[1] == input) {
input_shape = ov::PartialShape{1, 1, 1, m_is_static ? 1 : -1};

View File

@ -52,7 +52,7 @@ OutputVector translate_permute(const NodeContext & context) {
auto output_shape = context.get_output_shape(0).to_shape();
int64_t head_size = output_shape[3];
int64_t n_heads = output_shape[1];
int64_t ctx_per_seq = cache_shape[2].get_length();
int64_t ctx_per_seq = cache_shape[2].is_static() ? cache_shape[2].get_length() : -1;
int64_t n_seq = cache_shape[1].get_length();
Output<Node> attention_size;

View File

@ -18,6 +18,7 @@
#include <openvino/op/slice.hpp>
#include <openvino/op/squeeze.hpp>
#include <openvino/op/transpose.hpp>
#include <vector>
namespace ov {
namespace frontend {
@ -48,8 +49,13 @@ OutputVector translate_set_rows(const NodeContext & context) {
if (auto dst_reshape = std::dynamic_pointer_cast<ov::op::v1::Reshape>(dst.get_node_shared_ptr())) {
// Fix the case of multiple sequences, reshape back to original shape [1, n_seq, ctx_per_seq, emb]
res = std::make_shared<ov::op::v1::Reshape>(
res, ov::op::v0::Constant::create(ov::element::i64, {4}, dst_reshape->get_input_shape(0)), false);
// ctx_per_seq is not fixed due to llama-bench compatibility
auto dst_shape_partial = dst_reshape->get_input_partial_shape(0);
std::vector<int64_t> dst_shape = {dst_shape_partial[0].get_length(), dst_shape_partial[1].get_length(),
dst_shape_partial[2].is_static() ? dst_shape_partial[2].get_length() : -1,
dst_shape_partial[3].get_length()};
res = std::make_shared<ov::op::v1::Reshape>(res, ov::op::v0::Constant::create(ov::element::i64, {4}, dst_shape),
false);
}
return rename_outputs_with_suffix({res}, context.get_name());
}

View File

@ -129,26 +129,27 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph *
ov_input_names_cache[cgraph] = ov_input_names;
ov_output_names_cache[cgraph] = ov_output_names;
// Set output tensors (for NPU) and kvcache i/o tensors once and for all
for (size_t i = 0; i < ov_output_names.size(); i++) {
auto output_name = ov_output_names[i];
if (is_static || output_name.find("cache") == 0) {
auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]);
infer_request->set_output_tensor(i, output_tensor);
}
}
for (size_t i = 0; i < ov_input_names.size(); i++) {
auto param_name = ov_input_names[i];
if (param_name.find("cache") == 0) {
ov::Tensor input_tensor;
if (is_static) {
input_tensor = get_ov_input_tensor_static(ggml_decoder, param_name, 0, 0);
} else {
input_tensor = get_ov_input_tensor(ggml_decoder, param_name);
}
infer_request->set_input_tensor(i, input_tensor);
}
}
// // Set output tensors (for NPU) and kvcache i/o tensors once and for all
// // Note: does not seem to improve perf on CPU/GPU, but it breaks llama-bench, so disabled it
// for (size_t i = 0; i < ov_output_names.size(); i++) {
// auto output_name = ov_output_names[i];
// if (is_static || output_name.find("cache") == 0) {
// auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]);
// infer_request->set_output_tensor(i, output_tensor);
// }
// }
// for (size_t i = 0; i < ov_input_names.size(); i++) {
// auto param_name = ov_input_names[i];
// if (param_name.find("cache") == 0) {
// ov::Tensor input_tensor;
// if (is_static) {
// input_tensor = get_ov_input_tensor_static(ggml_decoder, param_name, 0, 0);
// } else {
// input_tensor = get_ov_input_tensor(ggml_decoder, param_name);
// }
// infer_request->set_input_tensor(i, input_tensor);
// }
// }
}
}
@ -158,9 +159,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph *
if (!is_static) {
for (size_t i = 0; i < ov_input_names.size(); i++) {
auto param_name = ov_input_names[i];
if (param_name.find("cache") == 0) {
continue;
}
auto input_tensor = get_ov_input_tensor(ggml_decoder, param_name);
infer_request->set_input_tensor(i, input_tensor);
@ -188,9 +186,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph *
for (int j = 0; j < input_len; j++) {
for (size_t i = 0; i < ov_input_names.size(); i++) {
auto param_name = ov_input_names[i];
if (param_name.find("cache") == 0) {
continue;
}
auto input_tensor = get_ov_input_tensor_static(ggml_decoder, param_name, j, input_len);
infer_request->set_input_tensor(i, input_tensor);