From ae404f7cbb177f3d8c4f445dcb2f697f8a3ef28a Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Thu, 20 Nov 2025 16:23:17 +0800 Subject: [PATCH] Fix llama-bench --- ggml/src/ggml-openvino/ggml-decoder.cpp | 11 ++++- .../src/ggml-openvino/openvino/op/permute.cpp | 2 +- .../ggml-openvino/openvino/op/set_rows.cpp | 10 +++- ggml/src/ggml-openvino/utils.cpp | 47 +++++++++---------- 4 files changed, 40 insertions(+), 30 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 0f913bdd75..dbc3780027 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -301,7 +301,9 @@ void GgmlOvDecoder::set_llm_params() { m_n_seq_active = mask->ne[3]; auto seq_size = cache_k->ne[0] * cache_k->ne[1] * ggml_type_size(cache_k->type); - m_seq_active_start = ((size_t *) cache_k_view->op_params)[0] / seq_size; + size_t offset; + memcpy(&offset, cache_k_view->op_params, sizeof(size_t)); + m_seq_active_start = offset / seq_size; m_token_len_per_seq = node->ne[2]; if (mask_name.find("swa") != std::string::npos) { @@ -346,6 +348,13 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co input_shape = ov::PartialShape{-1, 1, -1, -1}; } + } else if (name.find("cache_") == 0) { + input_shape = ov::PartialShape{get_shape(input)}; + if (!m_is_static) { + // do not fix ctx size to make llama-bench work + input_shape[2] = -1; + } + } else if (op && op->op == GGML_OP_SET_ROWS && op->src[1] == input) { input_shape = ov::PartialShape{1, 1, 1, m_is_static ? 1 : -1}; diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index 2fe2325d6a..772342a2ae 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -52,7 +52,7 @@ OutputVector translate_permute(const NodeContext & context) { auto output_shape = context.get_output_shape(0).to_shape(); int64_t head_size = output_shape[3]; int64_t n_heads = output_shape[1]; - int64_t ctx_per_seq = cache_shape[2].get_length(); + int64_t ctx_per_seq = cache_shape[2].is_static() ? cache_shape[2].get_length() : -1; int64_t n_seq = cache_shape[1].get_length(); Output attention_size; diff --git a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp index d71aca1d7f..a323e5ed38 100644 --- a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp @@ -18,6 +18,7 @@ #include #include #include +#include namespace ov { namespace frontend { @@ -48,8 +49,13 @@ OutputVector translate_set_rows(const NodeContext & context) { if (auto dst_reshape = std::dynamic_pointer_cast(dst.get_node_shared_ptr())) { // Fix the case of multiple sequences, reshape back to original shape [1, n_seq, ctx_per_seq, emb] - res = std::make_shared( - res, ov::op::v0::Constant::create(ov::element::i64, {4}, dst_reshape->get_input_shape(0)), false); + // ctx_per_seq is not fixed due to llama-bench compatibility + auto dst_shape_partial = dst_reshape->get_input_partial_shape(0); + std::vector dst_shape = {dst_shape_partial[0].get_length(), dst_shape_partial[1].get_length(), + dst_shape_partial[2].is_static() ? dst_shape_partial[2].get_length() : -1, + dst_shape_partial[3].get_length()}; + res = std::make_shared(res, ov::op::v0::Constant::create(ov::element::i64, {4}, dst_shape), + false); } return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 63e808c038..5b9ecb5f4f 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -129,26 +129,27 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * ov_input_names_cache[cgraph] = ov_input_names; ov_output_names_cache[cgraph] = ov_output_names; - // Set output tensors (for NPU) and kvcache i/o tensors once and for all - for (size_t i = 0; i < ov_output_names.size(); i++) { - auto output_name = ov_output_names[i]; - if (is_static || output_name.find("cache") == 0) { - auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); - infer_request->set_output_tensor(i, output_tensor); - } - } - for (size_t i = 0; i < ov_input_names.size(); i++) { - auto param_name = ov_input_names[i]; - if (param_name.find("cache") == 0) { - ov::Tensor input_tensor; - if (is_static) { - input_tensor = get_ov_input_tensor_static(ggml_decoder, param_name, 0, 0); - } else { - input_tensor = get_ov_input_tensor(ggml_decoder, param_name); - } - infer_request->set_input_tensor(i, input_tensor); - } - } + // // Set output tensors (for NPU) and kvcache i/o tensors once and for all + // // Note: does not seem to improve perf on CPU/GPU, but it breaks llama-bench, so disabled it + // for (size_t i = 0; i < ov_output_names.size(); i++) { + // auto output_name = ov_output_names[i]; + // if (is_static || output_name.find("cache") == 0) { + // auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); + // infer_request->set_output_tensor(i, output_tensor); + // } + // } + // for (size_t i = 0; i < ov_input_names.size(); i++) { + // auto param_name = ov_input_names[i]; + // if (param_name.find("cache") == 0) { + // ov::Tensor input_tensor; + // if (is_static) { + // input_tensor = get_ov_input_tensor_static(ggml_decoder, param_name, 0, 0); + // } else { + // input_tensor = get_ov_input_tensor(ggml_decoder, param_name); + // } + // infer_request->set_input_tensor(i, input_tensor); + // } + // } } } @@ -158,9 +159,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * if (!is_static) { for (size_t i = 0; i < ov_input_names.size(); i++) { auto param_name = ov_input_names[i]; - if (param_name.find("cache") == 0) { - continue; - } auto input_tensor = get_ov_input_tensor(ggml_decoder, param_name); infer_request->set_input_tensor(i, input_tensor); @@ -188,9 +186,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * for (int j = 0; j < input_len; j++) { for (size_t i = 0; i < ov_input_names.size(); i++) { auto param_name = ov_input_names[i]; - if (param_name.find("cache") == 0) { - continue; - } auto input_tensor = get_ov_input_tensor_static(ggml_decoder, param_name, j, input_len); infer_request->set_input_tensor(i, input_tensor);