Fix llama-bench
This commit is contained in:
parent
072dde0b2b
commit
ae404f7cbb
|
|
@ -301,7 +301,9 @@ void GgmlOvDecoder::set_llm_params() {
|
|||
|
||||
m_n_seq_active = mask->ne[3];
|
||||
auto seq_size = cache_k->ne[0] * cache_k->ne[1] * ggml_type_size(cache_k->type);
|
||||
m_seq_active_start = ((size_t *) cache_k_view->op_params)[0] / seq_size;
|
||||
size_t offset;
|
||||
memcpy(&offset, cache_k_view->op_params, sizeof(size_t));
|
||||
m_seq_active_start = offset / seq_size;
|
||||
m_token_len_per_seq = node->ne[2];
|
||||
|
||||
if (mask_name.find("swa") != std::string::npos) {
|
||||
|
|
@ -346,6 +348,13 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co
|
|||
input_shape = ov::PartialShape{-1, 1, -1, -1};
|
||||
}
|
||||
|
||||
} else if (name.find("cache_") == 0) {
|
||||
input_shape = ov::PartialShape{get_shape(input)};
|
||||
if (!m_is_static) {
|
||||
// do not fix ctx size to make llama-bench work
|
||||
input_shape[2] = -1;
|
||||
}
|
||||
|
||||
} else if (op && op->op == GGML_OP_SET_ROWS && op->src[1] == input) {
|
||||
input_shape = ov::PartialShape{1, 1, 1, m_is_static ? 1 : -1};
|
||||
|
||||
|
|
|
|||
|
|
@ -52,7 +52,7 @@ OutputVector translate_permute(const NodeContext & context) {
|
|||
auto output_shape = context.get_output_shape(0).to_shape();
|
||||
int64_t head_size = output_shape[3];
|
||||
int64_t n_heads = output_shape[1];
|
||||
int64_t ctx_per_seq = cache_shape[2].get_length();
|
||||
int64_t ctx_per_seq = cache_shape[2].is_static() ? cache_shape[2].get_length() : -1;
|
||||
int64_t n_seq = cache_shape[1].get_length();
|
||||
|
||||
Output<Node> attention_size;
|
||||
|
|
|
|||
|
|
@ -18,6 +18,7 @@
|
|||
#include <openvino/op/slice.hpp>
|
||||
#include <openvino/op/squeeze.hpp>
|
||||
#include <openvino/op/transpose.hpp>
|
||||
#include <vector>
|
||||
|
||||
namespace ov {
|
||||
namespace frontend {
|
||||
|
|
@ -48,8 +49,13 @@ OutputVector translate_set_rows(const NodeContext & context) {
|
|||
|
||||
if (auto dst_reshape = std::dynamic_pointer_cast<ov::op::v1::Reshape>(dst.get_node_shared_ptr())) {
|
||||
// Fix the case of multiple sequences, reshape back to original shape [1, n_seq, ctx_per_seq, emb]
|
||||
res = std::make_shared<ov::op::v1::Reshape>(
|
||||
res, ov::op::v0::Constant::create(ov::element::i64, {4}, dst_reshape->get_input_shape(0)), false);
|
||||
// ctx_per_seq is not fixed due to llama-bench compatibility
|
||||
auto dst_shape_partial = dst_reshape->get_input_partial_shape(0);
|
||||
std::vector<int64_t> dst_shape = {dst_shape_partial[0].get_length(), dst_shape_partial[1].get_length(),
|
||||
dst_shape_partial[2].is_static() ? dst_shape_partial[2].get_length() : -1,
|
||||
dst_shape_partial[3].get_length()};
|
||||
res = std::make_shared<ov::op::v1::Reshape>(res, ov::op::v0::Constant::create(ov::element::i64, {4}, dst_shape),
|
||||
false);
|
||||
}
|
||||
return rename_outputs_with_suffix({res}, context.get_name());
|
||||
}
|
||||
|
|
|
|||
|
|
@ -129,26 +129,27 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph *
|
|||
ov_input_names_cache[cgraph] = ov_input_names;
|
||||
ov_output_names_cache[cgraph] = ov_output_names;
|
||||
|
||||
// Set output tensors (for NPU) and kvcache i/o tensors once and for all
|
||||
for (size_t i = 0; i < ov_output_names.size(); i++) {
|
||||
auto output_name = ov_output_names[i];
|
||||
if (is_static || output_name.find("cache") == 0) {
|
||||
auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]);
|
||||
infer_request->set_output_tensor(i, output_tensor);
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < ov_input_names.size(); i++) {
|
||||
auto param_name = ov_input_names[i];
|
||||
if (param_name.find("cache") == 0) {
|
||||
ov::Tensor input_tensor;
|
||||
if (is_static) {
|
||||
input_tensor = get_ov_input_tensor_static(ggml_decoder, param_name, 0, 0);
|
||||
} else {
|
||||
input_tensor = get_ov_input_tensor(ggml_decoder, param_name);
|
||||
}
|
||||
infer_request->set_input_tensor(i, input_tensor);
|
||||
}
|
||||
}
|
||||
// // Set output tensors (for NPU) and kvcache i/o tensors once and for all
|
||||
// // Note: does not seem to improve perf on CPU/GPU, but it breaks llama-bench, so disabled it
|
||||
// for (size_t i = 0; i < ov_output_names.size(); i++) {
|
||||
// auto output_name = ov_output_names[i];
|
||||
// if (is_static || output_name.find("cache") == 0) {
|
||||
// auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]);
|
||||
// infer_request->set_output_tensor(i, output_tensor);
|
||||
// }
|
||||
// }
|
||||
// for (size_t i = 0; i < ov_input_names.size(); i++) {
|
||||
// auto param_name = ov_input_names[i];
|
||||
// if (param_name.find("cache") == 0) {
|
||||
// ov::Tensor input_tensor;
|
||||
// if (is_static) {
|
||||
// input_tensor = get_ov_input_tensor_static(ggml_decoder, param_name, 0, 0);
|
||||
// } else {
|
||||
// input_tensor = get_ov_input_tensor(ggml_decoder, param_name);
|
||||
// }
|
||||
// infer_request->set_input_tensor(i, input_tensor);
|
||||
// }
|
||||
// }
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -158,9 +159,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph *
|
|||
if (!is_static) {
|
||||
for (size_t i = 0; i < ov_input_names.size(); i++) {
|
||||
auto param_name = ov_input_names[i];
|
||||
if (param_name.find("cache") == 0) {
|
||||
continue;
|
||||
}
|
||||
auto input_tensor = get_ov_input_tensor(ggml_decoder, param_name);
|
||||
infer_request->set_input_tensor(i, input_tensor);
|
||||
|
||||
|
|
@ -188,9 +186,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph *
|
|||
for (int j = 0; j < input_len; j++) {
|
||||
for (size_t i = 0; i < ov_input_names.size(); i++) {
|
||||
auto param_name = ov_input_names[i];
|
||||
if (param_name.find("cache") == 0) {
|
||||
continue;
|
||||
}
|
||||
auto input_tensor = get_ov_input_tensor_static(ggml_decoder, param_name, j, input_len);
|
||||
infer_request->set_input_tensor(i, input_tensor);
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue