From 38254cf592ba3f7fbf69510e48ed63708559ee71 Mon Sep 17 00:00:00 2001 From: "Yu, Zijun" Date: Mon, 1 Dec 2025 13:47:43 +0800 Subject: [PATCH] NPU prefill chunking --- ggml/src/ggml-openvino/ggml-decoder.cpp | 17 +- ggml/src/ggml-openvino/ggml-decoder.h | 10 +- ggml/src/ggml-openvino/utils.cpp | 303 +++++++++++++++++++----- ggml/src/ggml-openvino/utils.h | 39 ++- 4 files changed, 286 insertions(+), 83 deletions(-) diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 3eda21a562..c7035c1580 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -37,8 +37,12 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map> & model_weights, - bool is_static) : + bool is_static, + bool is_prefill, + int prefill_chunk_size) : m_is_static(is_static), + m_is_prefill(is_prefill), + m_prefill_chunk_size(prefill_chunk_size), m_cgraph(cgraph), m_model_weights(model_weights) { if (auto * env = getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); env && std::string(env) != "0") { @@ -341,12 +345,16 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co auto name = std::string(input->name); ov::PartialShape input_shape; - if (name == "inp_tokens" || name == "inp_pos" || name == "inp_out_ids") { + if (name == "inp_tokens" || name == "inp_pos") { + int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1; + input_shape = ov::PartialShape{1, 1, 1, len}; + + } else if (name == "inp_out_ids") { input_shape = ov::PartialShape{1, 1, 1, m_is_static ? 1 : -1}; } else if (name.find("KQ_mask") == 0) { if (m_is_static) { - input_shape = ov::PartialShape{1, 1, 1, m_ctx}; + input_shape = ov::PartialShape{1, 1, m_is_prefill ? m_prefill_chunk_size : 1, m_ctx}; } else { input_shape = ov::PartialShape{-1, 1, -1, -1}; } @@ -359,7 +367,8 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co } } else if (op && op->op == GGML_OP_SET_ROWS && op->src[1] == input) { - input_shape = ov::PartialShape{1, 1, 1, m_is_static ? 1 : -1}; + int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1; + input_shape = ov::PartialShape{1, 1, 1, len}; } else if (input->op == GGML_OP_VIEW) { // This case is added to make test-backend-ops work diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 6e2bf0486d..11f35f038e 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -26,7 +26,9 @@ public: // Graph decoder GgmlOvDecoder(ggml_cgraph * cgraph, std::map> & model_weights, - bool is_static); + bool is_static, + bool is_prefill = false, + int prefill_chunk_size = 256); // Naive graph decoder GgmlOvDecoder(ggml_cgraph * cgraph, std::map> & model_weights); @@ -159,6 +161,10 @@ public: void clear_model_weights() { m_model_weights.clear(); } + bool m_is_static = false; + bool m_is_prefill = false; + int m_prefill_chunk_size = 0; + private: void set_input_output(ggml_tensor * node, bool naive = false); void add_extra_inputs(); @@ -171,8 +177,6 @@ private: void set_llm_params(); void validate_cgraph() const; - bool m_is_static = false; - ggml_cgraph * m_cgraph = nullptr; std::vector m_nodes; std::map m_inputs; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 6e1d7393c7..ae8916cc58 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -48,8 +48,17 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * } return device; }; - static std::string device = get_device(); - bool is_static = device == "NPU" ? true : false; + auto get_prefill_chunk_size = [] { + const char * chunk_size_str = getenv("GGML_OPENVINO_PREFILL_CHUNK_SIZE"); + if (chunk_size_str && atoi(chunk_size_str) > 0) { + return atoi(chunk_size_str); + } + return 256; + }; + + static const auto device = get_device(); + static const auto is_static = device == "NPU" ? true : false; + static const auto prefill_chunk_size = get_prefill_chunk_size(); ov::AnyMap config; @@ -71,12 +80,16 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * static std::mutex cache_mutex; static std::unordered_map> infer_request_cache; + static std::unordered_map> infer_request_cache_prefill; static std::unordered_map> ov_input_names_cache; static std::unordered_map> ov_output_names_cache; std::shared_ptr ggml_decoder; std::shared_ptr infer_request; + const auto * inp_pos = get_inp_pos_tensor(cgraph); + const auto is_prefill = get_is_prefill(inp_pos); + int64_t decoder_end_time; int64_t conversion_end_time; int64_t compile_end_time; @@ -88,36 +101,78 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * auto it = infer_request_cache.find(cgraph); if (it != infer_request_cache.end()) { std::map> model_weights; - ggml_decoder = std::make_shared(cgraph, model_weights, is_static); + ggml_decoder = + std::make_shared(cgraph, model_weights, is_static, is_prefill, prefill_chunk_size); decoder_end_time = ggml_time_us(); infer_request = infer_request_cache[cgraph]; + if (is_static && is_prefill) { + infer_request = infer_request_cache_prefill[cgraph]; + } conversion_end_time = ggml_time_us(); compile_end_time = conversion_end_time; } else { std::shared_ptr model; auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, get_types_to_requant(device)); - ggml_decoder = std::make_shared(cgraph, model_weights, is_static); - decoder_end_time = ggml_time_us(); + if (!is_static) { + ggml_decoder = std::make_shared(cgraph, model_weights, is_static); + decoder_end_time = ggml_time_us(); - auto input_model = std::make_shared(ggml_decoder); - model = ov::frontend::ggml::FrontEnd::convert(input_model); - ggml_decoder->clear_model_weights(); - conversion_end_time = ggml_time_us(); + auto input_model = std::make_shared(ggml_decoder); + model = ov::frontend::ggml::FrontEnd::convert(input_model); + ggml_decoder->clear_model_weights(); + conversion_end_time = ggml_time_us(); - if (getenv("GGML_OPENVINO_DUMP_IR")) { - char timestamped_filename[64]; - auto timestamp = (long long) ggml_time_us(); - snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); - ov::serialize(model, timestamped_filename); + if (getenv("GGML_OPENVINO_DUMP_IR")) { + char timestamped_filename[64]; + auto timestamp = (long long) ggml_time_us(); + snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); + ov::serialize(model, timestamped_filename); + } + + auto compiled_model = core.compile_model(model, device, get_ov_compile_config(device)); + compile_end_time = ggml_time_us(); + infer_request_cache[cgraph] = std::make_shared(compiled_model.create_infer_request()); + infer_request = infer_request_cache[cgraph]; + } else { + auto ggml_decoder_prefill = + std::make_shared(cgraph, model_weights, is_static, true, prefill_chunk_size); + auto ggml_decoder_decode = + std::make_shared(cgraph, model_weights, is_static, false, prefill_chunk_size); + decoder_end_time = ggml_time_us(); + + auto input_model_prefill = std::make_shared(ggml_decoder_prefill); + auto input_model_decode = std::make_shared(ggml_decoder_decode); + + auto model_prefill = ov::frontend::ggml::FrontEnd::convert(input_model_prefill); + ggml_decoder_prefill->clear_model_weights(); + auto model_decode = ov::frontend::ggml::FrontEnd::convert(input_model_decode); + ggml_decoder_decode->clear_model_weights(); + conversion_end_time = ggml_time_us(); + + if (getenv("GGML_OPENVINO_DUMP_IR")) { + char timestamped_filename[64]; + auto timestamp = (long long) ggml_time_us(); + snprintf(timestamped_filename, sizeof(timestamped_filename), "model_prefill_%lld.xml", timestamp); + ov::serialize(model_prefill, timestamped_filename); + snprintf(timestamped_filename, sizeof(timestamped_filename), "model_decode_%lld.xml", timestamp); + ov::serialize(model_decode, timestamped_filename); + } + + auto compiled_model_prefill = core.compile_model(model_prefill, device, get_ov_compile_config(device)); + auto compiled_model_decode = core.compile_model(model_decode, device, get_ov_compile_config(device)); + infer_request_cache_prefill[cgraph] = + std::make_shared(compiled_model_prefill.create_infer_request()); + infer_request_cache[cgraph] = + std::make_shared(compiled_model_decode.create_infer_request()); + compile_end_time = ggml_time_us(); + + model = is_prefill ? model_prefill : model_decode; + ggml_decoder = is_prefill ? ggml_decoder_prefill : ggml_decoder_decode; + infer_request = is_prefill ? infer_request_cache_prefill[cgraph] : infer_request_cache[cgraph]; } - auto compiled_model = core.compile_model(model, device, get_ov_compile_config(device)); - compile_end_time = ggml_time_us(); - infer_request_cache[cgraph] = std::make_shared(compiled_model.create_infer_request()); - infer_request = infer_request_cache[cgraph]; - std::vector ov_input_names; std::vector ov_output_names; for (const auto & ov_param : model->get_parameters()) { @@ -131,20 +186,22 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * // Set output tensors (for NPU) and kvcache i/o tensors once and for all // Note: does not seem to improve perf on CPU/GPU, but breaks llama-bench, so disabled it for CPU/GPU - if (is_static) { - for (size_t i = 0; i < ov_output_names.size(); i++) { - auto output_name = ov_output_names[i]; - auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); - infer_request->set_output_tensor(i, output_tensor); - } - for (size_t i = 0; i < ov_input_names.size(); i++) { - auto param_name = ov_input_names[i]; - if (param_name.find("cache") == 0) { - auto input_tensor = get_ov_input_tensor_static(ggml_decoder, param_name, 0, 0); - infer_request->set_input_tensor(i, input_tensor); - } - } - } + // if (is_static) { + // for (size_t i = 0; i < ov_input_names.size(); i++) { + // auto param_name = ov_input_names[i]; + // if (param_name.find("cache") == 0) { + // auto input_tensor = get_ov_input_tensor_static_decode(ggml_decoder, param_name); + // infer_request->set_input_tensor(i, input_tensor); + // } + // } + // for (size_t i = 0; i < ov_output_names.size(); i++) { + // auto output_name = ov_output_names[i]; + // if (output_name.find("cache") == 0) { + // auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); + // infer_request->set_output_tensor(i, output_tensor); + // } + // } + // } } } @@ -177,11 +234,39 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * } } } else { - auto input_len = ggml_decoder->get_input_len(); - for (int j = 0; j < input_len; j++) { + if (is_prefill) { + auto inp_len = inp_pos->ne[0]; + for (int chunk_index = 0; chunk_index * prefill_chunk_size < inp_len; chunk_index++) { + for (size_t i = 0; i < ov_input_names.size(); i++) { + auto param_name = ov_input_names[i]; + auto input_tensor = get_ov_input_tensor_static_prefill(ggml_decoder, param_name, chunk_index); + infer_request->set_input_tensor(i, input_tensor); + + if (getenv("GGML_OPENVINO_DEBUG_INPUT")) { + const auto input_tensor = infer_request->get_input_tensor(i); + print_input_tensor_info(param_name, input_tensor); + } + } + + for (size_t i = 0; i < ov_output_names.size(); i++) { + auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); + infer_request->set_output_tensor(i, output_tensor); + } + + infer_request->infer(); + + if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) { + for (size_t i = 0; i < ov_output_names.size(); i++) { + const auto output_tensor = infer_request->get_output_tensor(i); + print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data()); + } + } + } + infer_end_time = ggml_time_us(); + } else { for (size_t i = 0; i < ov_input_names.size(); i++) { auto param_name = ov_input_names[i]; - auto input_tensor = get_ov_input_tensor_static(ggml_decoder, param_name, j, input_len); + auto input_tensor = get_ov_input_tensor_static_decode(ggml_decoder, param_name); infer_request->set_input_tensor(i, input_tensor); if (getenv("GGML_OPENVINO_DEBUG_INPUT")) { @@ -190,7 +275,13 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * } } + for (size_t i = 0; i < ov_output_names.size(); i++) { + auto output_tensor = get_ov_output_tensor(ggml_decoder, ov_output_names[i]); + infer_request->set_output_tensor(i, output_tensor); + } + infer_request->infer(); + infer_end_time = ggml_time_us(); if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) { for (size_t i = 0; i < ov_output_names.size(); i++) { @@ -199,7 +290,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, ggml_cgraph * } } } - infer_end_time = ggml_time_us(); } if (getenv("GGML_OPENVINO_PROFILING")) { @@ -324,21 +414,84 @@ ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, cons return input_tensor; } -ov::Tensor get_ov_input_tensor_static(std::shared_ptr ggml_decoder, - const std::string & param_name, - int j, - int input_len) { +ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr ggml_decoder, + const std::string & param_name) { const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(param_name); const auto * op = ggml_decoder->get_tensor_used_op(ggml_tensor); if (param_name == "inp_pos" || param_name == "inp_tokens" || (op->op == GGML_OP_SET_ROWS && op->src[1] == ggml_tensor)) { + assert(ggml_tensor->ne[0] == 1); ov::Shape input_shape = {1, 1, 1, 1}; ov::Tensor input_tensor(ggml_decoder->get_input_type(param_name), input_shape); - // copy the j-th value from ggml_tensor + if (ggml_tensor->type == GGML_TYPE_I32) { + *input_tensor.data() = *((int32_t *) ggml_tensor->data); + } else if (ggml_tensor->type == GGML_TYPE_I64) { + *input_tensor.data() = *((int64_t *) ggml_tensor->data); + } else { + throw std::runtime_error("Unexpected tensor type for " + param_name); + } + return input_tensor; + } + + if (param_name == "inp_out_ids") { + ov::Shape input_shape = {1, 1, 1, 1}; + ov::Tensor input_tensor(ggml_decoder->get_input_type(param_name), input_shape); + int32_t inp_out_id = *((int32_t *) ggml_tensor->data); + assert(ggml_tensor->ne[0] == 1); + assert(inp_out_id == 0); + *input_tensor.data() = inp_out_id; + return input_tensor; + } + + if (param_name.find("KQ_mask") == 0) { + size_t context_size = ggml_decoder->get_ctx_size(); + std::vector padded_data = pad_input(ggml_tensor, 1, context_size, -INFINITY); + ov::Tensor input_tensor(ov::element::f32, ov::Shape{1, 1, 1, context_size}); + auto * data_ptr = input_tensor.data(); + std::copy(padded_data.begin(), padded_data.begin() + context_size, data_ptr); + return input_tensor; + } + + return get_ov_input_tensor(ggml_decoder, param_name); +} + +ov::Tensor get_ov_input_tensor_static_prefill(std::shared_ptr ggml_decoder, + const std::string & param_name, + int chunk_index) { + const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(param_name); + const auto * op = ggml_decoder->get_tensor_used_op(ggml_tensor); + + const size_t input_len = ggml_decoder->get_input_len(); + const size_t chunk_size = ggml_decoder->m_prefill_chunk_size; + const size_t chunk_valid_size = std::min(chunk_size, input_len - chunk_index * chunk_size); + const size_t chunk_pad_size = chunk_size - chunk_valid_size; + + if (param_name == "inp_pos" || param_name == "inp_tokens" || + (op->op == GGML_OP_SET_ROWS && op->src[1] == ggml_tensor)) { + ov::Shape input_shape = {1, 1, 1, chunk_size}; + ov::Tensor input_tensor(ggml_decoder->get_input_type(param_name), input_shape); + // copy the chunk_index-th chunk from ggml_tensor size_t element_size = ggml_type_size(ggml_tensor->type); - void * input_data = (char *) ggml_tensor->data + j * element_size; - std::memcpy(input_tensor.data(), input_data, element_size); + void * input_data = (char *) ggml_tensor->data + chunk_index * chunk_size * element_size; + std::memcpy(input_tensor.data(), input_data, chunk_valid_size * element_size); + // pad the rest with last_value + 1, so that kv's of padded positions are inserted + // to the next row after the valids row in the kvcache + if (chunk_pad_size > 0) { + if (ggml_tensor->type == GGML_TYPE_I32) { + int32_t last_value = + *((int32_t *) ggml_tensor->data + (chunk_index * chunk_size + chunk_valid_size - 1)); + int32_t * output_data = input_tensor.data(); + std::fill(output_data + chunk_valid_size, output_data + chunk_size, last_value + 1); + } else if (ggml_tensor->type == GGML_TYPE_I64) { + int64_t last_value = + *((int64_t *) ggml_tensor->data + (chunk_index * chunk_size + chunk_valid_size - 1)); + int64_t * output_data = input_tensor.data(); + std::fill(output_data + chunk_valid_size, output_data + chunk_size, last_value + 1); + } else { + throw std::runtime_error("Unexpected tensor type for " + param_name); + } + } return input_tensor; } @@ -348,25 +501,26 @@ ov::Tensor get_ov_input_tensor_static(std::shared_ptr ggml_decode if (ggml_tensor->ne[0] == 0) { *input_tensor.data() = 0; } else if (ggml_tensor->ne[0] == 1) { - if (j == input_len - 1) { - *input_tensor.data() = *((int32_t *) ggml_tensor->data); - } else { - *input_tensor.data() = 0; - } + int32_t inp_out_id = *((int32_t *) ggml_tensor->data) % chunk_size; + *input_tensor.data() = inp_out_id; } else { - throw std::runtime_error("Static graph inp_out_ids unexpected ne[0] > 1"); + throw std::runtime_error("NPU does not support outputing logits for multiple tokens at once."); } return input_tensor; } if (param_name.find("KQ_mask") == 0) { + size_t cols = ggml_tensor->ne[0]; + size_t rows = ggml_tensor->ne[1]; + float * ggml_data = (float *) ggml_tensor->data + chunk_index * chunk_size * cols; + size_t chunk_valid_rows = std::min(chunk_size, rows - chunk_index * chunk_size); size_t context_size = ggml_decoder->get_ctx_size(); - const auto * input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); - std::vector padded_data = pad_input(input_tensor_ggml, input_len, context_size, -INFINITY); - ov::Tensor input_tensor(ov::element::f32, ov::Shape{1, 1, 1, context_size}); - // copy the j-th row of padded_data + std::vector padded_data = + pad_input(ggml_data, chunk_valid_rows, cols, chunk_size, context_size, -INFINITY); + set_zero_diagonal(padded_data, chunk_size, context_size); + ov::Tensor input_tensor(ov::element::f32, ov::Shape{1, 1, chunk_size, context_size}); auto * data_ptr = input_tensor.data(); - std::copy(padded_data.begin() + j * context_size, padded_data.begin() + (j + 1) * context_size, data_ptr); + std::copy(padded_data.begin(), padded_data.begin() + chunk_size * context_size, data_ptr); return input_tensor; } @@ -401,9 +555,28 @@ void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor std::cout << "Input name: " << name << ", Input shape: " << tensor.get_shape() << ", Address: " << tensor.data() << std::endl; switch (tensor.get_element_type()) { - case ov::element::f32: - std::cout << *(tensor.data()) << std::endl; + case ov::element::f32: { + if (name.find("KQ_mask") == std::string::npos) { + std::cout << *(tensor.data()) << std::endl; + } else { + size_t rows = tensor.get_shape()[2]; + size_t cols = tensor.get_shape()[3]; + auto * data = tensor.data(); + for (size_t i = 0; i < rows; ++i) { + for (size_t j = 0; j < cols; ++j) { + float val = data[i * cols + j]; + if (std::isinf(val) && val < 0) { + std::cout << std::setw(5) << "-inf"; + } else { + std::cout << std::setw(5) << val; + } + } + std::cout << std::endl; + } + } + break; + } case ov::element::f16: std::cout << *(tensor.data()) << std::endl; break; @@ -414,7 +587,10 @@ void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor std::cout << std::endl; break; case ov::element::i64: - std::cout << *(tensor.data()) << std::endl; + for (size_t i = 0; i < tensor.get_size(); ++i) { + std::cout << tensor.data()[i] << " "; + } + std::cout << std::endl; break; default: break; @@ -471,9 +647,10 @@ void print_output_tensor_info(const std::string & name, const ov::Tensor & tenso } } -void set_zero_diagonal(std::vector & matrix, size_t dim) { - for (size_t i = 0; i < dim; ++i) { - matrix[i * dim + i] = 0.0f; +void set_zero_diagonal(std::vector & matrix, size_t rows, size_t cols) { + for (size_t i = 0; i < rows; ++i) { + size_t diag_col = std::min(i, cols - 1); + matrix[i * cols + diag_col] = 0.0f; } } @@ -494,8 +671,8 @@ const ggml_tensor * get_inp_pos_tensor(ggml_cgraph * cgraph) { throw std::runtime_error("get_inp_pos_tensor: inp_pos not found in cgraph"); } -bool get_is_first_token(const ggml_tensor * inp_pos) { - return *(int32_t *) inp_pos->data == 0; +bool get_is_prefill(const ggml_tensor * inp_pos) { + return inp_pos->ne[0] > 1; } #pragma GCC diagnostic pop diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 999fc53f32..31f86d0999 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -3,6 +3,7 @@ #include "ggml-impl.h" #include +#include #include enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph); @@ -14,35 +15,47 @@ void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor void print_output_tensor_info(const std::string & name, const ov::Tensor & tensor, void * output_dst); template -std::vector pad_input(const ggml_tensor * tensor, size_t padded_rows, size_t padded_cols, T pad_value) { - std::vector padded_data(padded_rows * padded_cols, pad_value); - size_t rows = tensor->ne[1]; - size_t cols = tensor->ne[0]; - T * data = static_cast(tensor->data); +std::vector pad_input(const T * data, + size_t rows, + size_t cols, + size_t padded_rows, + size_t padded_cols, + T pad_value) { + std::vector padded(padded_rows * padded_cols, pad_value); for (size_t i = 0; i < std::min(rows, padded_rows); ++i) { for (size_t j = 0; j < std::min(cols, padded_cols); ++j) { - padded_data[i * padded_cols + j] = data[i * cols + j]; + padded[i * padded_cols + j] = data[i * cols + j]; } } - return padded_data; + + return padded; } -void set_zero_diagonal(std::vector & matrix, size_t dim); +template +std::vector pad_input(const ggml_tensor * tensor, size_t padded_rows, size_t padded_cols, T pad_value) { + return pad_input(reinterpret_cast(tensor->data), + static_cast(tensor->ne[1]), // rows + static_cast(tensor->ne[0]), // cols + padded_rows, padded_cols, pad_value); +} + +void set_zero_diagonal(std::vector & matrix, size_t rows, size_t cols); const ggml_tensor * get_inp_pos_tensor(struct ggml_cgraph * cgraph); -bool get_is_first_token(const ggml_tensor * inp_pos); +bool get_is_prefill(const ggml_tensor * inp_pos); ov::AnyMap get_ov_compile_config(const std::string & device); std::map get_types_to_requant(const std::string & device); ov::Tensor get_ov_input_tensor(std::shared_ptr ggml_decoder, const std::string & param_name); -ov::Tensor get_ov_input_tensor_static(std::shared_ptr ggml_decoder, - const std::string & param_name, - int j, - int input_len); +ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr ggml_decoder, + const std::string & param_name); +ov::Tensor get_ov_input_tensor_static_prefill(std::shared_ptr ggml_decoder, + const std::string & param_name, + int chunk_index); ov::Tensor get_ov_output_tensor(std::shared_ptr ggml_decoder, const std::string & result_name);