diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 7bb092a65c..29be4dbae8 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -108,22 +108,25 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node) { ov::PartialShape input_shape; if (std::string(src->name) == "inp_tokens" || std::string(src->name) == "inp_pos") { if (m_is_static) { - input_shape = ov::PartialShape(get_shape(src)); - // if (m_is_first_token) { - // input_shape = ov::PartialShape{1, 1, m_max_token_len}; - // } else { - // input_shape = ov::PartialShape{1, 1, 1}; - // } + if (m_is_first_token) { + input_shape = ov::PartialShape{1, 1, m_max_token_len}; + } else { + input_shape = ov::PartialShape{1, 1, 1}; + } } else { input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_max_token_len)}; } - } else if (std::string(src->name).find("KQ_mask") == 0) { + } else if (std::string(src->name) == "KQ_mask") { if (m_is_static) { - input_shape = ov::PartialShape(get_shape(src)); + if (m_is_first_token) { + input_shape = ov::PartialShape{1, m_max_token_len, m_max_token_len}; + } else { + input_shape = ov::PartialShape{1, 1, m_max_token_len}; + } } else { - auto max_token_len = GGML_PAD(m_max_token_len, GGML_KQ_MASK_PAD); + auto max_mask_size = GGML_PAD(m_max_token_len, GGML_KQ_MASK_PAD); input_shape = - ov::PartialShape{1, ov::Dimension(1, max_token_len), ov::Dimension(1, max_token_len)}; + ov::PartialShape{1, ov::Dimension(1, max_mask_size), ov::Dimension(1, max_mask_size)}; } } else { input_shape = ov::Shape{get_shape(src)}; @@ -208,6 +211,7 @@ void GgmlOvDecoder::set_max_token_len() { void GgmlOvDecoder::add_extra_inputs() { int64_t past_token_len; + // attention_size not used for NPU int64_t attention_size; for (const auto& node : m_nodes) { @@ -231,8 +235,7 @@ void GgmlOvDecoder::add_extra_inputs() { for (const auto& node : m_nodes) { if (node->src[1] && std::string(node->src[1]->name).find("inp_tokens") == 0) { int64_t total_token_len = node->src[1]->ne[0] + past_token_len; - attention_size = (total_token_len + 31) / 32 * 32; - + attention_size = GGML_PAD(total_token_len, 32); std::string name = "attention_size"; auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); param_node->set_friendly_name(name); diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index b372cc8040..2c89d06267 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -92,9 +92,12 @@ public: virtual bool is_static() const override { return m_is_static; } - virtual bool is_first_token() const { + virtual bool is_first_token() const override { return m_is_first_token; } + virtual int get_max_token_len() const override { + return m_max_token_len; + } private: void set_input_output(ggml_tensor* node); @@ -106,7 +109,7 @@ private: static std::shared_ptr create_weight_node(ggml_tensor* tensor); void set_max_token_len(); - int64_t m_max_token_len; + int m_max_token_len; void add_weight_const_parallel(std::map>& model_weights); diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index a0b9509336..6212568399 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -57,6 +58,8 @@ public: virtual const std::vector& get_model_output_names() const = 0; virtual bool is_static() const = 0; + virtual bool is_first_token() const = 0; + virtual int get_max_token_len() const = 0; }; } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp index f5940585a6..f4e7c4e31f 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.hpp +++ b/ggml/src/ggml-openvino/openvino/node_context.hpp @@ -1,5 +1,6 @@ #pragma once +#include #include #include "decoder.hpp" @@ -87,6 +88,12 @@ public: bool is_static() const { return m_decoder->is_static(); } + bool is_first_token() const { + return m_decoder->is_first_token(); + } + int get_max_token_len() const { + return m_decoder->get_max_token_len(); + } private: std::shared_ptr m_decoder; diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp index fe755a5f64..75dd0e7d83 100644 --- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include #include @@ -34,18 +34,26 @@ OutputVector translate_cpy(const NodeContext& context) { auto src0 = context.get_input(0); auto src1 = context.get_input(1); - auto past_token_len = context.get_input("past_token_len"); + auto past_token_len_scalar = context.get_input("past_token_len"); + + src0 = std::make_shared(src0, context.get_input_type(1)); ov::Output res; + if (context.is_static() && context.is_first_token()) { + res = src0; + return rename_outputs_with_suffix({res}, context.get_name()); + } + auto src0_shape = context.get_input_shape(0).to_shape(); auto output_shape = context.get_output_shape(0).to_shape(); std::vector input0_strides = context.get_input_stride(0); std::vector output_strides = context.get_output_stride(0); - auto one = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {1}); + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto one_scalar = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {1}); - src0 = std::make_shared(src0, src1); if (op_case == 1) { // Write K to cache_k int64_t head_size = src0_shape[2]; @@ -56,32 +64,29 @@ OutputVector translate_cpy(const NodeContext& context) { auto reshaped_src1 = std::make_shared(src1, reshaped_src1_shape, false); auto token_len = get_dimensions(src0.get_node_shared_ptr(), {0}); - token_len = std::make_shared(token_len, - ov::op::v0::Constant::create(ov::element::i64, {0}, {}), - false); + auto token_len_scalar = std::make_shared(token_len, zero); + std::shared_ptr indices; if (context.is_static()) { - int32_t* op_params = context.get_input_op_params(1); - int64_t past_token_len_val = op_params[0] / context.get_input_stride(1)[2] / num_heads / head_size; - past_token_len = ov::op::v0::Constant::create(ov::element::i64, {}, {past_token_len_val}); + indices = past_token_len_scalar.get_node_shared_ptr(); + indices = std::make_shared( + indices, + ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{0, 1})); + } else { + auto total_token_len_scalar = std::make_shared(past_token_len_scalar, token_len_scalar); + indices = std::make_shared(past_token_len_scalar, + total_token_len_scalar, + one_scalar, + ov::element::i64); + indices = std::make_shared(indices, one); } - auto total_token_len = std::make_shared(past_token_len, token_len); - std::shared_ptr indices = - std::make_shared(past_token_len, total_token_len, one, ov::element::i64); - indices = std::make_shared( - indices, - ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{1})); - res = std::make_shared(reshaped_src1, indices, src0); } else { // Write V to cache_v - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); - auto zero_scalar = ov::op::v0::Constant::create(ov::element::i64, {}, {0}); - auto one_scalar = ov::op::v0::Constant::create(ov::element::i64, {}, {1}); int64_t total_head_size = src0_shape[1]; auto total_head_size_node = ov::op::v0::Constant::create(ov::element::i64, {1}, {total_head_size}); @@ -89,36 +94,6 @@ OutputVector translate_cpy(const NodeContext& context) { auto token_len = get_dimensions(src0.get_node_shared_ptr(), {2}); auto token_len_scalar = std::make_shared(token_len, zero); - if (context.is_static()) { - int32_t* op_params = context.get_input_op_params(1); - int64_t past_token_len_val = op_params[0] / context.get_input_stride(1)[2]; - past_token_len = ov::op::v0::Constant::create(ov::element::i64, {}, {past_token_len_val}); - } - auto total_token_len_scalar = std::make_shared(past_token_len, token_len_scalar); - - // auto reshaped_src1 = std::make_shared( - // src1, - // ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, -1}), - // false); - - // auto src1_left = std::make_shared( - // reshaped_src1, - // ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 0, 0}), - // std::make_shared(ov::OutputVector{one, total_head_size_node, past_token_len}, 0), - // ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1})); - - // auto src1_right = std::make_shared( - // reshaped_src1, - // std::make_shared(ov::OutputVector{zero, zero, total_token_len}, 0), - // ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, INT_MAX}), - // ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1})); - - // auto reshaped_src0 = std::make_shared( - // src0, - // ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, -1}), - // false); - - // auto res = std::make_shared(ov::OutputVector{src1_left, reshaped_src0, src1_right}, 2); // 1D tensor of shape [total_head_size], values starting from 0 auto range_row = @@ -131,8 +106,19 @@ OutputVector translate_cpy(const NodeContext& context) { std::make_shared(ov::OutputVector{total_head_size_node, token_len, one}, 0)); // 1D tensor of shape [token_len], values starting from past_token_len - auto range_col = - std::make_shared(past_token_len, total_token_len_scalar, one_scalar, element::i64); + std::shared_ptr range_col; + if (context.is_static()) { + range_col = past_token_len_scalar.get_node_shared_ptr(); + range_col = std::make_shared( + range_col, + ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{0})); + } else { + auto total_token_len_scalar = std::make_shared(past_token_len_scalar, token_len_scalar); + range_col = std::make_shared(past_token_len_scalar, + total_token_len_scalar, + one_scalar, + ov::element::i64); + } auto range_col_reshaped = std::make_shared(range_col, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 2})); diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 65a609f1d7..3e49081515 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -1,5 +1,7 @@ #include "utils.h" +#include +#include #include #include #include @@ -13,6 +15,7 @@ #include #include #include +#include #include "ggml-impl.h" #include "ggml.h" @@ -52,7 +55,6 @@ static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph) { static ov::Core core; - static bool is_first_token = true; static std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : ""; if (device.empty()) { @@ -66,12 +68,16 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c bool is_static = device == "NPU" ? true : false; ov::AnyMap config; - if (is_static) { + if (device == "NPU") { config = { {"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=ReduceMean"}, {"NPU_USE_NPUW", "YES"}, {"NPUW_DEVICES", "NPU"}, {"NPUW_FOLD", "YES"}, + {"NPUW_DQ", "YES"}, + {"NPUW_FUNCALL_ASYNC", "YES"}, + {"NPUW_HOST_GATHER", "YES"}, + {"NPUW_WEIGHTS_BANK", "shared"}, // {"NPU_COMPILER_TYPE", "MLIR"}, }; } @@ -83,69 +89,128 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c core.set_property(ov::cache_dir(cache_dir)); } - // For CPU and GPU, there is only one compiled model, so only use the first element of the pair - // For NPU, there are prefill model and kvcache model (This is the ideal approach, but not implemented yet, - // currently recompile for every token) - using CachedItem = std::pair, std::pair>; - static std::unordered_map compiled_cache; + // CPU and GPU will only use cache_prefill + using CachedItem = std::pair, ov::CompiledModel>; + static std::unordered_map compiled_cache_prefill; + static std::unordered_map compiled_cache_kvcache; + std::shared_ptr ggml_decoder; std::shared_ptr model; - ov::CompiledModel compiled_model_prefill; - ov::CompiledModel compiled_model_kvcache; + ov::CompiledModel compiled_model; + int64_t decoder_end_time; int64_t conversion_end_time; int64_t compile_end_time; - auto ggml_decoder = get_ggml_decoder(cgraph, is_static, is_first_token); - decoder_end_time = ggml_time_us(); + auto it = compiled_cache_prefill.find(cgraph); + bool is_first_token = it == compiled_cache_prefill.end(); + if (!is_first_token) { + ggml_decoder = get_ggml_decoder(cgraph, is_static, false); + decoder_end_time = ggml_time_us(); - auto it = compiled_cache.find(cgraph); - if (it != compiled_cache.end() && !is_static) { - model = it->second.first; - conversion_end_time = ggml_time_us(); - - compiled_model_prefill = it->second.second.first; - compiled_model_kvcache = it->second.second.second; - compile_end_time = ggml_time_us(); - } else { - ov::frontend::InputModel::Ptr input_model = std::make_shared(ggml_decoder); - model = ov::frontend::ggml::FrontEnd::convert(input_model); - - conversion_end_time = ggml_time_us(); - - if (getenv("GGML_OPENVINO_DUMP_IR")) { - char timestamped_filename[64]; - auto timestamp = (long long)ggml_time_us(); - snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); - ov::serialize(model, timestamped_filename); + if (is_static) { + model = compiled_cache_kvcache[cgraph].first; + compiled_model = compiled_cache_kvcache[cgraph].second; + } else { + model = it->second.first; + compiled_model = it->second.second; } - - compiled_model_prefill = core.compile_model(model, device, config); - compile_end_time = ggml_time_us(); - - compiled_cache[cgraph] = std::make_pair(model, std::make_pair(compiled_model_prefill, compiled_model_kvcache)); - } - - ov::InferRequest infer_request; - if (!is_static) { - infer_request = compiled_model_prefill.create_infer_request(); + conversion_end_time = ggml_time_us(); + compile_end_time = conversion_end_time; } else { - infer_request = compiled_model_prefill.create_infer_request(); - // if (is_first_token) { - // infer_request = compiled_model_prefill.create_infer_request(); - // } else { - // infer_request = compiled_model_kvcache.create_infer_request(); - // } + if (is_static) { + ggml_decoder = get_ggml_decoder(cgraph, is_static, true); + auto ggml_decoder_kvcache = get_ggml_decoder(cgraph, is_static, false); + decoder_end_time = ggml_time_us(); + + auto input_model = std::make_shared(ggml_decoder); + auto input_model_kvcache = std::make_shared(ggml_decoder_kvcache); + + model = ov::frontend::ggml::FrontEnd::convert(input_model); + auto model_kvcache = ov::frontend::ggml::FrontEnd::convert(input_model_kvcache); + conversion_end_time = ggml_time_us(); + + compiled_model = core.compile_model(model, device, config); + auto compiled_model_kvcache = core.compile_model(model_kvcache, device, config); + compile_end_time = ggml_time_us(); + + compiled_cache_prefill[cgraph] = std::make_pair(model, compiled_model); + compiled_cache_kvcache[cgraph] = std::make_pair(model_kvcache, compiled_model_kvcache); + + if (getenv("GGML_OPENVINO_DUMP_IR")) { + char timestamped_filename[64]; + auto timestamp = (long long)ggml_time_us(); + snprintf(timestamped_filename, sizeof(timestamped_filename), "model_prefill_%lld.xml", timestamp); + ov::serialize(model, timestamped_filename); + snprintf(timestamped_filename, sizeof(timestamped_filename), "model_kvcache_%lld.xml", timestamp); + ov::serialize(model_kvcache, timestamped_filename); + } + } else { + ggml_decoder = get_ggml_decoder(cgraph, is_static, true); + decoder_end_time = ggml_time_us(); + + auto input_model = std::make_shared(ggml_decoder); + model = ov::frontend::ggml::FrontEnd::convert(input_model); + conversion_end_time = ggml_time_us(); + + compiled_model = core.compile_model(model, device, config); + compile_end_time = ggml_time_us(); + compiled_cache_prefill[cgraph] = std::make_pair(model, compiled_model); + + if (getenv("GGML_OPENVINO_DUMP_IR")) { + char timestamped_filename[64]; + auto timestamp = (long long)ggml_time_us(); + snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); + ov::serialize(model, timestamped_filename); + } + } } + auto infer_request = compiled_model.create_infer_request(); auto ov_params = model->get_parameters(); for (size_t i = 0; i < ov_params.size(); i++) { auto param_name = ov_params[i]->get_friendly_name(); ov::Tensor input_tensor; + if (ggml_decoder->get_model_extra_inputs().find(param_name) != ggml_decoder->get_model_extra_inputs().end()) { input_tensor = *ggml_decoder->get_model_extra_input_values().at(param_name); - } else { + + } else if (!is_static) { input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name); + + } else { + if (param_name == "inp_tokens" || param_name == "inp_pos") { + if (is_first_token) { + size_t max_token_len = ggml_decoder->get_max_token_len(); + const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); + std::vector padded_data = pad_input(input_tensor_ggml, 1, max_token_len, 0); + input_tensor = ov::Tensor(ov::element::i32, ov::Shape{1, 1, max_token_len}); + auto* data_ptr = input_tensor.data(); + std::copy(padded_data.begin(), padded_data.end(), data_ptr); + } else { + input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name); + } + + } else if (param_name == "KQ_mask") { + size_t max_token_len = ggml_decoder->get_max_token_len(); + const auto* input_tensor_ggml = ggml_decoder->get_input_ggml_tensor(param_name); + if (is_first_token) { + std::vector padded_data = + pad_input(input_tensor_ggml, max_token_len, max_token_len, -INFINITY); + set_zero_diagonal(padded_data, max_token_len); + input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, max_token_len, max_token_len}); + auto* data_ptr = input_tensor.data(); + std::copy(padded_data.begin(), padded_data.end(), data_ptr); + } else { + std::vector padded_data = pad_input(input_tensor_ggml, 1, max_token_len, -INFINITY); + input_tensor = ov::Tensor(ov::element::f32, ov::Shape{1, 1, max_token_len}); + auto* data_ptr = input_tensor.data(); + std::copy(padded_data.begin(), padded_data.end(), data_ptr); + } + + } else { + input_tensor = get_ggml_graph_input_tensor(ggml_decoder, param_name); + } } infer_request.set_input_tensor(i, input_tensor); @@ -234,3 +299,9 @@ void print_output_tensor_info(const std::string& name, break; } } + +void set_zero_diagonal(std::vector& matrix, size_t dim) { + for (size_t i = 0; i < dim; ++i) { + matrix[i * dim + i] = 0.0f; + } +} diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 88c182d9ed..000c2b87c1 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -1,12 +1,37 @@ +#include + #include "ggml-backend-impl.h" #include "ggml-decoder.h" enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph); +std::shared_ptr get_ggml_decoder(struct ggml_cgraph* cgraph, bool is_static, bool is_first_token); + +ov::Tensor get_ggml_graph_input_tensor(std::shared_ptr ggml_decoder, std::string& name); + +std::map get_ggml_graph_output_dst(std::shared_ptr ggml_decoder); + size_t checksum(const void* data, size_t size); void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor); void print_output_tensor_info(const std::string& name, const ov::Tensor& tensor, - std::map& output_dst); \ No newline at end of file + std::map& output_dst); + +template +std::vector pad_input(const ggml_tensor* tensor, size_t padded_rows, size_t padded_cols, T pad_value) { + std::vector padded_data(padded_rows * padded_cols, pad_value); + size_t rows = tensor->ne[1]; + size_t cols = tensor->ne[0]; + T* data = static_cast(tensor->data); + + for (size_t i = 0; i < std::min(rows, padded_rows); ++i) { + for (size_t j = 0; j < std::min(cols, padded_cols); ++j) { + padded_data[i * padded_cols + j] = data[i * cols + j]; + } + } + return padded_data; +} + +void set_zero_diagonal(std::vector& matrix, size_t dim);