diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index e6474d6def..7bb092a65c 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -25,14 +26,16 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" -GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph) +GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token) : m_cgraph(cgraph), m_node(node), - m_op_name(m_node ? std::string(m_node->name) : "NONE_OP") { + m_op_name(m_node ? std::string(m_node->name) : "NONE_OP"), + m_is_static(is_static), + m_is_first_token(is_first_token) { static std::map> model_weights; if (m_node) { - set_input_output(m_node, model_weights); + set_input_output(m_node); } else { static bool printed = false; if (!printed && getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) { @@ -47,7 +50,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap set_max_token_len(); static bool weight_created = false; - if (!getenv("GGML_OPENVINO_WEIGHT_AS_INPUT") && !weight_created) { + if (!weight_created) { add_weight_const_parallel(model_weights); weight_created = true; } @@ -55,7 +58,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { auto* cur_node = m_cgraph->nodes[node_n]; m_nodes.push_back(cur_node); - set_input_output(cur_node, model_weights); + set_input_output(cur_node); } m_model_weights = model_weights; @@ -65,8 +68,7 @@ GgmlOvDecoder::GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgrap // Called in GgmlOvDecoder constructor. Two cases: 1. constructing a decoder for the whole graph; // 2. constructing a decoder for a node. -void GgmlOvDecoder::set_input_output(ggml_tensor* node, - std::map>& model_weights) { +void GgmlOvDecoder::set_input_output(ggml_tensor* node) { std::string node_name; if (node->op == GGML_OP_CPY) { // CPY updates the input tensor in place. For later ov op that uses the @@ -95,21 +97,7 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, if (!m_node && !src->view_src) { ggml_backend_buffer* buffer = src->buffer; - if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { - bool weight_as_input = getenv("GGML_OPENVINO_WEIGHT_AS_INPUT"); - auto& weights_map = weight_as_input ? m_model_inputs : model_weights; - if (weights_map.find(src_name) != weights_map.end()) { - continue; - } - - std::shared_ptr weight_node = - weight_as_input - ? std::make_shared(get_ov_type(src), ov::Shape{get_shape(src)}) - : create_weight_node(src); - weight_node->set_friendly_name(src_name); - weights_map[src_name] = weight_node; - - } else if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) { + if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || src->flags & GGML_TENSOR_FLAG_INPUT) { // GGML_BACKEND_BUFFER_USAGE_ANY are kv caches if (buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY) { assert(src_name.find("cache_k") == 0 || src_name.find("cache_v") == 0); @@ -119,10 +107,24 @@ void GgmlOvDecoder::set_input_output(ggml_tensor* node, } ov::PartialShape input_shape; if (std::string(src->name) == "inp_tokens" || std::string(src->name) == "inp_pos") { - input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_max_token_len)}; + if (m_is_static) { + input_shape = ov::PartialShape(get_shape(src)); + // if (m_is_first_token) { + // input_shape = ov::PartialShape{1, 1, m_max_token_len}; + // } else { + // input_shape = ov::PartialShape{1, 1, 1}; + // } + } else { + input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_max_token_len)}; + } } else if (std::string(src->name).find("KQ_mask") == 0) { - auto max_token_len = GGML_PAD(m_max_token_len, GGML_KQ_MASK_PAD); - input_shape = ov::PartialShape{1, ov::Dimension(1, max_token_len), ov::Dimension(1, max_token_len)}; + if (m_is_static) { + input_shape = ov::PartialShape(get_shape(src)); + } else { + auto max_token_len = GGML_PAD(m_max_token_len, GGML_KQ_MASK_PAD); + input_shape = + ov::PartialShape{1, ov::Dimension(1, max_token_len), ov::Dimension(1, max_token_len)}; + } } else { input_shape = ov::Shape{get_shape(src)}; } @@ -510,7 +512,7 @@ int32_t* GgmlOvDecoder::get_output_op_params(const std::string& name) const { void GgmlOvDecoder::visit_subgraph(std::function)> node_visitor) const { for (const auto& node : m_nodes) { - auto decoder = std::make_shared(node, m_cgraph); + auto decoder = std::make_shared(node, m_cgraph, m_is_static, m_is_first_token); node_visitor(decoder); } } diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 4d4a928121..b372cc8040 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -12,7 +12,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { public: using ov::frontend::ggml::GgmlDecoder::GgmlDecoder; - GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph); + GgmlOvDecoder(struct ggml_tensor* node, struct ggml_cgraph* cgraph, bool is_static, bool is_first_token); virtual ov::Any get_attribute(const std::string& name) const override { return nullptr; @@ -89,8 +89,15 @@ public: return m_model_output_names; } + virtual bool is_static() const override { + return m_is_static; + } + virtual bool is_first_token() const { + return m_is_first_token; + } + private: - void set_input_output(ggml_tensor* node, std::map>& model_weights); + void set_input_output(ggml_tensor* node); void add_extra_inputs(); static void dump_cgraph(const struct ggml_cgraph* cgraph); static std::vector get_shape(const ggml_tensor* tensor); @@ -119,6 +126,8 @@ private: std::map> m_model_extra_input_values; std::map> m_model_weights; std::vector m_model_output_names; + bool m_is_static; + bool m_is_first_token; }; void print_tensor_address_map(const struct ggml_cgraph* cgraph); diff --git a/ggml/src/ggml-openvino/openvino/decoder.hpp b/ggml/src/ggml-openvino/openvino/decoder.hpp index b3cf75817f..a0b9509336 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.hpp +++ b/ggml/src/ggml-openvino/openvino/decoder.hpp @@ -55,6 +55,8 @@ public: virtual const std::map>& get_model_extra_inputs() const = 0; virtual const std::map>& get_model_weights() const = 0; virtual const std::vector& get_model_output_names() const = 0; + + virtual bool is_static() const = 0; }; } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/node_context.hpp b/ggml/src/ggml-openvino/openvino/node_context.hpp index 44f55222e3..f5940585a6 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.hpp +++ b/ggml/src/ggml-openvino/openvino/node_context.hpp @@ -84,6 +84,9 @@ public: int get_op_case() const { return m_decoder->get_op_case(); } + bool is_static() const { + return m_decoder->is_static(); + } private: std::shared_ptr m_decoder; diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp index 7cdeddce38..fe755a5f64 100644 --- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -12,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -57,6 +59,13 @@ OutputVector translate_cpy(const NodeContext& context) { token_len = std::make_shared(token_len, ov::op::v0::Constant::create(ov::element::i64, {0}, {}), false); + + if (context.is_static()) { + int32_t* op_params = context.get_input_op_params(1); + int64_t past_token_len_val = op_params[0] / context.get_input_stride(1)[2] / num_heads / head_size; + past_token_len = ov::op::v0::Constant::create(ov::element::i64, {}, {past_token_len_val}); + } + auto total_token_len = std::make_shared(past_token_len, token_len); std::shared_ptr indices = std::make_shared(past_token_len, total_token_len, one, ov::element::i64); @@ -67,39 +76,88 @@ OutputVector translate_cpy(const NodeContext& context) { res = std::make_shared(reshaped_src1, indices, src0); } else { // Write V to cache_v - int64_t total_head_size = src0_shape[1]; - auto total_head_size_node = ov::op::v0::Constant::create(ov::element::i64, {1}, {total_head_size}); - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); + + auto zero_scalar = ov::op::v0::Constant::create(ov::element::i64, {}, {0}); + auto one_scalar = ov::op::v0::Constant::create(ov::element::i64, {}, {1}); + + int64_t total_head_size = src0_shape[1]; + auto total_head_size_node = ov::op::v0::Constant::create(ov::element::i64, {1}, {total_head_size}); + auto total_head_size_scalar = std::make_shared(total_head_size_node, zero); auto token_len = get_dimensions(src0.get_node_shared_ptr(), {2}); - past_token_len = std::make_shared(past_token_len, zero); - auto total_token_len = std::make_shared(past_token_len, token_len); + auto token_len_scalar = std::make_shared(token_len, zero); + if (context.is_static()) { + int32_t* op_params = context.get_input_op_params(1); + int64_t past_token_len_val = op_params[0] / context.get_input_stride(1)[2]; + past_token_len = ov::op::v0::Constant::create(ov::element::i64, {}, {past_token_len_val}); + } + auto total_token_len_scalar = std::make_shared(past_token_len, token_len_scalar); + // auto reshaped_src1 = std::make_shared( + // src1, + // ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, -1}), + // false); + + // auto src1_left = std::make_shared( + // reshaped_src1, + // ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 0, 0}), + // std::make_shared(ov::OutputVector{one, total_head_size_node, past_token_len}, 0), + // ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1})); + + // auto src1_right = std::make_shared( + // reshaped_src1, + // std::make_shared(ov::OutputVector{zero, zero, total_token_len}, 0), + // ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, INT_MAX}), + // ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1})); + + // auto reshaped_src0 = std::make_shared( + // src0, + // ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, -1}), + // false); + + // auto res = std::make_shared(ov::OutputVector{src1_left, reshaped_src0, src1_right}, 2); + + // 1D tensor of shape [total_head_size], values starting from 0 + auto range_row = + std::make_shared(zero_scalar, total_head_size_scalar, one_scalar, ov::element::i64); + auto range_row_reshaped = + std::make_shared(range_row, + ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 2})); + auto row_indices = std::make_shared( + range_row_reshaped, + std::make_shared(ov::OutputVector{total_head_size_node, token_len, one}, 0)); + + // 1D tensor of shape [token_len], values starting from past_token_len + auto range_col = + std::make_shared(past_token_len, total_token_len_scalar, one_scalar, element::i64); + auto range_col_reshaped = + std::make_shared(range_col, + ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 2})); + auto col_indices = std::make_shared( + range_col_reshaped, + std::make_shared(ov::OutputVector{total_head_size_node, token_len, one}, 0)); + + // Stack row_indices and col_indices along last axis: [total_head_size, token_len, 2] + auto indices = std::make_shared(OutputVector{row_indices, col_indices}, 2); + auto indices_final = std::make_shared( + indices, + ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{-1, 2}), + false); + + auto flattend_src0 = + std::make_shared(src0, + ov::op::v0::Constant::create(element::i64, Shape{1}, {-1}), + false); auto reshaped_src1 = std::make_shared( src1, - ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, -1}), + ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{total_head_size, -1}), false); - auto src1_left = std::make_shared( - reshaped_src1, - ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 0, 0}), - std::make_shared(ov::OutputVector{one, total_head_size_node, past_token_len}, 0), - ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1})); - - auto src1_right = std::make_shared( - reshaped_src1, - std::make_shared(ov::OutputVector{zero, zero, total_token_len}, 0), - ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, INT_MAX}), - ov::op::v0::Constant::create(ov::element::i64, {3}, {1, 1, 1})); - - auto reshaped_src0 = std::make_shared( - src0, - ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{1, total_head_size, -1}), - false); - - res = std::make_shared(ov::OutputVector{src1_left, reshaped_src0, src1_right}, 2); + auto updated = std::make_shared(reshaped_src1, indices_final, flattend_src0); + res = std::make_shared(updated, zero); } return rename_outputs_with_suffix({res}, context.get_name()); diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index 06e7d9ece0..20ad5683b8 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -55,17 +55,21 @@ OutputVector translate_mulmat(const NodeContext& context) { ov::Output A; ov::Output B; - auto attention_size = context.get_input("attention_size"); - auto src0 = context.get_input(0); auto src0_shape = context.get_input_shape(0).to_shape(); auto src0_stride = context.get_input_stride(0); auto permuted = is_permuted(src0_stride); auto token_dim = permuted ? 0 : 2; + auto attention_size = context.get_input("attention_size"); + auto src0_perm = argsort_descend(src0_stride); auto src0_original_shape_ = permute(src0_shape, src0_perm); std::vector src0_original_shape(src0_original_shape_.begin(), src0_original_shape_.end()); + + if (context.is_static()) { + attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {src0_original_shape[token_dim]}); + } src0_original_shape[token_dim] = -1; auto src0_slice_shape = src0_original_shape; diff --git a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp index 7b8b582dac..4b230ad630 100644 --- a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp @@ -1,8 +1,9 @@ +#include #include #include #include #include -#include +#include #include #include "../node_context.hpp" @@ -16,28 +17,24 @@ namespace op { OutputVector translate_rms_norm(const NodeContext& context) { num_inputs_check(context, 1, 1); - ov::Shape input_shape = context.get_input_shape(0).to_shape(); auto input_node = context.get_input(0); auto square = std::make_shared(input_node, input_node); - auto reduce_sum = - std::make_shared(square, - ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}), - true); - - auto mean = std::make_shared( - reduce_sum, - ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {static_cast(input_shape[2])})); + auto mean = + std::make_shared(square, + ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {2}), + true); float eps; memcpy(&eps, context.get_output_op_params(0), sizeof(float)); + auto rms = std::make_shared( std::make_shared(mean, ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {eps}))); - auto scale = - std::make_shared(ov::op::v0::Constant::create(ov::element::f32, ov::Shape{}, {1.0f}), rms); + auto reciprocal = + std::make_shared(ov::op::v0::Constant::create(ov::element::f32, ov::Shape{1}, {1.0f}), rms); - auto res = std::make_shared(input_node, scale); + auto res = std::make_shared(input_node, reciprocal); return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index 94810e549d..b47b8a6a54 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -1,4 +1,3 @@ - #include #include #include @@ -23,6 +22,10 @@ #include "../node_context.hpp" #include "../utils.hpp" +#ifndef M_PI +# define M_PI 3.14159265358979323846 +#endif + #define GGML_ROPE_TYPE_NEOX 2 #define MIN(a, b) ((a) < (b) ? (a) : (b)) diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 040ca1961e..65a609f1d7 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -4,11 +4,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include @@ -17,8 +19,8 @@ #include "openvino/frontend.hpp" #include "openvino/input_model.hpp" -std::shared_ptr get_ggml_decoder(struct ggml_cgraph* cgraph) { - return std::make_shared(nullptr, cgraph); +std::shared_ptr get_ggml_decoder(struct ggml_cgraph* cgraph, bool is_static, bool is_first_token) { + return std::make_shared(nullptr, cgraph, is_static, is_first_token); } ov::Tensor get_ggml_graph_input_tensor(std::shared_ptr ggml_decoder, std::string& name) { @@ -49,50 +51,63 @@ static ov::frontend::FrontEnd::Ptr get_ggml_frontend() { } enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph) { + static ov::Core core; + static bool is_first_token = true; + + static std::string device = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : ""; + if (device.empty()) { + // Prefer GPU over CPU + for (const auto& dev : core.get_available_devices()) { + device = dev; + if (device == "GPU") + break; + } + } + + bool is_static = device == "NPU" ? true : false; + ov::AnyMap config; + if (is_static) { + config = { + {"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=ReduceMean"}, + {"NPU_USE_NPUW", "YES"}, + {"NPUW_DEVICES", "NPU"}, + {"NPUW_FOLD", "YES"}, + // {"NPU_COMPILER_TYPE", "MLIR"}, + }; + } + auto start_time = ggml_time_us(); - static ov::Core core; auto* cache_dir = getenv("GGML_OPENVINO_CACHE_DIR"); - if (cache_dir) { + if (cache_dir && !is_static) { core.set_property(ov::cache_dir(cache_dir)); } - // auto devices = core.get_available_devices(); - // static auto front_end = get_ggml_frontend(); - // if (!front_end) { - // GGML_LOG_ERROR("GGML FrontEnd is not initialized \n"); - // return GGML_STATUS_FAILED; - // } - - using CachedItem = std::pair, ov::CompiledModel>; + // For CPU and GPU, there is only one compiled model, so only use the first element of the pair + // For NPU, there are prefill model and kvcache model (This is the ideal approach, but not implemented yet, + // currently recompile for every token) + using CachedItem = std::pair, std::pair>; static std::unordered_map compiled_cache; std::shared_ptr model; - ov::CompiledModel compiled_model; + ov::CompiledModel compiled_model_prefill; + ov::CompiledModel compiled_model_kvcache; int64_t decoder_end_time; int64_t conversion_end_time; int64_t compile_end_time; - auto ggml_decoder = get_ggml_decoder(cgraph); + auto ggml_decoder = get_ggml_decoder(cgraph, is_static, is_first_token); decoder_end_time = ggml_time_us(); auto it = compiled_cache.find(cgraph); - if (it != compiled_cache.end()) { + if (it != compiled_cache.end() && !is_static) { model = it->second.first; conversion_end_time = ggml_time_us(); - compiled_model = it->second.second; + compiled_model_prefill = it->second.second.first; + compiled_model_kvcache = it->second.second.second; compile_end_time = ggml_time_us(); } else { - // std::shared_ptr graph_decoder = ggml_decoder; - // ov::frontend::InputModel::Ptr input_model = front_end->load(graph_decoder); - // if (!input_model) { - // GGML_LOG_ERROR("Input Model is not loaded \n"); - // return GGML_STATUS_FAILED; - // } - - // model = front_end->convert(input_model); - ov::frontend::InputModel::Ptr input_model = std::make_shared(ggml_decoder); model = ov::frontend::ggml::FrontEnd::convert(input_model); @@ -105,16 +120,23 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c ov::serialize(model, timestamped_filename); } - if (!model) { - GGML_LOG_ERROR("Model is not converted \n"); - } - compiled_model = core.compile_model(model, "CPU"); + compiled_model_prefill = core.compile_model(model, device, config); compile_end_time = ggml_time_us(); - compiled_cache[cgraph] = std::make_pair(model, compiled_model); + compiled_cache[cgraph] = std::make_pair(model, std::make_pair(compiled_model_prefill, compiled_model_kvcache)); } - ov::InferRequest infer_request = compiled_model.create_infer_request(); + ov::InferRequest infer_request; + if (!is_static) { + infer_request = compiled_model_prefill.create_infer_request(); + } else { + infer_request = compiled_model_prefill.create_infer_request(); + // if (is_first_token) { + // infer_request = compiled_model_prefill.create_infer_request(); + // } else { + // infer_request = compiled_model_kvcache.create_infer_request(); + // } + } auto ov_params = model->get_parameters(); for (size_t i = 0; i < ov_params.size(); i++) { @@ -148,6 +170,8 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c } auto end_time = ggml_time_us(); + is_first_token = false; + if (getenv("GGML_OPENVINO_PROFILING")) { GGML_LOG_INFO("GGML OpenVINO Backend: \n"); GGML_LOG_INFO(" - Graph decoder Time: %ld ms \n", (decoder_end_time - start_time) / 1000);