diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 29be4dbae8..66f82773e3 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -222,11 +222,11 @@ void GgmlOvDecoder::add_extra_inputs() { past_token_len = (int64_t)(node->src[1]->op_params[0] / node->src[1]->nb[0] / head_size / num_heads); std::string name = "past_token_len"; - auto param_node = std::make_shared(ov::element::i64, ov::Shape{}); + auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); param_node->set_friendly_name(name); m_model_extra_inputs[name] = param_node; - auto tensor = std::make_shared(ov::element::i64, ov::Shape{}); + auto tensor = std::make_shared(ov::element::i64, ov::Shape{1}); *tensor->data() = past_token_len; m_model_extra_input_values[name] = tensor; break; diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp index 75dd0e7d83..4973645024 100644 --- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -34,7 +34,7 @@ OutputVector translate_cpy(const NodeContext& context) { auto src0 = context.get_input(0); auto src1 = context.get_input(1); - auto past_token_len_scalar = context.get_input("past_token_len"); + auto past_token_len = context.get_input("past_token_len"); src0 = std::make_shared(src0, context.get_input_type(1)); ov::Output res; @@ -68,18 +68,16 @@ OutputVector translate_cpy(const NodeContext& context) { std::shared_ptr indices; if (context.is_static()) { - indices = past_token_len_scalar.get_node_shared_ptr(); - indices = std::make_shared( - indices, - ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{0, 1})); + indices = past_token_len.get_node_shared_ptr(); } else { + auto past_token_len_scalar = std::make_shared(past_token_len, zero); auto total_token_len_scalar = std::make_shared(past_token_len_scalar, token_len_scalar); indices = std::make_shared(past_token_len_scalar, total_token_len_scalar, one_scalar, ov::element::i64); - indices = std::make_shared(indices, one); } + indices = std::make_shared(indices, one); res = std::make_shared(reshaped_src1, indices, src0); } else { @@ -108,11 +106,9 @@ OutputVector translate_cpy(const NodeContext& context) { // 1D tensor of shape [token_len], values starting from past_token_len std::shared_ptr range_col; if (context.is_static()) { - range_col = past_token_len_scalar.get_node_shared_ptr(); - range_col = std::make_shared( - range_col, - ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{0})); + range_col = past_token_len.get_node_shared_ptr(); } else { + auto past_token_len_scalar = std::make_shared(past_token_len, zero); auto total_token_len_scalar = std::make_shared(past_token_len_scalar, token_len_scalar); range_col = std::make_shared(past_token_len_scalar, total_token_len_scalar, diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index 20ad5683b8..0d3190f6c1 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -68,7 +69,7 @@ OutputVector translate_mulmat(const NodeContext& context) { std::vector src0_original_shape(src0_original_shape_.begin(), src0_original_shape_.end()); if (context.is_static()) { - attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {src0_original_shape[token_dim]}); + attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {INT_MAX}); } src0_original_shape[token_dim] = -1; diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 3e49081515..fe46b8a794 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -1,6 +1,7 @@ #include "utils.h" #include +#include #include #include #include @@ -70,15 +71,17 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c ov::AnyMap config; if (device == "NPU") { config = { - {"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=ReduceMean"}, - {"NPU_USE_NPUW", "YES"}, - {"NPUW_DEVICES", "NPU"}, - {"NPUW_FOLD", "YES"}, - {"NPUW_DQ", "YES"}, - {"NPUW_FUNCALL_ASYNC", "YES"}, - {"NPUW_HOST_GATHER", "YES"}, - {"NPUW_WEIGHTS_BANK", "shared"}, - // {"NPU_COMPILER_TYPE", "MLIR"}, + { "NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=ReduceMean" }, + { "NPU_USE_NPUW", "YES" }, + { "NPUW_DEVICES", "NPU" }, + { "NPUW_FOLD", "YES" }, + { "NPUW_HOST_GATHER", "YES" }, + { "NPUW_DQ", "YES" }, + { "NPUW_FUNCALL_ASYNC", "YES" }, + { "NPUW_WEIGHTS_BANK", "shared" }, + // Option 'CACHE_DIR' is not supported with MLIR compiler type + // {"NPUW_CACHE_DIR", getenv("GGML_OPENVINO_CACHE_DIR") ? getenv("GGML_OPENVINO_CACHE_DIR") : ""}, + { "NPU_COMPILER_TYPE", "MLIR" }, }; } @@ -102,15 +105,21 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c int64_t conversion_end_time; int64_t compile_end_time; + bool is_first_token = is_prefill(cgraph); + auto it = compiled_cache_prefill.find(cgraph); - bool is_first_token = it == compiled_cache_prefill.end(); - if (!is_first_token) { + if (it != compiled_cache_prefill.end()) { ggml_decoder = get_ggml_decoder(cgraph, is_static, false); decoder_end_time = ggml_time_us(); if (is_static) { - model = compiled_cache_kvcache[cgraph].first; - compiled_model = compiled_cache_kvcache[cgraph].second; + if (is_first_token) { + model = compiled_cache_prefill[cgraph].first; + compiled_model = compiled_cache_prefill[cgraph].second; + } else { + model = compiled_cache_kvcache[cgraph].first; + compiled_model = compiled_cache_kvcache[cgraph].second; + } } else { model = it->second.first; compiled_model = it->second.second; @@ -235,8 +244,6 @@ enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_c } auto end_time = ggml_time_us(); - is_first_token = false; - if (getenv("GGML_OPENVINO_PROFILING")) { GGML_LOG_INFO("GGML OpenVINO Backend: \n"); GGML_LOG_INFO(" - Graph decoder Time: %ld ms \n", (decoder_end_time - start_time) / 1000); @@ -305,3 +312,20 @@ void set_zero_diagonal(std::vector& matrix, size_t dim) { matrix[i * dim + i] = 0.0f; } } + +bool is_prefill(struct ggml_cgraph * cgraph) { + for (int i = 0; i < cgraph->n_nodes; ++i) { + auto * op = cgraph->nodes[i]; + for (int j = 0; j < GGML_MAX_SRC; ++j) { + auto* src = op->src[j]; + if (src == nullptr) { + break; + } + if (std::string(src->name) == "inp_tokens") { + return src->ne[0] != 1; + } + } + } + GGML_LOG_ERROR("is_prefill: inp_tokens not found in cgraph"); + throw std::runtime_error("is_prefill: inp_tokens not found in cgraph"); +} diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 000c2b87c1..2427b0b1ce 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -2,6 +2,7 @@ #include "ggml-backend-impl.h" #include "ggml-decoder.h" +#include "ggml-impl.h" enum ggml_status openvino_frontend_compute(ggml_backend_t backend, struct ggml_cgraph* cgraph); @@ -35,3 +36,5 @@ std::vector pad_input(const ggml_tensor* tensor, size_t padded_rows, size_t p } void set_zero_diagonal(std::vector& matrix, size_t dim); + +bool is_prefill(struct ggml_cgraph * cgraph);