diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 3dc2a3eeac..b43f45dbbd 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -236,8 +236,9 @@ void GgmlOvDecoder::set_llm_params() { } ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) const { + auto name = std::string(src->name); ov::PartialShape input_shape; - if (std::string(src->name) == "inp_tokens" || std::string(src->name) == "inp_pos") { + if (name == "inp_tokens" || name == "inp_pos") { if (m_is_static) { if (m_is_first_token) { input_shape = ov::PartialShape{1, 1, m_context_size}; @@ -247,7 +248,9 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co } else { input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_context_size)}; } - } else if (std::string(src->name) == "KQ_mask") { + } else if (name == "inp_out_ids" && !m_is_static) { + input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_context_size)}; + } else if (name == "KQ_mask") { if (m_is_static) { if (m_is_first_token) { input_shape = ov::PartialShape{1, m_context_size, m_context_size}; @@ -258,9 +261,9 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co auto max_mask_size = GGML_PAD(m_context_size, GGML_KQ_MASK_PAD); input_shape = ov::PartialShape{1, ov::Dimension(1, max_mask_size), ov::Dimension(1, max_mask_size)}; } - } else if (std::string(src->name).find("cache_k") == 0) { + } else if (name.find("cache_k") == 0) { input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size}; - } else if (std::string(src->name).find("cache_v") == 0) { + } else if (name.find("cache_v") == 0) { input_shape = ov::PartialShape{m_num_heads_kv, m_head_size, m_context_size}; } else if (src->op == GGML_OP_VIEW) { // This case is added to make test-backend-ops work @@ -273,18 +276,22 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co void GgmlOvDecoder::add_extra_inputs() { // Extra inputs: - // 1. `attention_size`, used in matmul's in the attention block. The shape of those matmul's are 32 aligned, - // see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding. - // Not used for NPU + // 1. `past_token_len`, used to create indices for updating kv cache. Usually equal to inp_pos[0], except for + // llama-perplexity. + // 2. `attention_size`, used in matmul's in the attention block. The shape of those matmul's are 32 aligned, + // see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding. + // Not used for NPU + int64_t past_token_len = -1; int64_t attention_size = -1; - int64_t past_token_len = -1; + int64_t token_len = -1; int64_t past_token_len_from_inp_pos = -1; for (const auto& node : m_nodes) { if (node->op == GGML_OP_ROPE && std::string(node->src[1]->name) == "inp_pos") { if (node->src[1]->type != GGML_TYPE_I32) { throw std::runtime_error("Expected cgraph input `inp_pos` to be of type GGML_TYPE_I32"); } + token_len = node->src[1]->ne[0]; past_token_len_from_inp_pos = ((int32_t*) (node->src[1]->data))[0]; } if (node->op == GGML_OP_CPY && ggml_is_contiguous(node)) { @@ -294,29 +301,39 @@ void GgmlOvDecoder::add_extra_inputs() { break; } } + if (past_token_len == -1) { throw std::runtime_error("Failed to find input \"cache_k\" in the graph"); } if (past_token_len != past_token_len_from_inp_pos) { - throw std::runtime_error("Mismatch between past_token_len from cache_k and inp_pos: " + - std::to_string(past_token_len) + " vs " + std::to_string(past_token_len_from_inp_pos)); + GGML_LOG_DEBUG("Mismatch between past_token_len from cache_k and inp_pos: %ld vs %ld\n", + past_token_len, + past_token_len_from_inp_pos); } - for (const auto& node : m_nodes) { - if (node->src[1] && std::string(node->src[1]->name).find("inp_tokens") == 0) { - int64_t total_token_len = node->src[1]->ne[0] + past_token_len; - attention_size = GGML_PAD(total_token_len, 32); - std::string name = "attention_size"; - auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); - param_node->set_friendly_name(name); - param_node->output(0).get_tensor().set_names({name}); - m_model_extra_inputs[name] = param_node; + { + std::string name = "past_token_len"; + auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); + param_node->set_friendly_name(name); + param_node->output(0).get_tensor().set_names({name}); + m_model_extra_inputs[name] = param_node; - auto tensor = std::make_shared(ov::element::i64, ov::Shape{1}); - *tensor->data() = attention_size; - m_model_extra_input_values[name] = tensor; - break; - } + auto tensor = std::make_shared(ov::element::i64, ov::Shape{1}); + *tensor->data() = past_token_len; + m_model_extra_input_values[name] = tensor; + } + { + int64_t total_token_len = token_len + past_token_len; + attention_size = GGML_PAD(total_token_len, 32); + std::string name = "attention_size"; + auto param_node = std::make_shared(ov::element::i64, ov::Shape{1}); + param_node->set_friendly_name(name); + param_node->output(0).get_tensor().set_names({name}); + m_model_extra_inputs[name] = param_node; + + auto tensor = std::make_shared(ov::element::i64, ov::Shape{1}); + *tensor->data() = attention_size; + m_model_extra_input_values[name] = tensor; } } diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 129c3592c9..83581ec5a8 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -78,11 +79,11 @@ void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decode // cache_k layout: [S, N, H] (seq, num_heads, head_size) // cache_v layout: [N, H, S] (num_heads, head_size, seq) // When writing to cache_v, cache should be reshaped to [N*H, S] and v-curr should be flattened - auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr(); + auto past_token_len = tensor_map.at("past_token_len").get_node_shared_ptr(); auto token_len = tensor_map.at("token_len").get_node_shared_ptr(); - std::shared_ptr update_indices_k; - std::shared_ptr update_indices_v; + Output update_indices_k; + Output update_indices_v; auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto zero_scalar = ov::op::v0::Constant::create(ov::element::i64, {}, {0}); @@ -90,11 +91,19 @@ void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decode auto one_scalar = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {1}); auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); - update_indices_k = - std::make_shared(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); - update_indices_k = std::make_shared(update_indices_k, one); - update_indices_k->set_friendly_name("update_indices_k"); - tensor_map.insert({"update_indices_k", update_indices_k->output(0)}); + auto past_token_len_scalar = std::make_shared(past_token_len, zero); + auto token_len_scalar = std::make_shared(token_len, zero); + auto total_token_len_scalar = std::make_shared(past_token_len_scalar, token_len_scalar); + + Output update_indices = std::make_shared( + past_token_len_scalar, total_token_len_scalar, one_scalar, ov::element::i64); + if (ggml_model_decoder.is_static()) { + update_indices = past_token_len; + } + + update_indices_k = std::make_shared(update_indices, one); + update_indices_k.get_node_shared_ptr()->set_friendly_name("update_indices_k"); + tensor_map.insert({"update_indices_k", update_indices_k}); auto total_head_size = ggml_model_decoder.get_num_heads_kv() * ggml_model_decoder.get_head_size(); auto total_head_size_node = ov::op::v0::Constant::create(ov::element::i64, {1}, {total_head_size}); @@ -102,7 +111,7 @@ void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decode // 1D tensor of shape [total_head_size], values starting from 0 auto range_row = - std::make_shared(zero_scalar, total_head_size_scalar, one_scalar, ov::element::i32); + std::make_shared(zero_scalar, total_head_size_scalar, one_scalar, ov::element::i64); auto range_row_reshaped = std::make_shared(range_row, ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 2})); auto row_indices = std::make_shared( @@ -110,8 +119,7 @@ void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decode std::make_shared(ov::OutputVector{total_head_size_node, token_len, one}, 0)); // 1D tensor of shape [token_len], values starting from past_token_len - auto range_col = - std::make_shared(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1})); + auto range_col = update_indices; auto range_col_reshaped = std::make_shared(range_col, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 2})); auto col_indices = std::make_shared( @@ -119,26 +127,11 @@ void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decode std::make_shared(ov::OutputVector{total_head_size_node, token_len, one}, 0)); // Stack row_indices and col_indices along last axis: [total_head_size, token_len, 2] - auto indices = std::make_shared(OutputVector{row_indices, col_indices}, 2); + update_indices_v = std::make_shared(OutputVector{row_indices, col_indices}, 2); update_indices_v = std::make_shared( - indices, ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{-1, 2}), false); - update_indices_v->set_friendly_name("update_indices_v"); - tensor_map.insert({"update_indices_v", update_indices_v->output(0)}); -} - -float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) { -#ifndef M_PI -# define M_PI 3.14159265358979323846 -#endif - return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float) M_PI)) / (2 * logf(base)); -} - -void ggml_rope_yarn_corr_dims(int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, - float dims[2]) { - float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base)); - float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base)); - dims[0] = std::max(0.0f, start); - dims[1] = std::min(static_cast(n_dims - 1), end); + update_indices_v, ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector{-1, 2}), false); + update_indices_v.get_node_shared_ptr()->set_friendly_name("update_indices_v"); + tensor_map.insert({"update_indices_v", update_indices_v}); } void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) { diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index be06c54e8b..45ed73499f 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -356,10 +356,13 @@ void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor) std::cout << *(tensor.data()) << std::endl; break; case ov::element::f16: - std::cout << ov::float16::from_bits(*(tensor.data())) << std::endl; + std::cout << *(tensor.data()) << std::endl; break; case ov::element::i32: - std::cout << *(tensor.data()) << std::endl; + for (size_t i = 0; i < tensor.get_size(); ++i) { + std::cout << tensor.data()[i] << " "; + } + std::cout << std::endl; break; case ov::element::i64: std::cout << *(tensor.data()) << std::endl; @@ -379,7 +382,7 @@ void print_output_tensor_info(const std::string& name, const ov::Tensor& tensor, std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl; break; case ov::element::f16: - std::cout << ov::float16::from_bits(*(tensor.data())) << std::endl; + std::cout << *(tensor.data()) << std::endl; std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl; break; default: