Fix llama-perplexity
This commit is contained in:
parent
75eec6265f
commit
4e7f04a307
|
|
@ -236,8 +236,9 @@ void GgmlOvDecoder::set_llm_params() {
|
|||
}
|
||||
|
||||
ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) const {
|
||||
auto name = std::string(src->name);
|
||||
ov::PartialShape input_shape;
|
||||
if (std::string(src->name) == "inp_tokens" || std::string(src->name) == "inp_pos") {
|
||||
if (name == "inp_tokens" || name == "inp_pos") {
|
||||
if (m_is_static) {
|
||||
if (m_is_first_token) {
|
||||
input_shape = ov::PartialShape{1, 1, m_context_size};
|
||||
|
|
@ -247,7 +248,9 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co
|
|||
} else {
|
||||
input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_context_size)};
|
||||
}
|
||||
} else if (std::string(src->name) == "KQ_mask") {
|
||||
} else if (name == "inp_out_ids" && !m_is_static) {
|
||||
input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_context_size)};
|
||||
} else if (name == "KQ_mask") {
|
||||
if (m_is_static) {
|
||||
if (m_is_first_token) {
|
||||
input_shape = ov::PartialShape{1, m_context_size, m_context_size};
|
||||
|
|
@ -258,9 +261,9 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co
|
|||
auto max_mask_size = GGML_PAD(m_context_size, GGML_KQ_MASK_PAD);
|
||||
input_shape = ov::PartialShape{1, ov::Dimension(1, max_mask_size), ov::Dimension(1, max_mask_size)};
|
||||
}
|
||||
} else if (std::string(src->name).find("cache_k") == 0) {
|
||||
} else if (name.find("cache_k") == 0) {
|
||||
input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size};
|
||||
} else if (std::string(src->name).find("cache_v") == 0) {
|
||||
} else if (name.find("cache_v") == 0) {
|
||||
input_shape = ov::PartialShape{m_num_heads_kv, m_head_size, m_context_size};
|
||||
} else if (src->op == GGML_OP_VIEW) {
|
||||
// This case is added to make test-backend-ops work
|
||||
|
|
@ -273,18 +276,22 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co
|
|||
|
||||
void GgmlOvDecoder::add_extra_inputs() {
|
||||
// Extra inputs:
|
||||
// 1. `attention_size`, used in matmul's in the attention block. The shape of those matmul's are 32 aligned,
|
||||
// see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding.
|
||||
// Not used for NPU
|
||||
// 1. `past_token_len`, used to create indices for updating kv cache. Usually equal to inp_pos[0], except for
|
||||
// llama-perplexity.
|
||||
// 2. `attention_size`, used in matmul's in the attention block. The shape of those matmul's are 32 aligned,
|
||||
// see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding.
|
||||
// Not used for NPU
|
||||
int64_t past_token_len = -1;
|
||||
int64_t attention_size = -1;
|
||||
|
||||
int64_t past_token_len = -1;
|
||||
int64_t token_len = -1;
|
||||
int64_t past_token_len_from_inp_pos = -1;
|
||||
for (const auto& node : m_nodes) {
|
||||
if (node->op == GGML_OP_ROPE && std::string(node->src[1]->name) == "inp_pos") {
|
||||
if (node->src[1]->type != GGML_TYPE_I32) {
|
||||
throw std::runtime_error("Expected cgraph input `inp_pos` to be of type GGML_TYPE_I32");
|
||||
}
|
||||
token_len = node->src[1]->ne[0];
|
||||
past_token_len_from_inp_pos = ((int32_t*) (node->src[1]->data))[0];
|
||||
}
|
||||
if (node->op == GGML_OP_CPY && ggml_is_contiguous(node)) {
|
||||
|
|
@ -294,29 +301,39 @@ void GgmlOvDecoder::add_extra_inputs() {
|
|||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (past_token_len == -1) {
|
||||
throw std::runtime_error("Failed to find input \"cache_k\" in the graph");
|
||||
}
|
||||
if (past_token_len != past_token_len_from_inp_pos) {
|
||||
throw std::runtime_error("Mismatch between past_token_len from cache_k and inp_pos: " +
|
||||
std::to_string(past_token_len) + " vs " + std::to_string(past_token_len_from_inp_pos));
|
||||
GGML_LOG_DEBUG("Mismatch between past_token_len from cache_k and inp_pos: %ld vs %ld\n",
|
||||
past_token_len,
|
||||
past_token_len_from_inp_pos);
|
||||
}
|
||||
|
||||
for (const auto& node : m_nodes) {
|
||||
if (node->src[1] && std::string(node->src[1]->name).find("inp_tokens") == 0) {
|
||||
int64_t total_token_len = node->src[1]->ne[0] + past_token_len;
|
||||
attention_size = GGML_PAD(total_token_len, 32);
|
||||
std::string name = "attention_size";
|
||||
auto param_node = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::Shape{1});
|
||||
param_node->set_friendly_name(name);
|
||||
param_node->output(0).get_tensor().set_names({name});
|
||||
m_model_extra_inputs[name] = param_node;
|
||||
{
|
||||
std::string name = "past_token_len";
|
||||
auto param_node = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::Shape{1});
|
||||
param_node->set_friendly_name(name);
|
||||
param_node->output(0).get_tensor().set_names({name});
|
||||
m_model_extra_inputs[name] = param_node;
|
||||
|
||||
auto tensor = std::make_shared<ov::Tensor>(ov::element::i64, ov::Shape{1});
|
||||
*tensor->data<int64_t>() = attention_size;
|
||||
m_model_extra_input_values[name] = tensor;
|
||||
break;
|
||||
}
|
||||
auto tensor = std::make_shared<ov::Tensor>(ov::element::i64, ov::Shape{1});
|
||||
*tensor->data<int64_t>() = past_token_len;
|
||||
m_model_extra_input_values[name] = tensor;
|
||||
}
|
||||
{
|
||||
int64_t total_token_len = token_len + past_token_len;
|
||||
attention_size = GGML_PAD(total_token_len, 32);
|
||||
std::string name = "attention_size";
|
||||
auto param_node = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::Shape{1});
|
||||
param_node->set_friendly_name(name);
|
||||
param_node->output(0).get_tensor().set_names({name});
|
||||
m_model_extra_inputs[name] = param_node;
|
||||
|
||||
auto tensor = std::make_shared<ov::Tensor>(ov::element::i64, ov::Shape{1});
|
||||
*tensor->data<int64_t>() = attention_size;
|
||||
m_model_extra_input_values[name] = tensor;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@
|
|||
#include <map>
|
||||
#include <memory>
|
||||
#include <openvino/core/node.hpp>
|
||||
#include <openvino/op/add.hpp>
|
||||
#include <openvino/op/broadcast.hpp>
|
||||
#include <openvino/op/concat.hpp>
|
||||
#include <openvino/op/convert.hpp>
|
||||
|
|
@ -78,11 +79,11 @@ void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decode
|
|||
// cache_k layout: [S, N, H] (seq, num_heads, head_size)
|
||||
// cache_v layout: [N, H, S] (num_heads, head_size, seq)
|
||||
// When writing to cache_v, cache should be reshaped to [N*H, S] and v-curr should be flattened
|
||||
auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr();
|
||||
auto past_token_len = tensor_map.at("past_token_len").get_node_shared_ptr();
|
||||
auto token_len = tensor_map.at("token_len").get_node_shared_ptr();
|
||||
|
||||
std::shared_ptr<ov::Node> update_indices_k;
|
||||
std::shared_ptr<ov::Node> update_indices_v;
|
||||
Output<Node> update_indices_k;
|
||||
Output<Node> update_indices_v;
|
||||
|
||||
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
|
||||
auto zero_scalar = ov::op::v0::Constant::create(ov::element::i64, {}, {0});
|
||||
|
|
@ -90,11 +91,19 @@ void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decode
|
|||
auto one_scalar = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {1});
|
||||
auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
|
||||
|
||||
update_indices_k =
|
||||
std::make_shared<ov::op::v0::Squeeze>(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1}));
|
||||
update_indices_k = std::make_shared<ov::op::v0::Unsqueeze>(update_indices_k, one);
|
||||
update_indices_k->set_friendly_name("update_indices_k");
|
||||
tensor_map.insert({"update_indices_k", update_indices_k->output(0)});
|
||||
auto past_token_len_scalar = std::make_shared<ov::op::v0::Squeeze>(past_token_len, zero);
|
||||
auto token_len_scalar = std::make_shared<ov::op::v0::Squeeze>(token_len, zero);
|
||||
auto total_token_len_scalar = std::make_shared<ov::op::v1::Add>(past_token_len_scalar, token_len_scalar);
|
||||
|
||||
Output<Node> update_indices = std::make_shared<ov::op::v4::Range>(
|
||||
past_token_len_scalar, total_token_len_scalar, one_scalar, ov::element::i64);
|
||||
if (ggml_model_decoder.is_static()) {
|
||||
update_indices = past_token_len;
|
||||
}
|
||||
|
||||
update_indices_k = std::make_shared<ov::op::v0::Unsqueeze>(update_indices, one);
|
||||
update_indices_k.get_node_shared_ptr()->set_friendly_name("update_indices_k");
|
||||
tensor_map.insert({"update_indices_k", update_indices_k});
|
||||
|
||||
auto total_head_size = ggml_model_decoder.get_num_heads_kv() * ggml_model_decoder.get_head_size();
|
||||
auto total_head_size_node = ov::op::v0::Constant::create(ov::element::i64, {1}, {total_head_size});
|
||||
|
|
@ -102,7 +111,7 @@ void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decode
|
|||
|
||||
// 1D tensor of shape [total_head_size], values starting from 0
|
||||
auto range_row =
|
||||
std::make_shared<ov::op::v4::Range>(zero_scalar, total_head_size_scalar, one_scalar, ov::element::i32);
|
||||
std::make_shared<ov::op::v4::Range>(zero_scalar, total_head_size_scalar, one_scalar, ov::element::i64);
|
||||
auto range_row_reshaped =
|
||||
std::make_shared<ov::op::v0::Unsqueeze>(range_row, ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 2}));
|
||||
auto row_indices = std::make_shared<ov::op::v3::Broadcast>(
|
||||
|
|
@ -110,8 +119,7 @@ void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decode
|
|||
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{total_head_size_node, token_len, one}, 0));
|
||||
|
||||
// 1D tensor of shape [token_len], values starting from past_token_len
|
||||
auto range_col =
|
||||
std::make_shared<ov::op::v0::Squeeze>(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1}));
|
||||
auto range_col = update_indices;
|
||||
auto range_col_reshaped =
|
||||
std::make_shared<ov::op::v0::Unsqueeze>(range_col, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 2}));
|
||||
auto col_indices = std::make_shared<ov::op::v3::Broadcast>(
|
||||
|
|
@ -119,26 +127,11 @@ void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decode
|
|||
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{total_head_size_node, token_len, one}, 0));
|
||||
|
||||
// Stack row_indices and col_indices along last axis: [total_head_size, token_len, 2]
|
||||
auto indices = std::make_shared<ov::op::v0::Concat>(OutputVector{row_indices, col_indices}, 2);
|
||||
update_indices_v = std::make_shared<ov::op::v0::Concat>(OutputVector{row_indices, col_indices}, 2);
|
||||
update_indices_v = std::make_shared<ov::op::v1::Reshape>(
|
||||
indices, ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector<int64_t>{-1, 2}), false);
|
||||
update_indices_v->set_friendly_name("update_indices_v");
|
||||
tensor_map.insert({"update_indices_v", update_indices_v->output(0)});
|
||||
}
|
||||
|
||||
float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) {
|
||||
#ifndef M_PI
|
||||
# define M_PI 3.14159265358979323846
|
||||
#endif
|
||||
return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float) M_PI)) / (2 * logf(base));
|
||||
}
|
||||
|
||||
void ggml_rope_yarn_corr_dims(int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow,
|
||||
float dims[2]) {
|
||||
float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base));
|
||||
float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base));
|
||||
dims[0] = std::max(0.0f, start);
|
||||
dims[1] = std::min(static_cast<float>(n_dims - 1), end);
|
||||
update_indices_v, ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector<int64_t>{-1, 2}), false);
|
||||
update_indices_v.get_node_shared_ptr()->set_friendly_name("update_indices_v");
|
||||
tensor_map.insert({"update_indices_v", update_indices_v});
|
||||
}
|
||||
|
||||
void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {
|
||||
|
|
|
|||
|
|
@ -356,10 +356,13 @@ void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor)
|
|||
std::cout << *(tensor.data<float>()) << std::endl;
|
||||
break;
|
||||
case ov::element::f16:
|
||||
std::cout << ov::float16::from_bits(*(tensor.data<uint16_t>())) << std::endl;
|
||||
std::cout << *(tensor.data<ov::float16>()) << std::endl;
|
||||
break;
|
||||
case ov::element::i32:
|
||||
std::cout << *(tensor.data<int32_t>()) << std::endl;
|
||||
for (size_t i = 0; i < tensor.get_size(); ++i) {
|
||||
std::cout << tensor.data<int32_t>()[i] << " ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
break;
|
||||
case ov::element::i64:
|
||||
std::cout << *(tensor.data<int64_t>()) << std::endl;
|
||||
|
|
@ -379,7 +382,7 @@ void print_output_tensor_info(const std::string& name, const ov::Tensor& tensor,
|
|||
std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl;
|
||||
break;
|
||||
case ov::element::f16:
|
||||
std::cout << ov::float16::from_bits(*(tensor.data<uint16_t>())) << std::endl;
|
||||
std::cout << *(tensor.data<ov::float16>()) << std::endl;
|
||||
std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl;
|
||||
break;
|
||||
default:
|
||||
|
|
|
|||
Loading…
Reference in New Issue