Fix llama-perplexity

This commit is contained in:
Yu, Zijun 2025-07-24 17:44:32 +08:00 committed by Mustafa Cavus
parent 75eec6265f
commit 4e7f04a307
3 changed files with 70 additions and 57 deletions

View File

@ -236,8 +236,9 @@ void GgmlOvDecoder::set_llm_params() {
}
ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) const {
auto name = std::string(src->name);
ov::PartialShape input_shape;
if (std::string(src->name) == "inp_tokens" || std::string(src->name) == "inp_pos") {
if (name == "inp_tokens" || name == "inp_pos") {
if (m_is_static) {
if (m_is_first_token) {
input_shape = ov::PartialShape{1, 1, m_context_size};
@ -247,7 +248,9 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co
} else {
input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_context_size)};
}
} else if (std::string(src->name) == "KQ_mask") {
} else if (name == "inp_out_ids" && !m_is_static) {
input_shape = ov::PartialShape{1, 1, ov::Dimension(1, m_context_size)};
} else if (name == "KQ_mask") {
if (m_is_static) {
if (m_is_first_token) {
input_shape = ov::PartialShape{1, m_context_size, m_context_size};
@ -258,9 +261,9 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co
auto max_mask_size = GGML_PAD(m_context_size, GGML_KQ_MASK_PAD);
input_shape = ov::PartialShape{1, ov::Dimension(1, max_mask_size), ov::Dimension(1, max_mask_size)};
}
} else if (std::string(src->name).find("cache_k") == 0) {
} else if (name.find("cache_k") == 0) {
input_shape = ov::PartialShape{m_context_size, m_num_heads_kv, m_head_size};
} else if (std::string(src->name).find("cache_v") == 0) {
} else if (name.find("cache_v") == 0) {
input_shape = ov::PartialShape{m_num_heads_kv, m_head_size, m_context_size};
} else if (src->op == GGML_OP_VIEW) {
// This case is added to make test-backend-ops work
@ -273,18 +276,22 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor* src) co
void GgmlOvDecoder::add_extra_inputs() {
// Extra inputs:
// 1. `attention_size`, used in matmul's in the attention block. The shape of those matmul's are 32 aligned,
// see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding.
// Not used for NPU
// 1. `past_token_len`, used to create indices for updating kv cache. Usually equal to inp_pos[0], except for
// llama-perplexity.
// 2. `attention_size`, used in matmul's in the attention block. The shape of those matmul's are 32 aligned,
// see llama_kv_cache_unified::get_n_kv and llama_kv_cache_unified::get_padding.
// Not used for NPU
int64_t past_token_len = -1;
int64_t attention_size = -1;
int64_t past_token_len = -1;
int64_t token_len = -1;
int64_t past_token_len_from_inp_pos = -1;
for (const auto& node : m_nodes) {
if (node->op == GGML_OP_ROPE && std::string(node->src[1]->name) == "inp_pos") {
if (node->src[1]->type != GGML_TYPE_I32) {
throw std::runtime_error("Expected cgraph input `inp_pos` to be of type GGML_TYPE_I32");
}
token_len = node->src[1]->ne[0];
past_token_len_from_inp_pos = ((int32_t*) (node->src[1]->data))[0];
}
if (node->op == GGML_OP_CPY && ggml_is_contiguous(node)) {
@ -294,29 +301,39 @@ void GgmlOvDecoder::add_extra_inputs() {
break;
}
}
if (past_token_len == -1) {
throw std::runtime_error("Failed to find input \"cache_k\" in the graph");
}
if (past_token_len != past_token_len_from_inp_pos) {
throw std::runtime_error("Mismatch between past_token_len from cache_k and inp_pos: " +
std::to_string(past_token_len) + " vs " + std::to_string(past_token_len_from_inp_pos));
GGML_LOG_DEBUG("Mismatch between past_token_len from cache_k and inp_pos: %ld vs %ld\n",
past_token_len,
past_token_len_from_inp_pos);
}
for (const auto& node : m_nodes) {
if (node->src[1] && std::string(node->src[1]->name).find("inp_tokens") == 0) {
int64_t total_token_len = node->src[1]->ne[0] + past_token_len;
attention_size = GGML_PAD(total_token_len, 32);
std::string name = "attention_size";
auto param_node = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::Shape{1});
param_node->set_friendly_name(name);
param_node->output(0).get_tensor().set_names({name});
m_model_extra_inputs[name] = param_node;
{
std::string name = "past_token_len";
auto param_node = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::Shape{1});
param_node->set_friendly_name(name);
param_node->output(0).get_tensor().set_names({name});
m_model_extra_inputs[name] = param_node;
auto tensor = std::make_shared<ov::Tensor>(ov::element::i64, ov::Shape{1});
*tensor->data<int64_t>() = attention_size;
m_model_extra_input_values[name] = tensor;
break;
}
auto tensor = std::make_shared<ov::Tensor>(ov::element::i64, ov::Shape{1});
*tensor->data<int64_t>() = past_token_len;
m_model_extra_input_values[name] = tensor;
}
{
int64_t total_token_len = token_len + past_token_len;
attention_size = GGML_PAD(total_token_len, 32);
std::string name = "attention_size";
auto param_node = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::Shape{1});
param_node->set_friendly_name(name);
param_node->output(0).get_tensor().set_names({name});
m_model_extra_inputs[name] = param_node;
auto tensor = std::make_shared<ov::Tensor>(ov::element::i64, ov::Shape{1});
*tensor->data<int64_t>() = attention_size;
m_model_extra_input_values[name] = tensor;
}
}

View File

@ -5,6 +5,7 @@
#include <map>
#include <memory>
#include <openvino/core/node.hpp>
#include <openvino/op/add.hpp>
#include <openvino/op/broadcast.hpp>
#include <openvino/op/concat.hpp>
#include <openvino/op/convert.hpp>
@ -78,11 +79,11 @@ void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decode
// cache_k layout: [S, N, H] (seq, num_heads, head_size)
// cache_v layout: [N, H, S] (num_heads, head_size, seq)
// When writing to cache_v, cache should be reshaped to [N*H, S] and v-curr should be flattened
auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr();
auto past_token_len = tensor_map.at("past_token_len").get_node_shared_ptr();
auto token_len = tensor_map.at("token_len").get_node_shared_ptr();
std::shared_ptr<ov::Node> update_indices_k;
std::shared_ptr<ov::Node> update_indices_v;
Output<Node> update_indices_k;
Output<Node> update_indices_v;
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
auto zero_scalar = ov::op::v0::Constant::create(ov::element::i64, {}, {0});
@ -90,11 +91,19 @@ void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decode
auto one_scalar = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {1});
auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
update_indices_k =
std::make_shared<ov::op::v0::Squeeze>(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1}));
update_indices_k = std::make_shared<ov::op::v0::Unsqueeze>(update_indices_k, one);
update_indices_k->set_friendly_name("update_indices_k");
tensor_map.insert({"update_indices_k", update_indices_k->output(0)});
auto past_token_len_scalar = std::make_shared<ov::op::v0::Squeeze>(past_token_len, zero);
auto token_len_scalar = std::make_shared<ov::op::v0::Squeeze>(token_len, zero);
auto total_token_len_scalar = std::make_shared<ov::op::v1::Add>(past_token_len_scalar, token_len_scalar);
Output<Node> update_indices = std::make_shared<ov::op::v4::Range>(
past_token_len_scalar, total_token_len_scalar, one_scalar, ov::element::i64);
if (ggml_model_decoder.is_static()) {
update_indices = past_token_len;
}
update_indices_k = std::make_shared<ov::op::v0::Unsqueeze>(update_indices, one);
update_indices_k.get_node_shared_ptr()->set_friendly_name("update_indices_k");
tensor_map.insert({"update_indices_k", update_indices_k});
auto total_head_size = ggml_model_decoder.get_num_heads_kv() * ggml_model_decoder.get_head_size();
auto total_head_size_node = ov::op::v0::Constant::create(ov::element::i64, {1}, {total_head_size});
@ -102,7 +111,7 @@ void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decode
// 1D tensor of shape [total_head_size], values starting from 0
auto range_row =
std::make_shared<ov::op::v4::Range>(zero_scalar, total_head_size_scalar, one_scalar, ov::element::i32);
std::make_shared<ov::op::v4::Range>(zero_scalar, total_head_size_scalar, one_scalar, ov::element::i64);
auto range_row_reshaped =
std::make_shared<ov::op::v0::Unsqueeze>(range_row, ov::op::v0::Constant::create(ov::element::i64, {2}, {1, 2}));
auto row_indices = std::make_shared<ov::op::v3::Broadcast>(
@ -110,8 +119,7 @@ void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decode
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{total_head_size_node, token_len, one}, 0));
// 1D tensor of shape [token_len], values starting from past_token_len
auto range_col =
std::make_shared<ov::op::v0::Squeeze>(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 1}));
auto range_col = update_indices;
auto range_col_reshaped =
std::make_shared<ov::op::v0::Unsqueeze>(range_col, ov::op::v0::Constant::create(ov::element::i64, {2}, {0, 2}));
auto col_indices = std::make_shared<ov::op::v3::Broadcast>(
@ -119,26 +127,11 @@ void add_kv_update_indices(TensorMap& tensor_map, GgmlDecoder& ggml_model_decode
std::make_shared<ov::op::v0::Concat>(ov::OutputVector{total_head_size_node, token_len, one}, 0));
// Stack row_indices and col_indices along last axis: [total_head_size, token_len, 2]
auto indices = std::make_shared<ov::op::v0::Concat>(OutputVector{row_indices, col_indices}, 2);
update_indices_v = std::make_shared<ov::op::v0::Concat>(OutputVector{row_indices, col_indices}, 2);
update_indices_v = std::make_shared<ov::op::v1::Reshape>(
indices, ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector<int64_t>{-1, 2}), false);
update_indices_v->set_friendly_name("update_indices_v");
tensor_map.insert({"update_indices_v", update_indices_v->output(0)});
}
float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) {
#ifndef M_PI
# define M_PI 3.14159265358979323846
#endif
return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float) M_PI)) / (2 * logf(base));
}
void ggml_rope_yarn_corr_dims(int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow,
float dims[2]) {
float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base));
float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base));
dims[0] = std::max(0.0f, start);
dims[1] = std::min(static_cast<float>(n_dims - 1), end);
update_indices_v, ov::op::v0::Constant::create(ov::element::i64, {2}, std::vector<int64_t>{-1, 2}), false);
update_indices_v.get_node_shared_ptr()->set_friendly_name("update_indices_v");
tensor_map.insert({"update_indices_v", update_indices_v});
}
void add_rope_sin_cos(TensorMap& tensor_map, GgmlDecoder& ggml_model_decoder) {

View File

@ -356,10 +356,13 @@ void print_input_tensor_info(const std::string& name, const ov::Tensor& tensor)
std::cout << *(tensor.data<float>()) << std::endl;
break;
case ov::element::f16:
std::cout << ov::float16::from_bits(*(tensor.data<uint16_t>())) << std::endl;
std::cout << *(tensor.data<ov::float16>()) << std::endl;
break;
case ov::element::i32:
std::cout << *(tensor.data<int32_t>()) << std::endl;
for (size_t i = 0; i < tensor.get_size(); ++i) {
std::cout << tensor.data<int32_t>()[i] << " ";
}
std::cout << std::endl;
break;
case ov::element::i64:
std::cout << *(tensor.data<int64_t>()) << std::endl;
@ -379,7 +382,7 @@ void print_output_tensor_info(const std::string& name, const ov::Tensor& tensor,
std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl;
break;
case ov::element::f16:
std::cout << ov::float16::from_bits(*(tensor.data<uint16_t>())) << std::endl;
std::cout << *(tensor.data<ov::float16>()) << std::endl;
std::cout << checksum(tensor.data(), tensor.get_byte_size()) << std::endl;
break;
default: