NPU support llma-perplexity -b 512 --no-warmup

This commit is contained in:
Yu, Zijun 2025-12-03 17:10:07 +08:00 committed by Mustafa Cavus
parent 65348b5d20
commit 808619e274
2 changed files with 13 additions and 6 deletions

View File

@ -344,6 +344,12 @@ std::pair<ModelParams, ComputeParams> GgmlOvDecoder::compute_llm_params(ggml_cgr
} else if (name.find("Kcur-0") == 0 || std::string(node->src[0]->name).find("Kcur-0") == 0) {
model_params.n_heads_kv = node->ne[1];
}
} else if (node->op == GGML_OP_GET_ROWS && std::string(node->src[1]->name) == "inp_out_ids") {
// for static case, output_len is always 1 except for llama-perplexity
compute_params.output_len = node->src[1]->ne[0];
if (is_static && compute_params.output_len == 0) {
compute_params.output_len = 1;
}
}
}
model_params.ctx = model_params.ctx_per_seq * model_params.n_seq;
@ -366,7 +372,7 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co
input_shape = ov::PartialShape{1, 1, 1, len};
} else if (name == "inp_out_ids") {
input_shape = ov::PartialShape{1, 1, 1, m_is_static ? 1 : -1};
input_shape = ov::PartialShape{1, 1, 1, m_is_static ? m_compute_params.output_len : -1};
} else if (name.find("KQ_mask") == 0) {
if (m_is_static) {

View File

@ -513,15 +513,16 @@ ov::Tensor get_ov_input_tensor_static_prefill(std::shared_ptr<GgmlOvDecoder> ggm
}
if (param_name == "inp_out_ids") {
ov::Shape input_shape = {1, 1, 1, 1};
size_t output_len = ggml_decoder->get_compute_params().output_len;
ov::Shape input_shape = {1, 1, 1, output_len};
ov::Tensor input_tensor(ggml_decoder->get_input_type(param_name), input_shape);
if (ggml_tensor->ne[0] == 0) {
*input_tensor.data<int32_t>() = 0;
} else if (ggml_tensor->ne[0] == 1) {
int32_t inp_out_id = *((int32_t *) ggml_tensor->data) % chunk_size;
*input_tensor.data<int32_t>() = inp_out_id;
} else {
throw std::runtime_error("NPU does not support outputing logits for multiple tokens at once.");
auto * data_addr = input_tensor.data<int32_t>();
for (size_t i = 0; i < output_len; i++) {
data_addr[i] = ((int32_t *) ggml_tensor->data)[i] % chunk_size;
}
}
return input_tensor;
}