NPU support llma-perplexity -b 512 --no-warmup
This commit is contained in:
parent
65348b5d20
commit
808619e274
|
|
@ -344,6 +344,12 @@ std::pair<ModelParams, ComputeParams> GgmlOvDecoder::compute_llm_params(ggml_cgr
|
|||
} else if (name.find("Kcur-0") == 0 || std::string(node->src[0]->name).find("Kcur-0") == 0) {
|
||||
model_params.n_heads_kv = node->ne[1];
|
||||
}
|
||||
} else if (node->op == GGML_OP_GET_ROWS && std::string(node->src[1]->name) == "inp_out_ids") {
|
||||
// for static case, output_len is always 1 except for llama-perplexity
|
||||
compute_params.output_len = node->src[1]->ne[0];
|
||||
if (is_static && compute_params.output_len == 0) {
|
||||
compute_params.output_len = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
model_params.ctx = model_params.ctx_per_seq * model_params.n_seq;
|
||||
|
|
@ -366,7 +372,7 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co
|
|||
input_shape = ov::PartialShape{1, 1, 1, len};
|
||||
|
||||
} else if (name == "inp_out_ids") {
|
||||
input_shape = ov::PartialShape{1, 1, 1, m_is_static ? 1 : -1};
|
||||
input_shape = ov::PartialShape{1, 1, 1, m_is_static ? m_compute_params.output_len : -1};
|
||||
|
||||
} else if (name.find("KQ_mask") == 0) {
|
||||
if (m_is_static) {
|
||||
|
|
|
|||
|
|
@ -513,15 +513,16 @@ ov::Tensor get_ov_input_tensor_static_prefill(std::shared_ptr<GgmlOvDecoder> ggm
|
|||
}
|
||||
|
||||
if (param_name == "inp_out_ids") {
|
||||
ov::Shape input_shape = {1, 1, 1, 1};
|
||||
size_t output_len = ggml_decoder->get_compute_params().output_len;
|
||||
ov::Shape input_shape = {1, 1, 1, output_len};
|
||||
ov::Tensor input_tensor(ggml_decoder->get_input_type(param_name), input_shape);
|
||||
if (ggml_tensor->ne[0] == 0) {
|
||||
*input_tensor.data<int32_t>() = 0;
|
||||
} else if (ggml_tensor->ne[0] == 1) {
|
||||
int32_t inp_out_id = *((int32_t *) ggml_tensor->data) % chunk_size;
|
||||
*input_tensor.data<int32_t>() = inp_out_id;
|
||||
} else {
|
||||
throw std::runtime_error("NPU does not support outputing logits for multiple tokens at once.");
|
||||
auto * data_addr = input_tensor.data<int32_t>();
|
||||
for (size_t i = 0; i < output_len; i++) {
|
||||
data_addr[i] = ((int32_t *) ggml_tensor->data)[i] % chunk_size;
|
||||
}
|
||||
}
|
||||
return input_tensor;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue