diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index ece1fc8a54..3d8fddc720 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -344,6 +344,12 @@ std::pair GgmlOvDecoder::compute_llm_params(ggml_cgr } else if (name.find("Kcur-0") == 0 || std::string(node->src[0]->name).find("Kcur-0") == 0) { model_params.n_heads_kv = node->ne[1]; } + } else if (node->op == GGML_OP_GET_ROWS && std::string(node->src[1]->name) == "inp_out_ids") { + // for static case, output_len is always 1 except for llama-perplexity + compute_params.output_len = node->src[1]->ne[0]; + if (is_static && compute_params.output_len == 0) { + compute_params.output_len = 1; + } } } model_params.ctx = model_params.ctx_per_seq * model_params.n_seq; @@ -366,7 +372,7 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co input_shape = ov::PartialShape{1, 1, 1, len}; } else if (name == "inp_out_ids") { - input_shape = ov::PartialShape{1, 1, 1, m_is_static ? 1 : -1}; + input_shape = ov::PartialShape{1, 1, 1, m_is_static ? m_compute_params.output_len : -1}; } else if (name.find("KQ_mask") == 0) { if (m_is_static) { diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 92e8ce80b3..139bda1f8f 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -513,15 +513,16 @@ ov::Tensor get_ov_input_tensor_static_prefill(std::shared_ptr ggm } if (param_name == "inp_out_ids") { - ov::Shape input_shape = {1, 1, 1, 1}; + size_t output_len = ggml_decoder->get_compute_params().output_len; + ov::Shape input_shape = {1, 1, 1, output_len}; ov::Tensor input_tensor(ggml_decoder->get_input_type(param_name), input_shape); if (ggml_tensor->ne[0] == 0) { *input_tensor.data() = 0; - } else if (ggml_tensor->ne[0] == 1) { - int32_t inp_out_id = *((int32_t *) ggml_tensor->data) % chunk_size; - *input_tensor.data() = inp_out_id; } else { - throw std::runtime_error("NPU does not support outputing logits for multiple tokens at once."); + auto * data_addr = input_tensor.data(); + for (size_t i = 0; i < output_len; i++) { + data_addr[i] = ((int32_t *) ggml_tensor->data)[i] % chunk_size; + } } return input_tensor; }