NPU support llma-perplexity -b 512 --no-warmup

2025-12-03 17:10:07 +08:00 · 2025-12-03 17:10:07 +08:00 · 808619e274
parent 65348b5d20
commit 808619e274
2 changed files with 13 additions and 6 deletions
--- a/ggml/src/ggml-openvino/ggml-decoder.cpp
+++ b/ggml/src/ggml-openvino/ggml-decoder.cpp
@ -344,6 +344,12 @@ std::pair<ModelParams, ComputeParams> GgmlOvDecoder::compute_llm_params(ggml_cgr
            } else if (name.find("Kcur-0") == 0 || std::string(node->src[0]->name).find("Kcur-0") == 0) {
                model_params.n_heads_kv = node->ne[1];
            }
+        } else if (node->op == GGML_OP_GET_ROWS && std::string(node->src[1]->name) == "inp_out_ids") {
+            // for static case, output_len is always 1 except for llama-perplexity
+            compute_params.output_len = node->src[1]->ne[0];
+            if (is_static && compute_params.output_len == 0) {
+                compute_params.output_len = 1;
+            }
        }
    }
    model_params.ctx = model_params.ctx_per_seq * model_params.n_seq;
@ -366,7 +372,7 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co
        input_shape = ov::PartialShape{1, 1, 1, len};

    } else if (name == "inp_out_ids") {
-        input_shape = ov::PartialShape{1, 1, 1, m_is_static ? 1 : -1};
+        input_shape = ov::PartialShape{1, 1, 1, m_is_static ? m_compute_params.output_len : -1};

    } else if (name.find("KQ_mask") == 0) {
        if (m_is_static) {
--- a/ggml/src/ggml-openvino/utils.cpp
+++ b/ggml/src/ggml-openvino/utils.cpp
@ -513,15 +513,16 @@ ov::Tensor get_ov_input_tensor_static_prefill(std::shared_ptr<GgmlOvDecoder> ggm
    }

    if (param_name == "inp_out_ids") {
-        ov::Shape input_shape = {1, 1, 1, 1};
+        size_t output_len = ggml_decoder->get_compute_params().output_len;
+        ov::Shape input_shape = {1, 1, 1, output_len};
        ov::Tensor input_tensor(ggml_decoder->get_input_type(param_name), input_shape);
        if (ggml_tensor->ne[0] == 0) {
            *input_tensor.data<int32_t>() = 0;
-        } else if (ggml_tensor->ne[0] == 1) {
-            int32_t inp_out_id = *((int32_t *) ggml_tensor->data) % chunk_size;
-            *input_tensor.data<int32_t>() = inp_out_id;
        } else {
-            throw std::runtime_error("NPU does not support outputing logits for multiple tokens at once.");
+            auto * data_addr = input_tensor.data<int32_t>();
+            for (size_t i = 0; i < output_len; i++) {
+                data_addr[i] = ((int32_t *) ggml_tensor->data)[i] % chunk_size;
+            }
        }
        return input_tensor;
    }