diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index da0480df7f..30930be422 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -566,6 +566,10 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t } bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) { + constexpr const auto get_tensor_size = [](const ggml_tensor *tensor) -> size_t { + return tensor->ne[0] * tensor->ne[1] * tensor->ne[2] * tensor->ne[3]; + }; + auto *src0 = op->src[0]; auto *src1 = op->src[1]; switch (ctx->device) { @@ -578,6 +582,11 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggm QNN_LOG_DEBUG("[qnn-npu] src0 and src1 dimensions are not equal, support/unsupported: %d/%d", ctx->support_op_count.load(), ++(ctx->unsupported_op_count)); return false; + } else if (get_tensor_size(src0) + get_tensor_size(src1) + get_tensor_size(op) >= + (8192 * 2048 + 8192 * 512 + 2048 * 512)) { + QNN_LOG_DEBUG("[qnn-npu] tensor size is too large, support/unsupported: %d/%d", + ctx->support_op_count.load(), ++(ctx->unsupported_op_count)); + return false; } // fall through, from test here, the convert op is super slow on NPU: // https://github.com/usefulsensors/qc_npu_benchmark