diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp
index da0480df7f..30930be422 100644
--- a/ggml/src/ggml-qnn/backend-ops.cpp
+++ b/ggml/src/ggml-qnn/backend-ops.cpp
@@ -566,6 +566,10 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t
 }
 
 bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) {
+    constexpr const auto get_tensor_size = [](const ggml_tensor *tensor) -> size_t {
+        return tensor->ne[0] * tensor->ne[1] * tensor->ne[2] * tensor->ne[3];
+    };
+
     auto *src0 = op->src[0];
     auto *src1 = op->src[1];
     switch (ctx->device) {
@@ -578,6 +582,11 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggm
                 QNN_LOG_DEBUG("[qnn-npu] src0 and src1 dimensions are not equal, support/unsupported: %d/%d",
                               ctx->support_op_count.load(), ++(ctx->unsupported_op_count));
                 return false;
+            } else if (get_tensor_size(src0) + get_tensor_size(src1) + get_tensor_size(op) >=
+                       (8192 * 2048 + 8192 * 512 + 2048 * 512)) {
+                QNN_LOG_DEBUG("[qnn-npu] tensor size is too large, support/unsupported: %d/%d",
+                              ctx->support_op_count.load(), ++(ctx->unsupported_op_count));
+                return false;
             }
             // fall through, from test here, the convert op is super slow on NPU:
             //   https://github.com/usefulsensors/qc_npu_benchmark