From 496c3599c6ee9bbc377d7fb8d25cc8fe9bbdc330 Mon Sep 17 00:00:00 2001 From: bssrdf Date: Sun, 9 Nov 2025 09:23:14 -0500 Subject: [PATCH] add loop unrolling --- ggml/src/ggml-cuda/conv2d-implicit.cu | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cu b/ggml/src/ggml-cuda/conv2d-implicit.cu index 00d96656ba..99fa1925d5 100644 --- a/ggml/src/ggml-cuda/conv2d-implicit.cu +++ b/ggml/src/ggml-cuda/conv2d-implicit.cu @@ -90,11 +90,13 @@ static __global__ void NCHW2NHWC(const src_T *src, dst_T * dst, const int ne, co __shared__ src_T tile[rs*blk_c]; +#pragma unroll for(int i = 0; i < CUDA_NCHW_2_NHWC_BLOCK_NM; ++i){ const unsigned int imat = by * CUDA_NCHW_2_NHWC_BLOCK_NM + i; if(imat >= nmat) break; +#pragma unroll for (unsigned int j = 0; j < rs; j++){ const unsigned int row = (j * blk_c + tx) % rs; const unsigned int col = (j * blk_c + tx) / rs; @@ -106,7 +108,7 @@ static __global__ void NCHW2NHWC(const src_T *src, dst_T * dst, const int ne, co } } __syncthreads(); - +#pragma unroll for (unsigned int j = 0; j < rs; j++){ const unsigned int dst_index = imat*n + j*ne00 + bx*blk_c + tx; if(dst_index < ne){