diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cu b/ggml/src/ggml-cuda/conv2d-implicit.cu index 00d96656ba..99fa1925d5 100644 --- a/ggml/src/ggml-cuda/conv2d-implicit.cu +++ b/ggml/src/ggml-cuda/conv2d-implicit.cu @@ -90,11 +90,13 @@ static __global__ void NCHW2NHWC(const src_T *src, dst_T * dst, const int ne, co __shared__ src_T tile[rs*blk_c]; +#pragma unroll for(int i = 0; i < CUDA_NCHW_2_NHWC_BLOCK_NM; ++i){ const unsigned int imat = by * CUDA_NCHW_2_NHWC_BLOCK_NM + i; if(imat >= nmat) break; +#pragma unroll for (unsigned int j = 0; j < rs; j++){ const unsigned int row = (j * blk_c + tx) % rs; const unsigned int col = (j * blk_c + tx) / rs; @@ -106,7 +108,7 @@ static __global__ void NCHW2NHWC(const src_T *src, dst_T * dst, const int ne, co } } __syncthreads(); - +#pragma unroll for (unsigned int j = 0; j < rs; j++){ const unsigned int dst_index = imat*n + j*ne00 + bx*blk_c + tx; if(dst_index < ne){