diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cu b/ggml/src/ggml-cuda/conv2d-implicit.cu
index 1ffce7a9d7..8b9c83ba1a 100644
--- a/ggml/src/ggml-cuda/conv2d-implicit.cu
+++ b/ggml/src/ggml-cuda/conv2d-implicit.cu
@@ -14,7 +14,7 @@ constexpr uint WARPSIZE = 32;
 #define CUDA_NCHW_2_NHWC_TILE_DIM 32
 #define CUDA_NCHW_2_NHWC_BLOCK_NM 8
 #define CUDA_NCHW_2_NHWC_BLOCK_ROWS 8
-#define CUDA_NCHW_2_NHWC_BLOCK_C 64
+#define CUDA_NCHW_2_NHWC_BLOCK_C 32
 
 
 //currently not use; in future for split-k kernels
@@ -58,12 +58,13 @@ static __global__ void NCHW2NHWC(const src_T *src, dst_T * dst, const int ne, co
     int ty = blockIdx.x * CUDA_NCHW_2_NHWC_TILE_DIM + threadIdx.y;
 
     __shared__ src_T tile[CUDA_NCHW_2_NHWC_TILE_DIM][CUDA_NCHW_2_NHWC_TILE_DIM];
-
+#pragma unroll
     for(int i = 0; i < CUDA_NCHW_2_NHWC_BLOCK_NM; ++i){
 
         const unsigned int imat = blockIdx.z * CUDA_NCHW_2_NHWC_BLOCK_NM + i;
         if(imat >= nmat)
             break;
+#pragma unroll
         for (int j = 0; j < CUDA_NCHW_2_NHWC_TILE_DIM; j += CUDA_NCHW_2_NHWC_BLOCK_ROWS){
             if(x < ne01 && y + j < ne00){
                 const int row = threadIdx.y+j;
@@ -72,7 +73,7 @@ static __global__ void NCHW2NHWC(const src_T *src, dst_T * dst, const int ne, co
             }
         }
         __syncthreads();
-
+#pragma unroll
         for (int j = 0; j < CUDA_NCHW_2_NHWC_TILE_DIM; j += CUDA_NCHW_2_NHWC_BLOCK_ROWS){
             if(ty + j < ne01 && tx < ne00){
                 const int col = (threadIdx.y+j) ^ threadIdx.x;