add loop unrolling

This commit is contained in:
bssrdf 2025-11-09 09:23:14 -05:00
parent 5ed2c1b787
commit 496c3599c6
1 changed files with 3 additions and 1 deletions

View File

@ -90,11 +90,13 @@ static __global__ void NCHW2NHWC(const src_T *src, dst_T * dst, const int ne, co
__shared__ src_T tile[rs*blk_c];
#pragma unroll
for(int i = 0; i < CUDA_NCHW_2_NHWC_BLOCK_NM; ++i){
const unsigned int imat = by * CUDA_NCHW_2_NHWC_BLOCK_NM + i;
if(imat >= nmat)
break;
#pragma unroll
for (unsigned int j = 0; j < rs; j++){
const unsigned int row = (j * blk_c + tx) % rs;
const unsigned int col = (j * blk_c + tx) / rs;
@ -106,7 +108,7 @@ static __global__ void NCHW2NHWC(const src_T *src, dst_T * dst, const int ne, co
}
}
__syncthreads();
#pragma unroll
for (unsigned int j = 0; j < rs; j++){
const unsigned int dst_index = imat*n + j*ne00 + bx*blk_c + tx;
if(dst_index < ne){