diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cu b/ggml/src/ggml-cuda/conv2d-implicit.cu index be993d99c1..8361c422ad 100644 --- a/ggml/src/ggml-cuda/conv2d-implicit.cu +++ b/ggml/src/ggml-cuda/conv2d-implicit.cu @@ -84,7 +84,7 @@ static __global__ void NCHW2NHWC(const src_T *src, dst_T * dst, const int ne, co } template -static __global__ void NCHW2NHWC(const src_T *src, dst_T * dst, const int ne, const int ne00, const int ne01){ +static __global__ void NCHW2NHWC(const src_T *src, dst_T * dst, const int ne, const int ne00, const int ne01, param_t P){ const int64_t n = ne00 * ne01; @@ -99,8 +99,9 @@ static __global__ void NCHW2NHWC(const src_T *src, dst_T * dst, const int ne, co #pragma unroll for (unsigned int j = 0; j < rs; j++){ - const unsigned int row = (j * blk + tx) % rs; - const unsigned int col = (j * blk + tx) / rs; + const int i = j * blk + tx; + const unsigned int row = fastmodulo(i, P.RS_fastdiv); + const unsigned int col = fastdiv(i, P.RS_fastdiv); const unsigned int src_index = by*n + bx * blk_c * rs + j * blk + tx; unsigned int idx = row * blk_c + col; idx = idx ^ ((idx & mask) >> 4); @@ -1224,34 +1225,34 @@ static void conv2d_implicit_cuda_f16(ggml_backend_cuda_context & ctx, const floa 1) ; if (ne01 == 25) { constexpr unsigned int mask = filter_swizzle_mask(25, CUDA_NCHW_2_NHWC_BLOCK_C); - NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01); + NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01, P); } else if (ne01 == 16) { constexpr unsigned int mask = filter_swizzle_mask(16, CUDA_NCHW_2_NHWC_BLOCK_C); - NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01); + NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01, P); } else if (ne01 == 9) { constexpr unsigned int mask = filter_swizzle_mask(9, CUDA_NCHW_2_NHWC_BLOCK_C); - NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01); + NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01, P); } else if (ne01 == 8) { constexpr unsigned int mask = filter_swizzle_mask(8, CUDA_NCHW_2_NHWC_BLOCK_C); - NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01); + NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01, P); } else if (ne01 == 7) { constexpr unsigned int mask = filter_swizzle_mask(7, CUDA_NCHW_2_NHWC_BLOCK_C); - NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01); + NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01, P); } else if (ne01 == 6) { constexpr unsigned int mask = filter_swizzle_mask(6, CUDA_NCHW_2_NHWC_BLOCK_C); - NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01); + NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01, P); } else if (ne01 == 5) { constexpr unsigned int mask = filter_swizzle_mask(5, CUDA_NCHW_2_NHWC_BLOCK_C); - NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01); + NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01, P); } else if (ne01 == 4) { constexpr unsigned int mask = filter_swizzle_mask(4, CUDA_NCHW_2_NHWC_BLOCK_C); - NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01); + NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01, P); } else if (ne01 == 3) { constexpr unsigned int mask = filter_swizzle_mask(3, CUDA_NCHW_2_NHWC_BLOCK_C); - NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01); + NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01, P); } else if (ne01 == 2) { constexpr unsigned int mask = filter_swizzle_mask(2, CUDA_NCHW_2_NHWC_BLOCK_C); - NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01); + NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01, P); } else { dim3 dimGrid2((ne01 + CUDA_NCHW_2_NHWC_TILE_DIM - 1) / CUDA_NCHW_2_NHWC_TILE_DIM, (ne00 + CUDA_NCHW_2_NHWC_TILE_DIM - 1) / CUDA_NCHW_2_NHWC_TILE_DIM,