From febee580c89d2b2c0248095dd4702a19d404906f Mon Sep 17 00:00:00 2001 From: bssrdf Date: Sun, 16 Nov 2025 11:14:27 -0500 Subject: [PATCH] fixed anotehr bug in the special filter transpose NCHW2NHWC --- ggml/src/ggml-cuda/conv2d-implicit.cu | 79 +++++++++++++-------------- tests/test-conv2d.cpp | 23 ++++---- 2 files changed, 51 insertions(+), 51 deletions(-) diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cu b/ggml/src/ggml-cuda/conv2d-implicit.cu index 2bd8bcd281..5958c3f29e 100644 --- a/ggml/src/ggml-cuda/conv2d-implicit.cu +++ b/ggml/src/ggml-cuda/conv2d-implicit.cu @@ -82,7 +82,6 @@ static __global__ void NCHW2NHWC(const src_T *src, dst_T * dst, const int ne, co } } - //*** broken, has bugs *** template static __global__ void NCHW2NHWC(const src_T *src, dst_T * dst, const int ne, const int ne00, const int ne01){ @@ -108,10 +107,9 @@ static __global__ void NCHW2NHWC(const src_T *src, dst_T * dst, const int ne, co const unsigned int row = (j * blk + tx) % rs; const unsigned int col = (j * blk + tx) / rs; const unsigned int src_index = imat*n + bx * blk_c * rs + j * blk + tx; - // const unsigned int src_index = imat*n + rs*ne00 + bx * blk_c + j * blk_c + tx; unsigned int idx = row * blk_c + col; idx = idx ^ ((idx & mask) >> 4); - if (src_index < ne) { + if (src_index < ne && tx < blk) { tile[idx] = src[src_index]; } } @@ -122,7 +120,7 @@ static __global__ void NCHW2NHWC(const src_T *src, dst_T * dst, const int ne, co if(dst_index < ne && tx < blk){ unsigned int idx = j*blk_c + tx; idx = idx ^ ((idx & mask) >> 4); - dst[dst_index] = ggml_cuda_cast(tile[idx]); + dst[dst_index] = ggml_cuda_cast(tile[idx]); } } } @@ -1340,46 +1338,47 @@ static void conv2d_implicit_cuda_f16(ggml_backend_cuda_context & ctx, const floa // ggml_cuda_pool_alloc kernel_f16(ctx.pool(id), ne); ggml_cuda_pool_alloc kernel_f16(ctx.pool(id)); if (ne01 > 1){ - // kernel_f16.alloc(ne); - // dim3 dimGrid1((ne00 + CUDA_NCHW_2_NHWC_BLOCK_C - 1) / CUDA_NCHW_2_NHWC_BLOCK_C, - // (ne/(ne00*ne01) + CUDA_NCHW_2_NHWC_BLOCK_NM - 1) / CUDA_NCHW_2_NHWC_BLOCK_NM, - // 1) ; - // if (ne01 == 25) { - // constexpr unsigned int mask = filter_swizzle_mask(25, CUDA_NCHW_2_NHWC_BLOCK_C); - // NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01); - // } else if (ne01 == 16) { - // constexpr unsigned int mask = filter_swizzle_mask(16, CUDA_NCHW_2_NHWC_BLOCK_C); - // NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01); - // } else if (ne01 == 9) { - // constexpr unsigned int mask = filter_swizzle_mask(9, CUDA_NCHW_2_NHWC_BLOCK_C); - // NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01); - // } else if (ne01 == 8) { - // constexpr unsigned int mask = filter_swizzle_mask(8, CUDA_NCHW_2_NHWC_BLOCK_C); - // NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01); - // } else if (ne01 == 7) { - // constexpr unsigned int mask = filter_swizzle_mask(7, CUDA_NCHW_2_NHWC_BLOCK_C); - // NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01); - // } else if (ne01 == 6) { - // constexpr unsigned int mask = filter_swizzle_mask(6, CUDA_NCHW_2_NHWC_BLOCK_C); - // NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01); - // } else if (ne01 == 5) { - // constexpr unsigned int mask = filter_swizzle_mask(5, CUDA_NCHW_2_NHWC_BLOCK_C); - // NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01); - // } else if (ne01 == 4) { - // constexpr unsigned int mask = filter_swizzle_mask(4, CUDA_NCHW_2_NHWC_BLOCK_C); - // NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01); - // } else if (ne01 == 3) { - // constexpr unsigned int mask = filter_swizzle_mask(3, CUDA_NCHW_2_NHWC_BLOCK_C); - // NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01); - // } else if (ne01 == 2) { - // constexpr unsigned int mask = filter_swizzle_mask(2, CUDA_NCHW_2_NHWC_BLOCK_C); - // NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01); - // } else { + kernel_f16.alloc(ne); + + dim3 dimGrid1((ne00 + CUDA_NCHW_2_NHWC_BLOCK_C - 1) / CUDA_NCHW_2_NHWC_BLOCK_C, + (ne/(ne00*ne01) + CUDA_NCHW_2_NHWC_BLOCK_NM - 1) / CUDA_NCHW_2_NHWC_BLOCK_NM, + 1) ; + if (ne01 == 25) { + constexpr unsigned int mask = filter_swizzle_mask(25, CUDA_NCHW_2_NHWC_BLOCK_C); + NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01); + } else if (ne01 == 16) { + constexpr unsigned int mask = filter_swizzle_mask(16, CUDA_NCHW_2_NHWC_BLOCK_C); + NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01); + } else if (ne01 == 9) { + constexpr unsigned int mask = filter_swizzle_mask(9, CUDA_NCHW_2_NHWC_BLOCK_C); + NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01); + } else if (ne01 == 8) { + constexpr unsigned int mask = filter_swizzle_mask(8, CUDA_NCHW_2_NHWC_BLOCK_C); + NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01); + } else if (ne01 == 7) { + constexpr unsigned int mask = filter_swizzle_mask(7, CUDA_NCHW_2_NHWC_BLOCK_C); + NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01); + } else if (ne01 == 6) { + constexpr unsigned int mask = filter_swizzle_mask(6, CUDA_NCHW_2_NHWC_BLOCK_C); + NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01); + } else if (ne01 == 5) { + constexpr unsigned int mask = filter_swizzle_mask(5, CUDA_NCHW_2_NHWC_BLOCK_C); + NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01); + } else if (ne01 == 4) { + constexpr unsigned int mask = filter_swizzle_mask(4, CUDA_NCHW_2_NHWC_BLOCK_C); + NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01); + } else if (ne01 == 3) { + constexpr unsigned int mask = filter_swizzle_mask(3, CUDA_NCHW_2_NHWC_BLOCK_C); + NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01); + } else if (ne01 == 2) { + constexpr unsigned int mask = filter_swizzle_mask(2, CUDA_NCHW_2_NHWC_BLOCK_C); + NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01); + } else { dim3 dimGrid2((ne01 + CUDA_NCHW_2_NHWC_TILE_DIM - 1) / CUDA_NCHW_2_NHWC_TILE_DIM, (ne00 + CUDA_NCHW_2_NHWC_TILE_DIM - 1) / CUDA_NCHW_2_NHWC_TILE_DIM, (ne/(ne00*ne01) + CUDA_NCHW_2_NHWC_BLOCK_NM - 1) / CUDA_NCHW_2_NHWC_BLOCK_NM) ; NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01); - // } + } } const half *X_H = input_f16.get(); diff --git a/tests/test-conv2d.cpp b/tests/test-conv2d.cpp index 3005e17edc..11cf757bc4 100644 --- a/tests/test-conv2d.cpp +++ b/tests/test-conv2d.cpp @@ -319,7 +319,7 @@ static std::vector> configs = { // std::make_tuple(960,320,104,152,3,3), // std::make_tuple(1280,1280,26,38,3,3), // std::make_tuple(1920,640,32,32,3,3) - // std::make_tuple(1280,1280,16,16,3,3), + std::make_tuple(1280,1280,16,16,3,3), // std::make_tuple(32,12,141,133,3,3), // std::make_tuple(32,6,141,133,3,3), // std::make_tuple(32,12,141,121,3,3), @@ -329,7 +329,8 @@ static std::vector> configs = { // std::make_tuple(320,12,16,16,3,3), //working // std::make_tuple(256,12,16,16,3,3), //working // std::make_tuple(32,12,16,16,3,3), //not working - std::make_tuple(16,12,16,16,3,3), //not working + // std::make_tuple(16,12,16,16,3,3), //not working + // std::make_tuple(32,12,16,16,3,3), //not working // std::make_tuple(48,12,16,16,3,3), // not working // std::make_tuple(96,12,16,16,3,3), //not working // std::make_tuple(64,12,16,16,3,3), //working @@ -750,15 +751,15 @@ int main(void) // int i = 2048; // for(int i = 0; i < ggml_nelements(wino_res); i++) { // for(int i = 0; i < 26*38; i++) { - for(int i = 0; i < conv2d_data.size(); i++) { - float diff = fabs(im2col_data[i] - conv2d_data[i]); - // if(diff > 0.5) { - printf("(%7.3f, %7.3f, %.2f, %d) \n", - im2col_data[i], conv2d_data[i], - diff, i); - // break; - // } - } + // for(int i = 0; i < conv2d_data.size(); i++) { + // float diff = fabs(im2col_data[i] - conv2d_data[i]); + // // if(diff > 0.5) { + // printf("(%7.3f, %7.3f, %.2f, %d) \n", + // im2col_data[i], conv2d_data[i], + // diff, i); + // // break; + // // } + // } ggml_free(model.ctx); ggml_backend_buffer_free(model.buffer);