From 721fa41076e9a387d4ccd77387b6ea63cca7c228 Mon Sep 17 00:00:00 2001 From: bssrdf Date: Sat, 15 Nov 2025 23:59:38 -0500 Subject: [PATCH] restore split-k for small inputs --- ggml/src/ggml-cuda/conv2d-implicit.cu | 81 +++++++++++++-------------- tests/test-conv2d.cpp | 4 +- 2 files changed, 42 insertions(+), 43 deletions(-) diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cu b/ggml/src/ggml-cuda/conv2d-implicit.cu index e923d77e0b..15cb613dc6 100644 --- a/ggml/src/ggml-cuda/conv2d-implicit.cu +++ b/ggml/src/ggml-cuda/conv2d-implicit.cu @@ -83,47 +83,48 @@ static __global__ void NCHW2NHWC(const src_T *src, dst_T * dst, const int ne, co } //*** broken, has bugs *** -// template -// static __global__ void NCHW2NHWC(const src_T *src, dst_T * dst, const int ne, const int ne00, const int ne01){ +template +static __global__ void NCHW2NHWC(const src_T *src, dst_T * dst, const int ne, const int ne00, const int ne01){ -// const int64_t nmat = ne / (ne00 * ne01); -// const int64_t n = ne00 * ne01; + const int64_t nmat = ne / (ne00 * ne01); + const int64_t n = ne00 * ne01; -// const unsigned int tx = threadIdx.x; -// const unsigned int bx = blockIdx.x; -// const unsigned int by = blockIdx.y; + const unsigned int tx = threadIdx.x; + const unsigned int bx = blockIdx.x; + const unsigned int by = blockIdx.y; -// __shared__ src_T tile[rs*blk_c]; + __shared__ src_T tile[rs*blk_c]; -// #pragma unroll -// for(int i = 0; i < CUDA_NCHW_2_NHWC_BLOCK_NM; ++i){ +#pragma unroll + for(int i = 0; i < CUDA_NCHW_2_NHWC_BLOCK_NM; ++i){ -// const unsigned int imat = by * CUDA_NCHW_2_NHWC_BLOCK_NM + i; -// if(imat >= nmat) -// break; -// #pragma unroll -// for (unsigned int j = 0; j < rs; j++){ -// const unsigned int row = (j * blk_c + tx) % rs; -// const unsigned int col = (j * blk_c + tx) / rs; -// const unsigned int src_index = imat*n + bx * blk_c * rs + j * blk_c + tx; -// unsigned int idx = row * blk_c + col; -// idx = idx ^ ((idx & mask) >> 4); -// if (src_index < ne) { -// tile[idx] = src[src_index]; -// } -// } -// __syncthreads(); -// #pragma unroll -// for (unsigned int j = 0; j < rs; j++){ -// const unsigned int dst_index = imat*n + j*ne00 + bx*blk_c + tx; -// if(dst_index < ne){ -// unsigned int idx = j*blk_c + tx; -// idx = idx ^ ((idx & mask) >> 4); -// dst[dst_index] = ggml_cuda_cast(tile[idx]); -// } -// } -// } -// } + const unsigned int imat = by * CUDA_NCHW_2_NHWC_BLOCK_NM + i; + if(imat >= nmat) + break; +#pragma unroll + for (unsigned int j = 0; j < rs; j++){ + const unsigned int row = (j * blk_c + tx) % rs; + const unsigned int col = (j * blk_c + tx) / rs; + const unsigned int src_index = imat*n + bx * blk_c * rs + j * blk_c + tx; + // const unsigned int src_index = imat*n + rs*ne00 + bx * blk_c + j * blk_c + tx; + unsigned int idx = row * blk_c + col; + // idx = idx ^ ((idx & mask) >> 4); + if (src_index < ne) { + tile[idx] = src[src_index]; + } + } + __syncthreads(); +#pragma unroll + for (unsigned int j = 0; j < rs; j++){ + const unsigned int dst_index = imat*n + j*ne00 + bx*blk_c + tx; + if(dst_index < ne){ + unsigned int idx = j*blk_c + tx; + // idx = idx ^ ((idx & mask) >> 4); + dst[dst_index] = ggml_cuda_cast(tile[idx]); + } + } + } +} @@ -1338,9 +1339,9 @@ static void conv2d_implicit_cuda_f16(ggml_backend_cuda_context & ctx, const floa ggml_cuda_pool_alloc kernel_f16(ctx.pool(id)); if (ne01 > 1){ kernel_f16.alloc(ne); - dim3 dimGrid1((ne00 + CUDA_NCHW_2_NHWC_BLOCK_C - 1) / CUDA_NCHW_2_NHWC_BLOCK_C, - (ne/(ne00*ne01) + CUDA_NCHW_2_NHWC_BLOCK_NM - 1) / CUDA_NCHW_2_NHWC_BLOCK_NM, - 1) ; + // dim3 dimGrid1((ne00 + CUDA_NCHW_2_NHWC_BLOCK_C - 1) / CUDA_NCHW_2_NHWC_BLOCK_C, + // (ne/(ne00*ne01) + CUDA_NCHW_2_NHWC_BLOCK_NM - 1) / CUDA_NCHW_2_NHWC_BLOCK_NM, + // 1) ; // if (ne01 == 25) { // constexpr unsigned int mask = filter_swizzle_mask(25, CUDA_NCHW_2_NHWC_BLOCK_C); // NCHW2NHWC<<>>(K_D, kernel_f16.get(), ne, ne00, ne01); @@ -1424,10 +1425,8 @@ static void conv2d_implicit_cuda_f16(ggml_backend_cuda_context & ctx, const floa } } } - candidate = -1; if(candidate != -1){ j = candidate; - printf("choosing %d \n", j); if (j == 2) { launch_conv2d_implicit_split_kernel(ctx, X_H, K_H, Y_D, BlocksM, BlocksN, shmem_bytes, P, st); diff --git a/tests/test-conv2d.cpp b/tests/test-conv2d.cpp index b7ff38b833..2ebf0e2c14 100644 --- a/tests/test-conv2d.cpp +++ b/tests/test-conv2d.cpp @@ -319,7 +319,7 @@ static std::vector> configs = { // std::make_tuple(960,320,104,152,3,3), // std::make_tuple(1280,1280,26,38,3,3), // std::make_tuple(1920,640,32,32,3,3) - // std::make_tuple(1280,1280,16,16,3,3), + std::make_tuple(1280,1280,16,16,3,3), // std::make_tuple(32,12,141,133,3,3), // std::make_tuple(32,6,141,133,3,3), // std::make_tuple(32,12,141,121,3,3), @@ -330,7 +330,7 @@ static std::vector> configs = { // std::make_tuple(256,12,16,16,3,3), //working // std::make_tuple(32,12,16,16,3,3), //not working // std::make_tuple(48,12,16,16,3,3), // not working - std::make_tuple(96,12,16,16,3,3), //not working + // std::make_tuple(96,12,16,16,3,3), //not working // std::make_tuple(64,12,16,16,3,3), //working // std::make_tuple(64,12,141,133,3,3), //working // std::make_tuple(32,12,141,133,3,3), //working