From 63c53fe1f1568f11ac69e07ffdce2f84968675ab Mon Sep 17 00:00:00 2001 From: bssrdf Date: Thu, 13 Nov 2025 18:44:32 -0500 Subject: [PATCH] WIP: move rs loop into block-k-loop following cutlass --- ggml/src/ggml-cuda/conv2d-implicit.cu | 63 ++++++++++++++++++--------- tests/test-conv2d.cpp | 24 +++++----- 2 files changed, 55 insertions(+), 32 deletions(-) diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cu b/ggml/src/ggml-cuda/conv2d-implicit.cu index d2b5ee6d33..d04c379bb0 100644 --- a/ggml/src/ggml-cuda/conv2d-implicit.cu +++ b/ggml/src/ggml-cuda/conv2d-implicit.cu @@ -803,6 +803,7 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input, const unsigned int start_k = (ksplit > 0) ? z * ks : 0; const unsigned int end_k = min(start_k + ks, K); const unsigned int num_block_tiles_k = (ks + (BK-1)) / BK; + const unsigned int num_block_tiles_krs = num_block_tiles_k * param.r * param.s; constexpr unsigned int TILE_COLS_VECTORIZED = BK / 8; constexpr unsigned int ROW_STEP = NUM_THREADS / TILE_COLS_VECTORIZED; @@ -867,26 +868,49 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input, prepareIteratorA(thread_idx, masks_a, element_offset_a, param); - // prefetch the first block tile of A,B into shared memory const half* A_block_gmem = input; const half* B_block_gmem = kernel + block_n * BN * weightKOffset; - int s = 0; - int r = 0; - while (r < param.r) { - // for (int r = 0; r < param.r; ++r) { - tileMemcpySwizzleA(A_block_gmem, A_block_smem, r, s, masks_a, element_offset_a, thread_idx, start_k, end_k, inChannelOffset, param); - tileMemcpySwizzleB(B_block_gmem, B_block_smem, r, s, start_k, end_k, weightKOffset, param); + tileMemcpySwizzleA(A_block_gmem, A_block_smem, 0, 0, masks_a, element_offset_a, thread_idx, start_k, end_k, inChannelOffset, param); + tileMemcpySwizzleB(B_block_gmem, B_block_smem, 0, 0, start_k, end_k, weightKOffset, param); int offset_direction = 1; - - for (unsigned int block_k = 1; block_k <= num_block_tiles_k; block_k++){ + unsigned int block_k = 0; + unsigned int block_krs = 1; + // for (unsigned int block_k = 1; block_k <= num_block_tiles_k; block_k++){ + int s = 0; + int r = 0; + while (block_k < num_block_tiles_k){ __syncthreads(); - if (block_k != num_block_tiles_k){ + // moves to the next tile + int next_idx = 0; + ++s; + if (s == param.s) { + s = 0; + ++r; + if (r < param.r) { + next_idx = 1; + } else { + r = 0; + next_idx = 2; + } + } + if (next_idx == 2) { + ++block_k; + } + // if(block_k == num_block_tiles_k) + // break; + + // if(thread_idx == 0 && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0){ + // printf(" s = %d, r = %d, block_k = %d, next_idx = %d , %d %d \n", s, r, block_k, next_idx, block_krs, num_block_tiles_k); + // } + + // if (block_k != num_block_tiles_k){ + if (block_krs != num_block_tiles_krs){ tileMemcpyLoadA(A_block_gmem, A_gmem_cache_reg, r, s, block_k * BK, start_k, end_k, inChannelOffset, param); tileMemcpyLoadB(B_block_gmem, B_gmem_cache_reg, r, s, block_k * BK, start_k, end_k, weightKOffset, param); } @@ -932,7 +956,8 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input, } } - if (block_k != num_block_tiles_k) + // if (block_k != num_block_tiles_k) + if (block_krs != num_block_tiles_krs) { // switch smem buffers each iteration A_block_smem = A_block_smem + BUFFER_SIZE * offset_direction; @@ -942,16 +967,14 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input, tileMemcpySwizzleStore(A_gmem_cache_reg, A_block_smem); tileMemcpySwizzleStore(B_gmem_cache_reg, B_block_smem); } - } // iter block_k - s++; - if (s == param.s) { - s = 0; - r++; - } - A_block_smem = shmem; - B_block_smem = &shmem[BM * BK]; - } // iter r + block_krs++; + + } + // A_block_smem = shmem; + // B_block_smem = &shmem[BM * BK]; + + // } // iter block_k // reuse smem half *smemoutput = shmem; diff --git a/tests/test-conv2d.cpp b/tests/test-conv2d.cpp index daac5c9605..e3968f28b8 100644 --- a/tests/test-conv2d.cpp +++ b/tests/test-conv2d.cpp @@ -301,9 +301,9 @@ static std::vector> configs = { // std::make_tuple(960,320,104,152,3,3), // std::make_tuple(1280,1280,26,38,3,3), // std::make_tuple(1920,640,32,32,3,3) - // std::make_tuple(1280,1280,16,16,3,3), + std::make_tuple(1280,1280,16,16,3,3), // std::make_tuple(32,8,24,24,3,3), - std::make_tuple(640,640,64,64,3,3), + // std::make_tuple(640,640,64,64,3,3), // std::make_tuple(320,640,32,32,3,3), // std::make_tuple(4,320,96,128,3,3), // std::make_tuple(320,4,96,128,3,3), @@ -673,7 +673,7 @@ int main(void) // fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0f/1024.0f); - int iterations = 20; + int iterations = 0; double run_time0; std::vector im2col_data = compute_graph(model, allocr, build_graph_0, iterations, &run_time0); @@ -716,15 +716,15 @@ int main(void) // for(int i = 0; i < ggml_nelements(wino_res); i++) { // for(int i = 0; i < 26*38; i++) { - // for(int i = 0; i < conv2d_data.size(); i++) { - // float diff = fabs(im2col_data[i] - conv2d_data[i]); - // // if(diff > 0.5) { - // printf("(%7.3f, %7.3f, %.2f, %d) \n", - // im2col_data[i], conv2d_data[i], - // diff, i); - // // break; - // // } - // } + for(int i = 0; i < conv2d_data.size(); i++) { + float diff = fabs(im2col_data[i] - conv2d_data[i]); + // if(diff > 0.5) { + printf("(%7.3f, %7.3f, %.2f, %d) \n", + im2col_data[i], conv2d_data[i], + diff, i); + // break; + // } + } ggml_free(model.ctx); ggml_backend_buffer_free(model.buffer);