From be25be8ed3fc7f5ab41a7157d91d388e6363c729 Mon Sep 17 00:00:00 2001 From: bssrdf Date: Fri, 24 Oct 2025 14:24:26 -0400 Subject: [PATCH] WIP: debugging tensor core kernel --- ggml/src/ggml-cuda/conv2d-implicit.cu | 66 +++++++++++++++----------- ggml/src/ggml-cuda/conv2d-implicit.cuh | 28 +++++------ tests/test-conv2d-implicit.cpp | 66 +++++++++++++------------- 3 files changed, 86 insertions(+), 74 deletions(-) diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cu b/ggml/src/ggml-cuda/conv2d-implicit.cu index 482270e2c7..f08e19e9fb 100644 --- a/ggml/src/ggml-cuda/conv2d-implicit.cu +++ b/ggml/src/ggml-cuda/conv2d-implicit.cu @@ -259,6 +259,10 @@ static __global__ void conv2d_implicit_kernel(const float * __restrict__ input, __syncthreads(); + if(tx == 0 && bx == 0 && by == 0 && z == 0){ + printf("non tensor \n"); + } + // if(tx == 0 && bx == 0 && by == 0 && z == 0){ // for(int i=0; i < 128; ++i) // printf("%.2f,", smeminput[i]); @@ -738,7 +742,7 @@ __device__ __forceinline__ void ldmatrix_a( half (®)[mma_tiles_per_warp_m][mma_tiles_per_warp_k][4] ) { -#if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE +// #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE static_assert(mma_tiles_per_warp_m == 8, "mma_tiles_per_warp_m must be 4"); static_assert(mma_tiles_per_warp_k == 4, "mma_tiles_per_warp_k must be 4"); @@ -881,11 +885,11 @@ __device__ __forceinline__ void ldmatrix_a( : "=r"(reg_[6][3][0]), "=r"(reg_[6][3][1]), "=r"(reg_[7][3][0]), "=r"(reg_[7][3][1]) : "r"(src_addr + 96 * smem_stride_) ); -#else - GGML_UNUSED(src); - GGML_UNUSED(reg); - NO_DEVICE_CODE; -#endif +// #else +// GGML_UNUSED(src); +// GGML_UNUSED(reg); +// NO_DEVICE_CODE; +// #endif } template @@ -894,7 +898,7 @@ __device__ __forceinline__ void ldmatrix_b( half (®)[mma_tiles_per_warp_k][mma_tiles_per_warp_n][2] ) { -#if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE +// #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE static_assert(mma_tiles_per_warp_k == 4, "mma_tiles_per_warp_k must be 4"); static_assert(mma_tiles_per_warp_n == 8, "mma_tiles_per_warp_n must be 8"); @@ -985,23 +989,26 @@ __device__ __forceinline__ void ldmatrix_b( // : "r"(src_addr ^ 0b1000000) : "r"(src_addr + 32 * smem_stride_) ); -#else - GGML_UNUSED(src); - GGML_UNUSED(reg); - NO_DEVICE_CODE; -#endif +// #else +// GGML_UNUSED(src); +// GGML_UNUSED(reg); +// NO_DEVICE_CODE; +// #endif } template -static __global__ void conv2d_implicit_kernel_tc(const half * __restrict__ input, +static __global__ void conv2d_implicit_kernel(const half * __restrict__ input, const half * __restrict__ kernel, half * __restrict__ output, const param_t param) { -#if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE +// #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE constexpr unsigned int MMA_M = 16; constexpr unsigned int MMA_N = 8; +// if(threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y ==0) +// printf("conv2d_implicit_kernel launch BM:%d, BN:%d, BK:%d, WM:%d, WN:%d, WK:%d, NUM_THREADS:%d \n", BM, BN, BK, WM, WN, WK, NUM_THREADS); + const unsigned int K = param.c * param.r * param.s; const uint PQ = param.Oh * param.Ow; const uint inChannelOffset = param.c * param.w; @@ -1180,13 +1187,13 @@ static __global__ void conv2d_implicit_kernel_tc(const half * __restrict__ input } } } -#else - GGML_UNUSED(input); - GGML_UNUSED(kernel); - GGML_UNUSED(output); - GGML_UNUSED(param); - NO_DEVICE_CODE; -#endif +// #else +// GGML_UNUSED(input); +// GGML_UNUSED(kernel); +// GGML_UNUSED(output); +// GGML_UNUSED(param); +// NO_DEVICE_CODE; +// #endif } @@ -1248,8 +1255,8 @@ static void conv2d_implicit_cuda(const float * X_D, const T * K_D, float * Y_D, } static void conv2d_implicit_cuda_f16(ggml_backend_cuda_context & ctx, const float * X_D, const half * K_D, float * Y_D, int cc, const param_t P, cudaStream_t st) { -#if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE if (GGML_CUDA_CC_IS_NVIDIA(cc) && ampere_mma_available(cc) && P.layout == 0 && P.c % 8 == 0) { +// #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE constexpr unsigned int BM_dim = 256; constexpr unsigned int BN_dim = 256; constexpr unsigned int BK_dim = 32; @@ -1267,6 +1274,9 @@ static void conv2d_implicit_cuda_f16(ggml_backend_cuda_context & ctx, const floa constexpr unsigned int ThreadsN = WARPSIZE * WARPS_PER_BLOCK_N; constexpr unsigned int NumThreads = ThreadsM * ThreadsN; const unsigned int shmem_bytes = (BM_dim * BK_dim + BK_dim * BN_dim) * 2 * sizeof(half); + + cudaFuncSetAttribute(conv2d_implicit_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, 65536); // set shared memory limit to 64KB which is maximum for sm_75 dim3 gridDim(BlocksN, BlocksM); dim3 blockDim(ThreadsN, ThreadsM); @@ -1280,17 +1290,19 @@ static void conv2d_implicit_cuda_f16(ggml_backend_cuda_context & ctx, const floa to_fp16_cuda(X_D, x_f16.get(), ne, st); const half *X_H = x_f16.get(); ggml_cuda_pool_alloc Y_H(ctx.pool(id), P.k * P.Oh * P.Ow * P.n); - conv2d_implicit_kernel_tc <<>>(X_H, K_D, Y_H.get(), P); const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16); to_fp32_cuda(Y_H.get(), Y_D, P.k * P.Oh * P.Ow * P.n, st); - }else{ +// #else +// printf("non tensor path called\n"); +// conv2d_implicit_cuda(X_D, K_D, Y_D, P, st); +// #endif + } else{ conv2d_implicit_cuda(X_D, K_D, Y_D, P, st); } -#else - conv2d_implicit_cuda(X_D, K_D, Y_D, P, st); -#endif + } static void conv2d_implicit_cuda_f32(ggml_backend_cuda_context & ctx, const float * X_D, const float * K_D, float * Y_D, int cc, const param_t P, cudaStream_t st) { diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cuh b/ggml/src/ggml-cuda/conv2d-implicit.cuh index b7d2c8ff2e..7d966705b8 100644 --- a/ggml/src/ggml-cuda/conv2d-implicit.cuh +++ b/ggml/src/ggml-cuda/conv2d-implicit.cuh @@ -26,7 +26,7 @@ typedef struct{ uint3 OHOW_fastdiv; } param_t; -#if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE +// #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE // same as above, but writes are swizzled to avoid bank conflicts when shared memory is read later in the kernel template @@ -98,9 +98,9 @@ __device__ __forceinline__ void tileMemcpySwizzleB( unsigned int thread_row = thread_idx / TILE_COLS_VECTORIZED; const unsigned int thread_col = thread_idx % TILE_COLS_VECTORIZED; // TODO: next block_k loop - const uint curR = fastdiv(thread_col*8, param.SC_fastdiv); // channel offset - const uint curS = fastdiv(fastmodulo(thread_col*8, param.SC_fastdiv), param.C_fastdiv); // kernel r offset - const uint curC = fastmodulo(fastmodulo(thread_col*8, param.SC_fastdiv), param.C_fastdiv); // + const unsigned int curR = fastdiv(thread_col*8, param.SC_fastdiv); // channel offset + const unsigned int curS = fastdiv(fastmodulo(thread_col*8, param.SC_fastdiv), param.C_fastdiv); // kernel r offset + const unsigned int curC = fastmodulo(fastmodulo(thread_col*8, param.SC_fastdiv), param.C_fastdiv); // #pragma unroll for (unsigned int i = 0; i < NUM_ITERS; i++) @@ -166,9 +166,9 @@ __device__ __forceinline__ void tileMemcpySwizzleA( int posw_ori = fastmodulo(npq_res, param.OW_fastdiv) * param.v - param.q; unsigned int inOffset = n * param.c * param.h * param.w; // TODO: next block_k loop - const uint curR = fastdiv(thread_col*8, param.SC_fastdiv); // channel offset - const uint curS = fastdiv(fastmodulo(thread_col*8, param.SC_fastdiv), param.C_fastdiv); // kernel r offset - const uint curC = fastmodulo(fastmodulo(thread_col*8, param.SC_fastdiv), param.C_fastdiv); // kernel r offset + const unsigned int curR = fastdiv(thread_col*8, param.SC_fastdiv); // channel offset + const unsigned int curS = fastdiv(fastmodulo(thread_col*8, param.SC_fastdiv), param.C_fastdiv); // kernel r offset + const unsigned int curC = fastmodulo(fastmodulo(thread_col*8, param.SC_fastdiv), param.C_fastdiv); // kernel r offset int curH = posh_ori + curR * param.d_h; // input h int curW = posw_ori + curS * param.d_w; // input w // apply swizzle to the dst index @@ -234,9 +234,9 @@ __device__ __forceinline__ void tileMemcpyLoadA( int posw_ori = fastmodulo(npq_res, param.OW_fastdiv) * param.v - param.q; unsigned int inOffset = n * param.c * param.h * param.w; // TODO: next block_k loop - const uint curR = fastdiv(block_k+thread_col*8, param.SC_fastdiv); // channel offset - const uint curS = fastdiv(fastmodulo(block_k+thread_col*8, param.SC_fastdiv), param.C_fastdiv); // kernel r offset - const uint curC = fastmodulo(fastmodulo(block_k+thread_col*8, param.SC_fastdiv), param.C_fastdiv); // kernel r offset + const unsigned int curR = fastdiv(block_k+thread_col*8, param.SC_fastdiv); // channel offset + const unsigned int curS = fastdiv(fastmodulo(block_k+thread_col*8, param.SC_fastdiv), param.C_fastdiv); // kernel r offset + const unsigned int curC = fastmodulo(fastmodulo(block_k+thread_col*8, param.SC_fastdiv), param.C_fastdiv); // kernel r offset int curH = posh_ori + curR * param.d_h; // input h int curW = posw_ori + curS * param.d_w; // input w if (curH >= 0 && curW >= 0 && curW < param.w && curH < param.h && @@ -285,9 +285,9 @@ __device__ __forceinline__ void tileMemcpyLoadB( // compile time check that we provided the right amount of registers for storage static_assert(ELEMENTS_PER_THREAD == NUM_ITERS); - const uint curR = fastdiv(block_k+thread_col*8, param.SC_fastdiv); // channel offset - const uint curS = fastdiv(fastmodulo(block_k+thread_col*8, param.SC_fastdiv), param.C_fastdiv); // kernel r offset - const uint curC = fastmodulo(fastmodulo(block_k+thread_col*8, param.SC_fastdiv), param.C_fastdiv); // + const unsigned int curR = fastdiv(block_k+thread_col*8, param.SC_fastdiv); // channel offset + const unsigned int curS = fastdiv(fastmodulo(block_k+thread_col*8, param.SC_fastdiv), param.C_fastdiv); // kernel r offset + const unsigned int curC = fastmodulo(fastmodulo(block_k+thread_col*8, param.SC_fastdiv), param.C_fastdiv); // #pragma unroll for (unsigned int i = 0; i < NUM_ITERS; i++) @@ -448,7 +448,7 @@ __device__ __forceinline__ uint32_t cvta_to_shared_u32(const void *pointer) { return address; } -#endif +// #endif // constexpr unsigned int int_log2(unsigned int x) // { diff --git a/tests/test-conv2d-implicit.cpp b/tests/test-conv2d-implicit.cpp index 4d416e748c..3685a10d72 100644 --- a/tests/test-conv2d-implicit.cpp +++ b/tests/test-conv2d-implicit.cpp @@ -63,8 +63,8 @@ void load_model(test_model & model, int ic, int oc, int iw, int ih, bool use_gpu size_t buffer_size = 0; { - buffer_size += KW * KH * IC * OC * ggml_type_size(GGML_TYPE_F32); // tensor a - // buffer_size += KW * KH * IC * OC * ggml_type_size(GGML_TYPE_F16); // tensor a + // buffer_size += KW * KH * IC * OC * ggml_type_size(GGML_TYPE_F32); // tensor a + buffer_size += KW * KH * IC * OC * ggml_type_size(GGML_TYPE_F16); // tensor a buffer_size += IW * IH * IC * N * ggml_type_size(GGML_TYPE_F32); // tensor b buffer_size += 1024; // overhead } @@ -112,8 +112,8 @@ void load_model(test_model & model, int ic, int oc, int iw, int ih, bool use_gpu model.ctx = ggml_init(params); // create tensors - // model.a = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F16, KW, KH, IC, OC); - model.a = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F32, KW, KH, IC, OC); + model.a = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F16, KW, KH, IC, OC); + // model.a = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F32, KW, KH, IC, OC); model.b = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F32, IW, IH, IC, N); // create a allocator @@ -124,11 +124,11 @@ void load_model(test_model & model, int ic, int oc, int iw, int ih, bool use_gpu // load data to buffer if(ggml_backend_is_cpu(model.backend)) { - // memcpy(model.a->data, hadata.data(), ggml_nbytes(model.a)); - memcpy(model.a->data, adata.data(), ggml_nbytes(model.a)); + memcpy(model.a->data, hadata.data(), ggml_nbytes(model.a)); + // memcpy(model.a->data, adata.data(), ggml_nbytes(model.a)); } else { - // ggml_backend_tensor_set(model.a, hadata.data(), 0, ggml_nbytes(model.a)); - ggml_backend_tensor_set(model.a, adata.data(), 0, ggml_nbytes(model.a)); + ggml_backend_tensor_set(model.a, hadata.data(), 0, ggml_nbytes(model.a)); + // ggml_backend_tensor_set(model.a, adata.data(), 0, ggml_nbytes(model.a)); } // alloc memory @@ -262,7 +262,7 @@ struct ggml_cgraph * build_graph_2(const test_model& model) { // printf("conv2d: (%zu, %zu, %zu, %zu) \n", ne[0], ne[1], ne[2], ne[3]); - struct ggml_tensor* wino_res = ggml_conv_2d_implicitgemm(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1, 1); + struct ggml_tensor* wino_res = ggml_conv_2d_implicitgemm(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1, 0); // struct ggml_tensor* wino_res = ggml_conv_2d_direct(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1); ggml_set_name(wino_res, "wino_res"); ggml_build_forward_expand(gf, wino_res); @@ -339,20 +339,20 @@ int main(void) { ggml_time_init(); std::vector> configs = { - std::make_tuple(64,64,48,64), - std::make_tuple(320,320,104,152), - std::make_tuple(640,640,52,76), - std::make_tuple(640,640,104,152), - std::make_tuple(960,320,104,152), - std::make_tuple(1280,1280,26,38), - std::make_tuple(1280,640,52,76), - std::make_tuple(1920,1280,26,38), - std::make_tuple(2560,1280,26,38), - std::make_tuple(512,512,104,152), - std::make_tuple(512,512,208,304), - std::make_tuple(512,256,416,608), - std::make_tuple(256,128,832,1216), - std::make_tuple(256,256,832,1216), + // std::make_tuple(64,64,48,64), + // std::make_tuple(320,320,104,152), + // std::make_tuple(640,640,52,76), + // std::make_tuple(640,640,104,152), + // std::make_tuple(960,320,104,152), + std::make_tuple(160,1280,26,38), + // std::make_tuple(1280,640,52,76), + // std::make_tuple(1920,1280,26,38), + // std::make_tuple(2560,1280,26,38), + // std::make_tuple(512,512,104,152), + // std::make_tuple(512,512,208,304), + // std::make_tuple(512,256,416,608), + // std::make_tuple(256,128,832,1216), + // std::make_tuple(256,256,832,1216), // std::make_tuple(320,256,1024,1920) }; @@ -375,7 +375,7 @@ int main(void) struct ggml_cgraph * gf_res_0 = NULL; - int iterations = 20; + int iterations = 0; double run_time0; std::vector conv2d_data = compute_graph(model, allocr, build_graph_0, iterations, &run_time0); @@ -436,15 +436,15 @@ int main(void) // for(int i = 0; i < ggml_nelements(wino_res); i++) { - // for(int i = 0; i < 3*28; i++) { - // float diff = fabs(conv2d_data[i] - wino_data[i]); - // // if(diff > 1.e-4) { - // printf("(%f, %f, %f, %d) \n", - // conv2d_data[i], - // wino_data[i], diff, i); - // // break; - // // } - // } + for(int i = 0; i < 26*38; i++) { + float diff = fabs(conv2d_data[i] - wino_data[i]); + // if(diff > 1.e-4) { + printf("(%f, %f, %f, %d) \n", + conv2d_data[i], + wino_data[i], diff, i); + // break; + // } + } ggml_free(model.ctx); ggml_backend_buffer_free(model.buffer);