WIP: debugging tensor core kernel

This commit is contained in:
bssrdf 2025-10-24 14:24:26 -04:00
parent 80a996cfc0
commit be25be8ed3
3 changed files with 86 additions and 74 deletions

View File

@ -259,6 +259,10 @@ static __global__ void conv2d_implicit_kernel(const float * __restrict__ input,
__syncthreads(); __syncthreads();
if(tx == 0 && bx == 0 && by == 0 && z == 0){
printf("non tensor \n");
}
// if(tx == 0 && bx == 0 && by == 0 && z == 0){ // if(tx == 0 && bx == 0 && by == 0 && z == 0){
// for(int i=0; i < 128; ++i) // for(int i=0; i < 128; ++i)
// printf("%.2f,", smeminput[i]); // printf("%.2f,", smeminput[i]);
@ -738,7 +742,7 @@ __device__ __forceinline__ void ldmatrix_a(
half (&reg)[mma_tiles_per_warp_m][mma_tiles_per_warp_k][4] half (&reg)[mma_tiles_per_warp_m][mma_tiles_per_warp_k][4]
) )
{ {
#if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE // #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
static_assert(mma_tiles_per_warp_m == 8, "mma_tiles_per_warp_m must be 4"); static_assert(mma_tiles_per_warp_m == 8, "mma_tiles_per_warp_m must be 4");
static_assert(mma_tiles_per_warp_k == 4, "mma_tiles_per_warp_k must be 4"); static_assert(mma_tiles_per_warp_k == 4, "mma_tiles_per_warp_k must be 4");
@ -881,11 +885,11 @@ __device__ __forceinline__ void ldmatrix_a(
: "=r"(reg_[6][3][0]), "=r"(reg_[6][3][1]), "=r"(reg_[7][3][0]), "=r"(reg_[7][3][1]) : "=r"(reg_[6][3][0]), "=r"(reg_[6][3][1]), "=r"(reg_[7][3][0]), "=r"(reg_[7][3][1])
: "r"(src_addr + 96 * smem_stride_) : "r"(src_addr + 96 * smem_stride_)
); );
#else // #else
GGML_UNUSED(src); // GGML_UNUSED(src);
GGML_UNUSED(reg); // GGML_UNUSED(reg);
NO_DEVICE_CODE; // NO_DEVICE_CODE;
#endif // #endif
} }
template <unsigned int mma_tiles_per_warp_k, unsigned int mma_tiles_per_warp_n, unsigned int smem_stride> template <unsigned int mma_tiles_per_warp_k, unsigned int mma_tiles_per_warp_n, unsigned int smem_stride>
@ -894,7 +898,7 @@ __device__ __forceinline__ void ldmatrix_b(
half (&reg)[mma_tiles_per_warp_k][mma_tiles_per_warp_n][2] half (&reg)[mma_tiles_per_warp_k][mma_tiles_per_warp_n][2]
) )
{ {
#if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE // #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
static_assert(mma_tiles_per_warp_k == 4, "mma_tiles_per_warp_k must be 4"); static_assert(mma_tiles_per_warp_k == 4, "mma_tiles_per_warp_k must be 4");
static_assert(mma_tiles_per_warp_n == 8, "mma_tiles_per_warp_n must be 8"); static_assert(mma_tiles_per_warp_n == 8, "mma_tiles_per_warp_n must be 8");
@ -985,23 +989,26 @@ __device__ __forceinline__ void ldmatrix_b(
// : "r"(src_addr ^ 0b1000000) // : "r"(src_addr ^ 0b1000000)
: "r"(src_addr + 32 * smem_stride_) : "r"(src_addr + 32 * smem_stride_)
); );
#else // #else
GGML_UNUSED(src); // GGML_UNUSED(src);
GGML_UNUSED(reg); // GGML_UNUSED(reg);
NO_DEVICE_CODE; // NO_DEVICE_CODE;
#endif // #endif
} }
template<const int BM, const int BN, const int BK, const int WM, const int WN, template<const int BM, const int BN, const int BK, const int WM, const int WN,
const int WK, const int NUM_THREADS> const int WK, const int NUM_THREADS>
static __global__ void conv2d_implicit_kernel_tc(const half * __restrict__ input, static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
const half * __restrict__ kernel, const half * __restrict__ kernel,
half * __restrict__ output, half * __restrict__ output,
const param_t param) { const param_t param) {
#if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE // #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
constexpr unsigned int MMA_M = 16; constexpr unsigned int MMA_M = 16;
constexpr unsigned int MMA_N = 8; constexpr unsigned int MMA_N = 8;
// if(threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y ==0)
// printf("conv2d_implicit_kernel launch BM:%d, BN:%d, BK:%d, WM:%d, WN:%d, WK:%d, NUM_THREADS:%d \n", BM, BN, BK, WM, WN, WK, NUM_THREADS);
const unsigned int K = param.c * param.r * param.s; const unsigned int K = param.c * param.r * param.s;
const uint PQ = param.Oh * param.Ow; const uint PQ = param.Oh * param.Ow;
const uint inChannelOffset = param.c * param.w; const uint inChannelOffset = param.c * param.w;
@ -1180,13 +1187,13 @@ static __global__ void conv2d_implicit_kernel_tc(const half * __restrict__ input
} }
} }
} }
#else // #else
GGML_UNUSED(input); // GGML_UNUSED(input);
GGML_UNUSED(kernel); // GGML_UNUSED(kernel);
GGML_UNUSED(output); // GGML_UNUSED(output);
GGML_UNUSED(param); // GGML_UNUSED(param);
NO_DEVICE_CODE; // NO_DEVICE_CODE;
#endif // #endif
} }
@ -1248,8 +1255,8 @@ static void conv2d_implicit_cuda(const float * X_D, const T * K_D, float * Y_D,
} }
static void conv2d_implicit_cuda_f16(ggml_backend_cuda_context & ctx, const float * X_D, const half * K_D, float * Y_D, int cc, const param_t P, cudaStream_t st) { static void conv2d_implicit_cuda_f16(ggml_backend_cuda_context & ctx, const float * X_D, const half * K_D, float * Y_D, int cc, const param_t P, cudaStream_t st) {
#if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
if (GGML_CUDA_CC_IS_NVIDIA(cc) && ampere_mma_available(cc) && P.layout == 0 && P.c % 8 == 0) { if (GGML_CUDA_CC_IS_NVIDIA(cc) && ampere_mma_available(cc) && P.layout == 0 && P.c % 8 == 0) {
// #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
constexpr unsigned int BM_dim = 256; constexpr unsigned int BM_dim = 256;
constexpr unsigned int BN_dim = 256; constexpr unsigned int BN_dim = 256;
constexpr unsigned int BK_dim = 32; constexpr unsigned int BK_dim = 32;
@ -1267,6 +1274,9 @@ static void conv2d_implicit_cuda_f16(ggml_backend_cuda_context & ctx, const floa
constexpr unsigned int ThreadsN = WARPSIZE * WARPS_PER_BLOCK_N; constexpr unsigned int ThreadsN = WARPSIZE * WARPS_PER_BLOCK_N;
constexpr unsigned int NumThreads = ThreadsM * ThreadsN; constexpr unsigned int NumThreads = ThreadsM * ThreadsN;
const unsigned int shmem_bytes = (BM_dim * BK_dim + BK_dim * BN_dim) * 2 * sizeof(half); const unsigned int shmem_bytes = (BM_dim * BK_dim + BK_dim * BN_dim) * 2 * sizeof(half);
cudaFuncSetAttribute(conv2d_implicit_kernel<BM_dim, BN_dim, BK_dim, WM_dim, WN_dim, WK_dim, NumThreads>,
cudaFuncAttributeMaxDynamicSharedMemorySize, 65536); // set shared memory limit to 64KB which is maximum for sm_75
dim3 gridDim(BlocksN, BlocksM); dim3 gridDim(BlocksN, BlocksM);
dim3 blockDim(ThreadsN, ThreadsM); dim3 blockDim(ThreadsN, ThreadsM);
@ -1280,17 +1290,19 @@ static void conv2d_implicit_cuda_f16(ggml_backend_cuda_context & ctx, const floa
to_fp16_cuda(X_D, x_f16.get(), ne, st); to_fp16_cuda(X_D, x_f16.get(), ne, st);
const half *X_H = x_f16.get(); const half *X_H = x_f16.get();
ggml_cuda_pool_alloc<half> Y_H(ctx.pool(id), P.k * P.Oh * P.Ow * P.n); ggml_cuda_pool_alloc<half> Y_H(ctx.pool(id), P.k * P.Oh * P.Ow * P.n);
conv2d_implicit_kernel_tc<BM_dim, BN_dim, BK_dim, conv2d_implicit_kernel<BM_dim, BN_dim, BK_dim,
WM_dim, WN_dim, WK_dim, NumThreads> WM_dim, WN_dim, WK_dim, NumThreads>
<<<gridDim, blockDim, shmem_bytes, st>>>(X_H, K_D, Y_H.get(), P); <<<gridDim, blockDim, shmem_bytes, st>>>(X_H, K_D, Y_H.get(), P);
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16); const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
to_fp32_cuda(Y_H.get(), Y_D, P.k * P.Oh * P.Ow * P.n, st); to_fp32_cuda(Y_H.get(), Y_D, P.k * P.Oh * P.Ow * P.n, st);
}else{ // #else
// printf("non tensor path called\n");
// conv2d_implicit_cuda<half, 1>(X_D, K_D, Y_D, P, st);
// #endif
} else{
conv2d_implicit_cuda<half, 1>(X_D, K_D, Y_D, P, st); conv2d_implicit_cuda<half, 1>(X_D, K_D, Y_D, P, st);
} }
#else
conv2d_implicit_cuda<half, 1>(X_D, K_D, Y_D, P, st);
#endif
} }
static void conv2d_implicit_cuda_f32(ggml_backend_cuda_context & ctx, const float * X_D, const float * K_D, float * Y_D, int cc, const param_t P, cudaStream_t st) { static void conv2d_implicit_cuda_f32(ggml_backend_cuda_context & ctx, const float * X_D, const float * K_D, float * Y_D, int cc, const param_t P, cudaStream_t st) {

View File

@ -26,7 +26,7 @@ typedef struct{
uint3 OHOW_fastdiv; uint3 OHOW_fastdiv;
} param_t; } param_t;
#if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE // #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
// same as above, but writes are swizzled to avoid bank conflicts when shared memory is read later in the kernel // same as above, but writes are swizzled to avoid bank conflicts when shared memory is read later in the kernel
template<unsigned int TILE_ROWS, template<unsigned int TILE_ROWS,
unsigned int NUM_THREADS> unsigned int NUM_THREADS>
@ -98,9 +98,9 @@ __device__ __forceinline__ void tileMemcpySwizzleB(
unsigned int thread_row = thread_idx / TILE_COLS_VECTORIZED; unsigned int thread_row = thread_idx / TILE_COLS_VECTORIZED;
const unsigned int thread_col = thread_idx % TILE_COLS_VECTORIZED; const unsigned int thread_col = thread_idx % TILE_COLS_VECTORIZED;
// TODO: next block_k loop // TODO: next block_k loop
const uint curR = fastdiv(thread_col*8, param.SC_fastdiv); // channel offset const unsigned int curR = fastdiv(thread_col*8, param.SC_fastdiv); // channel offset
const uint curS = fastdiv(fastmodulo(thread_col*8, param.SC_fastdiv), param.C_fastdiv); // kernel r offset const unsigned int curS = fastdiv(fastmodulo(thread_col*8, param.SC_fastdiv), param.C_fastdiv); // kernel r offset
const uint curC = fastmodulo(fastmodulo(thread_col*8, param.SC_fastdiv), param.C_fastdiv); // const unsigned int curC = fastmodulo(fastmodulo(thread_col*8, param.SC_fastdiv), param.C_fastdiv); //
#pragma unroll #pragma unroll
for (unsigned int i = 0; i < NUM_ITERS; i++) for (unsigned int i = 0; i < NUM_ITERS; i++)
@ -166,9 +166,9 @@ __device__ __forceinline__ void tileMemcpySwizzleA(
int posw_ori = fastmodulo(npq_res, param.OW_fastdiv) * param.v - param.q; int posw_ori = fastmodulo(npq_res, param.OW_fastdiv) * param.v - param.q;
unsigned int inOffset = n * param.c * param.h * param.w; unsigned int inOffset = n * param.c * param.h * param.w;
// TODO: next block_k loop // TODO: next block_k loop
const uint curR = fastdiv(thread_col*8, param.SC_fastdiv); // channel offset const unsigned int curR = fastdiv(thread_col*8, param.SC_fastdiv); // channel offset
const uint curS = fastdiv(fastmodulo(thread_col*8, param.SC_fastdiv), param.C_fastdiv); // kernel r offset const unsigned int curS = fastdiv(fastmodulo(thread_col*8, param.SC_fastdiv), param.C_fastdiv); // kernel r offset
const uint curC = fastmodulo(fastmodulo(thread_col*8, param.SC_fastdiv), param.C_fastdiv); // kernel r offset const unsigned int curC = fastmodulo(fastmodulo(thread_col*8, param.SC_fastdiv), param.C_fastdiv); // kernel r offset
int curH = posh_ori + curR * param.d_h; // input h int curH = posh_ori + curR * param.d_h; // input h
int curW = posw_ori + curS * param.d_w; // input w int curW = posw_ori + curS * param.d_w; // input w
// apply swizzle to the dst index // apply swizzle to the dst index
@ -234,9 +234,9 @@ __device__ __forceinline__ void tileMemcpyLoadA(
int posw_ori = fastmodulo(npq_res, param.OW_fastdiv) * param.v - param.q; int posw_ori = fastmodulo(npq_res, param.OW_fastdiv) * param.v - param.q;
unsigned int inOffset = n * param.c * param.h * param.w; unsigned int inOffset = n * param.c * param.h * param.w;
// TODO: next block_k loop // TODO: next block_k loop
const uint curR = fastdiv(block_k+thread_col*8, param.SC_fastdiv); // channel offset const unsigned int curR = fastdiv(block_k+thread_col*8, param.SC_fastdiv); // channel offset
const uint curS = fastdiv(fastmodulo(block_k+thread_col*8, param.SC_fastdiv), param.C_fastdiv); // kernel r offset const unsigned int curS = fastdiv(fastmodulo(block_k+thread_col*8, param.SC_fastdiv), param.C_fastdiv); // kernel r offset
const uint curC = fastmodulo(fastmodulo(block_k+thread_col*8, param.SC_fastdiv), param.C_fastdiv); // kernel r offset const unsigned int curC = fastmodulo(fastmodulo(block_k+thread_col*8, param.SC_fastdiv), param.C_fastdiv); // kernel r offset
int curH = posh_ori + curR * param.d_h; // input h int curH = posh_ori + curR * param.d_h; // input h
int curW = posw_ori + curS * param.d_w; // input w int curW = posw_ori + curS * param.d_w; // input w
if (curH >= 0 && curW >= 0 && curW < param.w && curH < param.h && if (curH >= 0 && curW >= 0 && curW < param.w && curH < param.h &&
@ -285,9 +285,9 @@ __device__ __forceinline__ void tileMemcpyLoadB(
// compile time check that we provided the right amount of registers for storage // compile time check that we provided the right amount of registers for storage
static_assert(ELEMENTS_PER_THREAD == NUM_ITERS); static_assert(ELEMENTS_PER_THREAD == NUM_ITERS);
const uint curR = fastdiv(block_k+thread_col*8, param.SC_fastdiv); // channel offset const unsigned int curR = fastdiv(block_k+thread_col*8, param.SC_fastdiv); // channel offset
const uint curS = fastdiv(fastmodulo(block_k+thread_col*8, param.SC_fastdiv), param.C_fastdiv); // kernel r offset const unsigned int curS = fastdiv(fastmodulo(block_k+thread_col*8, param.SC_fastdiv), param.C_fastdiv); // kernel r offset
const uint curC = fastmodulo(fastmodulo(block_k+thread_col*8, param.SC_fastdiv), param.C_fastdiv); // const unsigned int curC = fastmodulo(fastmodulo(block_k+thread_col*8, param.SC_fastdiv), param.C_fastdiv); //
#pragma unroll #pragma unroll
for (unsigned int i = 0; i < NUM_ITERS; i++) for (unsigned int i = 0; i < NUM_ITERS; i++)
@ -448,7 +448,7 @@ __device__ __forceinline__ uint32_t cvta_to_shared_u32(const void *pointer) {
return address; return address;
} }
#endif // #endif
// constexpr unsigned int int_log2(unsigned int x) // constexpr unsigned int int_log2(unsigned int x)
// { // {

View File

@ -63,8 +63,8 @@ void load_model(test_model & model, int ic, int oc, int iw, int ih, bool use_gpu
size_t buffer_size = 0; size_t buffer_size = 0;
{ {
buffer_size += KW * KH * IC * OC * ggml_type_size(GGML_TYPE_F32); // tensor a // buffer_size += KW * KH * IC * OC * ggml_type_size(GGML_TYPE_F32); // tensor a
// buffer_size += KW * KH * IC * OC * ggml_type_size(GGML_TYPE_F16); // tensor a buffer_size += KW * KH * IC * OC * ggml_type_size(GGML_TYPE_F16); // tensor a
buffer_size += IW * IH * IC * N * ggml_type_size(GGML_TYPE_F32); // tensor b buffer_size += IW * IH * IC * N * ggml_type_size(GGML_TYPE_F32); // tensor b
buffer_size += 1024; // overhead buffer_size += 1024; // overhead
} }
@ -112,8 +112,8 @@ void load_model(test_model & model, int ic, int oc, int iw, int ih, bool use_gpu
model.ctx = ggml_init(params); model.ctx = ggml_init(params);
// create tensors // create tensors
// model.a = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F16, KW, KH, IC, OC); model.a = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F16, KW, KH, IC, OC);
model.a = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F32, KW, KH, IC, OC); // model.a = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F32, KW, KH, IC, OC);
model.b = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F32, IW, IH, IC, N); model.b = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F32, IW, IH, IC, N);
// create a allocator // create a allocator
@ -124,11 +124,11 @@ void load_model(test_model & model, int ic, int oc, int iw, int ih, bool use_gpu
// load data to buffer // load data to buffer
if(ggml_backend_is_cpu(model.backend)) { if(ggml_backend_is_cpu(model.backend)) {
// memcpy(model.a->data, hadata.data(), ggml_nbytes(model.a)); memcpy(model.a->data, hadata.data(), ggml_nbytes(model.a));
memcpy(model.a->data, adata.data(), ggml_nbytes(model.a)); // memcpy(model.a->data, adata.data(), ggml_nbytes(model.a));
} else { } else {
// ggml_backend_tensor_set(model.a, hadata.data(), 0, ggml_nbytes(model.a)); ggml_backend_tensor_set(model.a, hadata.data(), 0, ggml_nbytes(model.a));
ggml_backend_tensor_set(model.a, adata.data(), 0, ggml_nbytes(model.a)); // ggml_backend_tensor_set(model.a, adata.data(), 0, ggml_nbytes(model.a));
} }
// alloc memory // alloc memory
@ -262,7 +262,7 @@ struct ggml_cgraph * build_graph_2(const test_model& model) {
// printf("conv2d: (%zu, %zu, %zu, %zu) \n", ne[0], ne[1], ne[2], ne[3]); // printf("conv2d: (%zu, %zu, %zu, %zu) \n", ne[0], ne[1], ne[2], ne[3]);
struct ggml_tensor* wino_res = ggml_conv_2d_implicitgemm(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1, 1); struct ggml_tensor* wino_res = ggml_conv_2d_implicitgemm(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1, 0);
// struct ggml_tensor* wino_res = ggml_conv_2d_direct(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1); // struct ggml_tensor* wino_res = ggml_conv_2d_direct(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1);
ggml_set_name(wino_res, "wino_res"); ggml_set_name(wino_res, "wino_res");
ggml_build_forward_expand(gf, wino_res); ggml_build_forward_expand(gf, wino_res);
@ -339,20 +339,20 @@ int main(void)
{ {
ggml_time_init(); ggml_time_init();
std::vector<std::tuple<int, int, int, int>> configs = { std::vector<std::tuple<int, int, int, int>> configs = {
std::make_tuple(64,64,48,64), // std::make_tuple(64,64,48,64),
std::make_tuple(320,320,104,152), // std::make_tuple(320,320,104,152),
std::make_tuple(640,640,52,76), // std::make_tuple(640,640,52,76),
std::make_tuple(640,640,104,152), // std::make_tuple(640,640,104,152),
std::make_tuple(960,320,104,152), // std::make_tuple(960,320,104,152),
std::make_tuple(1280,1280,26,38), std::make_tuple(160,1280,26,38),
std::make_tuple(1280,640,52,76), // std::make_tuple(1280,640,52,76),
std::make_tuple(1920,1280,26,38), // std::make_tuple(1920,1280,26,38),
std::make_tuple(2560,1280,26,38), // std::make_tuple(2560,1280,26,38),
std::make_tuple(512,512,104,152), // std::make_tuple(512,512,104,152),
std::make_tuple(512,512,208,304), // std::make_tuple(512,512,208,304),
std::make_tuple(512,256,416,608), // std::make_tuple(512,256,416,608),
std::make_tuple(256,128,832,1216), // std::make_tuple(256,128,832,1216),
std::make_tuple(256,256,832,1216), // std::make_tuple(256,256,832,1216),
// std::make_tuple(320,256,1024,1920) // std::make_tuple(320,256,1024,1920)
}; };
@ -375,7 +375,7 @@ int main(void)
struct ggml_cgraph * gf_res_0 = NULL; struct ggml_cgraph * gf_res_0 = NULL;
int iterations = 20; int iterations = 0;
double run_time0; double run_time0;
std::vector<float> conv2d_data = compute_graph(model, allocr, build_graph_0, iterations, &run_time0); std::vector<float> conv2d_data = compute_graph(model, allocr, build_graph_0, iterations, &run_time0);
@ -436,15 +436,15 @@ int main(void)
// for(int i = 0; i < ggml_nelements(wino_res); i++) { // for(int i = 0; i < ggml_nelements(wino_res); i++) {
// for(int i = 0; i < 3*28; i++) { for(int i = 0; i < 26*38; i++) {
// float diff = fabs(conv2d_data[i] - wino_data[i]); float diff = fabs(conv2d_data[i] - wino_data[i]);
// // if(diff > 1.e-4) { // if(diff > 1.e-4) {
// printf("(%f, %f, %f, %d) \n", printf("(%f, %f, %f, %d) \n",
// conv2d_data[i], conv2d_data[i],
// wino_data[i], diff, i); wino_data[i], diff, i);
// // break; // break;
// // } // }
// } }
ggml_free(model.ctx); ggml_free(model.ctx);
ggml_backend_buffer_free(model.buffer); ggml_backend_buffer_free(model.buffer);