diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cu b/ggml/src/ggml-cuda/conv2d-implicit.cu index f6059fc3ae..000fd89e20 100644 --- a/ggml/src/ggml-cuda/conv2d-implicit.cu +++ b/ggml/src/ggml-cuda/conv2d-implicit.cu @@ -742,7 +742,7 @@ __device__ __forceinline__ void ldmatrix_a( half (®)[mma_tiles_per_warp_m][mma_tiles_per_warp_k][4] ) { -// #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE +#if __CUDA_ARCH__ >= GGML_CUDA_CC_TURING static_assert(mma_tiles_per_warp_m == 8, "mma_tiles_per_warp_m must be 4"); static_assert(mma_tiles_per_warp_k == 4, "mma_tiles_per_warp_k must be 4"); @@ -885,11 +885,11 @@ __device__ __forceinline__ void ldmatrix_a( : "=r"(reg_[6][3][0]), "=r"(reg_[6][3][1]), "=r"(reg_[7][3][0]), "=r"(reg_[7][3][1]) : "r"(src_addr + 96 * smem_stride_) ); -// #else -// GGML_UNUSED(src); -// GGML_UNUSED(reg); -// NO_DEVICE_CODE; -// #endif +#else + GGML_UNUSED(src); + GGML_UNUSED(reg); + NO_DEVICE_CODE; +#endif } template @@ -898,7 +898,7 @@ __device__ __forceinline__ void ldmatrix_b( half (®)[mma_tiles_per_warp_k][mma_tiles_per_warp_n][2] ) { -// #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE +#if __CUDA_ARCH__ >= GGML_CUDA_CC_TURING static_assert(mma_tiles_per_warp_k == 4, "mma_tiles_per_warp_k must be 4"); static_assert(mma_tiles_per_warp_n == 8, "mma_tiles_per_warp_n must be 8"); @@ -989,11 +989,11 @@ __device__ __forceinline__ void ldmatrix_b( // : "r"(src_addr ^ 0b1000000) : "r"(src_addr + 32 * smem_stride_) ); -// #else -// GGML_UNUSED(src); -// GGML_UNUSED(reg); -// NO_DEVICE_CODE; -// #endif +#else + GGML_UNUSED(src); + GGML_UNUSED(reg); + NO_DEVICE_CODE; +#endif } template= GGML_CUDA_CC_AMPERE +#if __CUDA_ARCH__ >= GGML_CUDA_CC_TURING constexpr unsigned int MMA_M = 16; constexpr unsigned int MMA_N = 8; @@ -1010,7 +1010,7 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input, // printf("conv2d_implicit_kernel launch BM:%d, BN:%d, BK:%d, WM:%d, WN:%d, WK:%d, NUM_THREADS:%d \n", BM, BN, BK, WM, WN, WK, NUM_THREADS); const unsigned int K = param.c * param.r * param.s; - const uint PQ = param.Oh * param.Ow; +// const uint PQ = param.Oh * param.Ow; const uint inChannelOffset = param.c * param.w; const uint weightKOffset = param.c * param.r * param.s; @@ -1153,7 +1153,8 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input, } } - + + // reuse smem half *smemoutput = shmem; const uint lane_id = threadIdx.x % WARPSIZE; @@ -1212,21 +1213,22 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input, // param.interm[outOffset] = smemoutput[output_lds_addr + subk * 32]; const uint outOffset = n * param.k * param.Oh * param.Ow + row * param.Oh * param.Ow + col; output[outOffset] = smemoutput[output_lds_addr + subk + j*32*BN/2]; - if(outOffset == 32){ - printf("(%u, %u, %u, %u), output[%d,%d,%d]=%f \n", threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, - n, row, col, __half2float(output[outOffset])); - } + // if(outOffset == 32){ + // printf("(%u, %u, %u, %u), output[%d,%d,%d]=%f \n", threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, + // n, row, col, __half2float(output[outOffset])); + // } } } } } -// #else -// GGML_UNUSED(input); -// GGML_UNUSED(kernel); -// GGML_UNUSED(output); -// GGML_UNUSED(param); -// NO_DEVICE_CODE; -// #endif + +#else + GGML_UNUSED(input); + GGML_UNUSED(kernel); + GGML_UNUSED(output); + GGML_UNUSED(param); + NO_DEVICE_CODE; +#endif } @@ -1289,7 +1291,8 @@ static void conv2d_implicit_cuda(const float * X_D, const T * K_D, float * Y_D, static void conv2d_implicit_cuda_f16(ggml_backend_cuda_context & ctx, const float * X_D, const half * K_D, float * Y_D, int cc, const param_t P, cudaStream_t st) { if (GGML_CUDA_CC_IS_NVIDIA(cc) && ampere_mma_available(cc) && P.layout == 0 && P.c % 8 == 0) { -// #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE +// #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA + // printf("tensor core path called\n"); constexpr unsigned int BM_dim = 256; constexpr unsigned int BN_dim = 256; constexpr unsigned int BK_dim = 32; diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cuh b/ggml/src/ggml-cuda/conv2d-implicit.cuh index 3ea0461218..69942bffac 100644 --- a/ggml/src/ggml-cuda/conv2d-implicit.cuh +++ b/ggml/src/ggml-cuda/conv2d-implicit.cuh @@ -26,7 +26,7 @@ typedef struct{ uint3 OHOW_fastdiv; } param_t; -// #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE + // same as above, but writes are swizzled to avoid bank conflicts when shared memory is read later in the kernel template @@ -37,6 +37,7 @@ __device__ __forceinline__ void tileMemcpySwizzleB( param_t param ) { +#if __CUDA_ARCH__ >= GGML_CUDA_TURING // constexpr unsigned int SWIZZLE_MASK = 0b111 << SWIZZLE_BITS; // // reinterpret input/output as float4 @@ -117,6 +118,13 @@ __device__ __forceinline__ void tileMemcpySwizzleB( } thread_row += ROW_STEP; } +#else + GGML_UNUSED(src); + GGML_UNUSED(dst); + GGML_UNUSED(src_stride); + GGML_UNUSED(param); + NO_DEVICE_CODE; +#endif } @@ -131,6 +139,7 @@ __device__ __forceinline__ void tileMemcpySwizzleA( param_t param ) { +#if __CUDA_ARCH__ >= GGML_CUDA_TURING constexpr unsigned int SWIZZLE_MASK_1 = 0b10000; constexpr unsigned int SWIZZLE_BITS_1 = 4; constexpr unsigned int SWIZZLE_MASK_2 = 0b1100; @@ -186,6 +195,13 @@ __device__ __forceinline__ void tileMemcpySwizzleA( } thread_row += ROW_STEP; } +#else + GGML_UNUSED(src); + GGML_UNUSED(dst); + GGML_UNUSED(inChannelOffset); + GGML_UNUSED(param); + NO_DEVICE_CODE; +#endif } template= GGML_CUDA_TURING // reinterpret input/output as float4 // const float4* src_float4 = reinterpret_cast(src); // const unsigned int src_stride_vectorized = src_stride / 8; @@ -251,6 +268,14 @@ __device__ __forceinline__ void tileMemcpyLoadA( } thread_row += ROW_STEP; } +#else + GGML_UNUSED(src); + GGML_UNUSED(dst_reg); + GGML_UNUSED(block_k); + GGML_UNUSED(inChannelOffset); + GGML_UNUSED(param); + NO_DEVICE_CODE; +#endif } @@ -266,6 +291,7 @@ __device__ __forceinline__ void tileMemcpyLoadB( param_t param ) { +#if __CUDA_ARCH__ >= GGML_CUDA_TURING // reinterpret input/output as float4 // const float4* src_float4 = reinterpret_cast(src); // const unsigned int src_stride_vectorized = src_stride / 8; @@ -305,91 +331,18 @@ __device__ __forceinline__ void tileMemcpyLoadB( } thread_row += ROW_STEP; } +#else + GGML_UNUSED(src); + GGML_UNUSED(dst_reg); + GGML_UNUSED(block_k); + GGML_UNUSED(src_stride); + GGML_UNUSED(param); + NO_DEVICE_CODE; +#endif } -// template -// __device__ __forceinline__ void tileMemcpySwizzleStoreB( -// float4 src_reg[ELEMENTS_PER_THREAD], -// half* dst -// ) -// { -// constexpr unsigned int SWIZZLE_MASK = 0b111 << SWIZZLE_BITS; - -// // reinterpret input/output as float4 -// float4* dst_float4 = reinterpret_cast(dst); - -// // # of threads is multiple of # of columns in the tile -// constexpr unsigned int TILE_COLS_VECTORIZED = TILE_COLS / 8; -// static_assert(NUM_THREADS % TILE_COLS_VECTORIZED == 0); - -// // flatten out 2d grid of threads into in order of increasing threadIdx.x -// const unsigned int thread_idx = threadIdx.y * blockDim.x + threadIdx.x; - -// // assign each thread a row/column in the tile, calculate how many iterations we need -// // to cover the whole tile -// constexpr unsigned int ROW_STEP = NUM_THREADS / TILE_COLS_VECTORIZED; -// constexpr unsigned int NUM_ITERS = TILE_ROWS / ROW_STEP; -// unsigned int thread_row = thread_idx / TILE_COLS_VECTORIZED; -// const unsigned int thread_col = thread_idx % TILE_COLS_VECTORIZED; - -// // compile time check that we provided the right amount of registers for storage -// static_assert(ELEMENTS_PER_THREAD == NUM_ITERS); - -// #pragma unroll -// for (unsigned int i = 0; i < NUM_ITERS; i++) -// { -// // apply swizzle to the dst index -// unsigned int dst_index = thread_row * TILE_COLS_VECTORIZED + thread_col; -// dst_index = dst_index ^ ((dst_index & SWIZZLE_MASK) >> SWIZZLE_BITS); -// dst_float4[dst_index] = src_reg[i]; -// thread_row += ROW_STEP; -// } -// } // same as above but without the swizzle -template -__device__ __forceinline__ void tileMemcpyStore( - float4 src_reg[ELEMENTS_PER_THREAD], - half* dst, - unsigned int dst_stride_float4 -) -{ - // reinterpret input/output as float4 - float4* dst_float4 = reinterpret_cast(dst); - - // # of threads is multiple of # of columns in the tile - constexpr unsigned int TILE_COLS_VECTORIZED = TILE_COLS / 8; - static_assert(NUM_THREADS % TILE_COLS_VECTORIZED == 0); - - // flatten out 2d grid of threads into in order of increasing threadIdx.x - const unsigned int thread_idx = threadIdx.y * blockDim.x + threadIdx.x; - - // assign each thread a row/column in the tile, calculate how many iterations we need - // to cover the whole tile - constexpr unsigned int ROW_STEP = NUM_THREADS / TILE_COLS_VECTORIZED; - constexpr unsigned int NUM_ITERS = TILE_ROWS / ROW_STEP; - unsigned int thread_row = thread_idx / TILE_COLS_VECTORIZED; - const unsigned int thread_col = thread_idx % TILE_COLS_VECTORIZED; - - // compile time check that we provided the right amount of registers for storage - static_assert(ELEMENTS_PER_THREAD == NUM_ITERS); - - #pragma unroll - for (unsigned int i = 0; i < NUM_ITERS; i++) - { - // apply swizzle to the dst index - unsigned int dst_index = thread_row * dst_stride_float4 + thread_col; - dst_float4[dst_index] = src_reg[i]; - thread_row += ROW_STEP; - } -} // this is a special case of the above for when TILE_COLS == 32 template= GGML_CUDA_TURING constexpr unsigned int SWIZZLE_MASK_1 = 0b10000; constexpr unsigned int SWIZZLE_BITS_1 = 4; constexpr unsigned int SWIZZLE_MASK_2 = 0b1100; @@ -436,6 +390,11 @@ __device__ __forceinline__ void tileMemcpySwizzleStore( dst_float4[dst_index] = src_reg[i]; thread_row += ROW_STEP; } +#else + GGML_UNUSED(src_reg); + GGML_UNUSED(dst); + NO_DEVICE_CODE; +#endif } __device__ __forceinline__ uint32_t cvta_to_shared_u32(const void *pointer) { @@ -450,8 +409,6 @@ __device__ __forceinline__ uint32_t cvta_to_shared_u32(const void *pointer) { return address; } -// #endif - // constexpr unsigned int int_log2(unsigned int x) // { // unsigned int result = 0; diff --git a/tests/test-conv2d-implicit.cpp b/tests/test-conv2d-implicit.cpp index 19d2826240..836bb10637 100644 --- a/tests/test-conv2d-implicit.cpp +++ b/tests/test-conv2d-implicit.cpp @@ -339,20 +339,20 @@ int main(void) { ggml_time_init(); std::vector> configs = { - // std::make_tuple(64,64,48,64), - // std::make_tuple(320,320,104,152), - // std::make_tuple(640,640,52,76), - // std::make_tuple(640,640,104,152), - // std::make_tuple(960,320,104,152), - std::make_tuple(128,1280,26,38), - // std::make_tuple(1280,640,52,76), - // std::make_tuple(1920,1280,26,38), - // std::make_tuple(2560,1280,26,38), - // std::make_tuple(512,512,104,152), - // std::make_tuple(512,512,208,304), - // std::make_tuple(512,256,416,608), - // std::make_tuple(256,128,832,1216), - // std::make_tuple(256,256,832,1216), + std::make_tuple(64,64,48,64), + std::make_tuple(320,320,104,152), + std::make_tuple(640,640,52,76), + std::make_tuple(640,640,104,152), + std::make_tuple(960,320,104,152), + std::make_tuple(1280,1280,26,38), + std::make_tuple(1280,640,52,76), + std::make_tuple(1920,1280,26,38), + std::make_tuple(2560,1280,26,38), + std::make_tuple(512,512,104,152), + std::make_tuple(512,512,208,304), + std::make_tuple(512,256,416,608), + std::make_tuple(256,128,832,1216), + std::make_tuple(256,256,832,1216), // std::make_tuple(320,256,1024,1920) }; @@ -375,7 +375,7 @@ int main(void) struct ggml_cgraph * gf_res_0 = NULL; - int iterations = 0; + int iterations = 20; double run_time0; std::vector conv2d_data = compute_graph(model, allocr, build_graph_0, iterations, &run_time0); @@ -437,15 +437,15 @@ int main(void) // for(int i = 0; i < ggml_nelements(wino_res); i++) { - for(int i = 0; i < 26*38; i++) { - float diff = fabs(conv2d_data[i] - wino_data[i]); - // if(diff > 1.e-4) { - printf("(%f, %f, %f, %d) \n", - conv2d_data[i], - wino_data[i], diff, i); - // break; - // } - } + // for(int i = 0; i < 26*38; i++) { + // float diff = fabs(conv2d_data[i] - wino_data[i]); + // // if(diff > 1.e-4) { + // printf("(%f, %f, %f, %d) \n", + // conv2d_data[i], + // wino_data[i], diff, i); + // // break; + // // } + // } ggml_free(model.ctx); ggml_backend_buffer_free(model.buffer);