From ecbbdb6608b6c9fbd107fbf0153e16be3c0b5176 Mon Sep 17 00:00:00 2001 From: bssrdf Date: Fri, 14 Nov 2025 13:05:31 -0500 Subject: [PATCH] reducing integer ops --- ggml/src/ggml-cuda/conv2d-implicit.cu | 20 +++++---- ggml/src/ggml-cuda/conv2d-implicit.cuh | 62 +++++++++++++++----------- 2 files changed, 47 insertions(+), 35 deletions(-) diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cu b/ggml/src/ggml-cuda/conv2d-implicit.cu index 57cd116d73..d204807a2f 100644 --- a/ggml/src/ggml-cuda/conv2d-implicit.cu +++ b/ggml/src/ggml-cuda/conv2d-implicit.cu @@ -822,6 +822,8 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input, const unsigned int warp_m = threadIdx.y; const unsigned int warp_n = threadIdx.x / 32; const unsigned int thread_idx = threadIdx.y * blockDim.x + threadIdx.x; + unsigned int thread_row = thread_idx / TILE_COLS_VECTORIZED; + const unsigned int thread_col = thread_idx % TILE_COLS_VECTORIZED; // double buffering extern __shared__ half shmem[]; @@ -871,7 +873,7 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input, - prepareIteratorA(thread_idx, masks_a, element_offset_a, param); + prepareIteratorA(thread_row, masks_a, element_offset_a, param); // for(int kk =0; kk < A_K_STRID; kk++){ // if(element_offset_a[kk] >= 327680) @@ -894,8 +896,8 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input, const half* B_block_gmem = kernel + block_n * BN * param.weightKOffset; unsigned int curC = tileMemcpySwizzleA(A_block_gmem, A_block_smem, 0, 0, masks_a, element_offset_a, - thread_idx, start_k, end_k, param); - tileMemcpySwizzleB(B_block_gmem, B_block_smem, 0, 0, start_k, end_k, param); + thread_row, thread_col, start_k, end_k, param); + tileMemcpySwizzleB(B_block_gmem, B_block_smem, 0, 0, start_k, end_k, thread_row, thread_col, param); int offset_direction = 1; unsigned int block_k = 0; @@ -947,9 +949,10 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input, // if (block_k != num_block_tiles_k){ if (block_krs != num_block_tiles_krs){ curC = tileMemcpyLoadA(A_block_gmem, A_gmem_cache_reg, r, s, - masks_a, element_offset_a, thread_idx, block_k * BK, + masks_a, element_offset_a, thread_row, thread_col, block_k * BK, start_k, end_k, curC, param); - tileMemcpyLoadB(B_block_gmem, B_gmem_cache_reg, r, s, block_k * BK, start_k, end_k, param); + tileMemcpyLoadB(B_block_gmem, B_gmem_cache_reg, r, s, block_k * BK, + start_k, end_k, thread_row, thread_col, param); } half* A_warp_tile = A_block_smem + A_warp_tile_offset; @@ -1002,8 +1005,8 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input, B_block_smem = B_block_smem + BUFFER_SIZE * offset_direction; offset_direction = -1 * offset_direction; - tileMemcpySwizzleStore(A_gmem_cache_reg, A_block_smem); - tileMemcpySwizzleStore(B_gmem_cache_reg, B_block_smem); + tileMemcpySwizzleStore(A_gmem_cache_reg, A_block_smem, thread_row, thread_col); + tileMemcpySwizzleStore(B_gmem_cache_reg, B_block_smem, thread_row, thread_col); } block_krs++; @@ -1413,7 +1416,8 @@ void ggml_cuda_op_conv2d_implicit(ggml_backend_cuda_context & ctx, ggml_tensor * IC*KW*KH, OW*OH, OC*OW*OH, - B*OC*OW*OH}; + B*OC*OW*OH, + IC*IW*IH}; if (kernel->type == GGML_TYPE_F16) { conv2d_implicit_cuda_f16(ctx, X_D, (half *) K_D, Y_D, cc, params, st); diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cuh b/ggml/src/ggml-cuda/conv2d-implicit.cuh index 40b1c7babe..9f817a0078 100644 --- a/ggml/src/ggml-cuda/conv2d-implicit.cuh +++ b/ggml/src/ggml-cuda/conv2d-implicit.cuh @@ -30,7 +30,7 @@ typedef struct{ unsigned int PQ; unsigned int KPQ; unsigned int NKPQ; - + unsigned int CHW; } param_t; @@ -58,7 +58,7 @@ template -__device__ void prepareIteratorA(const int thread_idx, +__device__ void prepareIteratorA(unsigned int thread_row, unsigned int masks[][2], int64_t element_offset[], const param_t param){ @@ -67,8 +67,8 @@ __device__ void prepareIteratorA(const int thread_idx, int offset_q[A_K_STRID]; constexpr unsigned int TILE_COLS_VECTORIZED = TILE_COLS / 8; - unsigned int thread_row = thread_idx / TILE_COLS_VECTORIZED; - const unsigned int chw = param.c * param.h * param.w; + // unsigned int thread_row = thread_idx / TILE_COLS_VECTORIZED; + // const unsigned int chw = param.c * param.h * param.w; #pragma unroll for (int s = 0; s < A_K_STRID; ++s) { @@ -91,7 +91,7 @@ __device__ void prepareIteratorA(const int thread_idx, // offset_npq, offset_n[s], offset_p[s], offset_q[s], AccessType::kElements, // ThreadMap::Iterations::kContiguous); - element_offset[s] = offset_n[s] * (int64_t)chw + h * (int64_t)(param.c * param.w) + w * (int64_t)param.c; + element_offset[s] = offset_n[s] * (int64_t)param.CHW + h * (int64_t)(param.inChannelOffset) + w * (int64_t)param.c; // if(element_offset[s] >= 327680) // printf("(%d, %d, %d, %d, %d), %d, %lld, %d, %d, %d, %d, %d, %u, %u, %u \n", @@ -126,12 +126,14 @@ __device__ void prepareIteratorA(const int thread_idx, template __device__ __forceinline__ void tileMemcpySwizzleB( - const half* src, - half* dst, + const half* __restrict__ src, + half* __restrict__ dst, const unsigned int curR, const unsigned int curS, const unsigned int start_k, const unsigned int end_k, + unsigned int thread_row, + const unsigned int thread_col, // const unsigned int src_stride, param_t param ){ @@ -149,14 +151,14 @@ __device__ __forceinline__ void tileMemcpySwizzleB( constexpr unsigned int TILE_COLS_VECTORIZED = TILE_COLS / 8; static_assert(NUM_THREADS % TILE_COLS_VECTORIZED == 0); // flatten out 2d grid of threads into in order of increasing threadIdx.x - const unsigned int thread_idx = threadIdx.y * blockDim.x + threadIdx.x; + // const unsigned int thread_idx = threadIdx.y * blockDim.x + threadIdx.x; // assign each thread a row/column in the tile, calculate how many iterations we need // to cover the whole tile constexpr unsigned int ROW_STEP = NUM_THREADS / TILE_COLS_VECTORIZED; constexpr unsigned int NUM_ITERS = TILE_ROWS / ROW_STEP; - unsigned int thread_row = thread_idx / TILE_COLS_VECTORIZED; - const unsigned int thread_col = thread_idx % TILE_COLS_VECTORIZED; + // unsigned int thread_row = thread_idx / TILE_COLS_VECTORIZED; + // const unsigned int thread_col = thread_idx % TILE_COLS_VECTORIZED; // const unsigned int ki = (curR*param.s+curS)*param.c + start_k+thread_col*8; // const unsigned int curR = fastdiv(ki, param.SC_fastdiv); // channel offset @@ -193,13 +195,14 @@ __device__ __forceinline__ void tileMemcpySwizzleB( template __device__ __forceinline__ unsigned int tileMemcpySwizzleA( - const half* src, - half* dst, + const half* __restrict__ src, + half* __restrict__ dst, const unsigned int curR, const unsigned int curS, unsigned int masks[][2], const int64_t element_offset[], - const unsigned int thread_idx, + unsigned int thread_row, + const unsigned int thread_col, const unsigned int start_k, const unsigned int end_k, param_t param @@ -225,8 +228,8 @@ __device__ __forceinline__ unsigned int tileMemcpySwizzleA( // to cover the whole tile constexpr unsigned int ROW_STEP = NUM_THREADS / TILE_COLS_VECTORIZED; constexpr unsigned int NUM_ITERS = TILE_ROWS / ROW_STEP; - unsigned int thread_row = thread_idx / TILE_COLS_VECTORIZED; - const unsigned int thread_col = thread_idx % TILE_COLS_VECTORIZED; + // unsigned int thread_row = thread_idx / TILE_COLS_VECTORIZED; + // const unsigned int thread_col = thread_idx % TILE_COLS_VECTORIZED; // const unsigned int ki = start_k+thread_col*8; // const unsigned int chw = param.c * param.h * param.w; @@ -295,13 +298,14 @@ unsigned int TILE_COLS, unsigned int NUM_THREADS, unsigned int ELEMENTS_PER_THREAD> __device__ __forceinline__ unsigned int tileMemcpyLoadA( - const half* src, + const half* __restrict__ src, float4 (&dst_reg)[ELEMENTS_PER_THREAD], const unsigned int curR, const unsigned int curS, unsigned int masks[][2], const int64_t element_offset[], - const unsigned int thread_idx, + unsigned int thread_row, + const unsigned int thread_col, const unsigned int block_k, const unsigned int start_k, const unsigned int end_k, @@ -320,8 +324,8 @@ __device__ __forceinline__ unsigned int tileMemcpyLoadA( // to cover the whole tile constexpr unsigned int ROW_STEP = NUM_THREADS / TILE_COLS_VECTORIZED; constexpr unsigned int NUM_ITERS = TILE_ROWS / ROW_STEP; - unsigned int thread_row = thread_idx / TILE_COLS_VECTORIZED; - const unsigned int thread_col = thread_idx % TILE_COLS_VECTORIZED; + // unsigned int thread_row = thread_idx / TILE_COLS_VECTORIZED; + // const unsigned int thread_col = thread_idx % TILE_COLS_VECTORIZED; // compile time check that we provided the right amount of registers for storage static_assert(ELEMENTS_PER_THREAD == NUM_ITERS); @@ -395,13 +399,15 @@ unsigned int TILE_COLS, unsigned int NUM_THREADS, unsigned int ELEMENTS_PER_THREAD> __device__ __forceinline__ void tileMemcpyLoadB( - const half* src, + const half* __restrict__ src, float4 (&dst_reg)[ELEMENTS_PER_THREAD], const unsigned int curR, const unsigned int curS, const unsigned int block_k, const unsigned int start_k, const unsigned int end_k, + unsigned int thread_row, + const unsigned int thread_col, // const unsigned int src_stride, param_t param ){ @@ -412,14 +418,14 @@ __device__ __forceinline__ void tileMemcpyLoadB( static_assert(NUM_THREADS % TILE_COLS_VECTORIZED == 0); // flatten out 2d grid of threads into in order of increasing threadIdx.x - const unsigned int thread_idx = threadIdx.y * blockDim.x + threadIdx.x; + // const unsigned int thread_idx = threadIdx.y * blockDim.x + threadIdx.x; // assign each thread a row/column in the tile, calculate how many iterations we need // to cover the whole tile constexpr unsigned int ROW_STEP = NUM_THREADS / TILE_COLS_VECTORIZED; constexpr unsigned int NUM_ITERS = TILE_ROWS / ROW_STEP; - unsigned int thread_row = thread_idx / TILE_COLS_VECTORIZED; - const unsigned int thread_col = thread_idx % TILE_COLS_VECTORIZED; + // unsigned int thread_row = thread_idx / TILE_COLS_VECTORIZED; + // const unsigned int thread_col = thread_idx % TILE_COLS_VECTORIZED; // compile time check that we provided the right amount of registers for storage static_assert(ELEMENTS_PER_THREAD == NUM_ITERS); @@ -459,7 +465,9 @@ unsigned int NUM_THREADS, unsigned int ELEMENTS_PER_THREAD> __device__ __forceinline__ void tileMemcpySwizzleStore( const float4 (&src_reg)[ELEMENTS_PER_THREAD], - half* dst + half* __restrict__ dst, + unsigned int thread_row, + const unsigned int thread_col ) { #if __CUDA_ARCH__ >= GGML_CUDA_TURING @@ -478,14 +486,14 @@ __device__ __forceinline__ void tileMemcpySwizzleStore( static_assert(NUM_THREADS % TILE_COLS_VECTORIZED == 0); // flatten out 2d grid of threads into in order of increasing threadIdx.x - const unsigned int thread_idx = threadIdx.y * blockDim.x + threadIdx.x; + // const unsigned int thread_idx = threadIdx.y * blockDim.x + threadIdx.x; // assign each thread a row/column in the tile, calculate how many iterations we need // to cover the whole tile constexpr unsigned int ROW_STEP = NUM_THREADS / TILE_COLS_VECTORIZED; constexpr unsigned int NUM_ITERS = TILE_ROWS / ROW_STEP; - unsigned int thread_row = thread_idx / TILE_COLS_VECTORIZED; - const unsigned int thread_col = thread_idx % TILE_COLS_VECTORIZED; + // unsigned int thread_row = thread_idx / TILE_COLS_VECTORIZED; + // const unsigned int thread_col = thread_idx % TILE_COLS_VECTORIZED; // compile time check that we provided the right amount of registers for storage static_assert(ELEMENTS_PER_THREAD == NUM_ITERS);