reducing integer ops
This commit is contained in:
parent
b4530b4f8b
commit
ecbbdb6608
|
|
@ -822,6 +822,8 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
|
|||
const unsigned int warp_m = threadIdx.y;
|
||||
const unsigned int warp_n = threadIdx.x / 32;
|
||||
const unsigned int thread_idx = threadIdx.y * blockDim.x + threadIdx.x;
|
||||
unsigned int thread_row = thread_idx / TILE_COLS_VECTORIZED;
|
||||
const unsigned int thread_col = thread_idx % TILE_COLS_VECTORIZED;
|
||||
|
||||
// double buffering
|
||||
extern __shared__ half shmem[];
|
||||
|
|
@ -871,7 +873,7 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
|
|||
|
||||
|
||||
|
||||
prepareIteratorA<BM, BK, A_K_STRID, ROW_STEP>(thread_idx, masks_a, element_offset_a, param);
|
||||
prepareIteratorA<BM, BK, A_K_STRID, ROW_STEP>(thread_row, masks_a, element_offset_a, param);
|
||||
|
||||
// for(int kk =0; kk < A_K_STRID; kk++){
|
||||
// if(element_offset_a[kk] >= 327680)
|
||||
|
|
@ -894,8 +896,8 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
|
|||
const half* B_block_gmem = kernel + block_n * BN * param.weightKOffset;
|
||||
|
||||
unsigned int curC = tileMemcpySwizzleA<BM, NUM_THREADS>(A_block_gmem, A_block_smem, 0, 0, masks_a, element_offset_a,
|
||||
thread_idx, start_k, end_k, param);
|
||||
tileMemcpySwizzleB<BN, NUM_THREADS>(B_block_gmem, B_block_smem, 0, 0, start_k, end_k, param);
|
||||
thread_row, thread_col, start_k, end_k, param);
|
||||
tileMemcpySwizzleB<BN, NUM_THREADS>(B_block_gmem, B_block_smem, 0, 0, start_k, end_k, thread_row, thread_col, param);
|
||||
|
||||
int offset_direction = 1;
|
||||
unsigned int block_k = 0;
|
||||
|
|
@ -947,9 +949,10 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
|
|||
// if (block_k != num_block_tiles_k){
|
||||
if (block_krs != num_block_tiles_krs){
|
||||
curC = tileMemcpyLoadA<BM, BK, NUM_THREADS, 4>(A_block_gmem, A_gmem_cache_reg, r, s,
|
||||
masks_a, element_offset_a, thread_idx, block_k * BK,
|
||||
masks_a, element_offset_a, thread_row, thread_col, block_k * BK,
|
||||
start_k, end_k, curC, param);
|
||||
tileMemcpyLoadB<BN, BK, NUM_THREADS, 4>(B_block_gmem, B_gmem_cache_reg, r, s, block_k * BK, start_k, end_k, param);
|
||||
tileMemcpyLoadB<BN, BK, NUM_THREADS, 4>(B_block_gmem, B_gmem_cache_reg, r, s, block_k * BK,
|
||||
start_k, end_k, thread_row, thread_col, param);
|
||||
}
|
||||
|
||||
half* A_warp_tile = A_block_smem + A_warp_tile_offset;
|
||||
|
|
@ -1002,8 +1005,8 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
|
|||
B_block_smem = B_block_smem + BUFFER_SIZE * offset_direction;
|
||||
offset_direction = -1 * offset_direction;
|
||||
|
||||
tileMemcpySwizzleStore<BM, NUM_THREADS, 4>(A_gmem_cache_reg, A_block_smem);
|
||||
tileMemcpySwizzleStore<BN, NUM_THREADS, 4>(B_gmem_cache_reg, B_block_smem);
|
||||
tileMemcpySwizzleStore<BM, NUM_THREADS, 4>(A_gmem_cache_reg, A_block_smem, thread_row, thread_col);
|
||||
tileMemcpySwizzleStore<BN, NUM_THREADS, 4>(B_gmem_cache_reg, B_block_smem, thread_row, thread_col);
|
||||
}
|
||||
|
||||
block_krs++;
|
||||
|
|
@ -1413,7 +1416,8 @@ void ggml_cuda_op_conv2d_implicit(ggml_backend_cuda_context & ctx, ggml_tensor *
|
|||
IC*KW*KH,
|
||||
OW*OH,
|
||||
OC*OW*OH,
|
||||
B*OC*OW*OH};
|
||||
B*OC*OW*OH,
|
||||
IC*IW*IH};
|
||||
|
||||
if (kernel->type == GGML_TYPE_F16) {
|
||||
conv2d_implicit_cuda_f16(ctx, X_D, (half *) K_D, Y_D, cc, params, st);
|
||||
|
|
|
|||
|
|
@ -30,7 +30,7 @@ typedef struct{
|
|||
unsigned int PQ;
|
||||
unsigned int KPQ;
|
||||
unsigned int NKPQ;
|
||||
|
||||
unsigned int CHW;
|
||||
} param_t;
|
||||
|
||||
|
||||
|
|
@ -58,7 +58,7 @@ template<const unsigned int TILE_ROWS,
|
|||
const unsigned int TILE_COLS,
|
||||
const unsigned int A_K_STRID,
|
||||
const unsigned int ROW_STEP>
|
||||
__device__ void prepareIteratorA(const int thread_idx,
|
||||
__device__ void prepareIteratorA(unsigned int thread_row,
|
||||
unsigned int masks[][2],
|
||||
int64_t element_offset[],
|
||||
const param_t param){
|
||||
|
|
@ -67,8 +67,8 @@ __device__ void prepareIteratorA(const int thread_idx,
|
|||
int offset_q[A_K_STRID];
|
||||
|
||||
constexpr unsigned int TILE_COLS_VECTORIZED = TILE_COLS / 8;
|
||||
unsigned int thread_row = thread_idx / TILE_COLS_VECTORIZED;
|
||||
const unsigned int chw = param.c * param.h * param.w;
|
||||
// unsigned int thread_row = thread_idx / TILE_COLS_VECTORIZED;
|
||||
// const unsigned int chw = param.c * param.h * param.w;
|
||||
|
||||
#pragma unroll
|
||||
for (int s = 0; s < A_K_STRID; ++s) {
|
||||
|
|
@ -91,7 +91,7 @@ __device__ void prepareIteratorA(const int thread_idx,
|
|||
// offset_npq, offset_n[s], offset_p[s], offset_q[s], AccessType::kElements,
|
||||
// ThreadMap::Iterations::kContiguous);
|
||||
|
||||
element_offset[s] = offset_n[s] * (int64_t)chw + h * (int64_t)(param.c * param.w) + w * (int64_t)param.c;
|
||||
element_offset[s] = offset_n[s] * (int64_t)param.CHW + h * (int64_t)(param.inChannelOffset) + w * (int64_t)param.c;
|
||||
|
||||
// if(element_offset[s] >= 327680)
|
||||
// printf("(%d, %d, %d, %d, %d), %d, %lld, %d, %d, %d, %d, %d, %u, %u, %u \n",
|
||||
|
|
@ -126,12 +126,14 @@ __device__ void prepareIteratorA(const int thread_idx,
|
|||
template<unsigned int TILE_ROWS,
|
||||
unsigned int NUM_THREADS>
|
||||
__device__ __forceinline__ void tileMemcpySwizzleB(
|
||||
const half* src,
|
||||
half* dst,
|
||||
const half* __restrict__ src,
|
||||
half* __restrict__ dst,
|
||||
const unsigned int curR,
|
||||
const unsigned int curS,
|
||||
const unsigned int start_k,
|
||||
const unsigned int end_k,
|
||||
unsigned int thread_row,
|
||||
const unsigned int thread_col,
|
||||
// const unsigned int src_stride,
|
||||
param_t param
|
||||
){
|
||||
|
|
@ -149,14 +151,14 @@ __device__ __forceinline__ void tileMemcpySwizzleB(
|
|||
constexpr unsigned int TILE_COLS_VECTORIZED = TILE_COLS / 8;
|
||||
static_assert(NUM_THREADS % TILE_COLS_VECTORIZED == 0);
|
||||
// flatten out 2d grid of threads into in order of increasing threadIdx.x
|
||||
const unsigned int thread_idx = threadIdx.y * blockDim.x + threadIdx.x;
|
||||
// const unsigned int thread_idx = threadIdx.y * blockDim.x + threadIdx.x;
|
||||
|
||||
// assign each thread a row/column in the tile, calculate how many iterations we need
|
||||
// to cover the whole tile
|
||||
constexpr unsigned int ROW_STEP = NUM_THREADS / TILE_COLS_VECTORIZED;
|
||||
constexpr unsigned int NUM_ITERS = TILE_ROWS / ROW_STEP;
|
||||
unsigned int thread_row = thread_idx / TILE_COLS_VECTORIZED;
|
||||
const unsigned int thread_col = thread_idx % TILE_COLS_VECTORIZED;
|
||||
// unsigned int thread_row = thread_idx / TILE_COLS_VECTORIZED;
|
||||
// const unsigned int thread_col = thread_idx % TILE_COLS_VECTORIZED;
|
||||
|
||||
// const unsigned int ki = (curR*param.s+curS)*param.c + start_k+thread_col*8;
|
||||
// const unsigned int curR = fastdiv(ki, param.SC_fastdiv); // channel offset
|
||||
|
|
@ -193,13 +195,14 @@ __device__ __forceinline__ void tileMemcpySwizzleB(
|
|||
template<unsigned int TILE_ROWS,
|
||||
unsigned int NUM_THREADS>
|
||||
__device__ __forceinline__ unsigned int tileMemcpySwizzleA(
|
||||
const half* src,
|
||||
half* dst,
|
||||
const half* __restrict__ src,
|
||||
half* __restrict__ dst,
|
||||
const unsigned int curR,
|
||||
const unsigned int curS,
|
||||
unsigned int masks[][2],
|
||||
const int64_t element_offset[],
|
||||
const unsigned int thread_idx,
|
||||
unsigned int thread_row,
|
||||
const unsigned int thread_col,
|
||||
const unsigned int start_k,
|
||||
const unsigned int end_k,
|
||||
param_t param
|
||||
|
|
@ -225,8 +228,8 @@ __device__ __forceinline__ unsigned int tileMemcpySwizzleA(
|
|||
// to cover the whole tile
|
||||
constexpr unsigned int ROW_STEP = NUM_THREADS / TILE_COLS_VECTORIZED;
|
||||
constexpr unsigned int NUM_ITERS = TILE_ROWS / ROW_STEP;
|
||||
unsigned int thread_row = thread_idx / TILE_COLS_VECTORIZED;
|
||||
const unsigned int thread_col = thread_idx % TILE_COLS_VECTORIZED;
|
||||
// unsigned int thread_row = thread_idx / TILE_COLS_VECTORIZED;
|
||||
// const unsigned int thread_col = thread_idx % TILE_COLS_VECTORIZED;
|
||||
|
||||
// const unsigned int ki = start_k+thread_col*8;
|
||||
// const unsigned int chw = param.c * param.h * param.w;
|
||||
|
|
@ -295,13 +298,14 @@ unsigned int TILE_COLS,
|
|||
unsigned int NUM_THREADS,
|
||||
unsigned int ELEMENTS_PER_THREAD>
|
||||
__device__ __forceinline__ unsigned int tileMemcpyLoadA(
|
||||
const half* src,
|
||||
const half* __restrict__ src,
|
||||
float4 (&dst_reg)[ELEMENTS_PER_THREAD],
|
||||
const unsigned int curR,
|
||||
const unsigned int curS,
|
||||
unsigned int masks[][2],
|
||||
const int64_t element_offset[],
|
||||
const unsigned int thread_idx,
|
||||
unsigned int thread_row,
|
||||
const unsigned int thread_col,
|
||||
const unsigned int block_k,
|
||||
const unsigned int start_k,
|
||||
const unsigned int end_k,
|
||||
|
|
@ -320,8 +324,8 @@ __device__ __forceinline__ unsigned int tileMemcpyLoadA(
|
|||
// to cover the whole tile
|
||||
constexpr unsigned int ROW_STEP = NUM_THREADS / TILE_COLS_VECTORIZED;
|
||||
constexpr unsigned int NUM_ITERS = TILE_ROWS / ROW_STEP;
|
||||
unsigned int thread_row = thread_idx / TILE_COLS_VECTORIZED;
|
||||
const unsigned int thread_col = thread_idx % TILE_COLS_VECTORIZED;
|
||||
// unsigned int thread_row = thread_idx / TILE_COLS_VECTORIZED;
|
||||
// const unsigned int thread_col = thread_idx % TILE_COLS_VECTORIZED;
|
||||
|
||||
// compile time check that we provided the right amount of registers for storage
|
||||
static_assert(ELEMENTS_PER_THREAD == NUM_ITERS);
|
||||
|
|
@ -395,13 +399,15 @@ unsigned int TILE_COLS,
|
|||
unsigned int NUM_THREADS,
|
||||
unsigned int ELEMENTS_PER_THREAD>
|
||||
__device__ __forceinline__ void tileMemcpyLoadB(
|
||||
const half* src,
|
||||
const half* __restrict__ src,
|
||||
float4 (&dst_reg)[ELEMENTS_PER_THREAD],
|
||||
const unsigned int curR,
|
||||
const unsigned int curS,
|
||||
const unsigned int block_k,
|
||||
const unsigned int start_k,
|
||||
const unsigned int end_k,
|
||||
unsigned int thread_row,
|
||||
const unsigned int thread_col,
|
||||
// const unsigned int src_stride,
|
||||
param_t param
|
||||
){
|
||||
|
|
@ -412,14 +418,14 @@ __device__ __forceinline__ void tileMemcpyLoadB(
|
|||
static_assert(NUM_THREADS % TILE_COLS_VECTORIZED == 0);
|
||||
|
||||
// flatten out 2d grid of threads into in order of increasing threadIdx.x
|
||||
const unsigned int thread_idx = threadIdx.y * blockDim.x + threadIdx.x;
|
||||
// const unsigned int thread_idx = threadIdx.y * blockDim.x + threadIdx.x;
|
||||
|
||||
// assign each thread a row/column in the tile, calculate how many iterations we need
|
||||
// to cover the whole tile
|
||||
constexpr unsigned int ROW_STEP = NUM_THREADS / TILE_COLS_VECTORIZED;
|
||||
constexpr unsigned int NUM_ITERS = TILE_ROWS / ROW_STEP;
|
||||
unsigned int thread_row = thread_idx / TILE_COLS_VECTORIZED;
|
||||
const unsigned int thread_col = thread_idx % TILE_COLS_VECTORIZED;
|
||||
// unsigned int thread_row = thread_idx / TILE_COLS_VECTORIZED;
|
||||
// const unsigned int thread_col = thread_idx % TILE_COLS_VECTORIZED;
|
||||
|
||||
// compile time check that we provided the right amount of registers for storage
|
||||
static_assert(ELEMENTS_PER_THREAD == NUM_ITERS);
|
||||
|
|
@ -459,7 +465,9 @@ unsigned int NUM_THREADS,
|
|||
unsigned int ELEMENTS_PER_THREAD>
|
||||
__device__ __forceinline__ void tileMemcpySwizzleStore(
|
||||
const float4 (&src_reg)[ELEMENTS_PER_THREAD],
|
||||
half* dst
|
||||
half* __restrict__ dst,
|
||||
unsigned int thread_row,
|
||||
const unsigned int thread_col
|
||||
)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= GGML_CUDA_TURING
|
||||
|
|
@ -478,14 +486,14 @@ __device__ __forceinline__ void tileMemcpySwizzleStore(
|
|||
static_assert(NUM_THREADS % TILE_COLS_VECTORIZED == 0);
|
||||
|
||||
// flatten out 2d grid of threads into in order of increasing threadIdx.x
|
||||
const unsigned int thread_idx = threadIdx.y * blockDim.x + threadIdx.x;
|
||||
// const unsigned int thread_idx = threadIdx.y * blockDim.x + threadIdx.x;
|
||||
|
||||
// assign each thread a row/column in the tile, calculate how many iterations we need
|
||||
// to cover the whole tile
|
||||
constexpr unsigned int ROW_STEP = NUM_THREADS / TILE_COLS_VECTORIZED;
|
||||
constexpr unsigned int NUM_ITERS = TILE_ROWS / ROW_STEP;
|
||||
unsigned int thread_row = thread_idx / TILE_COLS_VECTORIZED;
|
||||
const unsigned int thread_col = thread_idx % TILE_COLS_VECTORIZED;
|
||||
// unsigned int thread_row = thread_idx / TILE_COLS_VECTORIZED;
|
||||
// const unsigned int thread_col = thread_idx % TILE_COLS_VECTORIZED;
|
||||
|
||||
// compile time check that we provided the right amount of registers for storage
|
||||
static_assert(ELEMENTS_PER_THREAD == NUM_ITERS);
|
||||
|
|
|
|||
Loading…
Reference in New Issue