diff --git a/ggml/src/ggml-cuda/conv3d-implicit.cu b/ggml/src/ggml-cuda/conv3d-implicit.cu index 76f887972a..df5ed4578a 100644 --- a/ggml/src/ggml-cuda/conv3d-implicit.cu +++ b/ggml/src/ggml-cuda/conv3d-implicit.cu @@ -163,7 +163,7 @@ static __global__ void conv3d_implicit_kernel(const float * __restrict__ input, const uint inKOffset = start_k + innerColA * 4; #pragma unroll for (uint offset = 0; offset + rowStrideA <= BM; offset += rowStrideA) { - const unsigned int gemm_i = bx * BM + innerRowA + offset; + const unsigned int gemm_i = bx * BM + innerRowA + offset; // int n = (ksplit > 0) ? (bx * BM + innerRowA + offset) / PQZ : z; int n = (ksplit > 0) ? fastdiv(gemm_i, param.PQZ_fastdiv) : z; const unsigned int npqz_res = fastmodulo(gemm_i, param.PQZ_fastdiv); @@ -173,26 +173,7 @@ static __global__ void conv3d_implicit_kernel(const float * __restrict__ input, const int posw_ori = fastmodulo(ohow_res, param.OW_fastdiv) * param.stride0 - param.padding0; int inOffset = n * inNOffset; if(vec_load){ - // const uint cur0 = fastdiv(inKOffset, - // layout == 0 ? param.RSC_fastdiv : param.TRS_fastdiv); // channel offset - // const uint cur0_res = fastmodulo(inKOffset, - // layout == 0 ? param.RSC_fastdiv : param.TRS_fastdiv); // channel offset - // const uint cur1 = fastdiv(cur0_res, - // layout == 0 ? param.SC_fastdiv : param.RS_fastdiv); // kernel r offset - // const uint cur1_res = fastmodulo(cur0_res, - // layout == 0 ? param.SC_fastdiv : param.RS_fastdiv); // kernel r offset - // const uint cur2 = fastdiv(cur1_res, - // layout == 0 ? param.C_fastdiv : param.S_fastdiv); // kernel r offset - // const uint cur3 = fastmodulo(cur1_res, - // layout == 0 ? param.C_fastdiv : param.S_fastdiv); // kernel r offset - // const uint curC = layout == 0 ? cur3 : cur0; - // const uint curT = layout == 0 ? cur0 : cur1; - // const uint curR = layout == 0 ? cur1 : cur2; - // const uint curS = layout == 0 ? cur2 : cur3; const int4 curIdx = inputIndices(inKOffset, param); - // const int curD = posd_ori + curT * param.dilation2; // input w - // const int curH = posh_ori + curR * param.dilation1; // input h - // const int curW = posw_ori + curS * param.dilation0; // input w const int curD = posd_ori + curIdx.y * param.dilation2; // input w const int curH = posh_ori + curIdx.z * param.dilation1; // input h const int curW = posw_ori + curIdx.w * param.dilation0; // input w @@ -214,43 +195,11 @@ static __global__ void conv3d_implicit_kernel(const float * __restrict__ input, } else { #pragma unroll for (int i = 0; i < 4; ++i){ - // const uint cur0 = fastdiv(inKOffset + i, - // layout == 0 ? param.RSC_fastdiv : param.TRS_fastdiv); // channel offset - // const uint cur0_res = fastmodulo(inKOffset + i, - // layout == 0 ? param.RSC_fastdiv : param.TRS_fastdiv); // channel offset - // const uint cur1 = fastdiv(cur0_res, - // layout == 0 ? param.SC_fastdiv : param.RS_fastdiv); // kernel r offset - // const uint cur1_res = fastmodulo(cur0_res, - // layout == 0 ? param.SC_fastdiv : param.RS_fastdiv); // kernel r offset - // const uint cur2 = fastdiv(cur1_res, - // layout == 0 ? param.C_fastdiv : param.S_fastdiv); // kernel r offset - // const uint cur3 = fastmodulo(cur1_res, - // layout == 0 ? param.C_fastdiv : param.S_fastdiv); // kernel r offset - // const uint curC = layout == 0 ? cur3 : cur0; - // const uint curT = layout == 0 ? cur0 : cur1; - // const uint curR = layout == 0 ? cur1 : cur2; - // const uint curS = layout == 0 ? cur2 : cur3; const int4 curIdx = inputIndices(inKOffset + i, param); - // const int curD = posd_ori + curT * param.dilation2; // input w - // const int curH = posh_ori + curR * param.dilation1; // input h - // const int curW = posw_ori + curS * param.dilation0; // input w const int curD = posd_ori + curIdx.y * param.dilation2; // input w const int curH = posh_ori + curIdx.z * param.dilation1; // input h const int curW = posw_ori + curIdx.w * param.dilation0; // input w const int curC = curIdx.x; - // const uint cur0 = fastdiv(start_k + innerColA * 4 + i, - // layout == 0 ? param.SC_fastdiv : param.RS_fastdiv); // channel offset - // const uint cur1 = fastdiv(fastmodulo(start_k + innerColA * 4 + i, - // layout == 0 ? param.SC_fastdiv : param.RS_fastdiv), - // layout == 0 ? param.C_fastdiv : param.S_fastdiv); // kernel r offset - // const uint cur2 = fastmodulo(fastmodulo(start_k + innerColA * 4 + i, - // layout == 0 ? param.SC_fastdiv : param.RS_fastdiv), - // layout == 0 ? param.C_fastdiv : param.S_fastdiv); // kernel r offset - // const uint curC = layout == 0 ? cur2 : cur0; - // const uint curR = layout == 0 ? cur0 : cur1; - // const uint curS = layout == 0 ? cur1 : cur2; - // const int curH = posh_ori + curR * param.d_h; // input h - // const int curW = posw_ori + curS * param.d_w; // input w if (curH >= 0 && curW >= 0 && curD >= 0 && curW < param.w && curH < param.h && curD < param.d && inKOffset + i < end_k){ int inOffsetTmp = layout == 0 ? curD * inDepthOffset + curH * inChannelOffset + curW * param.c + curC: @@ -360,12 +309,7 @@ static __global__ void conv3d_implicit_kernel(const float * __restrict__ input, const uint inKkOffset = innerColA * 4 + crs + BK; #pragma unroll for (uint offset = 0; offset + rowStrideA <= BM; offset += rowStrideA) { - // int n = (ksplit > 0) ? (bx * BM + innerRowA + offset) / PQ : z; - // const unsigned int npq_res = (bx * BM + innerRowA + offset) % PQ; - // const int posh_ori = fastdiv((ksplit > 0) ? npq_res: bx * BM + innerRowA + offset, param.OW_fastdiv) * param.u - param.p; - // const int posw_ori = fastmodulo((ksplit > 0) ? npq_res: bx * BM + innerRowA + offset, param.OW_fastdiv) * param.v - param.q; - // int inOffset = n * param.c * param.h * param.w ; - const unsigned int gemm_i = bx * BM + innerRowA + offset; + const unsigned int gemm_i = bx * BM + innerRowA + offset; int n = (ksplit > 0) ? fastdiv(gemm_i, param.PQZ_fastdiv) : z; const unsigned int npqz_res = fastmodulo(gemm_i, param.PQZ_fastdiv); const int posd_ori = fastdiv((ksplit > 0) ? npqz_res: gemm_i, param.OHOW_fastdiv) * param.stride2 - param.padding2; @@ -379,28 +323,10 @@ static __global__ void conv3d_implicit_kernel(const float * __restrict__ input, const int curH = posh_ori + curIdx.z * param.dilation1; // input h const int curW = posw_ori + curIdx.w * param.dilation0; // input w const int curC = curIdx.x; - // const uint cur0 = fastdiv(innerColA * 4 + crs + BK, - // layout == 0 ? param.SC_fastdiv : param.RS_fastdiv); // channel offset - // const uint cur1 = fastdiv(fastmodulo(innerColA * 4 + crs + BK, - // layout == 0 ? param.SC_fastdiv : param.RS_fastdiv), - // layout == 0 ? param.C_fastdiv : param.S_fastdiv); // kernel r offset - // const uint cur2 = fastmodulo(fastmodulo(innerColA * 4 + crs + BK, - // layout == 0 ? param.SC_fastdiv : param.RS_fastdiv), - // layout == 0 ? param.C_fastdiv : param.S_fastdiv); // kernel r offset - // const uint curC = layout == 0 ? cur2 : cur0; - // const uint curR = layout == 0 ? cur0 : cur1; - // const uint curS = layout == 0 ? cur1 : cur2; - - // const int curH = posh_ori + curR * param.d_h; // input h - // const int curW = posw_ori + curS * param.d_w; // input w if (curH >= 0 && curW >= 0 && curD >= 0 && curW < param.w && curH < param.h && curD < param.d && inKkOffset < end_k){ int inOffsetTmp = layout == 0 ? curD * inDepthOffset + curH * inChannelOffset + curW * param.c + curC: curC * inDepthOffset + curD * inChannelOffset + curH * param.w + curW; - // if (curH >= 0 && curW >= 0 && curW < param.w && curH < param.h && inKkOffset < end_k){ - // int inOffsetTmp = layout == 0 ? - // curH * inChannelOffset + curW * param.c + curC: - // curC * inChannelOffset + curH * param.w + curW; float4 tmp = reinterpret_cast(&input[inOffset + inOffsetTmp])[0]; smeminput[write_flag * (BM+PAD) * BK + input_sts_addr + offset + 0] = tmp.x; smeminput[write_flag * (BM+PAD) * BK + input_sts_addr + offset + BM+PAD] = tmp.y; @@ -414,29 +340,11 @@ static __global__ void conv3d_implicit_kernel(const float * __restrict__ input, } else { #pragma unroll for (int i = 0; i < 4; ++i){ - // const uint cur0 = fastdiv(innerColA * 4 + crs + BK + i, - // layout == 0 ? param.SC_fastdiv : param.RS_fastdiv); // channel offset - // const uint cur1 = fastdiv(fastmodulo(innerColA * 4 + crs + BK + i, - // layout == 0 ? param.SC_fastdiv : param.RS_fastdiv), - // layout == 0 ? param.C_fastdiv : param.S_fastdiv); // kernel r offset - // const uint cur2 = fastmodulo(fastmodulo(innerColA * 4 + crs + BK + i, - // layout == 0 ? param.SC_fastdiv : param.RS_fastdiv), - // layout == 0 ? param.C_fastdiv : param.S_fastdiv); // kernel r offset - // const uint curC = layout == 0 ? cur2 : cur0; - // const uint curR = layout == 0 ? cur0 : cur1; - // const uint curS = layout == 0 ? cur1 : cur2; - - // const int curH = posh_ori + curR * param.d_h; // input h - // const int curW = posw_ori + curS * param.d_w; // input w const int4 curIdx = inputIndices(inKkOffset + i, param); const int curD = posd_ori + curIdx.y * param.dilation2; // input w const int curH = posh_ori + curIdx.z * param.dilation1; // input h const int curW = posw_ori + curIdx.w * param.dilation0; // input w const int curC = curIdx.x; - // if (curH >= 0 && curW >= 0 && curW < param.w && curH < param.h && innerColA * 4 + crs + BK + i < end_k){ - // int inOffsetTmp = layout == 0 ? - // curH * inChannelOffset + curW * param.c + curC: - // curC * inChannelOffset + curH * param.w + curW; if (curH >= 0 && curW >= 0 && curD >= 0 && curW < param.w && curH < param.h && curD < param.d && inKkOffset + i < end_k){ int inOffsetTmp = layout == 0 ? curD * inDepthOffset + curH * inChannelOffset + curW * param.c + curC: @@ -521,7 +429,6 @@ static __global__ void conv3d_implicit_kernel(const float * __restrict__ input, const int col = (ksplit > 0) ? fastmodulo(gemm_i, param.PQZ_fastdiv) : gemm_i; if (n < param.n && row < param.k && col < PQZ){ const uint outOffset = ksplit > 0 ? - // z * param.n * param.k * PQZ + n * param.k * PQZ + row * PQZ + col : ((z * param.n + n) * param.k + row) * PQZ + col : (z * param.k + row) * PQZ + col; output[outOffset] = smemoutput[output_lds_addr + subk * WARPSIZE]; @@ -790,7 +697,7 @@ static __global__ void conv3d_implicit_kernel(const half * __restrict__ input, const unsigned int K = param.c * param.r * param.s * param.t; const uint weightKOffset = K; //param.c * param.r * param.s * param.t; const uint inChannelOffset = param.c * param.w; - const uint inDepthOffset = param.h * param.c * param.w; + const uint inDepthOffset = param.h * param.c * param.w; const uint inNOffset = param.c * param.w * param.h * param.d; // loop bounds, constexpr where possible allows for loop unrolling @@ -854,7 +761,7 @@ static __global__ void conv3d_implicit_kernel(const half * __restrict__ input, if (block_k != num_block_tiles_k){ const half* A_block_gmem = input; const half* B_block_gmem = kernel + (block_n * BN * weightKOffset); - tileMemcpyLoadA(A_block_gmem, A_gmem_cache_reg, block_k * BK, + tileMemcpyLoadA(A_block_gmem, A_gmem_cache_reg, block_k * BK, inNOffset, inDepthOffset, inChannelOffset, param); tileMemcpyLoadB(B_block_gmem, B_gmem_cache_reg, block_k * BK, weightKOffset, param); } @@ -935,12 +842,9 @@ static __global__ void conv3d_implicit_kernel(const half * __restrict__ input, for (int j = 0; j < 4; ++j){ const uint row = m_idx + subk + i * WN / 2; const uint gemm_i = n_idx + j*32; - // const int n = fastdiv(gemm_i, param.OHOW_fastdiv); - // const int col = fastmodulo(gemm_i, param.OHOW_fastdiv); const int n = fastdiv(gemm_i, param.PQZ_fastdiv); const int col = fastmodulo(gemm_i, param.PQZ_fastdiv); if(n < param.n && row < param.k && col < PQZ){ - // const uint outOffset = n * param.k * param.Oh * param.Ow + row * param.Oh * param.Ow + col; const uint outOffset = (n * param.k + row) * PQZ + col; uint idx = output_lds_addr + subk + j*32*BN/2; idx = idx ^ ((idx & 0b1110000000) >> 4); @@ -1109,19 +1013,15 @@ void ggml_cuda_op_conv3d_implicit(ggml_backend_cuda_context & ctx, ggml_tensor * const uint KW = kernel->ne[0]; // kernel_w const uint KH = kernel->ne[1]; // kernel_h const uint KD = kernel->ne[2]; // kernel_h - // const uint IC = input->ne[2]; // input_channels - // const uint OC = kernel->ne[3]; // ouptut_chanles - // const uint B = input->ne[3]; // n_batches - - param_t params = { B, - IC, + param_t params = { B, + IC, IH, IW, ID, - OC, + OC, KH, KW, KD, - ST_Y, ST_X, ST_Z, - PD_Y, PD_X, PD_Z, - DL_Y, DL_X, DL_Z, + ST_X, ST_Y, ST_Z, + PD_X, PD_Y, PD_Z, + DL_X, DL_Y, DL_Z, OH, OW, OD, init_fastdiv_values(KW*IC), init_fastdiv_values(OW), diff --git a/ggml/src/ggml-cuda/conv3d-implicit.cuh b/ggml/src/ggml-cuda/conv3d-implicit.cuh index 04fc9109ed..9cd7fe4e9b 100644 --- a/ggml/src/ggml-cuda/conv3d-implicit.cuh +++ b/ggml/src/ggml-cuda/conv3d-implicit.cuh @@ -11,15 +11,15 @@ typedef struct{ unsigned int r; //filter height unsigned int s; //filter width unsigned int t; //filter depth - unsigned int stride0; //stride width - unsigned int stride1; //stride height + unsigned int stride0; //stride width + unsigned int stride1; //stride height unsigned int stride2; //stride depth - unsigned int padding0; //padding width + unsigned int padding0; //padding width unsigned int padding1; //padding height - unsigned int padding2; //padding depth - unsigned int dilation0; //dilation width - unsigned int dilation1; //dilation height - unsigned int dilation2; //dilation depth + unsigned int padding2; //padding depth + unsigned int dilation0; //dilation width + unsigned int dilation1; //dilation height + unsigned int dilation2; //dilation depth unsigned int Oh; //output height unsigned int Ow; //output width unsigned int Od; //output depth @@ -39,17 +39,17 @@ template __device__ __forceinline__ int4 inputIndices(const unsigned int kidx, param_t param) { const unsigned int cur0 = fastdiv(kidx, - layout == 0 ? param.RSC_fastdiv : param.TRS_fastdiv); // channel offset + layout == 0 ? param.RSC_fastdiv : param.TRS_fastdiv); const unsigned int cur0_res = fastmodulo(kidx, - layout == 0 ? param.RSC_fastdiv : param.TRS_fastdiv); // channel offset + layout == 0 ? param.RSC_fastdiv : param.TRS_fastdiv); const unsigned int cur1 = fastdiv(cur0_res, - layout == 0 ? param.SC_fastdiv : param.RS_fastdiv); // kernel r offset + layout == 0 ? param.SC_fastdiv : param.RS_fastdiv); const unsigned int cur1_res = fastmodulo(cur0_res, - layout == 0 ? param.SC_fastdiv : param.RS_fastdiv); // kernel r offset + layout == 0 ? param.SC_fastdiv : param.RS_fastdiv); const unsigned int cur2 = fastdiv(cur1_res, - layout == 0 ? param.C_fastdiv : param.S_fastdiv); // kernel r offset + layout == 0 ? param.C_fastdiv : param.S_fastdiv); const unsigned int cur3 = fastmodulo(cur1_res, - layout == 0 ? param.C_fastdiv : param.S_fastdiv); // kernel r offset + layout == 0 ? param.C_fastdiv : param.S_fastdiv); const unsigned int curC = layout == 0 ? cur3 : cur0; const unsigned int curT = layout == 0 ? cur0 : cur1; const unsigned int curR = layout == 0 ? cur1 : cur2; @@ -90,9 +90,6 @@ __device__ __forceinline__ void tileMemcpySwizzleB( constexpr unsigned int NUM_ITERS = TILE_ROWS / ROW_STEP; unsigned int thread_row = thread_idx / TILE_COLS_VECTORIZED; const unsigned int thread_col = thread_idx % TILE_COLS_VECTORIZED; - // const unsigned int curR = fastdiv(thread_col*8, param.SC_fastdiv); // channel offset - // const unsigned int curS = fastdiv(fastmodulo(thread_col*8, param.SC_fastdiv), param.C_fastdiv); // kernel r offset - // const unsigned int curC = fastmodulo(fastmodulo(thread_col*8, param.SC_fastdiv), param.C_fastdiv); // const unsigned int kidx = thread_col*8; const int4 curIdx = inputIndices<0>(kidx, param); const int curC = curIdx.x; @@ -172,17 +169,6 @@ __device__ __forceinline__ void tileMemcpySwizzleA( const int ohow_res = fastmodulo(npqz_res, param.OHOW_fastdiv); const int posh_ori = fastdiv(ohow_res, param.OW_fastdiv) * param.stride1 - param.padding1; const int posw_ori = fastmodulo(ohow_res, param.OW_fastdiv) * param.stride0 - param.padding0; - // unsigned int n = fastdiv(gemm_i, param.OHOW_fastdiv); - // unsigned int npq_res = fastmodulo(gemm_i, param.OHOW_fastdiv); - // int posh_ori = fastdiv(npq_res, param.OW_fastdiv) * param.stride1 - param.padding1; - // int posw_ori = fastmodulo(npq_res, param.OW_fastdiv) * param.stride0 - param.padding0; - // unsigned int inOffset = n * inNOffset; - // const unsigned int curR = fastdiv(thread_col*8, param.SC_fastdiv); // channel offset - // const unsigned int curS = fastdiv(fastmodulo(thread_col*8, param.SC_fastdiv), param.C_fastdiv); // kernel r offset - // const unsigned int curC = fastmodulo(fastmodulo(thread_col*8, param.SC_fastdiv), param.C_fastdiv); // kernel r offset - // int curH = posh_ori + curR * param.dilation1; // input h - // int curW = posw_ori + curS * param.dilation0; // input w - const int curD = posd_ori + curIdx.y * param.dilation2; // input d const int curH = posh_ori + curIdx.z * param.dilation1; // input h const int curW = posw_ori + curIdx.w * param.dilation0; // input w @@ -193,9 +179,6 @@ __device__ __forceinline__ void tileMemcpySwizzleA( dst_index = dst_index ^ ((dst_index & SWIZZLE_MASK_2) >> SWIZZLE_BITS_2); if (curH >= 0 && curW >= 0 && curD >= 0 && curW < param.w && curH < param.h && curD < param.d && curC < param.c){ int inOffsetTmp = curD * inDepthOffset + curH * inChannelOffset + curW * param.c + curC; - // if (curH >= 0 && curW >= 0 && curW < param.w && curH < param.h && - // curR < param.r && curS < param.s && curC < param.c){ - // const unsigned int inOffsetTmp = curH * inChannelOffset + curW * param.c + curC; dst_float4[dst_index] = reinterpret_cast(&src[n * inNOffset + inOffsetTmp])[0]; } else{ dst_float4[dst_index] = make_float4(0.f, 0.f, 0.f, 0.f); @@ -250,31 +233,18 @@ __device__ __forceinline__ void tileMemcpyLoadA( #pragma unroll for (unsigned int i = 0; i < NUM_ITERS; i++){ unsigned int gemm_i = blockIdx.y * TILE_ROWS + thread_row; - // unsigned int n = fastdiv(gemm_i, param.OHOW_fastdiv); - // unsigned int npq_res = fastmodulo(gemm_i, param.OHOW_fastdiv); - // int posh_ori = fastdiv(npq_res, param.OW_fastdiv) * param.stride1 - param.padding1; - // int posw_ori = fastmodulo(npq_res, param.OW_fastdiv) * param.stride0 - param.padding0; unsigned int n = fastdiv(gemm_i, param.PQZ_fastdiv); const unsigned int npqz_res = fastmodulo(gemm_i, param.PQZ_fastdiv); const int posd_ori = fastdiv(npqz_res, param.OHOW_fastdiv) * param.stride2 - param.padding2; const int ohow_res = fastmodulo(npqz_res, param.OHOW_fastdiv); const int posh_ori = fastdiv(ohow_res, param.OW_fastdiv) * param.stride1 - param.padding1; const int posw_ori = fastmodulo(ohow_res, param.OW_fastdiv) * param.stride0 - param.padding0; - // unsigned int inOffset = n * param.c * param.h * param.w; - // const unsigned int curR = fastdiv(block_k+thread_col*8, param.SC_fastdiv); // channel offset - // const unsigned int curS = fastdiv(fastmodulo(block_k+thread_col*8, param.SC_fastdiv), param.C_fastdiv); // kernel r offset - // const unsigned int curC = fastmodulo(fastmodulo(block_k+thread_col*8, param.SC_fastdiv), param.C_fastdiv); // kernel r offset - // int curH = posh_ori + curR * param.dilation1; // input h - // int curW = posw_ori + curS * param.dilation0; // input w const int curD = posd_ori + curIdx.y * param.dilation2; // input d const int curH = posh_ori + curIdx.z * param.dilation1; // input h const int curW = posw_ori + curIdx.w * param.dilation0; // input w const int curC = curIdx.x; if (curH >= 0 && curW >= 0 && curD >= 0 && curW < param.w && curH < param.h && curD < param.d && curC < param.c){ int inOffsetTmp = curD * inDepthOffset + curH * inChannelOffset + curW * param.c + curC; - // if (curH >= 0 && curW >= 0 && curW < param.w && curH < param.h && - // curR < param.r && curS < param.s && curC < param.c){ - // const unsigned int inOffsetTmp = curH * inChannelOffset + curW * param.c + curC; dst_reg[i] = reinterpret_cast(&src[n * inNOffset + inOffsetTmp])[0]; } else{ dst_reg[i] = make_float4(0.f, 0.f, 0.f, 0.f); @@ -322,9 +292,6 @@ __device__ __forceinline__ void tileMemcpyLoadB( // compile time check that we provided the right amount of registers for storage static_assert(ELEMENTS_PER_THREAD == NUM_ITERS); - // const unsigned int curR = fastdiv(block_k+thread_col*8, param.SC_fastdiv); // channel offset - // const unsigned int curS = fastdiv(fastmodulo(block_k+thread_col*8, param.SC_fastdiv), param.C_fastdiv); // kernel r offset - // const unsigned int curC = fastmodulo(fastmodulo(block_k+thread_col*8, param.SC_fastdiv), param.C_fastdiv); // const unsigned int kidx = block_k + thread_col*8; const int4 curIdx = inputIndices<0>(kidx, param); const int curC = curIdx.x; @@ -334,7 +301,6 @@ __device__ __forceinline__ void tileMemcpyLoadB( #pragma unroll for (unsigned int i = 0; i < NUM_ITERS; i++){ const unsigned int src_index = thread_row * src_stride + block_k + thread_col * 8; - // if (thread_row < param.k && curR < param.r && curS < param.s && curC < param.c){ // TODO : move some checks outside of the loop if (thread_row < param.k && curR < param.r && curS < param.s && curT < param.t && curC < param.c){ dst_reg[i] = reinterpret_cast(&src[src_index])[0]; diff --git a/tests/test-conv3d.cpp b/tests/test-conv3d.cpp index 53e37efd31..92e9d1e457 100644 --- a/tests/test-conv3d.cpp +++ b/tests/test-conv3d.cpp @@ -38,7 +38,9 @@ struct test_model { -void load_model(test_model & model, int ic, int oc, int iw, int ih, int id, int kw = 3, int kh = 3, int kd = 3, bool use_gpu = false ) { +void load_model(test_model & model, int ic, int oc, int iw, int ih, int id, + int kw = 3, int kh = 3, int kd = 3, + bool use_fp16 = true, bool use_gpu = false ) { // create data int KW = kw, KH = kh, KD = kd; int IC = ic, OC = oc; @@ -72,9 +74,10 @@ void load_model(test_model & model, int ic, int oc, int iw, int ih, int id, int } size_t buffer_size = 0; - { - // buffer_size += KW * KH * IC * OC * ggml_type_size(GGML_TYPE_F32); // tensor a - buffer_size += KW * KH * KD * IC * OC * ggml_type_size(GGML_TYPE_F16); // tensor a + { if(use_fp16) + buffer_size += KW * KH * KD * IC * OC * ggml_type_size(GGML_TYPE_F16); // tensor a + else + buffer_size += KW * KH * KD * IC * OC * ggml_type_size(GGML_TYPE_F32); // tensor a buffer_size += IW * IH * ID * IC * N * ggml_type_size(GGML_TYPE_F32); // tensor b buffer_size += 1024; // overhead } @@ -122,8 +125,10 @@ void load_model(test_model & model, int ic, int oc, int iw, int ih, int id, int model.ctx = ggml_init(params); // create tensors - model.a = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F16, KW, KH, KD, IC*OC); - // model.a = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F32, KW, KH, IC, OC); + if(use_fp16) + model.a = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F16, KW, KH, KD, IC*OC); + else + model.a = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F32, KW, KH, KD, IC*OC); model.b = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F32, IW, IH, ID, IC*N); // create a allocator @@ -134,11 +139,15 @@ void load_model(test_model & model, int ic, int oc, int iw, int ih, int id, int // load data to buffer if(ggml_backend_is_cpu(model.backend)) { - memcpy(model.a->data, hadata.data(), ggml_nbytes(model.a)); - // memcpy(model.a->data, adata.data(), ggml_nbytes(model.a)); + if(use_fp16) + memcpy(model.a->data, hadata.data(), ggml_nbytes(model.a)); + else + memcpy(model.a->data, adata.data(), ggml_nbytes(model.a)); } else { - ggml_backend_tensor_set(model.a, hadata.data(), 0, ggml_nbytes(model.a)); - // ggml_backend_tensor_set(model.a, adata.data(), 0, ggml_nbytes(model.a)); + if(use_fp16) + ggml_backend_tensor_set(model.a, hadata.data(), 0, ggml_nbytes(model.a)); + else + ggml_backend_tensor_set(model.a, adata.data(), 0, ggml_nbytes(model.a)); } // alloc memory @@ -155,7 +164,7 @@ void load_model(test_model & model, int ic, int oc, int iw, int ih, int id, int } } -typedef struct ggml_cgraph* (*build_graph_t)(const test_model& model, +typedef struct ggml_cgraph* (*build_graph_t)(const test_model& model, const int64_t i0, const int64_t i1, const int64_t i2); struct ggml_cgraph * build_graph_0(const test_model& model, const int64_t ic, const int64_t n, const int64_t oc) { @@ -173,18 +182,27 @@ struct ggml_cgraph * build_graph_0(const test_model& model, const int64_t ic, co struct ggml_cgraph * gf = ggml_new_graph(ctx0); + // int s0 = 2; + // int s1 = 1; + // int s2 = 1; + // int p0 = 2; + // int p1 = 0; + // int p2 = 1; + // int d0 = 1; + // int d1 = 1; + // int d2 = 2; + int s0 = 1; int s1 = 1; int s2 = 1; int p0 = 1; int p1 = 1; int p2 = 1; + int d0 = 1; int d1 = 1; int d2 = 1; - - // recalculate for avoid fragmentation struct ggml_tensor* conv2d_res = ggml_conv_3d(ctx0, model.a, model.b, ic, s0, s1, s2, p0, p1, p2, d0, d1, d2); ggml_set_name(conv2d_res, "conv2d_res"); @@ -227,6 +245,16 @@ struct ggml_cgraph * build_graph_1(const test_model& model, const int64_t ic, co int d1 = 1; int d2 = 1; + // int s0 = 2; + // int s1 = 1; + // int s2 = 1; + // int p0 = 2; + // int p1 = 0; + // int p2 = 1; + // int d0 = 1; + // int d1 = 1; + // int d2 = 2; + // recalculate for avoid fragmentation // struct ggml_tensor* conv2d_res = ggml_conv_2d(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1); // ggml_set_name(conv2d_res, "conv2d_res"); @@ -236,7 +264,7 @@ struct ggml_cgraph * build_graph_1(const test_model& model, const int64_t ic, co // struct ggml_tensor* wino_res = ggml_conv_2d_implicitgemm(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1); - struct ggml_tensor* wino_res = ggml_conv_3d_direct(ctx0, model.a, model.b, + struct ggml_tensor* wino_res = ggml_conv_3d_direct(ctx0, model.a, model.b, s0, s1, s2, p0, p1, p2, d0, d1, d2, ic, n, oc); ggml_set_name(wino_res, "wino_res"); @@ -251,7 +279,7 @@ struct ggml_cgraph * build_graph_1(const test_model& model, const int64_t ic, co std::vector compute_graph(const test_model & model, ggml_gallocr_t allocr, - build_graph_t build_graph, int iters, + build_graph_t build_graph, int iters, const int64_t ic, const int64_t n, const int64_t oc, double *t) { struct ggml_cgraph * gf = build_graph(model, ic, n, oc); @@ -271,7 +299,6 @@ std::vector compute_graph(const test_model & model, ggml_gallocr_t allocr } #endif - ggml_backend_graph_compute(model.backend, gf); @@ -289,8 +316,6 @@ std::vector compute_graph(const test_model & model, ggml_gallocr_t allocr double time_us = end_time - start_time; time_us = time_us/iters; - // printf(" Taking %f ms\n ", time_us/1000); - //ggml_graph_print(gf); struct ggml_tensor *res = NULL; @@ -316,12 +341,6 @@ int main(void) { ggml_time_init(); std::vector> configs = { - // std::make_tuple(64,64,48,64,3,3), - // std::make_tuple(320,320,104,152,3,3), - // std::make_tuple(640,640,52,76,3,3), - // std::make_tuple(640,640,104,152,3,3), - // std::make_tuple(960,320,104,152,3,3), - // std::make_tuple(1280,1280,26,38,3,3), std::make_tuple(320,1280,26,38,8,3,3,3), std::make_tuple(1280,1280,26,38,8,3,3,3), std::make_tuple(320,1280,52,76,8,3,3,3), @@ -330,29 +349,14 @@ int main(void) std::make_tuple(1280,1280,104,152,8,3,3,3), std::make_tuple(320,1280,208,304,4,3,3,3), std::make_tuple(640,1280,208,304,4,3,3,3), - // std::make_tuple(1280,1280,26,38,1,1), - // std::make_tuple(256,128,768,1024,3,3), - // std::make_tuple(128,3,768,1024,3,3), - // std::make_tuple(256,128,768,1024,1,1), - // std::make_tuple(512,256,384,512,1,1), - // std::make_tuple(1280,640,52,76,3,3), - // std::make_tuple(1920,1280,26,38,3,3), - // std::make_tuple(2560,1280,26,38,3,3), - // std::make_tuple(320,1280,26,38,3,3), - // std::make_tuple(512,512,104,152,3,3), - // std::make_tuple(512,512,208,304,3,3), - // std::make_tuple(512,256,416,608,3,3), - // std::make_tuple(256,128,832,1216,3,3), - // std::make_tuple(256,256,832,1216,3,3), - // std::make_tuple(320,256,1024,1920) }; int k = 0; for (auto c : configs){ test_model model; - load_model(model, std::get<0>(c), std::get<1>(c), std::get<2>(c), - std::get<3>(c), std::get<4>(c), std::get<5>(c), std::get<6>(c), std::get<7>(c), true); + load_model(model, std::get<0>(c), std::get<1>(c), std::get<2>(c), + std::get<3>(c), std::get<4>(c), std::get<5>(c), std::get<6>(c), std::get<7>(c), true, true); ggml_gallocr_t allocr = NULL; allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); @@ -366,11 +370,11 @@ int main(void) // fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0f/1024.0f); - struct ggml_cgraph * gf_res_0 = NULL; + struct ggml_cgraph * gf_res_0 = NULL; int iterations = 20; double run_time0; - std::vector im2col_data = compute_graph(model, allocr, build_graph_0, iterations, + std::vector im2col_data = compute_graph(model, allocr, build_graph_0, iterations, std::get<0>(c), 1, std::get<1>(c), &run_time0); ggml_gallocr_free(allocr); @@ -386,23 +390,22 @@ int main(void) ggml_gallocr_reserve(allocr, gf); size_t mem_size1 = ggml_gallocr_get_buffer_size(allocr, 0); // fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0f/1024.0f); - - struct ggml_cgraph * gf_res_1 = NULL; + struct ggml_cgraph * gf_res_1 = NULL; double run_time1; // std::vector wino_data = compute_graph(model, allocr, build_graph_1, iterations, &run_time1); - std::vector conv2d_data = compute_graph(model, allocr, build_graph_1, iterations, + std::vector conv2d_data = compute_graph(model, allocr, build_graph_1, iterations, std::get<0>(c), 1, std::get<1>(c), &run_time1); - if(k==0) { + if(k==0) { k = 1; fprintf(stderr, "| (IC, OC, IW, IH, ID, KW, KH, KD) | im2col+GEMM TIME | im2col+GEMM VRAM | implicit GEMM TIME | implicit GEMM VRAM \n"); fprintf(stderr, "| --- | --- | --- | --- | --- \n"); } - fprintf(stderr, " | (%d, %d, %d, %d, %d, %d, %d, %d) | %.2f ms | %.2f MB | %.2f ms | %.2f MB\n", - std::get<0>(c), std::get<1>(c), std::get<2>(c), + fprintf(stderr, " | (%d, %d, %d, %d, %d, %d, %d, %d) | %.2f ms | %.2f MB | %.2f ms | %.2f MB\n", + std::get<0>(c), std::get<1>(c), std::get<2>(c), std::get<3>(c), std::get<4>(c), std::get<5>(c), std::get<6>(c), std::get<7>(c), run_time0, mem_size0/1024.0f/1024.0f, @@ -412,7 +415,7 @@ int main(void) // for(int i = 0; i < conv2d_data.size(); i++) { // float diff = fabs(im2col_data[i] - conv2d_data[i]); // // if(diff > 0.5) { - // printf("(%7.3f, %7.3f, %.2f, %d) \n", + // printf("(%7.3f, %7.3f, %f, %d) \n", // im2col_data[i], conv2d_data[i], // diff, i); // // break; @@ -425,7 +428,5 @@ int main(void) ggml_gallocr_free(allocr); } - - // printf("\nPerforming test:\n"); return 0; }