remove implicit op and related calls; replace conv_2d with conv_2d_implicit kernel
This commit is contained in:
parent
2dfbbee73f
commit
55859a86aa
|
|
@ -513,7 +513,6 @@ extern "C" {
|
||||||
GGML_OP_IM2COL_BACK,
|
GGML_OP_IM2COL_BACK,
|
||||||
GGML_OP_IM2COL_3D,
|
GGML_OP_IM2COL_3D,
|
||||||
GGML_OP_CONV_2D,
|
GGML_OP_CONV_2D,
|
||||||
GGML_OP_CONV_2D_IMPLICIT,
|
|
||||||
GGML_OP_CONV_3D,
|
GGML_OP_CONV_3D,
|
||||||
GGML_OP_CONV_2D_DW,
|
GGML_OP_CONV_2D_DW,
|
||||||
GGML_OP_CONV_TRANSPOSE_2D,
|
GGML_OP_CONV_TRANSPOSE_2D,
|
||||||
|
|
@ -1983,19 +1982,6 @@ extern "C" {
|
||||||
int d0, // dilation dimension 0
|
int d0, // dilation dimension 0
|
||||||
int d1); // dilation dimension 1
|
int d1); // dilation dimension 1
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_conv_2d_implicitgemm(
|
|
||||||
struct ggml_context * ctx,
|
|
||||||
struct ggml_tensor * a, // convolution kernel [KW, KH, IC, OC]
|
|
||||||
struct ggml_tensor * b, // input data [W, H, C, N]
|
|
||||||
int s0, // stride dimension 0
|
|
||||||
int s1, // stride dimension 1
|
|
||||||
int p0, // padding dimension 0
|
|
||||||
int p1, // padding dimension 1
|
|
||||||
int d0, // dilation dimension 0
|
|
||||||
int d1);
|
|
||||||
// int layout); // for future
|
|
||||||
|
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_conv_3d_direct(
|
GGML_API struct ggml_tensor * ggml_conv_3d_direct(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a, // kernel [KW, KH, KD, IC * OC]
|
struct ggml_tensor * a, // kernel [KW, KH, KD, IC * OC]
|
||||||
|
|
|
||||||
|
|
@ -1887,10 +1887,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||||
{
|
{
|
||||||
ggml_compute_forward_conv_2d(params, tensor);
|
ggml_compute_forward_conv_2d(params, tensor);
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_CONV_2D_IMPLICIT:
|
|
||||||
{
|
|
||||||
ggml_compute_forward_conv_2d(params, tensor);
|
|
||||||
} break;
|
|
||||||
case GGML_OP_CONV_3D:
|
case GGML_OP_CONV_3D:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_conv_3d(params, tensor);
|
ggml_compute_forward_conv_3d(params, tensor);
|
||||||
|
|
@ -2268,7 +2264,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||||
case GGML_OP_IM2COL_BACK:
|
case GGML_OP_IM2COL_BACK:
|
||||||
case GGML_OP_IM2COL_3D:
|
case GGML_OP_IM2COL_3D:
|
||||||
case GGML_OP_CONV_2D:
|
case GGML_OP_CONV_2D:
|
||||||
case GGML_OP_CONV_2D_IMPLICIT:
|
|
||||||
case GGML_OP_CONV_3D:
|
case GGML_OP_CONV_3D:
|
||||||
case GGML_OP_CONV_2D_DW:
|
case GGML_OP_CONV_2D_DW:
|
||||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||||
|
|
@ -2794,7 +2789,6 @@ struct ggml_cplan ggml_graph_plan(
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_CONV_2D:
|
case GGML_OP_CONV_2D:
|
||||||
case GGML_OP_CONV_2D_IMPLICIT:
|
|
||||||
case GGML_OP_CONV_3D:
|
case GGML_OP_CONV_3D:
|
||||||
{
|
{
|
||||||
cur = GGML_IM2COL_WORK_SIZE;
|
cur = GGML_IM2COL_WORK_SIZE;
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,9 @@
|
||||||
|
|
||||||
typedef unsigned int uint;
|
typedef unsigned int uint;
|
||||||
constexpr uint WARPSIZE = 32;
|
constexpr uint WARPSIZE = 32;
|
||||||
|
#define CUDA_NCHW_2_NHWC_TILE_DIM 32
|
||||||
|
#define CUDA_NCHW_2_NHWC_BLOCK_NM 8
|
||||||
|
#define CUDA_NCHW_2_NHWC_BLOCK_ROWS 8
|
||||||
|
|
||||||
|
|
||||||
//currently not use; in future for split-k kernels
|
//currently not use; in future for split-k kernels
|
||||||
|
|
@ -23,6 +26,41 @@ static __global__ void reduce_f32(const float * __restrict__ x, float * __restri
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename src_T, typename dst_T>
|
||||||
|
static __global__ void NCHW2NHWC(const src_T *src, dst_T * dst, const int ne, const int ne00, const int ne01){
|
||||||
|
|
||||||
|
const int64_t nmat = ne / (ne00 * ne01);
|
||||||
|
const int64_t n = ne00 * ne01;
|
||||||
|
|
||||||
|
int x = blockIdx.x * CUDA_NCHW_2_NHWC_TILE_DIM + threadIdx.x;
|
||||||
|
int y = blockIdx.y * CUDA_NCHW_2_NHWC_TILE_DIM + threadIdx.y;
|
||||||
|
int tx = blockIdx.y * CUDA_NCHW_2_NHWC_TILE_DIM + threadIdx.x; // transpose block offset
|
||||||
|
int ty = blockIdx.x * CUDA_NCHW_2_NHWC_TILE_DIM + threadIdx.y;
|
||||||
|
|
||||||
|
__shared__ src_T tile[CUDA_NCHW_2_NHWC_TILE_DIM][CUDA_NCHW_2_NHWC_TILE_DIM];
|
||||||
|
|
||||||
|
for(int i = 0; i < CUDA_NCHW_2_NHWC_BLOCK_NM; ++i){
|
||||||
|
|
||||||
|
const unsigned int imat = blockIdx.z * CUDA_NCHW_2_NHWC_BLOCK_NM + i;
|
||||||
|
if(imat >= nmat)
|
||||||
|
break;
|
||||||
|
for (int j = 0; j < CUDA_NCHW_2_NHWC_TILE_DIM; j += CUDA_NCHW_2_NHWC_BLOCK_ROWS){
|
||||||
|
if(x < ne01 && y + j < ne00){
|
||||||
|
const int row = threadIdx.y+j;
|
||||||
|
const int col = threadIdx.x ^ row;
|
||||||
|
tile[row][col] = src[imat*n + (y+j)*ne01 + x];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
for (int j = 0; j < CUDA_NCHW_2_NHWC_TILE_DIM; j += CUDA_NCHW_2_NHWC_BLOCK_ROWS){
|
||||||
|
if(ty + j < ne01 && tx < ne00){
|
||||||
|
const int col = (threadIdx.y+j) ^ threadIdx.x;
|
||||||
|
dst[imat*n + (ty+j)*ne00 + tx] = ggml_cuda_cast<dst_T>(tile[threadIdx.x][col]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template<typename T, const int BM, const int BN, const int BK, const int WM, const int WN,
|
template<typename T, const int BM, const int BN, const int BK, const int WM, const int WN,
|
||||||
const int WNITER, const int TM, const int TN, const int NUM_THREADS,
|
const int WNITER, const int TM, const int TN, const int NUM_THREADS,
|
||||||
|
|
@ -882,26 +920,40 @@ static void conv2d_implicit_cuda(const float * X_D, const T * K_D, float * Y_D,
|
||||||
int threadz = 1; // threadz number per block
|
int threadz = 1; // threadz number per block
|
||||||
dim3 thblock(NUM_THREADS, thready, threadz);
|
dim3 thblock(NUM_THREADS, thready, threadz);
|
||||||
dim3 grid(blockx, blocky, blockz);
|
dim3 grid(blockx, blocky, blockz);
|
||||||
if(P.c % 4 == 0){
|
|
||||||
if(P.layout == 0)
|
conv2d_implicit_kernel<T, BM, BN, BK, WM, WN,
|
||||||
conv2d_implicit_kernel<T, BM, BN, BK, WM, WN,
|
WNITER, TM, TN, NUM_THREADS, 1, false, 0><<<grid, thblock, 0, st>>>(X_D, K_D, Y_D, P);
|
||||||
WNITER, TM, TN, NUM_THREADS, 0, true, 0><<<grid, thblock, 0, st>>>(X_D, K_D, Y_D, P);
|
|
||||||
else if(P.layout == 1)
|
|
||||||
conv2d_implicit_kernel<T, BM, BN, BK, WM, WN,
|
|
||||||
WNITER, TM, TN, NUM_THREADS, 1, false, 0><<<grid, thblock, 0, st>>>(X_D, K_D, Y_D, P);
|
|
||||||
} else{
|
|
||||||
if(P.layout == 0)
|
|
||||||
conv2d_implicit_kernel<T, BM, BN, BK, WM, WN,
|
|
||||||
WNITER, TM, TN, NUM_THREADS, 0, false, 0><<<grid, thblock, 0, st>>>(X_D, K_D, Y_D, P);
|
|
||||||
else if(P.layout == 1)
|
|
||||||
conv2d_implicit_kernel<T, BM, BN, BK, WM, WN,
|
|
||||||
WNITER, TM, TN, NUM_THREADS, 1, false, 0><<<grid, thblock, 0, st>>>(X_D, K_D, Y_D, P);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void conv2d_implicit_cuda_f16(ggml_backend_cuda_context & ctx, const float * X_D, const half * K_D, float * Y_D, int cc, const param_t P, cudaStream_t st) {
|
static void conv2d_implicit_cuda_f16(ggml_backend_cuda_context & ctx, const float * X_D, const half * K_D, float * Y_D, int cc, const param_t P, cudaStream_t st) {
|
||||||
|
|
||||||
if (GGML_CUDA_CC_IS_NVIDIA(cc) && ampere_mma_available(cc) && P.layout == 0 && P.c % 8 == 0) {
|
if (GGML_CUDA_CC_IS_NVIDIA(cc) && ampere_mma_available(cc) && P.c % 8 == 0 && (P.r > 1 || P.s > 1)) {
|
||||||
|
|
||||||
|
int id = ggml_cuda_get_device();
|
||||||
|
|
||||||
|
int64_t ne = P.c * P.h * P.w * P.n;
|
||||||
|
int64_t ne00 = P.c;
|
||||||
|
int64_t ne01 = P.h * P.w;
|
||||||
|
ggml_cuda_pool_alloc<half> input_f16(ctx.pool(id), ne);
|
||||||
|
|
||||||
|
dim3 dimGrid( (ne01 + CUDA_NCHW_2_NHWC_TILE_DIM - 1) / CUDA_NCHW_2_NHWC_TILE_DIM,
|
||||||
|
(ne00 + CUDA_NCHW_2_NHWC_TILE_DIM - 1) / CUDA_NCHW_2_NHWC_TILE_DIM,
|
||||||
|
(ne/(ne00*ne01) + CUDA_NCHW_2_NHWC_BLOCK_NM - 1) / CUDA_NCHW_2_NHWC_BLOCK_NM) ;
|
||||||
|
dim3 dimBlock(CUDA_NCHW_2_NHWC_TILE_DIM,CUDA_NCHW_2_NHWC_BLOCK_ROWS, 1);
|
||||||
|
NCHW2NHWC<float, half><<<dimGrid, dimBlock, 0, st>>>(X_D, input_f16.get(), ne, ne00, ne01);
|
||||||
|
|
||||||
|
ne = P.c * P.r * P.s * P.k;
|
||||||
|
ne01 = P.r * P.s;
|
||||||
|
ggml_cuda_pool_alloc<half> kernel_f16(ctx.pool(id), ne);
|
||||||
|
dim3 dimGrid1((ne01 + CUDA_NCHW_2_NHWC_TILE_DIM - 1) / CUDA_NCHW_2_NHWC_TILE_DIM,
|
||||||
|
(ne00 + CUDA_NCHW_2_NHWC_TILE_DIM - 1) / CUDA_NCHW_2_NHWC_TILE_DIM,
|
||||||
|
(ne/(ne00*ne01) + CUDA_NCHW_2_NHWC_BLOCK_NM - 1) / CUDA_NCHW_2_NHWC_BLOCK_NM) ;
|
||||||
|
NCHW2NHWC<half, half><<<dimGrid1, dimBlock, 0, st>>>(K_D, kernel_f16.get(), ne, ne00, ne01);
|
||||||
|
|
||||||
|
const half *X_H = input_f16.get();
|
||||||
|
const half *K_H = kernel_f16.get();
|
||||||
|
ggml_cuda_pool_alloc<half> Y_H(ctx.pool(id), P.k * P.Oh * P.Ow * P.n);
|
||||||
|
|
||||||
constexpr unsigned int BM_dim = 256;
|
constexpr unsigned int BM_dim = 256;
|
||||||
constexpr unsigned int BN_dim = 256;
|
constexpr unsigned int BN_dim = 256;
|
||||||
constexpr unsigned int BK_dim = 32;
|
constexpr unsigned int BK_dim = 32;
|
||||||
|
|
@ -925,19 +977,9 @@ static void conv2d_implicit_cuda_f16(ggml_backend_cuda_context & ctx, const floa
|
||||||
dim3 gridDim(BlocksN, BlocksM);
|
dim3 gridDim(BlocksN, BlocksM);
|
||||||
dim3 blockDim(ThreadsN, ThreadsM);
|
dim3 blockDim(ThreadsN, ThreadsM);
|
||||||
|
|
||||||
int id = ggml_cuda_get_device();
|
|
||||||
ggml_cuda_pool_alloc<half> x_f16(ctx.pool(id));
|
|
||||||
|
|
||||||
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(GGML_TYPE_F32);
|
|
||||||
GGML_ASSERT(to_fp16_cuda != nullptr);
|
|
||||||
size_t ne = P.c * P.h * P.w * P.n;
|
|
||||||
x_f16.alloc(ne);
|
|
||||||
to_fp16_cuda(X_D, x_f16.get(), ne, st);
|
|
||||||
const half *X_H = x_f16.get();
|
|
||||||
ggml_cuda_pool_alloc<half> Y_H(ctx.pool(id), P.k * P.Oh * P.Ow * P.n);
|
|
||||||
conv2d_implicit_kernel<BM_dim, BN_dim, BK_dim,
|
conv2d_implicit_kernel<BM_dim, BN_dim, BK_dim,
|
||||||
WM_dim, WN_dim, WK_dim, NumThreads>
|
WM_dim, WN_dim, WK_dim, NumThreads>
|
||||||
<<<gridDim, blockDim, shmem_bytes, st>>>(X_H, K_D, Y_H.get(), P);
|
<<<gridDim, blockDim, shmem_bytes, st>>>(X_H, K_H, Y_H.get(), P);
|
||||||
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
||||||
to_fp32_cuda(Y_H.get(), Y_D, P.k * P.Oh * P.Ow * P.n, st);
|
to_fp32_cuda(Y_H.get(), Y_D, P.k * P.Oh * P.Ow * P.n, st);
|
||||||
} else{
|
} else{
|
||||||
|
|
@ -971,28 +1013,28 @@ void ggml_cuda_op_conv2d_implicit(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||||
const int PD_Y = p[3]; // padding_y
|
const int PD_Y = p[3]; // padding_y
|
||||||
const int DL_X = p[4]; // dilation_x
|
const int DL_X = p[4]; // dilation_x
|
||||||
const int DL_Y = p[5]; // dilation_y
|
const int DL_Y = p[5]; // dilation_y
|
||||||
const int LT = p[6]; // layout
|
// const int LT = p[6]; // layout
|
||||||
|
|
||||||
GGML_ASSERT(LT == 0 || LT == 1);
|
// GGML_ASSERT(LT == 0 || LT == 1);
|
||||||
|
|
||||||
// same number of input channels
|
// same number of input channels
|
||||||
GGML_ASSERT(LT == 0 ? input->ne[0] == kernel->ne[0] : input->ne[2] == kernel->ne[2]);
|
// GGML_ASSERT(LT == 0 ? input->ne[0] == kernel->ne[0] : input->ne[2] == kernel->ne[2]);
|
||||||
// No cwhn
|
// No cwhn
|
||||||
GGML_ASSERT(p[7] == false);
|
GGML_ASSERT(p[6] == false);
|
||||||
|
|
||||||
const int IW = input->ne[LT == 0 ? 1 : 0]; // input_w
|
const int IW = input->ne[0]; // input_w
|
||||||
const int IH = input->ne[LT == 0 ? 2 : 1]; // input_h
|
const int IH = input->ne[1]; // input_h
|
||||||
const int OW = dst->ne[0]; // output_w
|
const int OW = dst->ne[0]; // output_w
|
||||||
const int OH = dst->ne[1]; // output_h
|
const int OH = dst->ne[1]; // output_h
|
||||||
const int KW = kernel->ne[LT == 0 ? 1 : 0]; // kernel_w
|
const int KW = kernel->ne[0]; // kernel_w
|
||||||
const int KH = kernel->ne[LT == 0 ? 2 : 1]; // kernel_h
|
const int KH = kernel->ne[1]; // kernel_h
|
||||||
const int IC = input->ne[LT == 0 ? 0: 2]; // input_channels
|
const int IC = input->ne[2]; // input_channels
|
||||||
|
|
||||||
const int OC = kernel->ne[3]; // ouptut_chanles
|
const int OC = kernel->ne[3]; // ouptut_chanles
|
||||||
const int B = input->ne[3]; // n_batches
|
const int B = input->ne[3]; // n_batches
|
||||||
|
|
||||||
const int64_t total = B * OC * OH * OW;
|
const int64_t total = B * OC * OH * OW;
|
||||||
|
|
||||||
param_t params = { B, IC, IH, IW, OC, KH, KW, ST_Y, ST_X, PD_Y, PD_X, DL_Y, DL_X, OH, OW };
|
param_t params = { B, IC, IH, IW, OC, KH, KW, ST_Y, ST_X, PD_Y, PD_X, DL_Y, DL_X, OH, OW };
|
||||||
params.SC_fastdiv = init_fastdiv_values(KW*IC);
|
params.SC_fastdiv = init_fastdiv_values(KW*IC);
|
||||||
params.OW_fastdiv = init_fastdiv_values(OW);
|
params.OW_fastdiv = init_fastdiv_values(OW);
|
||||||
|
|
@ -1000,7 +1042,7 @@ void ggml_cuda_op_conv2d_implicit(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||||
params.C_fastdiv = init_fastdiv_values(IC);
|
params.C_fastdiv = init_fastdiv_values(IC);
|
||||||
params.RS_fastdiv = init_fastdiv_values(KW*KH);
|
params.RS_fastdiv = init_fastdiv_values(KW*KH);
|
||||||
params.S_fastdiv = init_fastdiv_values(KW);
|
params.S_fastdiv = init_fastdiv_values(KW);
|
||||||
params.layout = LT;
|
// params.layout = LT;
|
||||||
|
|
||||||
if (kernel->type == GGML_TYPE_F16) {
|
if (kernel->type == GGML_TYPE_F16) {
|
||||||
conv2d_implicit_cuda_f16(ctx, X_D, (half *) K_D, Y_D, cc, params, st);
|
conv2d_implicit_cuda_f16(ctx, X_D, (half *) K_D, Y_D, cc, params, st);
|
||||||
|
|
|
||||||
|
|
@ -2462,11 +2462,8 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||||
ggml_cuda_op_im2col_3d(ctx, dst);
|
ggml_cuda_op_im2col_3d(ctx, dst);
|
||||||
break;
|
break;
|
||||||
case GGML_OP_CONV_2D:
|
case GGML_OP_CONV_2D:
|
||||||
ggml_cuda_op_conv2d(ctx, dst);
|
|
||||||
break;
|
|
||||||
case GGML_OP_CONV_2D_IMPLICIT:
|
|
||||||
ggml_cuda_op_conv2d_implicit(ctx, dst);
|
ggml_cuda_op_conv2d_implicit(ctx, dst);
|
||||||
break;
|
break;
|
||||||
case GGML_OP_CONV_2D_DW:
|
case GGML_OP_CONV_2D_DW:
|
||||||
ggml_cuda_op_conv2d_dw(ctx, dst);
|
ggml_cuda_op_conv2d_dw(ctx, dst);
|
||||||
break;
|
break;
|
||||||
|
|
@ -3580,7 +3577,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||||
case GGML_OP_IM2COL:
|
case GGML_OP_IM2COL:
|
||||||
case GGML_OP_IM2COL_3D:
|
case GGML_OP_IM2COL_3D:
|
||||||
case GGML_OP_CONV_2D:
|
case GGML_OP_CONV_2D:
|
||||||
case GGML_OP_CONV_2D_IMPLICIT:
|
|
||||||
case GGML_OP_CONV_2D_DW:
|
case GGML_OP_CONV_2D_DW:
|
||||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||||
case GGML_OP_POOL_2D:
|
case GGML_OP_POOL_2D:
|
||||||
|
|
|
||||||
|
|
@ -976,7 +976,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||||
"IM2COL_BACK",
|
"IM2COL_BACK",
|
||||||
"IM2COL_3D",
|
"IM2COL_3D",
|
||||||
"CONV_2D",
|
"CONV_2D",
|
||||||
"CONV_2D_IMPLICIT",
|
|
||||||
"CONV_3D",
|
"CONV_3D",
|
||||||
"CONV_2D_DW",
|
"CONV_2D_DW",
|
||||||
"CONV_TRANSPOSE_2D",
|
"CONV_TRANSPOSE_2D",
|
||||||
|
|
@ -1020,7 +1019,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||||
"GLU",
|
"GLU",
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(GGML_OP_COUNT == 91, "GGML_OP_COUNT != 91");
|
static_assert(GGML_OP_COUNT == 90, "GGML_OP_COUNT != 90");
|
||||||
|
|
||||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||||
"none",
|
"none",
|
||||||
|
|
@ -1081,7 +1080,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||||
"im2col_back(x)",
|
"im2col_back(x)",
|
||||||
"im2col_3d(x)",
|
"im2col_3d(x)",
|
||||||
"conv_2d(x)",
|
"conv_2d(x)",
|
||||||
"conv_2d_implicit(x)",
|
|
||||||
"conv_3d(x)",
|
"conv_3d(x)",
|
||||||
"conv_2d_dw(x)",
|
"conv_2d_dw(x)",
|
||||||
"conv_transpose_2d(x)",
|
"conv_transpose_2d(x)",
|
||||||
|
|
@ -1125,7 +1123,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||||
"glu(x)",
|
"glu(x)",
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(GGML_OP_COUNT == 91, "GGML_OP_COUNT != 91");
|
static_assert(GGML_OP_COUNT == 90, "GGML_OP_COUNT != 90");
|
||||||
|
|
||||||
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
||||||
|
|
||||||
|
|
@ -4573,66 +4571,6 @@ struct ggml_tensor * ggml_conv_2d_direct(
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// ggml_conv_2d_implicitgemm
|
|
||||||
|
|
||||||
struct ggml_tensor * ggml_conv_2d_implicitgemm(
|
|
||||||
struct ggml_context * ctx,
|
|
||||||
struct ggml_tensor * a, // convolution kernel [KW, KH, IC, OC]
|
|
||||||
struct ggml_tensor * b, // input data [W, H, C, N]
|
|
||||||
int s0, // stride dimension 0
|
|
||||||
int s1, // stride dimension 1
|
|
||||||
int p0, // padding dimension 0
|
|
||||||
int p1, // padding dimension 1
|
|
||||||
int d0, // dilation dimension 0
|
|
||||||
int d1){
|
|
||||||
// 0: NHWC, 1:NCHW
|
|
||||||
// int layout) {
|
|
||||||
|
|
||||||
GGML_ASSERT(a->ne[2] == b->ne[2]);
|
|
||||||
//GGML_ASSERT(a->type == b->type);
|
|
||||||
|
|
||||||
int64_t ne[4];
|
|
||||||
ne[0] = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
|
|
||||||
ne[1] = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1);
|
|
||||||
ne[2] = a->ne[3];
|
|
||||||
ne[3] = b->ne[3];
|
|
||||||
|
|
||||||
struct ggml_tensor * result = ggml_new_tensor(ctx, b->type, 4, ne);
|
|
||||||
|
|
||||||
ggml_set_op_params_i32(result, 0, s0);
|
|
||||||
ggml_set_op_params_i32(result, 1, s1);
|
|
||||||
ggml_set_op_params_i32(result, 2, p0);
|
|
||||||
ggml_set_op_params_i32(result, 3, p1);
|
|
||||||
ggml_set_op_params_i32(result, 4, d0);
|
|
||||||
ggml_set_op_params_i32(result, 5, d1);
|
|
||||||
|
|
||||||
struct ggml_tensor *ap, *bp;
|
|
||||||
if(a->type == GGML_TYPE_F16 && (a->ne[0] > 1 || a->ne[1] > 1)){
|
|
||||||
ggml_set_op_params_i32(result, 6, 0);
|
|
||||||
ap = ggml_reshape_4d(ctx,
|
|
||||||
ggml_cont(ctx,
|
|
||||||
ggml_transpose(ctx,
|
|
||||||
ggml_reshape_3d(ctx, a, a->ne[0]*a->ne[1], a->ne[2], a->ne[3]))),
|
|
||||||
a->ne[2], a->ne[0], a->ne[1], a->ne[3]);
|
|
||||||
bp = ggml_reshape_4d(ctx,
|
|
||||||
ggml_cont(ctx,
|
|
||||||
ggml_transpose(ctx,
|
|
||||||
ggml_reshape_3d(ctx, b, b->ne[0]*b->ne[1], b->ne[2], b->ne[3]))),
|
|
||||||
b->ne[2], b->ne[0], b->ne[1], b->ne[3]);
|
|
||||||
} else{
|
|
||||||
ggml_set_op_params_i32(result, 6, 1);
|
|
||||||
ap = a;
|
|
||||||
bp = b;
|
|
||||||
}
|
|
||||||
|
|
||||||
result->op = GGML_OP_CONV_2D_IMPLICIT;
|
|
||||||
result->src[0] = ap;
|
|
||||||
result->src[1] = bp;
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
// ggml_conv_3d
|
// ggml_conv_3d
|
||||||
|
|
||||||
struct ggml_tensor * ggml_conv_3d_direct(
|
struct ggml_tensor * ggml_conv_3d_direct(
|
||||||
|
|
|
||||||
|
|
@ -4191,94 +4191,6 @@ struct test_conv_2d : public test_case {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// CONV_2D_IMPLICIT
|
|
||||||
struct test_conv_2d_implicit : public test_case {
|
|
||||||
const std::array<int64_t, 4> ne_input;
|
|
||||||
const std::array<int64_t, 4> ne_kernel;
|
|
||||||
const ggml_type type_kernel;
|
|
||||||
const int stride0;
|
|
||||||
const int stride1;
|
|
||||||
const int padding0;
|
|
||||||
const int padding1;
|
|
||||||
const int dilation0;
|
|
||||||
const int dilation1;
|
|
||||||
// Whether the inputs are contiguous in the channel dim or the width dim
|
|
||||||
const bool cwhn;
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
std::string vars() override {
|
|
||||||
return VARS_TO_STR10(ne_input, ne_kernel, type_kernel, stride0, stride1, padding0, padding1, dilation0, dilation1, cwhn);
|
|
||||||
}
|
|
||||||
|
|
||||||
double max_nmse_err() override {
|
|
||||||
return 5e-4;
|
|
||||||
}
|
|
||||||
|
|
||||||
uint64_t op_flops(ggml_tensor * t) override {
|
|
||||||
GGML_UNUSED(t);
|
|
||||||
// Just counting matmul costs:
|
|
||||||
// KxCRS @ CRSxNPQ = KxNPQ --> KxNPQx(CRS+CRS-1) flops
|
|
||||||
|
|
||||||
// Copied from ggml.c: int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d)
|
|
||||||
auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
|
|
||||||
return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
|
|
||||||
};
|
|
||||||
|
|
||||||
int64_t W = ne_input[0];
|
|
||||||
int64_t H = ne_input[1];
|
|
||||||
int64_t KW = ne_kernel[0];
|
|
||||||
int64_t KH = ne_kernel[1];
|
|
||||||
int64_t Cin = ne_kernel[2];
|
|
||||||
int64_t Cout = ne_kernel[3];
|
|
||||||
int64_t N = ne_input[3];
|
|
||||||
int64_t OH = calc_conv_output_size(H, KH, stride0, padding0, dilation0);
|
|
||||||
int64_t OW = calc_conv_output_size(W, KW, stride0, padding0, dilation0);
|
|
||||||
|
|
||||||
int64_t K = Cout;
|
|
||||||
int64_t CRS = Cin * KH * KW;
|
|
||||||
int64_t NPQ = N * OH * OW;
|
|
||||||
|
|
||||||
return K * NPQ * (2 * CRS - 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
test_conv_2d_implicit(std::array<int64_t, 4> ne_input = { 64, 64, 16, 1 },
|
|
||||||
std::array<int64_t, 4> ne_kernel = { 3, 3, 1, 16 }, ggml_type type_kernel = GGML_TYPE_F32, int stride0 = 1,
|
|
||||||
int stride1 = 1, int padding0 = 0, int padding1 = 0, int dilation0 = 1, int dilation1 = 1, bool cwhn = false) :
|
|
||||||
ne_input(ne_input),
|
|
||||||
ne_kernel(ne_kernel),
|
|
||||||
type_kernel(type_kernel),
|
|
||||||
stride0(stride0),
|
|
||||||
stride1(stride1),
|
|
||||||
padding0(padding0),
|
|
||||||
padding1(padding1),
|
|
||||||
dilation0(dilation0),
|
|
||||||
dilation1(dilation1),
|
|
||||||
cwhn(cwhn) {}
|
|
||||||
|
|
||||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
||||||
ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
|
|
||||||
ggml_set_name(input, "input");
|
|
||||||
|
|
||||||
ggml_tensor * kernel = ggml_new_tensor(ctx, type_kernel, 4, ne_kernel.data());
|
|
||||||
ggml_set_name(kernel, "kernel");
|
|
||||||
|
|
||||||
// if (cwhn) {
|
|
||||||
// // change memory layout to channel-most-contiguous (CWHN),
|
|
||||||
// // then permute it back so NE matches the original input
|
|
||||||
// input = ggml_cont(ctx, ggml_permute(ctx, input, 1, 2, 0, 3));
|
|
||||||
// input = ggml_permute(ctx, input, 2, 0, 1, 3);
|
|
||||||
// kernel = ggml_cont(ctx, ggml_permute(ctx, kernel, 2, 3, 1, 0));
|
|
||||||
// kernel = ggml_permute(ctx, kernel, 3, 2, 0, 1);
|
|
||||||
// }
|
|
||||||
|
|
||||||
ggml_tensor * out =
|
|
||||||
ggml_conv_2d_implicitgemm(ctx, kernel, input, stride0, stride1, padding0, padding1, dilation0, dilation1);
|
|
||||||
ggml_set_name(out, "out");
|
|
||||||
return out;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// GGML_OP_CONV_2D_DW
|
// GGML_OP_CONV_2D_DW
|
||||||
struct test_conv_2d_dw : public test_case {
|
struct test_conv_2d_dw : public test_case {
|
||||||
const std::array<int64_t, 4> ne_input;
|
const std::array<int64_t, 4> ne_input;
|
||||||
|
|
@ -5941,30 +5853,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (uint32_t s0 : { 1, 3 }) {
|
|
||||||
for (uint32_t p1 : { 2, 5 }) {
|
|
||||||
for (uint32_t Cin : { 1, 25 }) {
|
|
||||||
for (uint32_t Cout : { 1, 12 }) {
|
|
||||||
for (uint32_t KH : { 1, 2, 3, 11 }) {
|
|
||||||
for (uint32_t KW : { 1, 2, 3, 11 }) {
|
|
||||||
for (uint32_t H : { 1, 133 }) {
|
|
||||||
for (uint32_t W : { 1, 141 }) {
|
|
||||||
if (calc_conv_output_size(W, KW, s0, p0, d0) > 0 &&
|
|
||||||
calc_conv_output_size(H, KH, s1, p1, d1) > 0) {
|
|
||||||
for (auto kernel_type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
|
||||||
test_cases.emplace_back(new test_conv_2d_implicit(
|
|
||||||
{ W, H, Cin, 2 }, { KW, KH, Cin, Cout }, kernel_type, s0, s1, p0, p1, d0, d1, false));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// sycl backend will limit task global_range < MAX_INT
|
// sycl backend will limit task global_range < MAX_INT
|
||||||
// test cases for 2D im2col with large input W and H (occurs in stable-diffusion)
|
// test cases for 2D im2col with large input W and H (occurs in stable-diffusion)
|
||||||
// however these cases need to alloc more memory which may fail in some devices (Intel Arc770, etc.)
|
// however these cases need to alloc more memory which may fail in some devices (Intel Arc770, etc.)
|
||||||
|
|
@ -6732,16 +6620,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (auto kernel_type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
|
||||||
for (auto act_case : cases) {
|
|
||||||
// Direct CONV_2D
|
|
||||||
test_cases.emplace_back(new test_conv_2d_implicit(
|
|
||||||
{ act_case[iwh_idx], act_case[iwh_idx], act_case[Cin_idx], act_case[B_idx] },
|
|
||||||
{ act_case[kwh_idx], act_case[kwh_idx], act_case[Cin_idx], act_case[Cout_idx] },
|
|
||||||
kernel_type, 1, 1, 0, 0, 1, 1, false));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Stable-diffusion layers
|
// Stable-diffusion layers
|
||||||
std::map<std::string, uint32_t> idx_sd{
|
std::map<std::string, uint32_t> idx_sd{
|
||||||
{ "iw", 0 },
|
{ "iw", 0 },
|
||||||
|
|
@ -6788,7 +6666,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
||||||
uint32_t p0 = act_case[idx_sd["kw"]] == 3 ? 1 : 0;
|
uint32_t p0 = act_case[idx_sd["kw"]] == 3 ? 1 : 0;
|
||||||
uint32_t p1 = act_case[idx_sd["kh"]] == 3 ? 1 : 0;
|
uint32_t p1 = act_case[idx_sd["kh"]] == 3 ? 1 : 0;
|
||||||
|
|
||||||
test_cases.emplace_back(new test_conv_2d_implicit(
|
test_cases.emplace_back(new test_conv_2d(
|
||||||
{ act_case[idx_sd["iw"]], act_case[idx_sd["ih"]], act_case[idx_sd["Cin"]], act_case[idx_sd["B"]] },
|
{ act_case[idx_sd["iw"]], act_case[idx_sd["ih"]], act_case[idx_sd["Cin"]], act_case[idx_sd["B"]] },
|
||||||
{ act_case[idx_sd["kw"]], act_case[idx_sd["kh"]], act_case[idx_sd["Cin"]], act_case[idx_sd["Cout"]] },
|
{ act_case[idx_sd["kw"]], act_case[idx_sd["kh"]], act_case[idx_sd["Cin"]], act_case[idx_sd["Cout"]] },
|
||||||
GGML_TYPE_F16, 1, 1, p0, p1, 1, 1, false));
|
GGML_TYPE_F16, 1, 1, p0, p1, 1, 1, false));
|
||||||
|
|
@ -6801,12 +6679,25 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
||||||
uint32_t p0 = act_case[idx_sd["kw"]] == 3 ? 1 : 0;
|
uint32_t p0 = act_case[idx_sd["kw"]] == 3 ? 1 : 0;
|
||||||
uint32_t p1 = act_case[idx_sd["kh"]] == 3 ? 1 : 0;
|
uint32_t p1 = act_case[idx_sd["kh"]] == 3 ? 1 : 0;
|
||||||
|
|
||||||
test_cases.emplace_back(new test_conv_2d_implicit(
|
test_cases.emplace_back(new test_conv_2d(
|
||||||
{ act_case[idx_sd["iw"]], act_case[idx_sd["ih"]], act_case[idx_sd["Cin"]], act_case[idx_sd["B"]] },
|
{ act_case[idx_sd["iw"]], act_case[idx_sd["ih"]], act_case[idx_sd["Cin"]], act_case[idx_sd["B"]] },
|
||||||
{ act_case[idx_sd["kw"]], act_case[idx_sd["kh"]], act_case[idx_sd["Cin"]], act_case[idx_sd["Cout"]] },
|
{ act_case[idx_sd["kw"]], act_case[idx_sd["kh"]], act_case[idx_sd["Cin"]], act_case[idx_sd["Cout"]] },
|
||||||
GGML_TYPE_F32, 1, 1, p0, p1, 1, 1, false));
|
GGML_TYPE_F32, 1, 1, p0, p1, 1, 1, false));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// for (auto act_case : cases_sd) {
|
||||||
|
// GGML_ASSERT(act_case[idx_sd["kw"]] == 3 || act_case[idx_sd["kw"]] == 1);
|
||||||
|
// GGML_ASSERT(act_case[idx_sd["kh"]] == 3 || act_case[idx_sd["kh"]] == 1);
|
||||||
|
|
||||||
|
// uint32_t p0 = act_case[idx_sd["kw"]] == 3 ? 1 : 0;
|
||||||
|
// uint32_t p1 = act_case[idx_sd["kh"]] == 3 ? 1 : 0;
|
||||||
|
|
||||||
|
// test_cases.emplace_back(new test_conv_2d_implicit(
|
||||||
|
// { act_case[idx_sd["iw"]], act_case[idx_sd["ih"]], act_case[idx_sd["Cin"]], act_case[idx_sd["B"]] },
|
||||||
|
// { act_case[idx_sd["kw"]], act_case[idx_sd["kh"]], act_case[idx_sd["Cin"]], act_case[idx_sd["Cout"]] },
|
||||||
|
// GGML_TYPE_F16, 1, 1, p0, p1, 1, 1, false));
|
||||||
|
// }
|
||||||
|
|
||||||
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 1, 1, 1}));
|
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 1, 1, 1}));
|
||||||
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1}));
|
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1}));
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -239,49 +239,6 @@ struct ggml_cgraph * build_graph_1(const test_model& model) {
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_cgraph * build_graph_2(const test_model& model) {
|
|
||||||
static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
|
|
||||||
static std::vector<uint8_t> buf(buf_size);
|
|
||||||
|
|
||||||
struct ggml_init_params params0 = {
|
|
||||||
/*.mem_size =*/ buf_size,
|
|
||||||
/*.mem_buffer =*/ buf.data(),
|
|
||||||
/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
|
|
||||||
};
|
|
||||||
|
|
||||||
// create a temporally context to build the graph
|
|
||||||
struct ggml_context * ctx0 = ggml_init(params0);
|
|
||||||
|
|
||||||
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
|
||||||
|
|
||||||
int s0 = 1;
|
|
||||||
int s1 = 1;
|
|
||||||
int p0 = 1;
|
|
||||||
int p1 = 1;
|
|
||||||
int d0 = 1;
|
|
||||||
int d1 = 1;
|
|
||||||
|
|
||||||
|
|
||||||
// recalculate for avoid fragmentation
|
|
||||||
// struct ggml_tensor* conv2d_res = ggml_conv_2d(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1);
|
|
||||||
// ggml_set_name(conv2d_res, "conv2d_res");
|
|
||||||
// ggml_build_forward_expand(gf, conv2d_res);
|
|
||||||
// int64_t *ne = conv2d_res->ne;
|
|
||||||
// printf("conv2d: (%zu, %zu, %zu, %zu) \n", ne[0], ne[1], ne[2], ne[3]);
|
|
||||||
|
|
||||||
|
|
||||||
struct ggml_tensor* wino_res = ggml_conv_2d_implicitgemm(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1);
|
|
||||||
// struct ggml_tensor* wino_res = ggml_conv_2d_direct(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1);
|
|
||||||
ggml_set_name(wino_res, "wino_res");
|
|
||||||
ggml_build_forward_expand(gf, wino_res);
|
|
||||||
// ne = wino_res->ne;
|
|
||||||
// printf("wino: (%zu, %zu, %zu, %zu) \n", ne[0], ne[1], ne[2], ne[3]);
|
|
||||||
ggml_free(ctx0);
|
|
||||||
return gf;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
std::vector<float> compute_graph(const test_model & model, ggml_gallocr_t allocr,
|
std::vector<float> compute_graph(const test_model & model, ggml_gallocr_t allocr,
|
||||||
build_graph_t build_graph, int iters, double *t) {
|
build_graph_t build_graph, int iters, double *t) {
|
||||||
|
|
@ -352,10 +309,10 @@ int main(void)
|
||||||
// std::make_tuple(640,640,52,76,3,3),
|
// std::make_tuple(640,640,52,76,3,3),
|
||||||
// std::make_tuple(640,640,104,152,3,3),
|
// std::make_tuple(640,640,104,152,3,3),
|
||||||
// std::make_tuple(960,320,104,152,3,3),
|
// std::make_tuple(960,320,104,152,3,3),
|
||||||
// std::make_tuple(1280,1280,26,38,3,3),
|
std::make_tuple(1280,1280,26,38,3,3),
|
||||||
// std::make_tuple(1280,1280,26,38,1,1),
|
// std::make_tuple(1280,1280,26,38,1,1),
|
||||||
// std::make_tuple(256,128,768,1024,3,3),
|
// std::make_tuple(256,128,768,1024,3,3),
|
||||||
std::make_tuple(128,3,768,1024,3,3),
|
// std::make_tuple(128,3,768,1024,3,3),
|
||||||
// std::make_tuple(256,128,768,1024,1,1),
|
// std::make_tuple(256,128,768,1024,1,1),
|
||||||
// std::make_tuple(512,256,384,512,1,1),
|
// std::make_tuple(512,256,384,512,1,1),
|
||||||
// std::make_tuple(1280,640,52,76,3,3),
|
// std::make_tuple(1280,640,52,76,3,3),
|
||||||
|
|
@ -389,7 +346,7 @@ int main(void)
|
||||||
|
|
||||||
|
|
||||||
struct ggml_cgraph * gf_res_0 = NULL;
|
struct ggml_cgraph * gf_res_0 = NULL;
|
||||||
int iterations = 20;
|
int iterations = 0;
|
||||||
|
|
||||||
double run_time0;
|
double run_time0;
|
||||||
std::vector<float> im2col_data = compute_graph(model, allocr, build_graph_0, iterations, &run_time0);
|
std::vector<float> im2col_data = compute_graph(model, allocr, build_graph_0, iterations, &run_time0);
|
||||||
|
|
@ -418,51 +375,31 @@ int main(void)
|
||||||
|
|
||||||
ggml_gallocr_free(allocr);
|
ggml_gallocr_free(allocr);
|
||||||
|
|
||||||
allocr = NULL;
|
|
||||||
|
|
||||||
allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
|
|
||||||
|
|
||||||
//create the worst case graph for memory usage estimation
|
|
||||||
gf = build_graph_2(model);
|
|
||||||
|
|
||||||
// compute the required memory
|
|
||||||
ggml_gallocr_reserve(allocr, gf);
|
|
||||||
size_t mem_size2 = ggml_gallocr_get_buffer_size(allocr, 0);
|
|
||||||
// fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0f/1024.0f);
|
|
||||||
|
|
||||||
|
|
||||||
struct ggml_cgraph * gf_res_2 = NULL;
|
|
||||||
|
|
||||||
double run_time2;
|
|
||||||
std::vector<float> wino_data = compute_graph(model, allocr, build_graph_2, iterations, &run_time2);
|
|
||||||
|
|
||||||
|
|
||||||
if(k==0) {
|
if(k==0) {
|
||||||
k = 1;
|
k = 1;
|
||||||
fprintf(stderr, "| (IC, OC, IW, IH) | im2col+GEMM TIME | im2col+GEMM VRAM | direct TIME | direct VRAM | implicit GEMM TIME | implicit GEMM VRAM \n");
|
fprintf(stderr, "| (IC, OC, IW, IH) | im2col+GEMM TIME | im2col+GEMM VRAM | implicit GEMM TIME | implicit GEMM VRAM \n");
|
||||||
fprintf(stderr, "| --- | --- | --- | --- | --- | --- | --- \n");
|
fprintf(stderr, "| --- | --- | --- | --- | --- \n");
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, " | (%d, %d, %d, %d, %d, %d) | %.2f ms | %.2f MB | %.2f ms | %.2f MB | %.2f ms | %.2f MB\n",
|
fprintf(stderr, " | (%d, %d, %d, %d) | %.2f ms | %.2f MB | %.2f ms | %.2f MB\n",
|
||||||
std::get<0>(c), std::get<1>(c), std::get<2>(c), std::get<3>(c), std::get<4>(c), std::get<5>(c),
|
std::get<0>(c), std::get<1>(c), std::get<2>(c), std::get<3>(c), std::get<4>(c), std::get<5>(c),
|
||||||
run_time0, mem_size0/1024.0f/1024.0f,
|
run_time0, mem_size0/1024.0f/1024.0f,
|
||||||
run_time1, mem_size1/1024.0f/1024.0f,
|
run_time1, mem_size1/1024.0f/1024.0f
|
||||||
run_time2, mem_size2/1024.0f/1024.0f);
|
);
|
||||||
|
|
||||||
|
|
||||||
// for(int i = 0; i < ggml_nelements(wino_res); i++) {
|
// for(int i = 0; i < ggml_nelements(wino_res); i++) {
|
||||||
// for(int i = 0; i < 26*38; i++) {
|
// for(int i = 0; i < 26*38; i++) {
|
||||||
// for(int i = 0; i < conv2d_data.size(); i++) {
|
for(int i = 0; i < conv2d_data.size(); i++) {
|
||||||
// // float diff = fabs(conv2d_data[i] - wino_data[i]);
|
// float diff = fabs(conv2d_data[i] - wino_data[i]);
|
||||||
// float diff = fabs(im2col_data[i] - wino_data[i]);
|
float diff = fabs(im2col_data[i] - conv2d_data[i]);
|
||||||
// float diff1 = fabs(im2col_data[i] - conv2d_data[i]);
|
// if(diff > 0.5) {
|
||||||
// // if(diff > 0.5) {
|
printf("(%7.3f, %7.3f, %.2f, %d) \n",
|
||||||
// printf("(%7.3f, %7.3f, %7.3f, %.2f, %.2f, %d) \n",
|
im2col_data[i], conv2d_data[i],
|
||||||
// im2col_data[i], conv2d_data[i],
|
diff, i);
|
||||||
// wino_data[i], diff, diff1, i);
|
// break;
|
||||||
// // break;
|
// }
|
||||||
// // }
|
}
|
||||||
// }
|
|
||||||
|
|
||||||
ggml_free(model.ctx);
|
ggml_free(model.ctx);
|
||||||
ggml_backend_buffer_free(model.buffer);
|
ggml_backend_buffer_free(model.buffer);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue