add support for both NCHW and NHWC layouts
This commit is contained in:
parent
3e2f722d11
commit
b70cca2ea3
|
|
@ -1992,7 +1992,8 @@ extern "C" {
|
|||
int p0, // padding dimension 0
|
||||
int p1, // padding dimension 1
|
||||
int d0, // dilation dimension 0
|
||||
int d1); // dilation dimension 1
|
||||
int d1,
|
||||
int layout); // dilation dimension 1
|
||||
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_conv_3d_direct(
|
||||
|
|
|
|||
|
|
@ -731,15 +731,15 @@ static __global__ void conv2d_implicit_kernel(const float * __restrict__ input,
|
|||
conv_shapes[][2]: ne_input=[192,256,512,1git diff],ne_kernel=[3,3,512,512]
|
||||
*/
|
||||
constexpr static int conv_shapes[][NUM_VARIANTS] = {
|
||||
{ 128, 128, 128 }, // BM
|
||||
{ 256, 128, 256 }, // BN
|
||||
{ 8, 8, 8 }, // BK
|
||||
{ 128, 64, 32 }, // WM
|
||||
{ 32, 32 , 256 }, // WN
|
||||
{ 2, 2, 1 }, // WNITER
|
||||
{ 8, 4, 8 }, // TM
|
||||
{ 8, 4, 4 }, // TN
|
||||
{ 256, 256, 128} // NUM_THREADS
|
||||
{ 128, 128, 128, 256 }, // BM
|
||||
{ 256, 128, 256, 128 }, // BN
|
||||
{ 8, 8, 8, 8 }, // BK
|
||||
{ 128, 64, 32, 128 }, // WM
|
||||
{ 32, 32 , 256, 32 }, // WN
|
||||
{ 2, 2, 1, 1 }, // WNITER
|
||||
{ 8, 4, 4, 4 }, // TM
|
||||
{ 8, 4, 8, 8 }, // TN
|
||||
{ 256, 256, 128, 256} // NUM_THREADS
|
||||
};
|
||||
|
||||
template <typename T, unsigned int CONV_SHAPE>
|
||||
|
|
@ -763,16 +763,29 @@ static void conv2d_implicit_cuda(const float * X_D, const T * K_D, float * Y_D,
|
|||
dim3 thblock(NUM_THREADS, thready, threadz);
|
||||
dim3 grid(blockx, blocky, blockz);
|
||||
// int smem_size = 24 * 1024;
|
||||
conv2d_implicit_kernel<T, BM, BN, BK, WM, WN,
|
||||
WNITER, TM, TN, NUM_THREADS, 1, false, 0><<<grid, thblock, 0, st>>>(X_D, K_D, Y_D, P);
|
||||
if(P.c % 4 == 0){
|
||||
if(P.layout == 0)
|
||||
conv2d_implicit_kernel<T, BM, BN, BK, WM, WN,
|
||||
WNITER, TM, TN, NUM_THREADS, 0, true, 0><<<grid, thblock, 0, st>>>(X_D, K_D, Y_D, P);
|
||||
else if(P.layout == 1)
|
||||
conv2d_implicit_kernel<T, BM, BN, BK, WM, WN,
|
||||
WNITER, TM, TN, NUM_THREADS, 1, false, 0><<<grid, thblock, 0, st>>>(X_D, K_D, Y_D, P);
|
||||
} else{
|
||||
if(P.layout == 0)
|
||||
conv2d_implicit_kernel<T, BM, BN, BK, WM, WN,
|
||||
WNITER, TM, TN, NUM_THREADS, 0, false, 0><<<grid, thblock, 0, st>>>(X_D, K_D, Y_D, P);
|
||||
else if(P.layout == 1)
|
||||
conv2d_implicit_kernel<T, BM, BN, BK, WM, WN,
|
||||
WNITER, TM, TN, NUM_THREADS, 1, false, 0><<<grid, thblock, 0, st>>>(X_D, K_D, Y_D, P);
|
||||
}
|
||||
}
|
||||
|
||||
static void conv2d_implicit_cuda_f16(const float * X_D, const half * K_D, float * Y_D, const param_t P, cudaStream_t st) {
|
||||
conv2d_implicit_cuda<half, 0>(X_D, K_D, Y_D, P, st);
|
||||
conv2d_implicit_cuda<half, 3>(X_D, K_D, Y_D, P, st);
|
||||
}
|
||||
|
||||
static void conv2d_implicit_cuda_f32(const float * X_D, const float * K_D, float * Y_D, const param_t P, cudaStream_t st) {
|
||||
conv2d_implicit_cuda<float, 0>(X_D, K_D, Y_D, P, st);
|
||||
conv2d_implicit_cuda<float, 3>(X_D, K_D, Y_D, P, st);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_conv2d_implicit(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
|
|
@ -785,8 +798,6 @@ void ggml_cuda_op_conv2d_implicit(ggml_backend_cuda_context & ctx, ggml_tensor *
|
|||
GGML_ASSERT(ggml_is_contiguous(kernel));
|
||||
GGML_ASSERT(kernel->type == GGML_TYPE_F16 || kernel->type == GGML_TYPE_F32);
|
||||
|
||||
// same number of input channels
|
||||
GGML_ASSERT(input->ne[2] == kernel->ne[2]);
|
||||
|
||||
cudaStream_t st = ctx.stream();
|
||||
|
||||
|
|
@ -797,17 +808,30 @@ void ggml_cuda_op_conv2d_implicit(ggml_backend_cuda_context & ctx, ggml_tensor *
|
|||
const int PD_Y = p[3]; // padding_y
|
||||
const int DL_X = p[4]; // dilation_x
|
||||
const int DL_Y = p[5]; // dilation_y
|
||||
const int LT = p[6]; // layout
|
||||
|
||||
GGML_ASSERT(LT == 0 || LT == 1);
|
||||
|
||||
// same number of input channels
|
||||
GGML_ASSERT(LT == 0 ? input->ne[0] == kernel->ne[0] : input->ne[2] == kernel->ne[2]);
|
||||
// No cwhn
|
||||
GGML_ASSERT(p[6] == false);
|
||||
GGML_ASSERT(p[7] == false);
|
||||
|
||||
const int IW = input->ne[0]; // input_w
|
||||
const int IH = input->ne[1]; // input_h
|
||||
// const int IW = input->ne[0]; // input_w
|
||||
// const int IH = input->ne[1]; // input_h
|
||||
// const int OW = dst->ne[0]; // output_w
|
||||
// const int OH = dst->ne[1]; // output_h
|
||||
// const int KW = kernel->ne[0]; // kernel_w
|
||||
// const int KH = kernel->ne[1]; // kernel_h
|
||||
// const int IC = input->ne[2]; // input_channels
|
||||
const int IW = input->ne[LT == 0 ? 1 : 0]; // input_w
|
||||
const int IH = input->ne[LT == 0 ? 2 : 1]; // input_h
|
||||
const int OW = dst->ne[0]; // output_w
|
||||
const int OH = dst->ne[1]; // output_h
|
||||
const int KW = kernel->ne[0]; // kernel_w
|
||||
const int KH = kernel->ne[1]; // kernel_h
|
||||
const int IC = input->ne[2]; // input_channels
|
||||
const int KW = kernel->ne[LT == 0 ? 1 : 0]; // kernel_w
|
||||
const int KH = kernel->ne[LT == 0 ? 2 : 1]; // kernel_h
|
||||
const int IC = input->ne[LT == 0 ? 0: 2]; // input_channels
|
||||
|
||||
const int OC = kernel->ne[3]; // ouptut_chanles
|
||||
const int B = input->ne[3]; // n_batches
|
||||
|
||||
|
|
@ -819,7 +843,7 @@ void ggml_cuda_op_conv2d_implicit(ggml_backend_cuda_context & ctx, ggml_tensor *
|
|||
params.C_fastdiv = init_fastdiv_values(IC);
|
||||
params.RS_fastdiv = init_fastdiv_values(KW*KH);
|
||||
params.S_fastdiv = init_fastdiv_values(KW);
|
||||
params.nchw = false;
|
||||
params.layout = LT;
|
||||
|
||||
if (kernel->type == GGML_TYPE_F16) {
|
||||
conv2d_implicit_cuda_f16(X_D, (half *) K_D, Y_D, params, st);
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ typedef struct{
|
|||
unsigned int d_w; //dilation width
|
||||
unsigned int Oh; //output height
|
||||
unsigned int Ow; //output width
|
||||
bool nchw;
|
||||
unsigned int layout;
|
||||
uint3 SC_fastdiv;
|
||||
uint3 OW_fastdiv;
|
||||
uint3 C_fastdiv;
|
||||
|
|
|
|||
|
|
@ -4584,7 +4584,9 @@ struct ggml_tensor * ggml_conv_2d_implicitgemm(
|
|||
int p0, // padding dimension 0
|
||||
int p1, // padding dimension 1
|
||||
int d0, // dilation dimension 0
|
||||
int d1) {// dilation dimension 1
|
||||
int d1,
|
||||
// 0: NHWC, 1:NCHW
|
||||
int layout) {// dilation dimension 1
|
||||
|
||||
GGML_ASSERT(a->ne[2] == b->ne[2]);
|
||||
//GGML_ASSERT(a->type == b->type);
|
||||
|
|
@ -4603,10 +4605,20 @@ struct ggml_tensor * ggml_conv_2d_implicitgemm(
|
|||
ggml_set_op_params_i32(result, 3, p1);
|
||||
ggml_set_op_params_i32(result, 4, d0);
|
||||
ggml_set_op_params_i32(result, 5, d1);
|
||||
ggml_set_op_params_i32(result, 6, layout);
|
||||
|
||||
struct ggml_tensor *ap, *bp;
|
||||
if(layout == 0){
|
||||
ap = ggml_cont(ctx, ggml_permute(ctx, a, 1, 2, 0, 3));
|
||||
bp = ggml_cont(ctx, ggml_permute(ctx, b, 1, 2, 0, 3));
|
||||
} else{
|
||||
ap = a;
|
||||
bp = b;
|
||||
}
|
||||
|
||||
result->op = GGML_OP_CONV_2D_IMPLICIT;
|
||||
result->src[0] = a;
|
||||
result->src[1] = b;
|
||||
result->src[0] = ap;
|
||||
result->src[1] = bp;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4268,7 +4268,7 @@ struct test_conv_2d_implicit : public test_case {
|
|||
}
|
||||
|
||||
ggml_tensor * out =
|
||||
ggml_conv_2d_implicitgemm(ctx, kernel, input, stride0, stride1, padding0, padding1, dilation0, dilation1);
|
||||
ggml_conv_2d_implicitgemm(ctx, kernel, input, stride0, stride1, padding0, padding1, dilation0, dilation1, cwhn?0:1);
|
||||
ggml_set_name(out, "out");
|
||||
return out;
|
||||
}
|
||||
|
|
@ -6788,6 +6788,45 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
|||
GGML_TYPE_F16, 1, 1, p0, p1, 1, 1, false));
|
||||
}
|
||||
|
||||
for (auto act_case : cases_sd) {
|
||||
GGML_ASSERT(act_case[idx_sd["kw"]] == 3 || act_case[idx_sd["kw"]] == 1);
|
||||
GGML_ASSERT(act_case[idx_sd["kh"]] == 3 || act_case[idx_sd["kh"]] == 1);
|
||||
|
||||
uint32_t p0 = act_case[idx_sd["kw"]] == 3 ? 1 : 0;
|
||||
uint32_t p1 = act_case[idx_sd["kh"]] == 3 ? 1 : 0;
|
||||
|
||||
test_cases.emplace_back(new test_conv_2d_implicit(
|
||||
{ act_case[idx_sd["iw"]], act_case[idx_sd["ih"]], act_case[idx_sd["Cin"]], act_case[idx_sd["B"]] },
|
||||
{ act_case[idx_sd["kw"]], act_case[idx_sd["kh"]], act_case[idx_sd["Cin"]], act_case[idx_sd["Cout"]] },
|
||||
GGML_TYPE_F32, 1, 1, p0, p1, 1, 1, false));
|
||||
}
|
||||
|
||||
for (auto act_case : cases_sd) {
|
||||
GGML_ASSERT(act_case[idx_sd["kw"]] == 3 || act_case[idx_sd["kw"]] == 1);
|
||||
GGML_ASSERT(act_case[idx_sd["kh"]] == 3 || act_case[idx_sd["kh"]] == 1);
|
||||
|
||||
uint32_t p0 = act_case[idx_sd["kw"]] == 3 ? 1 : 0;
|
||||
uint32_t p1 = act_case[idx_sd["kh"]] == 3 ? 1 : 0;
|
||||
|
||||
test_cases.emplace_back(new test_conv_2d_implicit(
|
||||
{ act_case[idx_sd["iw"]], act_case[idx_sd["ih"]], act_case[idx_sd["Cin"]], act_case[idx_sd["B"]] },
|
||||
{ act_case[idx_sd["kw"]], act_case[idx_sd["kh"]], act_case[idx_sd["Cin"]], act_case[idx_sd["Cout"]] },
|
||||
GGML_TYPE_F16, 1, 1, p0, p1, 1, 1, true));
|
||||
}
|
||||
|
||||
for (auto act_case : cases_sd) {
|
||||
GGML_ASSERT(act_case[idx_sd["kw"]] == 3 || act_case[idx_sd["kw"]] == 1);
|
||||
GGML_ASSERT(act_case[idx_sd["kh"]] == 3 || act_case[idx_sd["kh"]] == 1);
|
||||
|
||||
uint32_t p0 = act_case[idx_sd["kw"]] == 3 ? 1 : 0;
|
||||
uint32_t p1 = act_case[idx_sd["kh"]] == 3 ? 1 : 0;
|
||||
|
||||
test_cases.emplace_back(new test_conv_2d_implicit(
|
||||
{ act_case[idx_sd["iw"]], act_case[idx_sd["ih"]], act_case[idx_sd["Cin"]], act_case[idx_sd["B"]] },
|
||||
{ act_case[idx_sd["kw"]], act_case[idx_sd["kh"]], act_case[idx_sd["Cin"]], act_case[idx_sd["Cout"]] },
|
||||
GGML_TYPE_F32, 1, 1, p0, p1, 1, 1, true));
|
||||
}
|
||||
|
||||
|
||||
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 1, 1, 1}));
|
||||
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1}));
|
||||
|
|
|
|||
|
|
@ -262,7 +262,7 @@ struct ggml_cgraph * build_graph_2(const test_model& model) {
|
|||
// printf("conv2d: (%zu, %zu, %zu, %zu) \n", ne[0], ne[1], ne[2], ne[3]);
|
||||
|
||||
|
||||
struct ggml_tensor* wino_res = ggml_conv_2d_implicitgemm(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1);
|
||||
struct ggml_tensor* wino_res = ggml_conv_2d_implicitgemm(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1, 0);
|
||||
// struct ggml_tensor* wino_res = ggml_conv_2d_direct(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1);
|
||||
ggml_set_name(wino_res, "wino_res");
|
||||
ggml_build_forward_expand(gf, wino_res);
|
||||
|
|
@ -339,20 +339,20 @@ int main(void)
|
|||
{
|
||||
ggml_time_init();
|
||||
std::vector<std::tuple<int, int, int, int>> configs = {
|
||||
// std::make_tuple(64,64,48,64),
|
||||
// std::make_tuple(320,320,104,152),
|
||||
// std::make_tuple(640,640,52,76),
|
||||
// std::make_tuple(640,640,104,152),
|
||||
// std::make_tuple(960,320,104,152),
|
||||
// std::make_tuple(1280,1280,26,38),
|
||||
// std::make_tuple(1280,640,52,76),
|
||||
// std::make_tuple(1920,1280,26,38),
|
||||
// std::make_tuple(2560,1280,26,38),
|
||||
// std::make_tuple(512,512,104,152),
|
||||
// std::make_tuple(512,512,208,304),
|
||||
std::make_tuple(64,64,48,64),
|
||||
std::make_tuple(320,320,104,152),
|
||||
std::make_tuple(640,640,52,76),
|
||||
std::make_tuple(640,640,104,152),
|
||||
std::make_tuple(960,320,104,152),
|
||||
std::make_tuple(1280,1280,26,38),
|
||||
std::make_tuple(1280,640,52,76),
|
||||
std::make_tuple(1920,1280,26,38),
|
||||
std::make_tuple(2560,1280,26,38),
|
||||
std::make_tuple(512,512,104,152),
|
||||
std::make_tuple(512,512,208,304),
|
||||
std::make_tuple(512,256,416,608),
|
||||
// std::make_tuple(256,128,832,1216),
|
||||
// std::make_tuple(256,256,832,1216),
|
||||
std::make_tuple(256,128,832,1216),
|
||||
std::make_tuple(256,256,832,1216),
|
||||
// std::make_tuple(320,256,1024,1920)
|
||||
};
|
||||
|
||||
|
|
@ -453,8 +453,6 @@ int main(void)
|
|||
|
||||
}
|
||||
|
||||
|
||||
// printf("\nPerforming test:\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue