diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index b558b29874..3999acbd4e 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -1992,7 +1992,8 @@ extern "C" { int p0, // padding dimension 0 int p1, // padding dimension 1 int d0, // dilation dimension 0 - int d1); // dilation dimension 1 + int d1, + int layout); // dilation dimension 1 GGML_API struct ggml_tensor * ggml_conv_3d_direct( diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cu b/ggml/src/ggml-cuda/conv2d-implicit.cu index 8cd17aff84..a1693dcf24 100644 --- a/ggml/src/ggml-cuda/conv2d-implicit.cu +++ b/ggml/src/ggml-cuda/conv2d-implicit.cu @@ -731,15 +731,15 @@ static __global__ void conv2d_implicit_kernel(const float * __restrict__ input, conv_shapes[][2]: ne_input=[192,256,512,1git diff],ne_kernel=[3,3,512,512] */ constexpr static int conv_shapes[][NUM_VARIANTS] = { - { 128, 128, 128 }, // BM - { 256, 128, 256 }, // BN - { 8, 8, 8 }, // BK - { 128, 64, 32 }, // WM - { 32, 32 , 256 }, // WN - { 2, 2, 1 }, // WNITER - { 8, 4, 8 }, // TM - { 8, 4, 4 }, // TN - { 256, 256, 128} // NUM_THREADS + { 128, 128, 128, 256 }, // BM + { 256, 128, 256, 128 }, // BN + { 8, 8, 8, 8 }, // BK + { 128, 64, 32, 128 }, // WM + { 32, 32 , 256, 32 }, // WN + { 2, 2, 1, 1 }, // WNITER + { 8, 4, 4, 4 }, // TM + { 8, 4, 8, 8 }, // TN + { 256, 256, 128, 256} // NUM_THREADS }; template @@ -763,16 +763,29 @@ static void conv2d_implicit_cuda(const float * X_D, const T * K_D, float * Y_D, dim3 thblock(NUM_THREADS, thready, threadz); dim3 grid(blockx, blocky, blockz); // int smem_size = 24 * 1024; - conv2d_implicit_kernel<<>>(X_D, K_D, Y_D, P); + if(P.c % 4 == 0){ + if(P.layout == 0) + conv2d_implicit_kernel<<>>(X_D, K_D, Y_D, P); + else if(P.layout == 1) + conv2d_implicit_kernel<<>>(X_D, K_D, Y_D, P); + } else{ + if(P.layout == 0) + conv2d_implicit_kernel<<>>(X_D, K_D, Y_D, P); + else if(P.layout == 1) + conv2d_implicit_kernel<<>>(X_D, K_D, Y_D, P); + } } static void conv2d_implicit_cuda_f16(const float * X_D, const half * K_D, float * Y_D, const param_t P, cudaStream_t st) { - conv2d_implicit_cuda(X_D, K_D, Y_D, P, st); + conv2d_implicit_cuda(X_D, K_D, Y_D, P, st); } static void conv2d_implicit_cuda_f32(const float * X_D, const float * K_D, float * Y_D, const param_t P, cudaStream_t st) { - conv2d_implicit_cuda(X_D, K_D, Y_D, P, st); + conv2d_implicit_cuda(X_D, K_D, Y_D, P, st); } void ggml_cuda_op_conv2d_implicit(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { @@ -785,8 +798,6 @@ void ggml_cuda_op_conv2d_implicit(ggml_backend_cuda_context & ctx, ggml_tensor * GGML_ASSERT(ggml_is_contiguous(kernel)); GGML_ASSERT(kernel->type == GGML_TYPE_F16 || kernel->type == GGML_TYPE_F32); - // same number of input channels - GGML_ASSERT(input->ne[2] == kernel->ne[2]); cudaStream_t st = ctx.stream(); @@ -797,17 +808,30 @@ void ggml_cuda_op_conv2d_implicit(ggml_backend_cuda_context & ctx, ggml_tensor * const int PD_Y = p[3]; // padding_y const int DL_X = p[4]; // dilation_x const int DL_Y = p[5]; // dilation_y + const int LT = p[6]; // layout + GGML_ASSERT(LT == 0 || LT == 1); + + // same number of input channels + GGML_ASSERT(LT == 0 ? input->ne[0] == kernel->ne[0] : input->ne[2] == kernel->ne[2]); // No cwhn - GGML_ASSERT(p[6] == false); + GGML_ASSERT(p[7] == false); - const int IW = input->ne[0]; // input_w - const int IH = input->ne[1]; // input_h + // const int IW = input->ne[0]; // input_w + // const int IH = input->ne[1]; // input_h + // const int OW = dst->ne[0]; // output_w + // const int OH = dst->ne[1]; // output_h + // const int KW = kernel->ne[0]; // kernel_w + // const int KH = kernel->ne[1]; // kernel_h + // const int IC = input->ne[2]; // input_channels + const int IW = input->ne[LT == 0 ? 1 : 0]; // input_w + const int IH = input->ne[LT == 0 ? 2 : 1]; // input_h const int OW = dst->ne[0]; // output_w const int OH = dst->ne[1]; // output_h - const int KW = kernel->ne[0]; // kernel_w - const int KH = kernel->ne[1]; // kernel_h - const int IC = input->ne[2]; // input_channels + const int KW = kernel->ne[LT == 0 ? 1 : 0]; // kernel_w + const int KH = kernel->ne[LT == 0 ? 2 : 1]; // kernel_h + const int IC = input->ne[LT == 0 ? 0: 2]; // input_channels + const int OC = kernel->ne[3]; // ouptut_chanles const int B = input->ne[3]; // n_batches @@ -819,7 +843,7 @@ void ggml_cuda_op_conv2d_implicit(ggml_backend_cuda_context & ctx, ggml_tensor * params.C_fastdiv = init_fastdiv_values(IC); params.RS_fastdiv = init_fastdiv_values(KW*KH); params.S_fastdiv = init_fastdiv_values(KW); - params.nchw = false; + params.layout = LT; if (kernel->type == GGML_TYPE_F16) { conv2d_implicit_cuda_f16(X_D, (half *) K_D, Y_D, params, st); diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cuh b/ggml/src/ggml-cuda/conv2d-implicit.cuh index d2f3cffcc3..e46d93ef4f 100644 --- a/ggml/src/ggml-cuda/conv2d-implicit.cuh +++ b/ggml/src/ggml-cuda/conv2d-implicit.cuh @@ -17,7 +17,7 @@ typedef struct{ unsigned int d_w; //dilation width unsigned int Oh; //output height unsigned int Ow; //output width - bool nchw; + unsigned int layout; uint3 SC_fastdiv; uint3 OW_fastdiv; uint3 C_fastdiv; diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 1c746c687a..7fa97e84de 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -4584,7 +4584,9 @@ struct ggml_tensor * ggml_conv_2d_implicitgemm( int p0, // padding dimension 0 int p1, // padding dimension 1 int d0, // dilation dimension 0 - int d1) {// dilation dimension 1 + int d1, + // 0: NHWC, 1:NCHW + int layout) {// dilation dimension 1 GGML_ASSERT(a->ne[2] == b->ne[2]); //GGML_ASSERT(a->type == b->type); @@ -4603,10 +4605,20 @@ struct ggml_tensor * ggml_conv_2d_implicitgemm( ggml_set_op_params_i32(result, 3, p1); ggml_set_op_params_i32(result, 4, d0); ggml_set_op_params_i32(result, 5, d1); + ggml_set_op_params_i32(result, 6, layout); + + struct ggml_tensor *ap, *bp; + if(layout == 0){ + ap = ggml_cont(ctx, ggml_permute(ctx, a, 1, 2, 0, 3)); + bp = ggml_cont(ctx, ggml_permute(ctx, b, 1, 2, 0, 3)); + } else{ + ap = a; + bp = b; + } result->op = GGML_OP_CONV_2D_IMPLICIT; - result->src[0] = a; - result->src[1] = b; + result->src[0] = ap; + result->src[1] = bp; return result; } diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 3c4388f8a5..49a5688acd 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -4268,7 +4268,7 @@ struct test_conv_2d_implicit : public test_case { } ggml_tensor * out = - ggml_conv_2d_implicitgemm(ctx, kernel, input, stride0, stride1, padding0, padding1, dilation0, dilation1); + ggml_conv_2d_implicitgemm(ctx, kernel, input, stride0, stride1, padding0, padding1, dilation0, dilation1, cwhn?0:1); ggml_set_name(out, "out"); return out; } @@ -6788,6 +6788,45 @@ static std::vector> make_test_cases_perf() { GGML_TYPE_F16, 1, 1, p0, p1, 1, 1, false)); } + for (auto act_case : cases_sd) { + GGML_ASSERT(act_case[idx_sd["kw"]] == 3 || act_case[idx_sd["kw"]] == 1); + GGML_ASSERT(act_case[idx_sd["kh"]] == 3 || act_case[idx_sd["kh"]] == 1); + + uint32_t p0 = act_case[idx_sd["kw"]] == 3 ? 1 : 0; + uint32_t p1 = act_case[idx_sd["kh"]] == 3 ? 1 : 0; + + test_cases.emplace_back(new test_conv_2d_implicit( + { act_case[idx_sd["iw"]], act_case[idx_sd["ih"]], act_case[idx_sd["Cin"]], act_case[idx_sd["B"]] }, + { act_case[idx_sd["kw"]], act_case[idx_sd["kh"]], act_case[idx_sd["Cin"]], act_case[idx_sd["Cout"]] }, + GGML_TYPE_F32, 1, 1, p0, p1, 1, 1, false)); + } + + for (auto act_case : cases_sd) { + GGML_ASSERT(act_case[idx_sd["kw"]] == 3 || act_case[idx_sd["kw"]] == 1); + GGML_ASSERT(act_case[idx_sd["kh"]] == 3 || act_case[idx_sd["kh"]] == 1); + + uint32_t p0 = act_case[idx_sd["kw"]] == 3 ? 1 : 0; + uint32_t p1 = act_case[idx_sd["kh"]] == 3 ? 1 : 0; + + test_cases.emplace_back(new test_conv_2d_implicit( + { act_case[idx_sd["iw"]], act_case[idx_sd["ih"]], act_case[idx_sd["Cin"]], act_case[idx_sd["B"]] }, + { act_case[idx_sd["kw"]], act_case[idx_sd["kh"]], act_case[idx_sd["Cin"]], act_case[idx_sd["Cout"]] }, + GGML_TYPE_F16, 1, 1, p0, p1, 1, 1, true)); + } + + for (auto act_case : cases_sd) { + GGML_ASSERT(act_case[idx_sd["kw"]] == 3 || act_case[idx_sd["kw"]] == 1); + GGML_ASSERT(act_case[idx_sd["kh"]] == 3 || act_case[idx_sd["kh"]] == 1); + + uint32_t p0 = act_case[idx_sd["kw"]] == 3 ? 1 : 0; + uint32_t p1 = act_case[idx_sd["kh"]] == 3 ? 1 : 0; + + test_cases.emplace_back(new test_conv_2d_implicit( + { act_case[idx_sd["iw"]], act_case[idx_sd["ih"]], act_case[idx_sd["Cin"]], act_case[idx_sd["B"]] }, + { act_case[idx_sd["kw"]], act_case[idx_sd["kh"]], act_case[idx_sd["Cin"]], act_case[idx_sd["Cout"]] }, + GGML_TYPE_F32, 1, 1, p0, p1, 1, 1, true)); + } + test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 1, 1, 1})); test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1})); diff --git a/tests/test-conv2d-implicit.cpp b/tests/test-conv2d-implicit.cpp index 0ac438a137..0b8de368d1 100644 --- a/tests/test-conv2d-implicit.cpp +++ b/tests/test-conv2d-implicit.cpp @@ -262,7 +262,7 @@ struct ggml_cgraph * build_graph_2(const test_model& model) { // printf("conv2d: (%zu, %zu, %zu, %zu) \n", ne[0], ne[1], ne[2], ne[3]); - struct ggml_tensor* wino_res = ggml_conv_2d_implicitgemm(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1); + struct ggml_tensor* wino_res = ggml_conv_2d_implicitgemm(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1, 0); // struct ggml_tensor* wino_res = ggml_conv_2d_direct(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1); ggml_set_name(wino_res, "wino_res"); ggml_build_forward_expand(gf, wino_res); @@ -339,20 +339,20 @@ int main(void) { ggml_time_init(); std::vector> configs = { - // std::make_tuple(64,64,48,64), - // std::make_tuple(320,320,104,152), - // std::make_tuple(640,640,52,76), - // std::make_tuple(640,640,104,152), - // std::make_tuple(960,320,104,152), - // std::make_tuple(1280,1280,26,38), - // std::make_tuple(1280,640,52,76), - // std::make_tuple(1920,1280,26,38), - // std::make_tuple(2560,1280,26,38), - // std::make_tuple(512,512,104,152), - // std::make_tuple(512,512,208,304), + std::make_tuple(64,64,48,64), + std::make_tuple(320,320,104,152), + std::make_tuple(640,640,52,76), + std::make_tuple(640,640,104,152), + std::make_tuple(960,320,104,152), + std::make_tuple(1280,1280,26,38), + std::make_tuple(1280,640,52,76), + std::make_tuple(1920,1280,26,38), + std::make_tuple(2560,1280,26,38), + std::make_tuple(512,512,104,152), + std::make_tuple(512,512,208,304), std::make_tuple(512,256,416,608), - // std::make_tuple(256,128,832,1216), - // std::make_tuple(256,256,832,1216), + std::make_tuple(256,128,832,1216), + std::make_tuple(256,256,832,1216), // std::make_tuple(320,256,1024,1920) }; @@ -453,8 +453,6 @@ int main(void) } - // printf("\nPerforming test:\n"); - return 0; }