add support for both NCHW and NHWC layouts

2025-10-14 14:24:35 -04:00 · 2025-10-14 14:24:35 -04:00 · b70cca2ea3
parent 3e2f722d11
commit b70cca2ea3
6 changed files with 118 additions and 44 deletions
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -1992,7 +1992,8 @@ extern "C" {
            int                   p0,  // padding dimension 0
            int                   p1,  // padding dimension 1
            int                   d0,  // dilation dimension 0
-            int                   d1); // dilation dimension 1
+            int                   d1,
+            int                layout); // dilation dimension 1

    
    GGML_API struct ggml_tensor * ggml_conv_3d_direct(
--- a/ggml/src/ggml-cuda/conv2d-implicit.cu
+++ b/ggml/src/ggml-cuda/conv2d-implicit.cu
@ -731,15 +731,15 @@ static __global__ void conv2d_implicit_kernel(const float * __restrict__ input,
  conv_shapes[][2]: ne_input=[192,256,512,1git diff],ne_kernel=[3,3,512,512]
 */
 constexpr static int conv_shapes[][NUM_VARIANTS] = {
-    { 128, 128,  128  }, // BM
-    { 256,  128,  256  }, // BN
-    { 8, 8, 8 }, // BK
-    { 128, 64,  32   }, // WM
-    { 32,  32 ,  256   }, // WN
-    { 2,   2,  1   }, // WNITER
-    { 8,   4,  8   }, // TM
-    { 8,   4,  4   }, // TN
-    { 256,  256, 128}	    //  NUM_THREADS
+    { 128, 128,  128, 256 }, // BM
+    { 256,  128,  256, 128 }, // BN
+    { 8, 8, 8, 8 }, // BK
+    { 128, 64,  32, 128   }, // WM
+    { 32,  32 ,  256, 32   }, // WN
+    { 2,   2,  1, 1   }, // WNITER
+    { 8,   4,  4, 4  }, // TM
+    { 8,   4,  8, 8   }, // TN
+    { 256,  256, 128, 256}	    //  NUM_THREADS
 };

 template <typename T, unsigned int CONV_SHAPE>
@ -763,16 +763,29 @@ static void conv2d_implicit_cuda(const float * X_D, const T * K_D, float * Y_D,
    dim3 thblock(NUM_THREADS, thready, threadz);
    dim3 grid(blockx, blocky, blockz);
    // int smem_size = 24 * 1024;
-    conv2d_implicit_kernel<T, BM, BN, BK, WM, WN,
-          WNITER, TM, TN, NUM_THREADS, 1, false, 0><<<grid, thblock, 0, st>>>(X_D, K_D, Y_D, P);
+    if(P.c % 4 == 0){
+        if(P.layout == 0)
+            conv2d_implicit_kernel<T, BM, BN, BK, WM, WN,
+                WNITER, TM, TN, NUM_THREADS, 0, true, 0><<<grid, thblock, 0, st>>>(X_D, K_D, Y_D, P);
+        else if(P.layout == 1)
+            conv2d_implicit_kernel<T, BM, BN, BK, WM, WN,
+                WNITER, TM, TN, NUM_THREADS, 1, false, 0><<<grid, thblock, 0, st>>>(X_D, K_D, Y_D, P);
+    } else{
+        if(P.layout == 0)
+            conv2d_implicit_kernel<T, BM, BN, BK, WM, WN,
+                WNITER, TM, TN, NUM_THREADS, 0, false, 0><<<grid, thblock, 0, st>>>(X_D, K_D, Y_D, P);
+        else if(P.layout == 1)
+            conv2d_implicit_kernel<T, BM, BN, BK, WM, WN,
+                WNITER, TM, TN, NUM_THREADS, 1, false, 0><<<grid, thblock, 0, st>>>(X_D, K_D, Y_D, P);
+    }
 }

 static void conv2d_implicit_cuda_f16(const float * X_D, const half * K_D, float * Y_D, const param_t P, cudaStream_t st) {
-    conv2d_implicit_cuda<half, 0>(X_D, K_D, Y_D, P, st);
+    conv2d_implicit_cuda<half, 3>(X_D, K_D, Y_D, P, st);
 }

 static void conv2d_implicit_cuda_f32(const float * X_D, const float * K_D, float * Y_D, const param_t P, cudaStream_t st) {
-    conv2d_implicit_cuda<float, 0>(X_D, K_D, Y_D, P, st);
+    conv2d_implicit_cuda<float, 3>(X_D, K_D, Y_D, P, st);
 }

 void ggml_cuda_op_conv2d_implicit(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@ -785,8 +798,6 @@ void ggml_cuda_op_conv2d_implicit(ggml_backend_cuda_context & ctx, ggml_tensor *
    GGML_ASSERT(ggml_is_contiguous(kernel));
    GGML_ASSERT(kernel->type == GGML_TYPE_F16 || kernel->type == GGML_TYPE_F32);

-    // same number of input channels
-    GGML_ASSERT(input->ne[2] == kernel->ne[2]);

    cudaStream_t st = ctx.stream();

@ -797,17 +808,30 @@ void ggml_cuda_op_conv2d_implicit(ggml_backend_cuda_context & ctx, ggml_tensor *
    const int       PD_Y = p[3];  // padding_y
    const int       DL_X = p[4];  // dilation_x
    const int       DL_Y = p[5];  // dilation_y
+    const int       LT   = p[6];  // layout

+    GGML_ASSERT(LT == 0 || LT == 1);
+    
+    // same number of input channels
+    GGML_ASSERT(LT == 0 ? input->ne[0] == kernel->ne[0] : input->ne[2] == kernel->ne[2]);
    // No cwhn
-    GGML_ASSERT(p[6] == false);
+    GGML_ASSERT(p[7] == false);

-    const int IW = input->ne[0];   // input_w
-    const int IH = input->ne[1];   // input_h
+    // const int IW = input->ne[0];   // input_w
+    // const int IH = input->ne[1];   // input_h
+    // const int OW = dst->ne[0];     // output_w
+    // const int OH = dst->ne[1];     // output_h
+    // const int KW = kernel->ne[0];  // kernel_w
+    // const int KH = kernel->ne[1];  // kernel_h
+    // const int IC = input->ne[2];   // input_channels
+    const int IW = input->ne[LT == 0 ? 1 : 0];   // input_w
+    const int IH = input->ne[LT == 0 ? 2 : 1];   // input_h
    const int OW = dst->ne[0];     // output_w
    const int OH = dst->ne[1];     // output_h
-    const int KW = kernel->ne[0];  // kernel_w
-    const int KH = kernel->ne[1];  // kernel_h
-    const int IC = input->ne[2];   // input_channels
+    const int KW = kernel->ne[LT == 0 ? 1 : 0];  // kernel_w
+    const int KH = kernel->ne[LT == 0 ? 2 : 1];  // kernel_h
+    const int IC = input->ne[LT == 0 ? 0: 2];   // input_channels
+
    const int OC = kernel->ne[3];  // ouptut_chanles
    const int B  = input->ne[3];   // n_batches
    
@ -819,7 +843,7 @@ void ggml_cuda_op_conv2d_implicit(ggml_backend_cuda_context & ctx, ggml_tensor *
    params.C_fastdiv = init_fastdiv_values(IC);
    params.RS_fastdiv = init_fastdiv_values(KW*KH);
    params.S_fastdiv = init_fastdiv_values(KW);
-    params.nchw = false;
+    params.layout = LT;

    if (kernel->type == GGML_TYPE_F16) {
        conv2d_implicit_cuda_f16(X_D, (half *) K_D, Y_D, params, st);
--- a/ggml/src/ggml-cuda/conv2d-implicit.cuh
+++ b/ggml/src/ggml-cuda/conv2d-implicit.cuh
@ -17,7 +17,7 @@ typedef struct{
    unsigned int      d_w;                            //dilation width
    unsigned int      Oh;                             //output height
    unsigned int      Ow;                             //output width
-    bool              nchw;
+    unsigned int      layout;
    uint3 SC_fastdiv;
    uint3 OW_fastdiv;
    uint3 C_fastdiv;
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -4584,7 +4584,9 @@ struct ggml_tensor * ggml_conv_2d_implicitgemm(
        int                   p0,  // padding dimension 0
        int                   p1,  // padding dimension 1
        int                   d0,  // dilation dimension 0
-        int                   d1) {// dilation dimension 1
+        int                   d1,
+        // 0: NHWC, 1:NCHW
+        int                   layout) {// dilation dimension 1

    GGML_ASSERT(a->ne[2] == b->ne[2]);
    //GGML_ASSERT(a->type == b->type);
@ -4603,10 +4605,20 @@ struct ggml_tensor * ggml_conv_2d_implicitgemm(
    ggml_set_op_params_i32(result, 3, p1);
    ggml_set_op_params_i32(result, 4, d0);
    ggml_set_op_params_i32(result, 5, d1);
+    ggml_set_op_params_i32(result, 6, layout);
+
+    struct ggml_tensor *ap, *bp;
+    if(layout == 0){
+        ap = ggml_cont(ctx, ggml_permute(ctx, a, 1, 2, 0, 3));
+        bp = ggml_cont(ctx, ggml_permute(ctx, b, 1, 2, 0, 3));
+    } else{
+        ap = a;
+        bp = b;
+    }

    result->op = GGML_OP_CONV_2D_IMPLICIT;
-    result->src[0] = a;
-    result->src[1] = b;
+    result->src[0] = ap;
+    result->src[1] = bp;

    return result;
 }
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -4268,7 +4268,7 @@ struct test_conv_2d_implicit : public test_case {
        }

        ggml_tensor * out =
-            ggml_conv_2d_implicitgemm(ctx, kernel, input, stride0, stride1, padding0, padding1, dilation0, dilation1);
+            ggml_conv_2d_implicitgemm(ctx, kernel, input, stride0, stride1, padding0, padding1, dilation0, dilation1, cwhn?0:1);
        ggml_set_name(out, "out");
        return out;
    }
@ -6788,6 +6788,45 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
            GGML_TYPE_F16, 1, 1, p0, p1, 1, 1, false));
    }

+    for (auto act_case : cases_sd) {
+        GGML_ASSERT(act_case[idx_sd["kw"]] == 3 || act_case[idx_sd["kw"]] == 1);
+        GGML_ASSERT(act_case[idx_sd["kh"]] == 3 || act_case[idx_sd["kh"]] == 1);
+
+        uint32_t p0 = act_case[idx_sd["kw"]] == 3 ? 1 : 0;
+        uint32_t p1 = act_case[idx_sd["kh"]] == 3 ? 1 : 0;
+
+        test_cases.emplace_back(new test_conv_2d_implicit(
+            { act_case[idx_sd["iw"]], act_case[idx_sd["ih"]], act_case[idx_sd["Cin"]], act_case[idx_sd["B"]] },
+            { act_case[idx_sd["kw"]], act_case[idx_sd["kh"]], act_case[idx_sd["Cin"]], act_case[idx_sd["Cout"]] },
+            GGML_TYPE_F32, 1, 1, p0, p1, 1, 1, false));
+    }
+
+    for (auto act_case : cases_sd) {
+        GGML_ASSERT(act_case[idx_sd["kw"]] == 3 || act_case[idx_sd["kw"]] == 1);
+        GGML_ASSERT(act_case[idx_sd["kh"]] == 3 || act_case[idx_sd["kh"]] == 1);
+
+        uint32_t p0 = act_case[idx_sd["kw"]] == 3 ? 1 : 0;
+        uint32_t p1 = act_case[idx_sd["kh"]] == 3 ? 1 : 0;
+
+        test_cases.emplace_back(new test_conv_2d_implicit(
+            { act_case[idx_sd["iw"]], act_case[idx_sd["ih"]], act_case[idx_sd["Cin"]], act_case[idx_sd["B"]] },
+            { act_case[idx_sd["kw"]], act_case[idx_sd["kh"]], act_case[idx_sd["Cin"]], act_case[idx_sd["Cout"]] },
+            GGML_TYPE_F16, 1, 1, p0, p1, 1, 1, true));
+    }
+
+    for (auto act_case : cases_sd) {
+        GGML_ASSERT(act_case[idx_sd["kw"]] == 3 || act_case[idx_sd["kw"]] == 1);
+        GGML_ASSERT(act_case[idx_sd["kh"]] == 3 || act_case[idx_sd["kh"]] == 1);
+
+        uint32_t p0 = act_case[idx_sd["kw"]] == 3 ? 1 : 0;
+        uint32_t p1 = act_case[idx_sd["kh"]] == 3 ? 1 : 0;
+
+        test_cases.emplace_back(new test_conv_2d_implicit(
+            { act_case[idx_sd["iw"]], act_case[idx_sd["ih"]], act_case[idx_sd["Cin"]], act_case[idx_sd["B"]] },
+            { act_case[idx_sd["kw"]], act_case[idx_sd["kh"]], act_case[idx_sd["Cin"]], act_case[idx_sd["Cout"]] },
+            GGML_TYPE_F32, 1, 1, p0, p1, 1, 1, true));
+    }
+

    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1,   1, 1, 1}));
    test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1}));
--- a/tests/test-conv2d-implicit.cpp
+++ b/tests/test-conv2d-implicit.cpp
@ -262,7 +262,7 @@ struct ggml_cgraph * build_graph_2(const test_model& model) {
    // printf("conv2d: (%zu, %zu, %zu, %zu) \n", ne[0], ne[1], ne[2], ne[3]);


-    struct ggml_tensor* wino_res = ggml_conv_2d_implicitgemm(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1);
+    struct ggml_tensor* wino_res = ggml_conv_2d_implicitgemm(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1, 0);
    // struct ggml_tensor* wino_res = ggml_conv_2d_direct(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1);
    ggml_set_name(wino_res, "wino_res");
    ggml_build_forward_expand(gf, wino_res);
@ -339,20 +339,20 @@ int main(void)
 {
    ggml_time_init();
    std::vector<std::tuple<int, int, int, int>> configs = {
-        // std::make_tuple(64,64,48,64),
-        // std::make_tuple(320,320,104,152),
-        // std::make_tuple(640,640,52,76),
-        // std::make_tuple(640,640,104,152),
-        // std::make_tuple(960,320,104,152),
-        // std::make_tuple(1280,1280,26,38),
-        // std::make_tuple(1280,640,52,76),
-        // std::make_tuple(1920,1280,26,38),
-        // std::make_tuple(2560,1280,26,38),
-        // std::make_tuple(512,512,104,152),
-        // std::make_tuple(512,512,208,304),
+        std::make_tuple(64,64,48,64),
+        std::make_tuple(320,320,104,152),
+        std::make_tuple(640,640,52,76),
+        std::make_tuple(640,640,104,152),
+        std::make_tuple(960,320,104,152),
+        std::make_tuple(1280,1280,26,38),
+        std::make_tuple(1280,640,52,76),
+        std::make_tuple(1920,1280,26,38),
+        std::make_tuple(2560,1280,26,38),
+        std::make_tuple(512,512,104,152),
+        std::make_tuple(512,512,208,304),
        std::make_tuple(512,256,416,608),
-        // std::make_tuple(256,128,832,1216),
-        // std::make_tuple(256,256,832,1216),
+        std::make_tuple(256,128,832,1216),
+        std::make_tuple(256,256,832,1216),
        // std::make_tuple(320,256,1024,1920)
    };

@ -453,8 +453,6 @@ int main(void)

    }

-    
    // printf("\nPerforming test:\n");    
-  
    return 0;
 }