diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index b558b29874..3999acbd4e 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -1992,7 +1992,8 @@ extern "C" {
             int                   p0,  // padding dimension 0
             int                   p1,  // padding dimension 1
             int                   d0,  // dilation dimension 0
-            int                   d1); // dilation dimension 1
+            int                   d1,
+            int                layout); // dilation dimension 1
 
     
     GGML_API struct ggml_tensor * ggml_conv_3d_direct(
diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cu b/ggml/src/ggml-cuda/conv2d-implicit.cu
index 8cd17aff84..a1693dcf24 100644
--- a/ggml/src/ggml-cuda/conv2d-implicit.cu
+++ b/ggml/src/ggml-cuda/conv2d-implicit.cu
@@ -731,15 +731,15 @@ static __global__ void conv2d_implicit_kernel(const float * __restrict__ input,
   conv_shapes[][2]: ne_input=[192,256,512,1git diff],ne_kernel=[3,3,512,512]
 */
 constexpr static int conv_shapes[][NUM_VARIANTS] = {
-    { 128, 128,  128  }, // BM
-    { 256,  128,  256  }, // BN
-    { 8, 8, 8 }, // BK
-    { 128, 64,  32   }, // WM
-    { 32,  32 ,  256   }, // WN
-    { 2,   2,  1   }, // WNITER
-    { 8,   4,  8   }, // TM
-    { 8,   4,  4   }, // TN
-    { 256,  256, 128}	    //  NUM_THREADS
+    { 128, 128,  128, 256 }, // BM
+    { 256,  128,  256, 128 }, // BN
+    { 8, 8, 8, 8 }, // BK
+    { 128, 64,  32, 128   }, // WM
+    { 32,  32 ,  256, 32   }, // WN
+    { 2,   2,  1, 1   }, // WNITER
+    { 8,   4,  4, 4  }, // TM
+    { 8,   4,  8, 8   }, // TN
+    { 256,  256, 128, 256}	    //  NUM_THREADS
 };
 
 template <typename T, unsigned int CONV_SHAPE>
@@ -763,16 +763,29 @@ static void conv2d_implicit_cuda(const float * X_D, const T * K_D, float * Y_D,
     dim3 thblock(NUM_THREADS, thready, threadz);
     dim3 grid(blockx, blocky, blockz);
     // int smem_size = 24 * 1024;
-    conv2d_implicit_kernel<T, BM, BN, BK, WM, WN,
-          WNITER, TM, TN, NUM_THREADS, 1, false, 0><<<grid, thblock, 0, st>>>(X_D, K_D, Y_D, P);
+    if(P.c % 4 == 0){
+        if(P.layout == 0)
+            conv2d_implicit_kernel<T, BM, BN, BK, WM, WN,
+                WNITER, TM, TN, NUM_THREADS, 0, true, 0><<<grid, thblock, 0, st>>>(X_D, K_D, Y_D, P);
+        else if(P.layout == 1)
+            conv2d_implicit_kernel<T, BM, BN, BK, WM, WN,
+                WNITER, TM, TN, NUM_THREADS, 1, false, 0><<<grid, thblock, 0, st>>>(X_D, K_D, Y_D, P);
+    } else{
+        if(P.layout == 0)
+            conv2d_implicit_kernel<T, BM, BN, BK, WM, WN,
+                WNITER, TM, TN, NUM_THREADS, 0, false, 0><<<grid, thblock, 0, st>>>(X_D, K_D, Y_D, P);
+        else if(P.layout == 1)
+            conv2d_implicit_kernel<T, BM, BN, BK, WM, WN,
+                WNITER, TM, TN, NUM_THREADS, 1, false, 0><<<grid, thblock, 0, st>>>(X_D, K_D, Y_D, P);
+    }
 }
 
 static void conv2d_implicit_cuda_f16(const float * X_D, const half * K_D, float * Y_D, const param_t P, cudaStream_t st) {
-    conv2d_implicit_cuda<half, 0>(X_D, K_D, Y_D, P, st);
+    conv2d_implicit_cuda<half, 3>(X_D, K_D, Y_D, P, st);
 }
 
 static void conv2d_implicit_cuda_f32(const float * X_D, const float * K_D, float * Y_D, const param_t P, cudaStream_t st) {
-    conv2d_implicit_cuda<float, 0>(X_D, K_D, Y_D, P, st);
+    conv2d_implicit_cuda<float, 3>(X_D, K_D, Y_D, P, st);
 }
 
 void ggml_cuda_op_conv2d_implicit(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -785,8 +798,6 @@ void ggml_cuda_op_conv2d_implicit(ggml_backend_cuda_context & ctx, ggml_tensor *
     GGML_ASSERT(ggml_is_contiguous(kernel));
     GGML_ASSERT(kernel->type == GGML_TYPE_F16 || kernel->type == GGML_TYPE_F32);
 
-    // same number of input channels
-    GGML_ASSERT(input->ne[2] == kernel->ne[2]);
 
     cudaStream_t st = ctx.stream();
 
@@ -797,17 +808,30 @@ void ggml_cuda_op_conv2d_implicit(ggml_backend_cuda_context & ctx, ggml_tensor *
     const int       PD_Y = p[3];  // padding_y
     const int       DL_X = p[4];  // dilation_x
     const int       DL_Y = p[5];  // dilation_y
+    const int       LT   = p[6];  // layout
 
+    GGML_ASSERT(LT == 0 || LT == 1);
+    
+    // same number of input channels
+    GGML_ASSERT(LT == 0 ? input->ne[0] == kernel->ne[0] : input->ne[2] == kernel->ne[2]);
     // No cwhn
-    GGML_ASSERT(p[6] == false);
+    GGML_ASSERT(p[7] == false);
 
-    const int IW = input->ne[0];   // input_w
-    const int IH = input->ne[1];   // input_h
+    // const int IW = input->ne[0];   // input_w
+    // const int IH = input->ne[1];   // input_h
+    // const int OW = dst->ne[0];     // output_w
+    // const int OH = dst->ne[1];     // output_h
+    // const int KW = kernel->ne[0];  // kernel_w
+    // const int KH = kernel->ne[1];  // kernel_h
+    // const int IC = input->ne[2];   // input_channels
+    const int IW = input->ne[LT == 0 ? 1 : 0];   // input_w
+    const int IH = input->ne[LT == 0 ? 2 : 1];   // input_h
     const int OW = dst->ne[0];     // output_w
     const int OH = dst->ne[1];     // output_h
-    const int KW = kernel->ne[0];  // kernel_w
-    const int KH = kernel->ne[1];  // kernel_h
-    const int IC = input->ne[2];   // input_channels
+    const int KW = kernel->ne[LT == 0 ? 1 : 0];  // kernel_w
+    const int KH = kernel->ne[LT == 0 ? 2 : 1];  // kernel_h
+    const int IC = input->ne[LT == 0 ? 0: 2];   // input_channels
+
     const int OC = kernel->ne[3];  // ouptut_chanles
     const int B  = input->ne[3];   // n_batches
     
@@ -819,7 +843,7 @@ void ggml_cuda_op_conv2d_implicit(ggml_backend_cuda_context & ctx, ggml_tensor *
     params.C_fastdiv = init_fastdiv_values(IC);
     params.RS_fastdiv = init_fastdiv_values(KW*KH);
     params.S_fastdiv = init_fastdiv_values(KW);
-    params.nchw = false;
+    params.layout = LT;
 
     if (kernel->type == GGML_TYPE_F16) {
         conv2d_implicit_cuda_f16(X_D, (half *) K_D, Y_D, params, st);
diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cuh b/ggml/src/ggml-cuda/conv2d-implicit.cuh
index d2f3cffcc3..e46d93ef4f 100644
--- a/ggml/src/ggml-cuda/conv2d-implicit.cuh
+++ b/ggml/src/ggml-cuda/conv2d-implicit.cuh
@@ -17,7 +17,7 @@ typedef struct{
     unsigned int      d_w;                            //dilation width
     unsigned int      Oh;                             //output height
     unsigned int      Ow;                             //output width
-    bool              nchw;
+    unsigned int      layout;
     uint3 SC_fastdiv;
     uint3 OW_fastdiv;
     uint3 C_fastdiv;
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 1c746c687a..7fa97e84de 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -4584,7 +4584,9 @@ struct ggml_tensor * ggml_conv_2d_implicitgemm(
         int                   p0,  // padding dimension 0
         int                   p1,  // padding dimension 1
         int                   d0,  // dilation dimension 0
-        int                   d1) {// dilation dimension 1
+        int                   d1,
+        // 0: NHWC, 1:NCHW
+        int                   layout) {// dilation dimension 1
 
     GGML_ASSERT(a->ne[2] == b->ne[2]);
     //GGML_ASSERT(a->type == b->type);
@@ -4603,10 +4605,20 @@ struct ggml_tensor * ggml_conv_2d_implicitgemm(
     ggml_set_op_params_i32(result, 3, p1);
     ggml_set_op_params_i32(result, 4, d0);
     ggml_set_op_params_i32(result, 5, d1);
+    ggml_set_op_params_i32(result, 6, layout);
+
+    struct ggml_tensor *ap, *bp;
+    if(layout == 0){
+        ap = ggml_cont(ctx, ggml_permute(ctx, a, 1, 2, 0, 3));
+        bp = ggml_cont(ctx, ggml_permute(ctx, b, 1, 2, 0, 3));
+    } else{
+        ap = a;
+        bp = b;
+    }
 
     result->op = GGML_OP_CONV_2D_IMPLICIT;
-    result->src[0] = a;
-    result->src[1] = b;
+    result->src[0] = ap;
+    result->src[1] = bp;
 
     return result;
 }
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 3c4388f8a5..49a5688acd 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -4268,7 +4268,7 @@ struct test_conv_2d_implicit : public test_case {
         }
 
         ggml_tensor * out =
-            ggml_conv_2d_implicitgemm(ctx, kernel, input, stride0, stride1, padding0, padding1, dilation0, dilation1);
+            ggml_conv_2d_implicitgemm(ctx, kernel, input, stride0, stride1, padding0, padding1, dilation0, dilation1, cwhn?0:1);
         ggml_set_name(out, "out");
         return out;
     }
@@ -6788,6 +6788,45 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
             GGML_TYPE_F16, 1, 1, p0, p1, 1, 1, false));
     }
 
+    for (auto act_case : cases_sd) {
+        GGML_ASSERT(act_case[idx_sd["kw"]] == 3 || act_case[idx_sd["kw"]] == 1);
+        GGML_ASSERT(act_case[idx_sd["kh"]] == 3 || act_case[idx_sd["kh"]] == 1);
+
+        uint32_t p0 = act_case[idx_sd["kw"]] == 3 ? 1 : 0;
+        uint32_t p1 = act_case[idx_sd["kh"]] == 3 ? 1 : 0;
+
+        test_cases.emplace_back(new test_conv_2d_implicit(
+            { act_case[idx_sd["iw"]], act_case[idx_sd["ih"]], act_case[idx_sd["Cin"]], act_case[idx_sd["B"]] },
+            { act_case[idx_sd["kw"]], act_case[idx_sd["kh"]], act_case[idx_sd["Cin"]], act_case[idx_sd["Cout"]] },
+            GGML_TYPE_F32, 1, 1, p0, p1, 1, 1, false));
+    }
+
+    for (auto act_case : cases_sd) {
+        GGML_ASSERT(act_case[idx_sd["kw"]] == 3 || act_case[idx_sd["kw"]] == 1);
+        GGML_ASSERT(act_case[idx_sd["kh"]] == 3 || act_case[idx_sd["kh"]] == 1);
+
+        uint32_t p0 = act_case[idx_sd["kw"]] == 3 ? 1 : 0;
+        uint32_t p1 = act_case[idx_sd["kh"]] == 3 ? 1 : 0;
+
+        test_cases.emplace_back(new test_conv_2d_implicit(
+            { act_case[idx_sd["iw"]], act_case[idx_sd["ih"]], act_case[idx_sd["Cin"]], act_case[idx_sd["B"]] },
+            { act_case[idx_sd["kw"]], act_case[idx_sd["kh"]], act_case[idx_sd["Cin"]], act_case[idx_sd["Cout"]] },
+            GGML_TYPE_F16, 1, 1, p0, p1, 1, 1, true));
+    }
+
+    for (auto act_case : cases_sd) {
+        GGML_ASSERT(act_case[idx_sd["kw"]] == 3 || act_case[idx_sd["kw"]] == 1);
+        GGML_ASSERT(act_case[idx_sd["kh"]] == 3 || act_case[idx_sd["kh"]] == 1);
+
+        uint32_t p0 = act_case[idx_sd["kw"]] == 3 ? 1 : 0;
+        uint32_t p1 = act_case[idx_sd["kh"]] == 3 ? 1 : 0;
+
+        test_cases.emplace_back(new test_conv_2d_implicit(
+            { act_case[idx_sd["iw"]], act_case[idx_sd["ih"]], act_case[idx_sd["Cin"]], act_case[idx_sd["B"]] },
+            { act_case[idx_sd["kw"]], act_case[idx_sd["kh"]], act_case[idx_sd["Cin"]], act_case[idx_sd["Cout"]] },
+            GGML_TYPE_F32, 1, 1, p0, p1, 1, 1, true));
+    }
+
 
     test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1,   1, 1, 1}));
     test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1}));
diff --git a/tests/test-conv2d-implicit.cpp b/tests/test-conv2d-implicit.cpp
index 0ac438a137..0b8de368d1 100644
--- a/tests/test-conv2d-implicit.cpp
+++ b/tests/test-conv2d-implicit.cpp
@@ -262,7 +262,7 @@ struct ggml_cgraph * build_graph_2(const test_model& model) {
     // printf("conv2d: (%zu, %zu, %zu, %zu) \n", ne[0], ne[1], ne[2], ne[3]);
 
 
-    struct ggml_tensor* wino_res = ggml_conv_2d_implicitgemm(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1);
+    struct ggml_tensor* wino_res = ggml_conv_2d_implicitgemm(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1, 0);
     // struct ggml_tensor* wino_res = ggml_conv_2d_direct(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1);
     ggml_set_name(wino_res, "wino_res");
     ggml_build_forward_expand(gf, wino_res);
@@ -339,20 +339,20 @@ int main(void)
 {
     ggml_time_init();
     std::vector<std::tuple<int, int, int, int>> configs = {
-        // std::make_tuple(64,64,48,64),
-        // std::make_tuple(320,320,104,152),
-        // std::make_tuple(640,640,52,76),
-        // std::make_tuple(640,640,104,152),
-        // std::make_tuple(960,320,104,152),
-        // std::make_tuple(1280,1280,26,38),
-        // std::make_tuple(1280,640,52,76),
-        // std::make_tuple(1920,1280,26,38),
-        // std::make_tuple(2560,1280,26,38),
-        // std::make_tuple(512,512,104,152),
-        // std::make_tuple(512,512,208,304),
+        std::make_tuple(64,64,48,64),
+        std::make_tuple(320,320,104,152),
+        std::make_tuple(640,640,52,76),
+        std::make_tuple(640,640,104,152),
+        std::make_tuple(960,320,104,152),
+        std::make_tuple(1280,1280,26,38),
+        std::make_tuple(1280,640,52,76),
+        std::make_tuple(1920,1280,26,38),
+        std::make_tuple(2560,1280,26,38),
+        std::make_tuple(512,512,104,152),
+        std::make_tuple(512,512,208,304),
         std::make_tuple(512,256,416,608),
-        // std::make_tuple(256,128,832,1216),
-        // std::make_tuple(256,256,832,1216),
+        std::make_tuple(256,128,832,1216),
+        std::make_tuple(256,256,832,1216),
         // std::make_tuple(320,256,1024,1920)
     };
 
@@ -453,8 +453,6 @@ int main(void)
 
     }
 
-    
     // printf("\nPerforming test:\n");    
-  
     return 0;
 }