diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cu b/ggml/src/ggml-cuda/conv2d-implicit.cu
index 902220b74f..ec975194cc 100644
--- a/ggml/src/ggml-cuda/conv2d-implicit.cu
+++ b/ggml/src/ggml-cuda/conv2d-implicit.cu
@@ -913,13 +913,16 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
   // for (unsigned int block_k = 1; block_k <= num_block_tiles_k; block_k++){
   int s = 0;
   int r = 0;
-  while (block_k < num_block_tiles_k){
-  #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
+#if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
+  while (block_krs < num_block_tiles_krs) {
+
     asm volatile("cp.async.wait_group %0;\n" ::"n"(0));
-  #endif
+#else
+  while (block_k < num_block_tiles_k) {
+#endif
     __syncthreads();
 
-      // moves to the next tile
+      // moves to the next channel block tile
       int next_idx = 0;
       ++s;
       if (s == param.s) {
@@ -954,7 +957,8 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
       //   break;
 
       // if(thread_idx == 0 && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0){
-      //   printf(" s = %d, r = %d, block_k = %d, next_idx = %d , %d %d \n", s, r, block_k, next_idx, block_krs, num_block_tiles_k);
+      //   printf(" s = %d, r = %d, block_k = %d, next_idx = %d , %d,  %d, %d \n", s, r, block_k, next_idx, 
+      //     block_krs, num_block_tiles_k, num_block_tiles_krs);
       // }
 
     // if (block_k != num_block_tiles_k){
@@ -1044,8 +1048,8 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
 #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
     asm volatile("cp.async.wait_group %0;\n" ::"n"(0));
     __syncthreads();
-    half* A_warp_tile = SA2 + A_warp_tile_offset;
-    half* B_warp_tile = SB2 + B_warp_tile_offset;
+    half* A_warp_tile = SA1 + A_warp_tile_offset;
+    half* B_warp_tile = SB1 + B_warp_tile_offset;
     ldmatrix_a<mma_tiles_per_warp_m, mma_tiles_per_warp_k, BK>(A_warp_tile, A_register_);
     ldmatrix_b<mma_tiles_per_warp_k, mma_tiles_per_warp_n, BK>(B_warp_tile, B_register_);
     // outer product between mma tiles
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 16861c71c9..7a1fe441ff 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -5826,7 +5826,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
 
     for (uint32_t s0 : { 1, 3 }) {
         for (uint32_t p1 : { 2, 5 }) {
-            for (uint32_t Cin : { 1, 25 }) {
+            for (uint32_t Cin : { 1, 25, 32 }) {
                 for (uint32_t Cout : { 1, 12 }) {
                     for (uint32_t KH : { 1, 2, 3, 11 }) {
                         for (uint32_t KW : { 1, 2, 3, 11 }) {
@@ -5854,6 +5854,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
             GGML_TYPE_F16, 1, 1, 1, 1, 1, 1, false));
     test_cases.emplace_back(new test_conv_2d( { 24, 24, 128, 1 }, { 3, 3, 128, 8},
             GGML_TYPE_F16, 1, 1, 1, 1, 1, 1, false));
+    test_cases.emplace_back(new test_conv_2d( { 24, 24, 128, 3 }, { 3, 3, 128, 8},
+            GGML_TYPE_F16, 1, 1, 1, 1, 1, 1, false));
+
 
 
     // sycl backend will limit task global_range < MAX_INT
diff --git a/tests/test-conv2d.cpp b/tests/test-conv2d.cpp
index e3968f28b8..b87a04c858 100644
--- a/tests/test-conv2d.cpp
+++ b/tests/test-conv2d.cpp
@@ -43,7 +43,7 @@ struct ggml_cgraph * build_graph_1(const test_model&);
 void load_model(test_model & model, int ic, int oc, int iw, int ih, int kw = 3, int kh = 3, bool use_gpu = false ) {
     // create data
     int KW = kw, KH = kh, IC = ic, OC = oc;
-    int IW = iw, IH = ih, N = 1;
+    int IW = iw, IH = ih, N = 2;
     // srand(time(NULL));
 
     // printf(" input: IC = %d, OC = %d, IW = %d, IH = %d \n ", IC, OC, IW, IH);
@@ -176,12 +176,19 @@ struct ggml_cgraph * build_graph_0(const test_model& model) {
 
     struct ggml_cgraph  * gf = ggml_new_graph(ctx0);
 
-    int s0 = 1;
-    int s1 = 1;
-    int p0 = 1;
-    int p1 = 1;
-    int d0 = 1;
-    int d1 = 1;
+    // int s0 = 1;
+    // int s1 = 1;
+    // int p0 = 1;
+    // int p1 = 1;
+    // int d0 = 1;
+    // int d1 = 1;
+
+    int s0 = 3;
+    int s1 = 5;
+    int p0 = 5;
+    int p1 = 5;
+    int d0 = 2;
+    int d1 = 4;
 
     // recalculate for avoid fragmentation
     struct ggml_tensor* conv2d_res = ggml_conv_2d(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1);
@@ -215,12 +222,21 @@ struct ggml_cgraph * build_graph_1(const test_model& model) {
 
     struct ggml_cgraph  * gf = ggml_new_graph(ctx0);
 
-    int s0 = 1;
-    int s1 = 1;
-    int p0 = 1;
-    int p1 = 1;
-    int d0 = 1;
-    int d1 = 1;
+    // int s0 = 1;
+    // int s1 = 1;
+    // int p0 = 1;
+    // int p1 = 1;
+    // int d0 = 1;
+    // int d1 = 1;
+
+
+    int s0 = 3;
+    int s1 = 5;
+    int p0 = 5;
+    int p1 = 5;
+    int d0 = 2;
+    int d1 = 4;
+
 
     // recalculate for avoid fragmentation
     // struct ggml_tensor* conv2d_res = ggml_conv_2d(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1);
@@ -301,7 +317,8 @@ static std::vector<std::tuple<int, int, int, int, int, int>> configs = {
         // std::make_tuple(960,320,104,152,3,3),
         // std::make_tuple(1280,1280,26,38,3,3),
         // std::make_tuple(1920,640,32,32,3,3)
-        std::make_tuple(1280,1280,16,16,3,3),
+        // std::make_tuple(1280,1280,16,16,3,3),
+        std::make_tuple(32,12,141,133,3,3),
         // std::make_tuple(32,8,24,24,3,3),
         // std::make_tuple(640,640,64,64,3,3),
         // std::make_tuple(320,640,32,32,3,3),
@@ -718,12 +735,12 @@ int main(void)
         // for(int i = 0; i < 26*38; i++) {
         for(int i = 0; i < conv2d_data.size(); i++) {
             float diff = fabs(im2col_data[i] - conv2d_data[i]);
-            // if(diff > 0.5) {
+            if(diff > 0.5) {
                   printf("(%7.3f, %7.3f, %.2f, %d) \n",
                   im2col_data[i], conv2d_data[i],
                   diff, i);
-                // break;
-            // }
+                break;
+            }
         }
 
         ggml_free(model.ctx);