diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cu b/ggml/src/ggml-cuda/conv2d-implicit.cu
index f6059fc3ae..000fd89e20 100644
--- a/ggml/src/ggml-cuda/conv2d-implicit.cu
+++ b/ggml/src/ggml-cuda/conv2d-implicit.cu
@@ -742,7 +742,7 @@ __device__ __forceinline__ void ldmatrix_a(
   half (&reg)[mma_tiles_per_warp_m][mma_tiles_per_warp_k][4]
 )
 {
-// #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE    
+#if __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
   static_assert(mma_tiles_per_warp_m == 8, "mma_tiles_per_warp_m must be 4");
   static_assert(mma_tiles_per_warp_k == 4, "mma_tiles_per_warp_k must be 4");
 
@@ -885,11 +885,11 @@ __device__ __forceinline__ void ldmatrix_a(
       : "=r"(reg_[6][3][0]), "=r"(reg_[6][3][1]), "=r"(reg_[7][3][0]), "=r"(reg_[7][3][1])
       : "r"(src_addr + 96 * smem_stride_)
     );
-// #else
-//     GGML_UNUSED(src);
-//     GGML_UNUSED(reg);
-//     NO_DEVICE_CODE;
-// #endif
+#else
+    GGML_UNUSED(src);
+    GGML_UNUSED(reg);
+    NO_DEVICE_CODE;
+#endif
 }
 
 template <unsigned int mma_tiles_per_warp_k, unsigned int mma_tiles_per_warp_n, unsigned int smem_stride>
@@ -898,7 +898,7 @@ __device__ __forceinline__ void ldmatrix_b(
   half (&reg)[mma_tiles_per_warp_k][mma_tiles_per_warp_n][2]
 )
 {
-// #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE    
+#if __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
   static_assert(mma_tiles_per_warp_k == 4, "mma_tiles_per_warp_k must be 4");
   static_assert(mma_tiles_per_warp_n == 8, "mma_tiles_per_warp_n must be 8");
   
@@ -989,11 +989,11 @@ __device__ __forceinline__ void ldmatrix_b(
     // : "r"(src_addr ^ 0b1000000)
     : "r"(src_addr + 32 * smem_stride_)
   );
-// #else
-//     GGML_UNUSED(src);
-//     GGML_UNUSED(reg);
-//     NO_DEVICE_CODE;
-// #endif
+#else
+    GGML_UNUSED(src);
+    GGML_UNUSED(reg);
+    NO_DEVICE_CODE;
+#endif
 }
 
 template<const int BM, const int BN, const int BK, const int WM, const int WN,
@@ -1002,7 +1002,7 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
                                               const half * __restrict__ kernel,
                                               half * __restrict__ output,
                                               const param_t param) {
-// #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE    
+#if __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
   constexpr unsigned int MMA_M = 16;
   constexpr unsigned int MMA_N = 8;
 
@@ -1010,7 +1010,7 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
 //      printf("conv2d_implicit_kernel launch BM:%d, BN:%d, BK:%d, WM:%d, WN:%d, WK:%d, NUM_THREADS:%d \n", BM, BN, BK, WM, WN, WK, NUM_THREADS);
 
   const unsigned int K = param.c * param.r * param.s;
-  const uint PQ = param.Oh * param.Ow;
+//   const uint PQ = param.Oh * param.Ow;
   const uint inChannelOffset = param.c * param.w;
   const uint weightKOffset = param.c * param.r * param.s;
 
@@ -1153,7 +1153,8 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
     }
   }
 
-  
+
+
     // reuse smem
     half *smemoutput = shmem;
     const uint lane_id = threadIdx.x % WARPSIZE;  
@@ -1212,21 +1213,22 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
                 //     param.interm[outOffset] = smemoutput[output_lds_addr + subk * 32];
                     const uint outOffset = n * param.k * param.Oh * param.Ow + row * param.Oh * param.Ow + col;
                     output[outOffset] = smemoutput[output_lds_addr + subk + j*32*BN/2];
-                    if(outOffset == 32){
-                        printf("(%u, %u, %u, %u), output[%d,%d,%d]=%f \n", threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y,
-                             n, row, col, __half2float(output[outOffset]));
-                    }
+                    // if(outOffset == 32){
+                    //     printf("(%u, %u, %u, %u), output[%d,%d,%d]=%f \n", threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y,
+                    //          n, row, col, __half2float(output[outOffset]));
+                    // }
                 }
             }
         }
     }
-// #else
-//     GGML_UNUSED(input);
-//     GGML_UNUSED(kernel);
-//     GGML_UNUSED(output);
-//     GGML_UNUSED(param);
-//     NO_DEVICE_CODE;
-// #endif
+
+#else
+    GGML_UNUSED(input);
+    GGML_UNUSED(kernel);
+    GGML_UNUSED(output);
+    GGML_UNUSED(param);
+    NO_DEVICE_CODE;
+#endif
 }
 
 
@@ -1289,7 +1291,8 @@ static void conv2d_implicit_cuda(const float * X_D, const T * K_D, float * Y_D,
 
 static void conv2d_implicit_cuda_f16(ggml_backend_cuda_context & ctx, const float * X_D, const half * K_D, float * Y_D, int cc, const param_t P, cudaStream_t st) {
     if (GGML_CUDA_CC_IS_NVIDIA(cc) && ampere_mma_available(cc) && P.layout == 0 && P.c % 8 == 0) {
-// #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
+// #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
+        // printf("tensor core path called\n");
         constexpr unsigned int BM_dim = 256;
         constexpr unsigned int BN_dim = 256;
         constexpr unsigned int BK_dim = 32;
diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cuh b/ggml/src/ggml-cuda/conv2d-implicit.cuh
index 3ea0461218..69942bffac 100644
--- a/ggml/src/ggml-cuda/conv2d-implicit.cuh
+++ b/ggml/src/ggml-cuda/conv2d-implicit.cuh
@@ -26,7 +26,7 @@ typedef struct{
     uint3 OHOW_fastdiv;
 } param_t;
 
-// #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
+
 // same as above, but writes are swizzled to avoid bank conflicts when shared memory is read later in the kernel
 template<unsigned int TILE_ROWS,
 unsigned int NUM_THREADS>
@@ -37,6 +37,7 @@ __device__ __forceinline__ void tileMemcpySwizzleB(
     param_t param
 )
 {
+#if __CUDA_ARCH__ >= GGML_CUDA_TURING
     // constexpr unsigned int SWIZZLE_MASK = 0b111 << SWIZZLE_BITS;
 
     // // reinterpret input/output as float4
@@ -117,6 +118,13 @@ __device__ __forceinline__ void tileMemcpySwizzleB(
         }
         thread_row += ROW_STEP;
     }
+#else
+    GGML_UNUSED(src);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src_stride);
+    GGML_UNUSED(param);
+    NO_DEVICE_CODE;
+#endif
 }
 
 
@@ -131,6 +139,7 @@ __device__ __forceinline__ void tileMemcpySwizzleA(
     param_t param
 )
 {
+#if __CUDA_ARCH__ >= GGML_CUDA_TURING
     constexpr unsigned int SWIZZLE_MASK_1 = 0b10000;
     constexpr unsigned int SWIZZLE_BITS_1 = 4;
     constexpr unsigned int SWIZZLE_MASK_2 = 0b1100;
@@ -186,6 +195,13 @@ __device__ __forceinline__ void tileMemcpySwizzleA(
         }
         thread_row += ROW_STEP;
     }
+#else
+    GGML_UNUSED(src);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(inChannelOffset);
+    GGML_UNUSED(param);
+    NO_DEVICE_CODE;
+#endif    
 }
 
 template<unsigned int TILE_ROWS,
@@ -201,6 +217,7 @@ __device__ __forceinline__ void tileMemcpyLoadA(
     param_t param
 )
 {
+#if __CUDA_ARCH__ >= GGML_CUDA_TURING
     // reinterpret input/output as float4
     // const float4* src_float4 = reinterpret_cast<const float4*>(src);
     // const unsigned int src_stride_vectorized = src_stride / 8;
@@ -251,6 +268,14 @@ __device__ __forceinline__ void tileMemcpyLoadA(
         }
         thread_row += ROW_STEP;
     }
+#else
+    GGML_UNUSED(src);
+    GGML_UNUSED(dst_reg);
+    GGML_UNUSED(block_k);
+    GGML_UNUSED(inChannelOffset);
+    GGML_UNUSED(param);
+    NO_DEVICE_CODE;
+#endif    
 }
 
 
@@ -266,6 +291,7 @@ __device__ __forceinline__ void tileMemcpyLoadB(
     param_t param
 )
 {
+#if __CUDA_ARCH__ >= GGML_CUDA_TURING
     // reinterpret input/output as float4
     // const float4* src_float4 = reinterpret_cast<const float4*>(src);
     // const unsigned int src_stride_vectorized = src_stride / 8;
@@ -305,91 +331,18 @@ __device__ __forceinline__ void tileMemcpyLoadB(
         }
         thread_row += ROW_STEP;
     }
+#else
+    GGML_UNUSED(src);
+    GGML_UNUSED(dst_reg);
+    GGML_UNUSED(block_k);
+    GGML_UNUSED(src_stride);
+    GGML_UNUSED(param);
+    NO_DEVICE_CODE;
+#endif    
 }
 
-// template<unsigned int TILE_ROWS,
-// unsigned int TILE_COLS,
-// unsigned int NUM_THREADS,
-// unsigned int SWIZZLE_BITS,
-// unsigned int ELEMENTS_PER_THREAD>
-// __device__ __forceinline__ void tileMemcpySwizzleStoreB(
-//     float4 src_reg[ELEMENTS_PER_THREAD],
-//     half* dst
-// )
-// {
-//     constexpr unsigned int SWIZZLE_MASK = 0b111 << SWIZZLE_BITS;
-
-//     // reinterpret input/output as float4
-//     float4* dst_float4 = reinterpret_cast<float4*>(dst);
-
-//     // # of threads is multiple of # of columns in the tile
-//     constexpr unsigned int TILE_COLS_VECTORIZED = TILE_COLS / 8;
-//     static_assert(NUM_THREADS % TILE_COLS_VECTORIZED == 0);
-    
-//     // flatten out 2d grid of threads into in order of increasing threadIdx.x
-//     const unsigned int thread_idx = threadIdx.y * blockDim.x + threadIdx.x;
-
-//     // assign each thread a row/column in the tile, calculate how many iterations we need
-//     // to cover the whole tile
-//     constexpr unsigned int ROW_STEP = NUM_THREADS / TILE_COLS_VECTORIZED;
-//     constexpr unsigned int NUM_ITERS = TILE_ROWS / ROW_STEP;
-//     unsigned int thread_row = thread_idx / TILE_COLS_VECTORIZED;
-//     const unsigned int thread_col = thread_idx % TILE_COLS_VECTORIZED;
-    
-//     // compile time check that we provided the right amount of registers for storage
-//     static_assert(ELEMENTS_PER_THREAD == NUM_ITERS);
-    
-//     #pragma unroll
-//     for (unsigned int i = 0; i < NUM_ITERS; i++)
-//     {
-//         // apply swizzle to the dst index
-//         unsigned int dst_index = thread_row * TILE_COLS_VECTORIZED + thread_col;
-//         dst_index = dst_index ^ ((dst_index & SWIZZLE_MASK) >> SWIZZLE_BITS);
-//         dst_float4[dst_index] = src_reg[i];
-//         thread_row += ROW_STEP;
-//     }
-// }
 
 // same as above but without the swizzle
-template<unsigned int TILE_ROWS,
-unsigned int TILE_COLS,
-unsigned int NUM_THREADS,
-unsigned int ELEMENTS_PER_THREAD>
-__device__ __forceinline__ void tileMemcpyStore(
-    float4 src_reg[ELEMENTS_PER_THREAD],
-    half* dst,
-    unsigned int dst_stride_float4
-)
-{
-    // reinterpret input/output as float4
-    float4* dst_float4 = reinterpret_cast<float4*>(dst);
-
-    // # of threads is multiple of # of columns in the tile
-    constexpr unsigned int TILE_COLS_VECTORIZED = TILE_COLS / 8;
-    static_assert(NUM_THREADS % TILE_COLS_VECTORIZED == 0);
-    
-    // flatten out 2d grid of threads into in order of increasing threadIdx.x
-    const unsigned int thread_idx = threadIdx.y * blockDim.x + threadIdx.x;
-
-    // assign each thread a row/column in the tile, calculate how many iterations we need
-    // to cover the whole tile
-    constexpr unsigned int ROW_STEP = NUM_THREADS / TILE_COLS_VECTORIZED;
-    constexpr unsigned int NUM_ITERS = TILE_ROWS / ROW_STEP;
-    unsigned int thread_row = thread_idx / TILE_COLS_VECTORIZED;
-    const unsigned int thread_col = thread_idx % TILE_COLS_VECTORIZED;
-    
-    // compile time check that we provided the right amount of registers for storage
-    static_assert(ELEMENTS_PER_THREAD == NUM_ITERS);
-    
-    #pragma unroll
-    for (unsigned int i = 0; i < NUM_ITERS; i++)
-    {
-        // apply swizzle to the dst index
-        unsigned int dst_index = thread_row * dst_stride_float4 + thread_col;
-        dst_float4[dst_index] = src_reg[i];
-        thread_row += ROW_STEP;
-    }
-}
 
 // this is a special case of the above for when TILE_COLS == 32
 template<unsigned int TILE_ROWS,
@@ -400,6 +353,7 @@ __device__ __forceinline__ void tileMemcpySwizzleStore(
     half* dst
 )
 {
+#if __CUDA_ARCH__ >= GGML_CUDA_TURING
     constexpr unsigned int SWIZZLE_MASK_1 = 0b10000;
     constexpr unsigned int SWIZZLE_BITS_1 = 4;
     constexpr unsigned int SWIZZLE_MASK_2 = 0b1100;
@@ -436,6 +390,11 @@ __device__ __forceinline__ void tileMemcpySwizzleStore(
         dst_float4[dst_index] =  src_reg[i];
         thread_row += ROW_STEP;
     }
+#else
+    GGML_UNUSED(src_reg);
+    GGML_UNUSED(dst);    
+    NO_DEVICE_CODE;
+#endif    
 }
 
 __device__ __forceinline__ uint32_t cvta_to_shared_u32(const void *pointer) {
@@ -450,8 +409,6 @@ __device__ __forceinline__ uint32_t cvta_to_shared_u32(const void *pointer) {
     return address;
 }
 
-// #endif
-
 // constexpr unsigned int int_log2(unsigned int x)
 // {
 //     unsigned int result = 0;
diff --git a/tests/test-conv2d-implicit.cpp b/tests/test-conv2d-implicit.cpp
index 19d2826240..836bb10637 100644
--- a/tests/test-conv2d-implicit.cpp
+++ b/tests/test-conv2d-implicit.cpp
@@ -339,20 +339,20 @@ int main(void)
 {
     ggml_time_init();
     std::vector<std::tuple<int, int, int, int>> configs = {
-        // std::make_tuple(64,64,48,64),
-        // std::make_tuple(320,320,104,152),
-        // std::make_tuple(640,640,52,76),
-        // std::make_tuple(640,640,104,152),
-        // std::make_tuple(960,320,104,152),
-        std::make_tuple(128,1280,26,38),
-        // std::make_tuple(1280,640,52,76),
-        // std::make_tuple(1920,1280,26,38),
-        // std::make_tuple(2560,1280,26,38),
-        // std::make_tuple(512,512,104,152),
-        // std::make_tuple(512,512,208,304),
-        // std::make_tuple(512,256,416,608),
-        // std::make_tuple(256,128,832,1216),
-        // std::make_tuple(256,256,832,1216),
+        std::make_tuple(64,64,48,64),
+        std::make_tuple(320,320,104,152),
+        std::make_tuple(640,640,52,76),
+        std::make_tuple(640,640,104,152),
+        std::make_tuple(960,320,104,152),
+        std::make_tuple(1280,1280,26,38),
+        std::make_tuple(1280,640,52,76),
+        std::make_tuple(1920,1280,26,38),
+        std::make_tuple(2560,1280,26,38),
+        std::make_tuple(512,512,104,152),
+        std::make_tuple(512,512,208,304),
+        std::make_tuple(512,256,416,608),
+        std::make_tuple(256,128,832,1216),
+        std::make_tuple(256,256,832,1216),
         // std::make_tuple(320,256,1024,1920)
     };
 
@@ -375,7 +375,7 @@ int main(void)
        
 
         struct ggml_cgraph * gf_res_0 = NULL;    
-        int iterations = 0;
+        int iterations = 20;
 
         double run_time0;
         std::vector<float> conv2d_data = compute_graph(model, allocr, build_graph_0, iterations, &run_time0);
@@ -437,15 +437,15 @@ int main(void)
 
 
         // for(int i = 0; i < ggml_nelements(wino_res); i++) {
-        for(int i = 0; i < 26*38; i++) {
-            float diff = fabs(conv2d_data[i] - wino_data[i]);
-            // if(diff > 1.e-4) {
-                  printf("(%f, %f, %f, %d) \n",
-                  conv2d_data[i],
-                  wino_data[i], diff, i);
-                // break;
-            // }
-        }
+        // for(int i = 0; i < 26*38; i++) {
+        //     float diff = fabs(conv2d_data[i] - wino_data[i]);
+        //     // if(diff > 1.e-4) {
+        //           printf("(%f, %f, %f, %d) \n",
+        //           conv2d_data[i],
+        //           wino_data[i], diff, i);
+        //         // break;
+        //     // }
+        // }
 
         ggml_free(model.ctx);
         ggml_backend_buffer_free(model.buffer);