diff --git a/ggml/src/ggml-cuda/conv3d-implicit.cu b/ggml/src/ggml-cuda/conv3d-implicit.cu
index 76f887972a..df5ed4578a 100644
--- a/ggml/src/ggml-cuda/conv3d-implicit.cu
+++ b/ggml/src/ggml-cuda/conv3d-implicit.cu
@@ -163,7 +163,7 @@ static __global__ void conv3d_implicit_kernel(const float * __restrict__ input,
     const uint inKOffset = start_k + innerColA * 4;
 #pragma unroll
     for (uint offset = 0; offset + rowStrideA <= BM; offset += rowStrideA) {
-        const unsigned int gemm_i = bx * BM + innerRowA + offset; 
+        const unsigned int gemm_i = bx * BM + innerRowA + offset;
         // int n = (ksplit > 0) ? (bx * BM + innerRowA + offset) / PQZ : z;
         int n = (ksplit > 0) ? fastdiv(gemm_i, param.PQZ_fastdiv) : z;
         const unsigned int npqz_res = fastmodulo(gemm_i, param.PQZ_fastdiv);
@@ -173,26 +173,7 @@ static __global__ void conv3d_implicit_kernel(const float * __restrict__ input,
         const int posw_ori = fastmodulo(ohow_res, param.OW_fastdiv) * param.stride0 - param.padding0;
         int inOffset = n * inNOffset;
         if(vec_load){
-            // const uint cur0 = fastdiv(inKOffset,
-            //        layout == 0 ? param.RSC_fastdiv : param.TRS_fastdiv);             // channel offset
-            // const uint cur0_res = fastmodulo(inKOffset,
-            //        layout == 0 ? param.RSC_fastdiv : param.TRS_fastdiv);             // channel offset
-            // const uint cur1 = fastdiv(cur0_res,
-            //     layout == 0 ? param.SC_fastdiv  : param.RS_fastdiv); // kernel r offset
-            // const uint cur1_res = fastmodulo(cur0_res,
-            //     layout == 0 ? param.SC_fastdiv  : param.RS_fastdiv); // kernel r offset
-            // const uint cur2 = fastdiv(cur1_res,
-            //     layout == 0 ? param.C_fastdiv  : param.S_fastdiv); // kernel r offset
-            // const uint cur3 = fastmodulo(cur1_res,
-            //     layout == 0 ? param.C_fastdiv  : param.S_fastdiv); // kernel r offset
-            // const uint curC = layout == 0 ? cur3 : cur0;
-            // const uint curT = layout == 0 ? cur0 : cur1;
-            // const uint curR = layout == 0 ? cur1 : cur2;
-            // const uint curS = layout == 0 ? cur2 : cur3;
             const int4 curIdx = inputIndices<layout>(inKOffset, param);
-            // const int curD = posd_ori + curT * param.dilation2; // input w
-            // const int curH = posh_ori + curR * param.dilation1; // input h
-            // const int curW = posw_ori + curS * param.dilation0; // input w
             const int curD = posd_ori + curIdx.y * param.dilation2; // input w
             const int curH = posh_ori + curIdx.z * param.dilation1; // input h
             const int curW = posw_ori + curIdx.w * param.dilation0; // input w
@@ -214,43 +195,11 @@ static __global__ void conv3d_implicit_kernel(const float * __restrict__ input,
         } else {
 #pragma unroll
             for (int i = 0; i < 4; ++i){
-                // const uint cur0 = fastdiv(inKOffset + i,
-                //     layout == 0 ? param.RSC_fastdiv : param.TRS_fastdiv);             // channel offset
-                // const uint cur0_res = fastmodulo(inKOffset + i,
-                //     layout == 0 ? param.RSC_fastdiv : param.TRS_fastdiv);             // channel offset
-                // const uint cur1 = fastdiv(cur0_res,
-                //     layout == 0 ? param.SC_fastdiv  : param.RS_fastdiv); // kernel r offset
-                // const uint cur1_res = fastmodulo(cur0_res,
-                //     layout == 0 ? param.SC_fastdiv  : param.RS_fastdiv); // kernel r offset
-                // const uint cur2 = fastdiv(cur1_res,
-                //     layout == 0 ? param.C_fastdiv  : param.S_fastdiv); // kernel r offset
-                // const uint cur3 = fastmodulo(cur1_res,
-                //     layout == 0 ? param.C_fastdiv  : param.S_fastdiv); // kernel r offset
-                // const uint curC = layout == 0 ? cur3 : cur0;
-                // const uint curT = layout == 0 ? cur0 : cur1;
-                // const uint curR = layout == 0 ? cur1 : cur2;
-                // const uint curS = layout == 0 ? cur2 : cur3;
                 const int4 curIdx = inputIndices<layout>(inKOffset + i, param);
-                // const int curD = posd_ori + curT * param.dilation2; // input w
-                // const int curH = posh_ori + curR * param.dilation1; // input h
-                // const int curW = posw_ori + curS * param.dilation0; // input w
                 const int curD = posd_ori + curIdx.y * param.dilation2; // input w
                 const int curH = posh_ori + curIdx.z * param.dilation1; // input h
                 const int curW = posw_ori + curIdx.w * param.dilation0; // input w
                 const int curC = curIdx.x;
-                // const uint cur0 = fastdiv(start_k + innerColA * 4 + i,
-                //     layout == 0 ? param.SC_fastdiv : param.RS_fastdiv);             // channel offset
-                // const uint cur1 = fastdiv(fastmodulo(start_k + innerColA * 4 + i,
-                //     layout == 0 ? param.SC_fastdiv : param.RS_fastdiv),
-                //     layout == 0 ? param.C_fastdiv  : param.S_fastdiv); // kernel r offset
-                // const uint cur2 = fastmodulo(fastmodulo(start_k + innerColA * 4 + i,
-                //     layout == 0 ? param.SC_fastdiv : param.RS_fastdiv),
-                //     layout == 0 ? param.C_fastdiv  : param.S_fastdiv); // kernel r offset
-                // const uint curC = layout == 0 ? cur2 : cur0;
-                // const uint curR = layout == 0 ? cur0 : cur1;
-                // const uint curS = layout == 0 ? cur1 : cur2;
-                // const int curH = posh_ori + curR * param.d_h; // input h
-                // const int curW = posw_ori + curS * param.d_w; // input w
                 if (curH >= 0 && curW >= 0 && curD >= 0 && curW < param.w && curH < param.h && curD < param.d && inKOffset + i < end_k){
                     int inOffsetTmp = layout == 0 ?
                                 curD * inDepthOffset + curH * inChannelOffset + curW * param.c + curC:
@@ -360,12 +309,7 @@ static __global__ void conv3d_implicit_kernel(const float * __restrict__ input,
         const uint inKkOffset = innerColA * 4 + crs + BK;
 #pragma unroll
         for (uint offset = 0; offset + rowStrideA <= BM; offset += rowStrideA) {
-            // int n = (ksplit > 0) ? (bx * BM + innerRowA + offset) / PQ : z;
-            // const unsigned int npq_res = (bx * BM + innerRowA + offset) % PQ;
-            // const int posh_ori = fastdiv((ksplit > 0) ? npq_res: bx * BM + innerRowA + offset, param.OW_fastdiv) * param.u - param.p;
-            // const int posw_ori = fastmodulo((ksplit > 0) ? npq_res: bx * BM + innerRowA + offset, param.OW_fastdiv) * param.v - param.q;
-            // int inOffset = n * param.c * param.h * param.w ;
-            const unsigned int gemm_i = bx * BM + innerRowA + offset; 
+            const unsigned int gemm_i = bx * BM + innerRowA + offset;
             int n = (ksplit > 0) ? fastdiv(gemm_i, param.PQZ_fastdiv) : z;
             const unsigned int npqz_res = fastmodulo(gemm_i, param.PQZ_fastdiv);
             const int posd_ori = fastdiv((ksplit > 0) ? npqz_res: gemm_i, param.OHOW_fastdiv) * param.stride2 - param.padding2;
@@ -379,28 +323,10 @@ static __global__ void conv3d_implicit_kernel(const float * __restrict__ input,
                 const int curH = posh_ori + curIdx.z * param.dilation1; // input h
                 const int curW = posw_ori + curIdx.w * param.dilation0; // input w
                 const int curC = curIdx.x;
-                // const uint cur0 = fastdiv(innerColA * 4 + crs + BK,
-                //     layout == 0 ? param.SC_fastdiv : param.RS_fastdiv);             // channel offset
-                // const uint cur1 = fastdiv(fastmodulo(innerColA * 4 + crs + BK,
-                //     layout == 0 ? param.SC_fastdiv : param.RS_fastdiv),
-                //     layout == 0 ? param.C_fastdiv  : param.S_fastdiv); // kernel r offset
-                // const uint cur2 = fastmodulo(fastmodulo(innerColA * 4 + crs + BK,
-                //     layout == 0 ? param.SC_fastdiv : param.RS_fastdiv),
-                //     layout == 0 ? param.C_fastdiv  : param.S_fastdiv); // kernel r offset
-                // const uint curC = layout == 0 ? cur2 : cur0;
-                // const uint curR = layout == 0 ? cur0 : cur1;
-                // const uint curS = layout == 0 ? cur1 : cur2;
-
-                // const int curH = posh_ori + curR * param.d_h; // input h
-                // const int curW = posw_ori + curS * param.d_w; // input w
                 if (curH >= 0 && curW >= 0 && curD >= 0 && curW < param.w && curH < param.h && curD < param.d && inKkOffset < end_k){
                     int inOffsetTmp = layout == 0 ?
                                 curD * inDepthOffset + curH * inChannelOffset + curW * param.c + curC:
                                 curC * inDepthOffset + curD * inChannelOffset + curH * param.w + curW;
-                // if (curH >= 0 && curW >= 0 && curW < param.w && curH < param.h && inKkOffset < end_k){
-                //     int inOffsetTmp = layout == 0 ?
-                //                 curH * inChannelOffset + curW * param.c + curC:
-                //                 curC * inChannelOffset + curH * param.w + curW;
                     float4 tmp = reinterpret_cast<const float4 *>(&input[inOffset + inOffsetTmp])[0];
                     smeminput[write_flag * (BM+PAD) * BK + input_sts_addr + offset +           0] = tmp.x;
                     smeminput[write_flag * (BM+PAD) * BK + input_sts_addr + offset +      BM+PAD] = tmp.y;
@@ -414,29 +340,11 @@ static __global__ void conv3d_implicit_kernel(const float * __restrict__ input,
             } else {
 #pragma unroll
                 for (int i = 0; i < 4; ++i){
-                    // const uint cur0 = fastdiv(innerColA * 4 + crs + BK + i,
-                    //     layout == 0 ? param.SC_fastdiv : param.RS_fastdiv);             // channel offset
-                    // const uint cur1 = fastdiv(fastmodulo(innerColA * 4 + crs + BK + i,
-                    //     layout == 0 ? param.SC_fastdiv : param.RS_fastdiv),
-                    //     layout == 0 ? param.C_fastdiv  : param.S_fastdiv); // kernel r offset
-                    // const uint cur2 = fastmodulo(fastmodulo(innerColA * 4 + crs + BK + i,
-                    //     layout == 0 ? param.SC_fastdiv : param.RS_fastdiv),
-                    //     layout == 0 ? param.C_fastdiv  : param.S_fastdiv); // kernel r offset
-                    // const uint curC = layout == 0 ? cur2 : cur0;
-                    // const uint curR = layout == 0 ? cur0 : cur1;
-                    // const uint curS = layout == 0 ? cur1 : cur2;
-
-                    // const int curH = posh_ori + curR * param.d_h; // input h
-                    // const int curW = posw_ori + curS * param.d_w; // input w
                     const int4 curIdx = inputIndices<layout>(inKkOffset + i, param);
                     const int curD = posd_ori + curIdx.y * param.dilation2; // input w
                     const int curH = posh_ori + curIdx.z * param.dilation1; // input h
                     const int curW = posw_ori + curIdx.w * param.dilation0; // input w
                     const int curC = curIdx.x;
-                    // if (curH >= 0 && curW >= 0 && curW < param.w && curH < param.h && innerColA * 4 + crs + BK + i < end_k){
-                    //     int inOffsetTmp = layout == 0 ?
-                    //             curH * inChannelOffset + curW * param.c + curC:
-                    //             curC * inChannelOffset + curH * param.w + curW;
                     if (curH >= 0 && curW >= 0 && curD >= 0 && curW < param.w && curH < param.h && curD < param.d && inKkOffset + i < end_k){
                         int inOffsetTmp = layout == 0 ?
                                 curD * inDepthOffset + curH * inChannelOffset + curW * param.c + curC:
@@ -521,7 +429,6 @@ static __global__ void conv3d_implicit_kernel(const float * __restrict__ input,
                 const int col = (ksplit > 0) ? fastmodulo(gemm_i, param.PQZ_fastdiv) : gemm_i;
                 if (n < param.n && row < param.k && col < PQZ){
                     const uint outOffset = ksplit > 0 ?
-                                // z * param.n * param.k * PQZ + n * param.k * PQZ + row * PQZ + col :
                                 ((z * param.n + n) * param.k + row) * PQZ  + col :
                                 (z * param.k + row) * PQZ + col;
                     output[outOffset] = smemoutput[output_lds_addr + subk * WARPSIZE];
@@ -790,7 +697,7 @@ static __global__ void conv3d_implicit_kernel(const half * __restrict__ input,
   const unsigned int K = param.c * param.r * param.s * param.t;
   const uint weightKOffset = K; //param.c * param.r * param.s * param.t;
   const uint inChannelOffset = param.c * param.w;
-  const uint inDepthOffset = param.h * param.c * param.w;  
+  const uint inDepthOffset = param.h * param.c * param.w;
   const uint inNOffset = param.c * param.w * param.h * param.d;
 
   // loop bounds, constexpr where possible allows for loop unrolling
@@ -854,7 +761,7 @@ static __global__ void conv3d_implicit_kernel(const half * __restrict__ input,
     if (block_k != num_block_tiles_k){
       const half* A_block_gmem = input;
       const half* B_block_gmem = kernel + (block_n * BN * weightKOffset);
-      tileMemcpyLoadA<BM, BK, NUM_THREADS, 4>(A_block_gmem, A_gmem_cache_reg, block_k * BK, 
+      tileMemcpyLoadA<BM, BK, NUM_THREADS, 4>(A_block_gmem, A_gmem_cache_reg, block_k * BK,
                                              inNOffset, inDepthOffset, inChannelOffset, param);
       tileMemcpyLoadB<BN, BK, NUM_THREADS, 4>(B_block_gmem, B_gmem_cache_reg, block_k * BK, weightKOffset, param);
     }
@@ -935,12 +842,9 @@ static __global__ void conv3d_implicit_kernel(const half * __restrict__ input,
             for (int j = 0; j < 4; ++j){
                 const uint row =  m_idx + subk + i * WN / 2;
                 const uint gemm_i =  n_idx + j*32;
-                // const int n = fastdiv(gemm_i, param.OHOW_fastdiv);
-                // const int col = fastmodulo(gemm_i, param.OHOW_fastdiv);
                 const int n = fastdiv(gemm_i, param.PQZ_fastdiv);
                 const int col = fastmodulo(gemm_i, param.PQZ_fastdiv);
                 if(n < param.n && row < param.k && col < PQZ){
-                    // const uint outOffset = n * param.k * param.Oh * param.Ow + row * param.Oh * param.Ow + col;
                     const uint outOffset = (n * param.k + row) * PQZ + col;
                     uint idx = output_lds_addr + subk + j*32*BN/2;
                     idx = idx ^ ((idx & 0b1110000000) >> 4);
@@ -1109,19 +1013,15 @@ void ggml_cuda_op_conv3d_implicit(ggml_backend_cuda_context & ctx, ggml_tensor *
     const uint KW = kernel->ne[0];  // kernel_w
     const uint KH = kernel->ne[1];  // kernel_h
     const uint KD = kernel->ne[2];  // kernel_h
-    // const uint IC = input->ne[2];   // input_channels
 
-    // const uint OC = kernel->ne[3];  // ouptut_chanles
-    // const uint B  = input->ne[3];   // n_batches
-
-    param_t params = { B, 
-                      IC, 
+    param_t params = { B,
+                      IC,
                       IH, IW, ID,
-                      OC, 
+                      OC,
                       KH, KW, KD,
-                      ST_Y, ST_X, ST_Z,
-                      PD_Y, PD_X, PD_Z,
-                      DL_Y, DL_X, DL_Z,
+                      ST_X, ST_Y, ST_Z,
+                      PD_X, PD_Y, PD_Z,
+                      DL_X, DL_Y, DL_Z,
                       OH, OW, OD,
                       init_fastdiv_values(KW*IC),
                       init_fastdiv_values(OW),
diff --git a/ggml/src/ggml-cuda/conv3d-implicit.cuh b/ggml/src/ggml-cuda/conv3d-implicit.cuh
index 04fc9109ed..9cd7fe4e9b 100644
--- a/ggml/src/ggml-cuda/conv3d-implicit.cuh
+++ b/ggml/src/ggml-cuda/conv3d-implicit.cuh
@@ -11,15 +11,15 @@ typedef struct{
     unsigned int      r;                              //filter height
     unsigned int      s;                              //filter width
     unsigned int      t;                              //filter depth
-    unsigned int      stride0;                       //stride width
-    unsigned int      stride1;                       //stride height
+    unsigned int      stride0;                        //stride width
+    unsigned int      stride1;                        //stride height
     unsigned int      stride2;                        //stride depth
-    unsigned int      padding0;                      //padding width 
+    unsigned int      padding0;                       //padding width
     unsigned int      padding1;                       //padding height
-    unsigned int      padding2;                      //padding depth
-    unsigned int      dilation0;                       //dilation width
-    unsigned int      dilation1;                       //dilation height
-    unsigned int      dilation2;                        //dilation depth
+    unsigned int      padding2;                       //padding depth
+    unsigned int      dilation0;                      //dilation width
+    unsigned int      dilation1;                      //dilation height
+    unsigned int      dilation2;                      //dilation depth
     unsigned int      Oh;                             //output height
     unsigned int      Ow;                             //output width
     unsigned int      Od;                             //output depth
@@ -39,17 +39,17 @@ template<const int layout>
 __device__ __forceinline__ int4 inputIndices(const unsigned int kidx, param_t param) {
 
     const unsigned int cur0 = fastdiv(kidx,
-        layout == 0 ? param.RSC_fastdiv : param.TRS_fastdiv);             // channel offset
+        layout == 0 ? param.RSC_fastdiv : param.TRS_fastdiv);
     const unsigned int cur0_res = fastmodulo(kidx,
-        layout == 0 ? param.RSC_fastdiv : param.TRS_fastdiv);             // channel offset
+        layout == 0 ? param.RSC_fastdiv : param.TRS_fastdiv);
     const unsigned int cur1 = fastdiv(cur0_res,
-        layout == 0 ? param.SC_fastdiv  : param.RS_fastdiv); // kernel r offset
+        layout == 0 ? param.SC_fastdiv  : param.RS_fastdiv);
     const unsigned int cur1_res = fastmodulo(cur0_res,
-        layout == 0 ? param.SC_fastdiv  : param.RS_fastdiv); // kernel r offset
+        layout == 0 ? param.SC_fastdiv  : param.RS_fastdiv);
     const unsigned int cur2 = fastdiv(cur1_res,
-        layout == 0 ? param.C_fastdiv  : param.S_fastdiv); // kernel r offset
+        layout == 0 ? param.C_fastdiv  : param.S_fastdiv);
     const unsigned int cur3 = fastmodulo(cur1_res,
-        layout == 0 ? param.C_fastdiv  : param.S_fastdiv); // kernel r offset
+        layout == 0 ? param.C_fastdiv  : param.S_fastdiv);
     const unsigned int curC = layout == 0 ? cur3 : cur0;
     const unsigned int curT = layout == 0 ? cur0 : cur1;
     const unsigned int curR = layout == 0 ? cur1 : cur2;
@@ -90,9 +90,6 @@ __device__ __forceinline__ void tileMemcpySwizzleB(
     constexpr unsigned int NUM_ITERS = TILE_ROWS / ROW_STEP;
     unsigned int thread_row = thread_idx / TILE_COLS_VECTORIZED;
     const unsigned int thread_col = thread_idx % TILE_COLS_VECTORIZED;
-    // const unsigned int curR = fastdiv(thread_col*8,                                 param.SC_fastdiv); // channel offset
-    // const unsigned int curS = fastdiv(fastmodulo(thread_col*8,    param.SC_fastdiv), param.C_fastdiv); // kernel r offset
-    // const unsigned int curC = fastmodulo(fastmodulo(thread_col*8, param.SC_fastdiv), param.C_fastdiv); //
     const unsigned int kidx = thread_col*8;
     const int4 curIdx = inputIndices<0>(kidx, param);
     const int curC = curIdx.x;
@@ -172,17 +169,6 @@ __device__ __forceinline__ void tileMemcpySwizzleA(
         const int ohow_res = fastmodulo(npqz_res, param.OHOW_fastdiv);
         const int posh_ori = fastdiv(ohow_res, param.OW_fastdiv) * param.stride1 - param.padding1;
         const int posw_ori = fastmodulo(ohow_res, param.OW_fastdiv) * param.stride0 - param.padding0;
-        // unsigned int n = fastdiv(gemm_i, param.OHOW_fastdiv);
-        // unsigned int npq_res = fastmodulo(gemm_i, param.OHOW_fastdiv);
-        // int posh_ori = fastdiv(npq_res, param.OW_fastdiv) * param.stride1 - param.padding1;
-        // int posw_ori = fastmodulo(npq_res, param.OW_fastdiv) * param.stride0 - param.padding0;
-        // unsigned int inOffset = n * inNOffset;
-        // const unsigned int curR = fastdiv(thread_col*8,                                 param.SC_fastdiv); // channel offset
-        // const unsigned int curS = fastdiv(fastmodulo(thread_col*8,    param.SC_fastdiv), param.C_fastdiv); // kernel r offset
-        // const unsigned int curC = fastmodulo(fastmodulo(thread_col*8, param.SC_fastdiv), param.C_fastdiv); // kernel r offset
-        // int curH = posh_ori + curR * param.dilation1; // input h
-        // int curW = posw_ori + curS * param.dilation0; // input w
-        
         const int curD = posd_ori + curIdx.y * param.dilation2; // input d
         const int curH = posh_ori + curIdx.z * param.dilation1; // input h
         const int curW = posw_ori + curIdx.w * param.dilation0; // input w
@@ -193,9 +179,6 @@ __device__ __forceinline__ void tileMemcpySwizzleA(
         dst_index = dst_index ^ ((dst_index & SWIZZLE_MASK_2) >> SWIZZLE_BITS_2);
         if (curH >= 0 && curW >= 0 && curD >= 0 && curW < param.w && curH < param.h && curD < param.d && curC < param.c){
             int inOffsetTmp = curD * inDepthOffset + curH * inChannelOffset + curW * param.c + curC;
-        // if (curH >= 0 && curW >= 0 && curW < param.w && curH < param.h &&
-        //     curR < param.r && curS < param.s && curC < param.c){
-        //     const unsigned int inOffsetTmp = curH * inChannelOffset + curW * param.c + curC;
             dst_float4[dst_index] = reinterpret_cast<const float4 *>(&src[n * inNOffset + inOffsetTmp])[0];
         } else{
             dst_float4[dst_index] = make_float4(0.f, 0.f, 0.f, 0.f);
@@ -250,31 +233,18 @@ __device__ __forceinline__ void tileMemcpyLoadA(
     #pragma unroll
     for (unsigned int i = 0; i < NUM_ITERS; i++){
         unsigned int gemm_i = blockIdx.y * TILE_ROWS + thread_row;
-        // unsigned int n = fastdiv(gemm_i, param.OHOW_fastdiv);
-        // unsigned int npq_res = fastmodulo(gemm_i, param.OHOW_fastdiv);
-        // int posh_ori = fastdiv(npq_res, param.OW_fastdiv) * param.stride1 - param.padding1;
-        // int posw_ori = fastmodulo(npq_res, param.OW_fastdiv) * param.stride0 - param.padding0;
         unsigned int n = fastdiv(gemm_i, param.PQZ_fastdiv);
         const unsigned int npqz_res = fastmodulo(gemm_i, param.PQZ_fastdiv);
         const int posd_ori = fastdiv(npqz_res, param.OHOW_fastdiv) * param.stride2 - param.padding2;
         const int ohow_res = fastmodulo(npqz_res, param.OHOW_fastdiv);
         const int posh_ori = fastdiv(ohow_res, param.OW_fastdiv) * param.stride1 - param.padding1;
         const int posw_ori = fastmodulo(ohow_res, param.OW_fastdiv) * param.stride0 - param.padding0;
-        // unsigned int inOffset = n * param.c * param.h * param.w;
-        // const unsigned int curR = fastdiv(block_k+thread_col*8,                                 param.SC_fastdiv); // channel offset
-        // const unsigned int curS = fastdiv(fastmodulo(block_k+thread_col*8,    param.SC_fastdiv), param.C_fastdiv); // kernel r offset
-        // const unsigned int curC = fastmodulo(fastmodulo(block_k+thread_col*8, param.SC_fastdiv), param.C_fastdiv); // kernel r offset
-        // int curH = posh_ori + curR * param.dilation1; // input h
-        // int curW = posw_ori + curS * param.dilation0; // input w
         const int curD = posd_ori + curIdx.y * param.dilation2; // input d
         const int curH = posh_ori + curIdx.z * param.dilation1; // input h
         const int curW = posw_ori + curIdx.w * param.dilation0; // input w
         const int curC = curIdx.x;
         if (curH >= 0 && curW >= 0 && curD >= 0 && curW < param.w && curH < param.h && curD < param.d && curC < param.c){
             int inOffsetTmp = curD * inDepthOffset + curH * inChannelOffset + curW * param.c + curC;
-        // if (curH >= 0 && curW >= 0 && curW < param.w && curH < param.h &&
-        //     curR < param.r && curS < param.s && curC < param.c){
-        //     const unsigned int inOffsetTmp = curH * inChannelOffset + curW * param.c + curC;
             dst_reg[i] = reinterpret_cast<const float4 *>(&src[n * inNOffset + inOffsetTmp])[0];
         } else{
             dst_reg[i] = make_float4(0.f, 0.f, 0.f, 0.f);
@@ -322,9 +292,6 @@ __device__ __forceinline__ void tileMemcpyLoadB(
     // compile time check that we provided the right amount of registers for storage
     static_assert(ELEMENTS_PER_THREAD == NUM_ITERS);
 
-    // const unsigned int curR = fastdiv(block_k+thread_col*8,                                 param.SC_fastdiv); // channel offset
-    // const unsigned int curS = fastdiv(fastmodulo(block_k+thread_col*8,    param.SC_fastdiv), param.C_fastdiv); // kernel r offset
-    // const unsigned int curC = fastmodulo(fastmodulo(block_k+thread_col*8, param.SC_fastdiv), param.C_fastdiv); //
     const unsigned int kidx = block_k + thread_col*8;
     const int4 curIdx = inputIndices<0>(kidx, param);
     const int curC = curIdx.x;
@@ -334,7 +301,6 @@ __device__ __forceinline__ void tileMemcpyLoadB(
     #pragma unroll
     for (unsigned int i = 0; i < NUM_ITERS; i++){
         const unsigned int src_index = thread_row * src_stride + block_k + thread_col * 8;
-        // if (thread_row < param.k && curR < param.r && curS < param.s && curC < param.c){
         // TODO : move some checks outside of the loop
         if (thread_row < param.k && curR < param.r && curS < param.s && curT < param.t && curC < param.c){
             dst_reg[i] = reinterpret_cast<const float4 *>(&src[src_index])[0];
diff --git a/tests/test-conv3d.cpp b/tests/test-conv3d.cpp
index 53e37efd31..92e9d1e457 100644
--- a/tests/test-conv3d.cpp
+++ b/tests/test-conv3d.cpp
@@ -38,7 +38,9 @@ struct test_model {
 
 
 
-void load_model(test_model & model, int ic, int oc, int iw, int ih, int id, int kw = 3, int kh = 3, int kd = 3, bool use_gpu = false ) {
+void load_model(test_model & model, int ic, int oc, int iw, int ih, int id,
+                                    int kw = 3, int kh = 3, int kd = 3,
+                    bool use_fp16 = true, bool use_gpu = false ) {
     // create data
     int KW = kw, KH = kh, KD = kd;
     int IC = ic, OC = oc;
@@ -72,9 +74,10 @@ void load_model(test_model & model, int ic, int oc, int iw, int ih, int id, int
     }
 
     size_t buffer_size = 0;
-    {
-        // buffer_size += KW * KH * IC * OC * ggml_type_size(GGML_TYPE_F32); // tensor a
-        buffer_size += KW * KH * KD * IC * OC * ggml_type_size(GGML_TYPE_F16); // tensor a
+    {   if(use_fp16)
+            buffer_size += KW * KH * KD * IC * OC * ggml_type_size(GGML_TYPE_F16); // tensor a
+        else
+            buffer_size += KW * KH * KD * IC * OC * ggml_type_size(GGML_TYPE_F32); // tensor a
         buffer_size += IW * IH * ID * IC * N  * ggml_type_size(GGML_TYPE_F32); // tensor b
         buffer_size += 1024; // overhead
     }
@@ -122,8 +125,10 @@ void load_model(test_model & model, int ic, int oc, int iw, int ih, int id, int
     model.ctx = ggml_init(params);
 
     // create tensors
-    model.a = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F16,  KW, KH, KD, IC*OC);
-    // model.a = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F32,  KW, KH, IC, OC);
+    if(use_fp16)
+        model.a = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F16,  KW, KH, KD, IC*OC);
+    else
+        model.a = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F32,  KW, KH, KD, IC*OC);
     model.b = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F32, IW, IH, ID, IC*N);
 
     // create a allocator
@@ -134,11 +139,15 @@ void load_model(test_model & model, int ic, int oc, int iw, int ih, int id, int
 
     // load data to buffer
     if(ggml_backend_is_cpu(model.backend)) {
-        memcpy(model.a->data, hadata.data(), ggml_nbytes(model.a));
-        // memcpy(model.a->data, adata.data(), ggml_nbytes(model.a));
+        if(use_fp16)
+            memcpy(model.a->data, hadata.data(), ggml_nbytes(model.a));
+        else
+            memcpy(model.a->data, adata.data(), ggml_nbytes(model.a));
     } else {
-        ggml_backend_tensor_set(model.a, hadata.data(), 0, ggml_nbytes(model.a));
-        // ggml_backend_tensor_set(model.a, adata.data(), 0, ggml_nbytes(model.a));
+        if(use_fp16)
+            ggml_backend_tensor_set(model.a, hadata.data(), 0, ggml_nbytes(model.a));
+        else
+            ggml_backend_tensor_set(model.a,  adata.data(), 0, ggml_nbytes(model.a));
     }
 
     // alloc memory
@@ -155,7 +164,7 @@ void load_model(test_model & model, int ic, int oc, int iw, int ih, int id, int
     }
 }
 
-typedef struct ggml_cgraph* (*build_graph_t)(const test_model& model, 
+typedef struct ggml_cgraph* (*build_graph_t)(const test_model& model,
      const int64_t i0,  const int64_t i1, const int64_t i2);
 
 struct ggml_cgraph * build_graph_0(const test_model& model, const int64_t ic, const int64_t n, const int64_t oc) {
@@ -173,18 +182,27 @@ struct ggml_cgraph * build_graph_0(const test_model& model, const int64_t ic, co
 
     struct ggml_cgraph  * gf = ggml_new_graph(ctx0);
 
+    // int s0 = 2;
+    // int s1 = 1;
+    // int s2 = 1;
+    // int p0 = 2;
+    // int p1 = 0;
+    // int p2 = 1;
+    // int d0 = 1;
+    // int d1 = 1;
+    // int d2 = 2;
+
     int s0 = 1;
     int s1 = 1;
     int s2 = 1;
     int p0 = 1;
     int p1 = 1;
     int p2 = 1;
+
     int d0 = 1;
     int d1 = 1;
     int d2 = 1;
 
-       
-
     // recalculate for avoid fragmentation
     struct ggml_tensor* conv2d_res = ggml_conv_3d(ctx0, model.a, model.b, ic, s0, s1, s2, p0, p1, p2, d0, d1, d2);
     ggml_set_name(conv2d_res, "conv2d_res");
@@ -227,6 +245,16 @@ struct ggml_cgraph * build_graph_1(const test_model& model, const int64_t ic, co
     int d1 = 1;
     int d2 = 1;
 
+    // int s0 = 2;
+    // int s1 = 1;
+    // int s2 = 1;
+    // int p0 = 2;
+    // int p1 = 0;
+    // int p2 = 1;
+    // int d0 = 1;
+    // int d1 = 1;
+    // int d2 = 2;
+
     // recalculate for avoid fragmentation
     // struct ggml_tensor* conv2d_res = ggml_conv_2d(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1);
     // ggml_set_name(conv2d_res, "conv2d_res");
@@ -236,7 +264,7 @@ struct ggml_cgraph * build_graph_1(const test_model& model, const int64_t ic, co
 
 
     // struct ggml_tensor* wino_res = ggml_conv_2d_implicitgemm(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1);
-    struct ggml_tensor* wino_res = ggml_conv_3d_direct(ctx0, model.a, model.b, 
+    struct ggml_tensor* wino_res = ggml_conv_3d_direct(ctx0, model.a, model.b,
                                        s0, s1, s2, p0, p1, p2, d0, d1, d2,
                                        ic, n, oc);
     ggml_set_name(wino_res, "wino_res");
@@ -251,7 +279,7 @@ struct ggml_cgraph * build_graph_1(const test_model& model, const int64_t ic, co
 
 
 std::vector<float> compute_graph(const test_model & model, ggml_gallocr_t allocr,
-            build_graph_t build_graph, int iters, 
+            build_graph_t build_graph, int iters,
             const int64_t ic, const int64_t n, const int64_t oc, double *t) {
 
     struct ggml_cgraph * gf = build_graph(model, ic, n, oc);
@@ -271,7 +299,6 @@ std::vector<float> compute_graph(const test_model & model, ggml_gallocr_t allocr
     }
 #endif
 
-   
 
     ggml_backend_graph_compute(model.backend, gf);
 
@@ -289,8 +316,6 @@ std::vector<float> compute_graph(const test_model & model, ggml_gallocr_t allocr
     double time_us = end_time - start_time;
 
     time_us = time_us/iters;
-    // printf(" Taking %f ms\n ", time_us/1000);
-   
     //ggml_graph_print(gf);
 
     struct ggml_tensor *res = NULL;
@@ -316,12 +341,6 @@ int main(void)
 {
     ggml_time_init();
     std::vector<std::tuple<int, int, int, int, int, int, int, int>> configs = {
-        // std::make_tuple(64,64,48,64,3,3),
-        // std::make_tuple(320,320,104,152,3,3),
-        // std::make_tuple(640,640,52,76,3,3),
-        // std::make_tuple(640,640,104,152,3,3),
-        // std::make_tuple(960,320,104,152,3,3),
-        // std::make_tuple(1280,1280,26,38,3,3),
         std::make_tuple(320,1280,26,38,8,3,3,3),
         std::make_tuple(1280,1280,26,38,8,3,3,3),
         std::make_tuple(320,1280,52,76,8,3,3,3),
@@ -330,29 +349,14 @@ int main(void)
         std::make_tuple(1280,1280,104,152,8,3,3,3),
         std::make_tuple(320,1280,208,304,4,3,3,3),
         std::make_tuple(640,1280,208,304,4,3,3,3),
-        // std::make_tuple(1280,1280,26,38,1,1),
-        // std::make_tuple(256,128,768,1024,3,3),
-        // std::make_tuple(128,3,768,1024,3,3),
-        // std::make_tuple(256,128,768,1024,1,1),
-        // std::make_tuple(512,256,384,512,1,1),
-        // std::make_tuple(1280,640,52,76,3,3),
-        // std::make_tuple(1920,1280,26,38,3,3),
-        // std::make_tuple(2560,1280,26,38,3,3),
-        // std::make_tuple(320,1280,26,38,3,3),
-        // std::make_tuple(512,512,104,152,3,3),
-        // std::make_tuple(512,512,208,304,3,3),
-        // std::make_tuple(512,256,416,608,3,3),
-        // std::make_tuple(256,128,832,1216,3,3),
-        // std::make_tuple(256,256,832,1216,3,3),
-        // std::make_tuple(320,256,1024,1920)
     };
 
     int k = 0;
 
     for (auto c : configs){
         test_model model;
-        load_model(model, std::get<0>(c), std::get<1>(c), std::get<2>(c), 
-            std::get<3>(c), std::get<4>(c), std::get<5>(c), std::get<6>(c), std::get<7>(c), true);
+        load_model(model, std::get<0>(c), std::get<1>(c), std::get<2>(c),
+            std::get<3>(c), std::get<4>(c), std::get<5>(c), std::get<6>(c), std::get<7>(c), true, true);
 
         ggml_gallocr_t allocr = NULL;
         allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
@@ -366,11 +370,11 @@ int main(void)
         // fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0f/1024.0f);
 
 
-        struct ggml_cgraph * gf_res_0 = NULL;    
+        struct ggml_cgraph * gf_res_0 = NULL;
         int iterations = 20;
 
         double run_time0;
-        std::vector<float> im2col_data = compute_graph(model, allocr, build_graph_0, iterations, 
+        std::vector<float> im2col_data = compute_graph(model, allocr, build_graph_0, iterations,
             std::get<0>(c), 1, std::get<1>(c), &run_time0);
 
         ggml_gallocr_free(allocr);
@@ -386,23 +390,22 @@ int main(void)
         ggml_gallocr_reserve(allocr, gf);
         size_t mem_size1 = ggml_gallocr_get_buffer_size(allocr, 0);
             // fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0f/1024.0f);
-        
 
-        struct ggml_cgraph * gf_res_1 = NULL;    
+        struct ggml_cgraph * gf_res_1 = NULL;
 
         double run_time1;
         // std::vector<float> wino_data = compute_graph(model, allocr, build_graph_1, iterations, &run_time1);
-        std::vector<float> conv2d_data = compute_graph(model, allocr, build_graph_1, iterations, 
+        std::vector<float> conv2d_data = compute_graph(model, allocr, build_graph_1, iterations,
             std::get<0>(c), 1, std::get<1>(c), &run_time1);
 
-        if(k==0) { 
+        if(k==0) {
             k = 1;
             fprintf(stderr, "| (IC, OC, IW, IH, ID, KW, KH, KD) | im2col+GEMM TIME | im2col+GEMM VRAM | implicit GEMM TIME | implicit GEMM VRAM \n");
             fprintf(stderr, "| --- | --- | --- | --- | --- \n");
         }
 
-        fprintf(stderr, " | (%d, %d, %d, %d, %d, %d, %d, %d) | %.2f ms | %.2f MB | %.2f ms | %.2f MB\n", 
-                std::get<0>(c), std::get<1>(c), std::get<2>(c), 
+        fprintf(stderr, " | (%d, %d, %d, %d, %d, %d, %d, %d) | %.2f ms | %.2f MB | %.2f ms | %.2f MB\n",
+                std::get<0>(c), std::get<1>(c), std::get<2>(c),
                 std::get<3>(c), std::get<4>(c), std::get<5>(c),
                 std::get<6>(c), std::get<7>(c),
                 run_time0, mem_size0/1024.0f/1024.0f,
@@ -412,7 +415,7 @@ int main(void)
         // for(int i = 0; i < conv2d_data.size(); i++) {
         //     float diff = fabs(im2col_data[i] - conv2d_data[i]);
         //     // if(diff > 0.5) {
-        //         printf("(%7.3f, %7.3f, %.2f, %d) \n",
+        //         printf("(%7.3f, %7.3f, %f, %d) \n",
         //           im2col_data[i], conv2d_data[i],
         //            diff,  i);
         //         // break;
@@ -425,7 +428,5 @@ int main(void)
         ggml_gallocr_free(allocr);
 
     }
-
-    // printf("\nPerforming test:\n");    
     return 0;
 }