From a660d4d45d5184b2910d9b7b9f59e01bd36b69a8 Mon Sep 17 00:00:00 2001
From: bssrdf <merlintiger@hotmail.com>
Date: Mon, 10 Nov 2025 12:39:50 -0500
Subject: [PATCH] get rid of a convert unary kernel call and fuse the type cast
 into conv epilogue

---
 ggml/src/ggml-cuda/conv2d-implicit.cu | 32 +++++++++++++--------------
 tests/test-conv2d.cpp                 |  6 ++---
 2 files changed, 18 insertions(+), 20 deletions(-)
diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cu b/ggml/src/ggml-cuda/conv2d-implicit.cu
index 09f317e2d3..874d40e80b 100644
--- a/ggml/src/ggml-cuda/conv2d-implicit.cu
+++ b/ggml/src/ggml-cuda/conv2d-implicit.cu
@@ -582,11 +582,11 @@ __device__ __forceinline__ void ldmatrix_b(
 #endif
 }
 
-template<const int BM, const int BN, const int BK, const int WM, const int WN,
+template<typename T, const int BM, const int BN, const int BK, const int WM, const int WN,
         const int WK,  const int ksplit, const int NUM_THREADS>
 static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
                                               const half * __restrict__ kernel,
-                                              half * __restrict__ output,
+                                              T * __restrict__ output,
                                               const param_t param) {
 #if __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
 
@@ -763,10 +763,10 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
                         const uint outOffset = z * NKPQ +
                                                n * KPQ +
                                                row * PQ + col;
-                        output[outOffset] = res_[0];
+                        output[outOffset] = ggml_cuda_cast<T>(res_[0]);
                     } else {
                         const uint outOffset = n * KPQ + row * PQ + col;
-                        output[outOffset] = res_[0];
+                        output[outOffset] = ggml_cuda_cast<T>(res_[0]);
                     }
                 }
                 if (n < param.n && row+1 < param.k && col < PQ) {
@@ -774,10 +774,10 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
                         const uint outOffset = z * NKPQ +
                                                n * KPQ +
                                                (row+1) * PQ + col;
-                        output[outOffset] = res_[1];
+                        output[outOffset] = ggml_cuda_cast<T>(res_[1]);
                     } else {
                         const uint outOffset = n * KPQ + (row+1) * PQ + col;
-                        output[outOffset] = res_[1];
+                        output[outOffset] = ggml_cuda_cast<T>(res_[1]);
                     }
                 }
             }
@@ -848,12 +848,12 @@ static void launch_conv2d_implicit_split_kernel(ggml_backend_cuda_context & ctx,
         int id = ggml_cuda_get_device();
 
         ggml_cuda_pool_alloc<half> Y_H(ctx.pool(id), ksplit * P.k * P.Oh * P.Ow * P.n);
-        cudaFuncSetAttribute(conv2d_implicit_kernel<BM, BN, BK, WM, WN, WK, ksplit, NUM_THREADS>,
+        cudaFuncSetAttribute(conv2d_implicit_kernel<half, BM, BN, BK, WM, WN, WK, ksplit, NUM_THREADS>,
             cudaFuncAttributeMaxDynamicSharedMemorySize,    65536); // set shared memory limit to 64KB which is maximum for sm_75
         dim3 gridDim(BlocksN, BlocksM, ksplit);
         dim3 blockDim(ThreadsN, ThreadsM);
 
-        conv2d_implicit_kernel<BM, BN, BK,
+        conv2d_implicit_kernel<half, BM, BN, BK,
             WM, WN, WK, ksplit, NUM_THREADS><<<gridDim, blockDim, shmem_bytes, st>>>(X_H, K_H, Y_H.get(), P);
 
         const unsigned int nrows = P.n * P.k * P.Oh * P.Ow;
@@ -866,7 +866,7 @@ static void launch_conv2d_implicit_split_kernel(ggml_backend_cuda_context & ctx,
 static void conv2d_implicit_cuda_f16(ggml_backend_cuda_context & ctx, const float * X_D, const half * K_D, float * Y_D, int cc, const param_t P, cudaStream_t st) {
 
     // if (GGML_CUDA_CC_IS_NVIDIA(cc) && turing_mma_available(cc) && P.c % 8 == 0 && (P.r > 1 || P.s > 1)) {
-    if (GGML_CUDA_CC_IS_NVIDIA(cc) && turing_mma_available(cc) && P.c % 8 == 0 && (P.r > 1 || P.s > 1)) {
+    if (GGML_CUDA_CC_IS_NVIDIA(cc) && turing_mma_available(cc) && P.c % 8 == 0) {
 
         int id = ggml_cuda_get_device();
 
@@ -883,8 +883,10 @@ static void conv2d_implicit_cuda_f16(ggml_backend_cuda_context & ctx, const floa
 
         ne = P.c * P.r * P.s * P.k;
         ne01 = P.r * P.s;
-        ggml_cuda_pool_alloc<half> kernel_f16(ctx.pool(id), ne);
+        // ggml_cuda_pool_alloc<half> kernel_f16(ctx.pool(id), ne);
+        ggml_cuda_pool_alloc<half> kernel_f16(ctx.pool(id));
         if (ne01 > 1){
+          kernel_f16.alloc(ne);
           dim3 dimGrid1((ne00 + CUDA_NCHW_2_NHWC_BLOCK_C - 1) / CUDA_NCHW_2_NHWC_BLOCK_C,
                           (ne/(ne00*ne01) + CUDA_NCHW_2_NHWC_BLOCK_NM - 1) / CUDA_NCHW_2_NHWC_BLOCK_NM,
                           1) ;
@@ -973,7 +975,6 @@ static void conv2d_implicit_cuda_f16(ggml_backend_cuda_context & ctx, const floa
 
             if(candidate != -1){
               j = candidate;
-              // printf(" choosing %d, %d \n", j, max_remaining_waves);
               if (j == 2) {
                 launch_conv2d_implicit_split_kernel<BM_dim, BN_dim, BK_dim, WM_dim, WN_dim, WK_dim, 2,
                 ThreadsM, ThreadsN, NumThreads>(ctx, X_H, K_H, Y_D, BlocksM, BlocksN, shmem_bytes, P, st);
@@ -1023,18 +1024,15 @@ static void conv2d_implicit_cuda_f16(ggml_backend_cuda_context & ctx, const floa
               return;
             }
         }
-        ggml_cuda_pool_alloc<half> Y_H(ctx.pool(id), P.k * P.Oh * P.Ow * P.n);
 
-        cudaFuncSetAttribute(conv2d_implicit_kernel<BM_dim, BN_dim, BK_dim, WM_dim, WN_dim, WK_dim, 0, NumThreads>,
+        cudaFuncSetAttribute(conv2d_implicit_kernel<float, BM_dim, BN_dim, BK_dim, WM_dim, WN_dim, WK_dim, 0, NumThreads>,
             cudaFuncAttributeMaxDynamicSharedMemorySize,    65536); // set shared memory limit to 64KB which is maximum for sm_75
         dim3 gridDim(BlocksN, BlocksM);
         dim3 blockDim(ThreadsN, ThreadsM);
 
-        conv2d_implicit_kernel<BM_dim, BN_dim, BK_dim,
+        conv2d_implicit_kernel<float, BM_dim, BN_dim, BK_dim,
             WM_dim, WN_dim, WK_dim, 0, NumThreads>
-            <<<gridDim, blockDim, shmem_bytes, st>>>(X_H, K_H, Y_H.get(), P);
-        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
-        to_fp32_cuda(Y_H.get(), Y_D, P.k * P.Oh * P.Ow * P.n, st);
+            <<<gridDim, blockDim, shmem_bytes, st>>>(X_H, K_H, Y_D, P);
     } else{
        conv2d_implicit_cuda<half, 1>(X_D, K_D, Y_D, P, st);
     }
diff --git a/tests/test-conv2d.cpp b/tests/test-conv2d.cpp
index a5f2a6aef6..23a3aab366 100644
--- a/tests/test-conv2d.cpp
+++ b/tests/test-conv2d.cpp
@@ -299,7 +299,7 @@ static std::vector<std::tuple<int, int, int, int, int, int>> configs = {
         // std::make_tuple(640,640,52,76,3,3),
         // std::make_tuple(640,640,104,152,3,3),
         // std::make_tuple(960,320,104,152,3,3),
-        std::make_tuple(1280,1280,26,38,3,3),
+        // std::make_tuple(1280,1280,26,38,3,3),
         // std::make_tuple(1920,640,32,32,3,3)
         // std::make_tuple(1280,1280,16,16,3,3),
         // std::make_tuple(320,640,32,32,3,3),
@@ -317,7 +317,7 @@ static std::vector<std::tuple<int, int, int, int, int, int>> configs = {
         // std::make_tuple(1920,1280,26,38,3,3),
         // std::make_tuple(2560,1280,26,38,3,3),
         // std::make_tuple(320,1280,26,38,3,3),
-        // std::make_tuple(512,512,104,152,3,3),
+        std::make_tuple(512,512,104,152,3,3),
         // std::make_tuple(512,512,208,304,3,3),
         // std::make_tuple(512,256,416,608,3,3),
         // std::make_tuple(256,128,832,1216,3,3),
@@ -714,7 +714,7 @@ int main(void)
 
         // for(int i = 0; i < ggml_nelements(wino_res); i++) {
         // for(int i = 0; i < 26*38; i++) {
-        // // for(int i = 0; i < conv2d_data.size(); i++) {
+        // for(int i = 0; i < conv2d_data.size(); i++) {
         //     float diff = fabs(im2col_data[i] - conv2d_data[i]);
         //     // if(diff > 0.5) {
         //           printf("(%7.3f, %7.3f, %.2f, %d) \n",