From 6c90c20cb1ff375c8b3afb1b0544e088ebb9725c Mon Sep 17 00:00:00 2001
From: bssrdf <merlintiger@hotmail.com>
Date: Fri, 24 Oct 2025 15:33:57 -0400
Subject: [PATCH] WIP: bug fix

---
 ggml/src/ggml-cuda/conv2d-implicit.cu  | 15 +++++++++++++-
 ggml/src/ggml-cuda/conv2d-implicit.cuh |  6 ++++--
 tests/test-conv2d-implicit.cpp         | 27 +++++++++++++-------------
 3 files changed, 32 insertions(+), 16 deletions(-)
diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cu b/ggml/src/ggml-cuda/conv2d-implicit.cu
index f08e19e9fb..de2cf4aecb 100644
--- a/ggml/src/ggml-cuda/conv2d-implicit.cu
+++ b/ggml/src/ggml-cuda/conv2d-implicit.cu
@@ -1081,7 +1081,11 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
   tileMemcpySwizzleB<BN, NUM_THREADS>(B_block_gmem, B_block_smem, weightKOffset, param);
 
   // construct const pointers to warp tiles for use inside the inner loop
-
+//   if(threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x ==0 && blockIdx.y ==0){
+//     for(int i = 0; i < 32; ++i)
+//         printf("%.2f,", __half2float(A_block_smem[i]));
+//     printf("\n");
+//   }
 
   int offset_direction = 1;
 
@@ -1127,6 +1131,14 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
         }
       }
     }
+    if(threadIdx.x == 0 && threadIdx.y ==0 && blockIdx.x ==0 && blockIdx.y ==0){
+     printf(" %d: %f, %f, %f, %f \n", block_k, __half2float(acc_register_[0][0][0]), __half2float(acc_register_[0][0][1]), 
+     __half2float(acc_register_[0][0][2]), __half2float(acc_register_[0][0][3]));
+     printf(" %d: %f, %f, %f, %f \n", block_k, __half2float(A_register_[0][0][0]), __half2float(A_register_[0][0][1]), 
+     __half2float(A_register_[0][0][2]), __half2float(A_register_[0][0][3]));
+     printf(" %d: %f, %f, %f, %f \n", block_k, __half2float(B_register_[0][0][0]), __half2float(B_register_[0][0][1]), 
+     __half2float(B_register_[0][0][2]), __half2float(B_register_[0][0][3]));
+    }
 
 
     if (block_k != num_block_tiles_k)
@@ -1141,6 +1153,7 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
     }
   }
 
+  
     // reuse smem
     half *smemoutput = shmem;
     const uint lane_id = threadIdx.x % WARPSIZE;  
diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cuh b/ggml/src/ggml-cuda/conv2d-implicit.cuh
index 7d966705b8..3ea0461218 100644
--- a/ggml/src/ggml-cuda/conv2d-implicit.cuh
+++ b/ggml/src/ggml-cuda/conv2d-implicit.cuh
@@ -159,7 +159,8 @@ __device__ __forceinline__ void tileMemcpySwizzleA(
     #pragma unroll
     for (unsigned int i = 0; i < NUM_ITERS; i++)
     {
-        unsigned int gemm_i = blockDim.y * TILE_ROWS + thread_row;
+        // unsigned int gemm_i = blockDim.y * TILE_ROWS + thread_row;
+        unsigned int gemm_i = blockIdx.y * TILE_ROWS + thread_row;
         unsigned int n = fastdiv(gemm_i, param.OHOW_fastdiv);
         unsigned int npq_res = fastmodulo(gemm_i, param.OHOW_fastdiv);
         int posh_ori = fastdiv(npq_res, param.OW_fastdiv) * param.u - param.p;
@@ -227,7 +228,8 @@ __device__ __forceinline__ void tileMemcpyLoadA(
         // const unsigned int src_index = thread_row * src_stride_vectorized + thread_col;
         // dst_reg[i] = src_float4[src_index];
         // thread_row += ROW_STEP;
-        unsigned int gemm_i = blockDim.y * TILE_ROWS + thread_row;
+        // unsigned int gemm_i = blockDim.y * TILE_ROWS + thread_row;
+        unsigned int gemm_i = blockIdx.y * TILE_ROWS + thread_row;
         unsigned int n = fastdiv(gemm_i, param.OHOW_fastdiv);
         unsigned int npq_res = fastmodulo(gemm_i, param.OHOW_fastdiv);
         int posh_ori = fastdiv(npq_res, param.OW_fastdiv) * param.u - param.p;
diff --git a/tests/test-conv2d-implicit.cpp b/tests/test-conv2d-implicit.cpp
index 3685a10d72..58cd74e7a4 100644
--- a/tests/test-conv2d-implicit.cpp
+++ b/tests/test-conv2d-implicit.cpp
@@ -48,7 +48,7 @@ void load_model(test_model & model, int ic, int oc, int iw, int ih, bool use_gpu
     // Initialize adata
     std::vector<float> adata(KW * KH * IC * OC);
     for (int i = 0; i < KW * KH * IC * OC; i++) {
-        adata[i] = 2.5f;
+        adata[i] = 2.f;
     }
 
     // Convert adata to fp16 format
@@ -344,7 +344,7 @@ int main(void)
         // std::make_tuple(640,640,52,76),
         // std::make_tuple(640,640,104,152),
         // std::make_tuple(960,320,104,152),
-        std::make_tuple(160,1280,26,38),
+        std::make_tuple(128,1280,26,38),
         // std::make_tuple(1280,640,52,76),
         // std::make_tuple(1920,1280,26,38),
         // std::make_tuple(2560,1280,26,38),
@@ -398,7 +398,8 @@ int main(void)
         struct ggml_cgraph * gf_res_1 = NULL;    
 
         double run_time1;
-        std::vector<float> wino_data = compute_graph(model, allocr, build_graph_1, iterations, &run_time1);
+        // std::vector<float> wino_data = compute_graph(model, allocr, build_graph_1, iterations, &run_time1);
+        conv2d_data = compute_graph(model, allocr, build_graph_1, iterations, &run_time1);
 
 
         ggml_gallocr_free(allocr);
@@ -419,7 +420,7 @@ int main(void)
         struct ggml_cgraph * gf_res_2 = NULL;
 
         double run_time2;
-        wino_data = compute_graph(model, allocr, build_graph_2, iterations, &run_time2);
+        std::vector<float> wino_data = compute_graph(model, allocr, build_graph_2, iterations, &run_time2);
 
 
         if(k==0) { 
@@ -436,15 +437,15 @@ int main(void)
 
 
         // for(int i = 0; i < ggml_nelements(wino_res); i++) {
-        for(int i = 0; i < 26*38; i++) {
-            float diff = fabs(conv2d_data[i] - wino_data[i]);
-            // if(diff > 1.e-4) {
-                  printf("(%f, %f, %f, %d) \n",
-                  conv2d_data[i],
-                  wino_data[i], diff, i);
-                // break;
-            // }
-        }
+        // for(int i = 0; i < 26*38; i++) {
+        //     float diff = fabs(conv2d_data[i] - wino_data[i]);
+        //     // if(diff > 1.e-4) {
+        //           printf("(%f, %f, %f, %d) \n",
+        //           conv2d_data[i],
+        //           wino_data[i], diff, i);
+        //         // break;
+        //     // }
+        // }
 
         ggml_free(model.ctx);
         ggml_backend_buffer_free(model.buffer);