WIP

2025-11-15 09:58:23 -05:00 · 2025-11-15 09:58:23 -05:00 · e489dd2773
parent e10b495dd2
commit e489dd2773
3 changed files with 49 additions and 25 deletions
--- a/ggml/src/ggml-cuda/conv2d-implicit.cu
+++ b/ggml/src/ggml-cuda/conv2d-implicit.cu
@ -913,13 +913,16 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
  // for (unsigned int block_k = 1; block_k <= num_block_tiles_k; block_k++){
  int s = 0;
  int r = 0;
-  while (block_k < num_block_tiles_k){
+#if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
-  #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
+  while (block_krs < num_block_tiles_krs) {
    asm volatile("cp.async.wait_group %0;\n" ::"n"(0));
-  #endif
+#else
  while (block_k < num_block_tiles_k) {
 #endif
    __syncthreads();
-      // moves to the next tile
+      // moves to the next channel block tile
      int next_idx = 0;
      ++s;
      if (s == param.s) {
@ -954,7 +957,8 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
      //   break;
      // if(thread_idx == 0 && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0){
-      //   printf(" s = %d, r = %d, block_k = %d, next_idx = %d , %d %d \n", s, r, block_k, next_idx, block_krs, num_block_tiles_k);
+      //   printf(" s = %d, r = %d, block_k = %d, next_idx = %d , %d,  %d, %d \n", s, r, block_k, next_idx, 
      //     block_krs, num_block_tiles_k, num_block_tiles_krs);
      // }
    // if (block_k != num_block_tiles_k){
@ -1044,8 +1048,8 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
 #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
    asm volatile("cp.async.wait_group %0;\n" ::"n"(0));
    __syncthreads();
-    half* A_warp_tile = SA2 + A_warp_tile_offset;
+    half* A_warp_tile = SA1 + A_warp_tile_offset;
-    half* B_warp_tile = SB2 + B_warp_tile_offset;
+    half* B_warp_tile = SB1 + B_warp_tile_offset;
    ldmatrix_a<mma_tiles_per_warp_m, mma_tiles_per_warp_k, BK>(A_warp_tile, A_register_);
    ldmatrix_b<mma_tiles_per_warp_k, mma_tiles_per_warp_n, BK>(B_warp_tile, B_register_);
    // outer product between mma tiles
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -5826,7 +5826,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
    for (uint32_t s0 : { 1, 3 }) {
        for (uint32_t p1 : { 2, 5 }) {
-            for (uint32_t Cin : { 1, 25 }) {
+            for (uint32_t Cin : { 1, 25, 32 }) {
                for (uint32_t Cout : { 1, 12 }) {
                    for (uint32_t KH : { 1, 2, 3, 11 }) {
                        for (uint32_t KW : { 1, 2, 3, 11 }) {
@ -5854,6 +5854,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
            GGML_TYPE_F16, 1, 1, 1, 1, 1, 1, false));
    test_cases.emplace_back(new test_conv_2d( { 24, 24, 128, 1 }, { 3, 3, 128, 8},
            GGML_TYPE_F16, 1, 1, 1, 1, 1, 1, false));
    test_cases.emplace_back(new test_conv_2d( { 24, 24, 128, 3 }, { 3, 3, 128, 8},
            GGML_TYPE_F16, 1, 1, 1, 1, 1, 1, false));
    // sycl backend will limit task global_range < MAX_INT
--- a/tests/test-conv2d.cpp
+++ b/tests/test-conv2d.cpp
@ -43,7 +43,7 @@ struct ggml_cgraph * build_graph_1(const test_model&);
 void load_model(test_model & model, int ic, int oc, int iw, int ih, int kw = 3, int kh = 3, bool use_gpu = false ) {
    // create data
    int KW = kw, KH = kh, IC = ic, OC = oc;
-    int IW = iw, IH = ih, N = 1;
+    int IW = iw, IH = ih, N = 2;
    // srand(time(NULL));
    // printf(" input: IC = %d, OC = %d, IW = %d, IH = %d \n ", IC, OC, IW, IH);
@ -176,12 +176,19 @@ struct ggml_cgraph * build_graph_0(const test_model& model) {
    struct ggml_cgraph  * gf = ggml_new_graph(ctx0);
-    int s0 = 1;
+    // int s0 = 1;
-    int s1 = 1;
+    // int s1 = 1;
-    int p0 = 1;
+    // int p0 = 1;
-    int p1 = 1;
+    // int p1 = 1;
-    int d0 = 1;
+    // int d0 = 1;
-    int d1 = 1;
+    // int d1 = 1;
    int s0 = 3;
    int s1 = 5;
    int p0 = 5;
    int p1 = 5;
    int d0 = 2;
    int d1 = 4;
    // recalculate for avoid fragmentation
    struct ggml_tensor* conv2d_res = ggml_conv_2d(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1);
@ -215,12 +222,21 @@ struct ggml_cgraph * build_graph_1(const test_model& model) {
    struct ggml_cgraph  * gf = ggml_new_graph(ctx0);
-    int s0 = 1;
+    // int s0 = 1;
-    int s1 = 1;
+    // int s1 = 1;
-    int p0 = 1;
+    // int p0 = 1;
-    int p1 = 1;
+    // int p1 = 1;
-    int d0 = 1;
+    // int d0 = 1;
-    int d1 = 1;
+    // int d1 = 1;
    int s0 = 3;
    int s1 = 5;
    int p0 = 5;
    int p1 = 5;
    int d0 = 2;
    int d1 = 4;
    // recalculate for avoid fragmentation
    // struct ggml_tensor* conv2d_res = ggml_conv_2d(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1);
@ -301,7 +317,8 @@ static std::vector<std::tuple<int, int, int, int, int, int>> configs = {
        // std::make_tuple(960,320,104,152,3,3),
        // std::make_tuple(1280,1280,26,38,3,3),
        // std::make_tuple(1920,640,32,32,3,3)
-        std::make_tuple(1280,1280,16,16,3,3),
+        // std::make_tuple(1280,1280,16,16,3,3),
        std::make_tuple(32,12,141,133,3,3),
        // std::make_tuple(32,8,24,24,3,3),
        // std::make_tuple(640,640,64,64,3,3),
        // std::make_tuple(320,640,32,32,3,3),
@ -718,12 +735,12 @@ int main(void)
        // for(int i = 0; i < 26*38; i++) {
        for(int i = 0; i < conv2d_data.size(); i++) {
            float diff = fabs(im2col_data[i] - conv2d_data[i]);
-            // if(diff > 0.5) {
+            if(diff > 0.5) {
                  printf("(%7.3f, %7.3f, %.2f, %d) \n",
                  im2col_data[i], conv2d_data[i],
                  diff, i);
-                // break;
+                break;
-            // }
+            }
        }
        ggml_free(model.ctx);