properly use __CUDA_ARCH__ to protect the tensor path

2025-10-24 21:56:58 -04:00 · 2025-10-24 21:56:58 -04:00 · 980ddc1e87
parent 24b553204b
commit 980ddc1e87
3 changed files with 95 additions and 135 deletions
--- a/ggml/src/ggml-cuda/conv2d-implicit.cu
+++ b/ggml/src/ggml-cuda/conv2d-implicit.cu
@ -742,7 +742,7 @@ __device__ __forceinline__ void ldmatrix_a(
  half (&reg)[mma_tiles_per_warp_m][mma_tiles_per_warp_k][4]
 )
 {
-// #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE    
+#if __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
  static_assert(mma_tiles_per_warp_m == 8, "mma_tiles_per_warp_m must be 4");
  static_assert(mma_tiles_per_warp_k == 4, "mma_tiles_per_warp_k must be 4");

@ -885,11 +885,11 @@ __device__ __forceinline__ void ldmatrix_a(
      : "=r"(reg_[6][3][0]), "=r"(reg_[6][3][1]), "=r"(reg_[7][3][0]), "=r"(reg_[7][3][1])
      : "r"(src_addr + 96 * smem_stride_)
    );
-// #else
-//     GGML_UNUSED(src);
-//     GGML_UNUSED(reg);
-//     NO_DEVICE_CODE;
-// #endif
+#else
+    GGML_UNUSED(src);
+    GGML_UNUSED(reg);
+    NO_DEVICE_CODE;
+#endif
 }

 template <unsigned int mma_tiles_per_warp_k, unsigned int mma_tiles_per_warp_n, unsigned int smem_stride>
@ -898,7 +898,7 @@ __device__ __forceinline__ void ldmatrix_b(
  half (&reg)[mma_tiles_per_warp_k][mma_tiles_per_warp_n][2]
 )
 {
-// #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE    
+#if __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
  static_assert(mma_tiles_per_warp_k == 4, "mma_tiles_per_warp_k must be 4");
  static_assert(mma_tiles_per_warp_n == 8, "mma_tiles_per_warp_n must be 8");
  
@ -989,11 +989,11 @@ __device__ __forceinline__ void ldmatrix_b(
    // : "r"(src_addr ^ 0b1000000)
    : "r"(src_addr + 32 * smem_stride_)
  );
-// #else
-//     GGML_UNUSED(src);
-//     GGML_UNUSED(reg);
-//     NO_DEVICE_CODE;
-// #endif
+#else
+    GGML_UNUSED(src);
+    GGML_UNUSED(reg);
+    NO_DEVICE_CODE;
+#endif
 }

 template<const int BM, const int BN, const int BK, const int WM, const int WN,
@ -1002,7 +1002,7 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
                                              const half * __restrict__ kernel,
                                              half * __restrict__ output,
                                              const param_t param) {
-// #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE    
+#if __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
  constexpr unsigned int MMA_M = 16;
  constexpr unsigned int MMA_N = 8;

@ -1010,7 +1010,7 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
 //      printf("conv2d_implicit_kernel launch BM:%d, BN:%d, BK:%d, WM:%d, WN:%d, WK:%d, NUM_THREADS:%d \n", BM, BN, BK, WM, WN, WK, NUM_THREADS);

  const unsigned int K = param.c * param.r * param.s;
-  const uint PQ = param.Oh * param.Ow;
+//   const uint PQ = param.Oh * param.Ow;
  const uint inChannelOffset = param.c * param.w;
  const uint weightKOffset = param.c * param.r * param.s;

@ -1153,7 +1153,8 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
    }
  }

-  
+
+
    // reuse smem
    half *smemoutput = shmem;
    const uint lane_id = threadIdx.x % WARPSIZE;  
@ -1212,21 +1213,22 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
                //     param.interm[outOffset] = smemoutput[output_lds_addr + subk * 32];
                    const uint outOffset = n * param.k * param.Oh * param.Ow + row * param.Oh * param.Ow + col;
                    output[outOffset] = smemoutput[output_lds_addr + subk + j*32*BN/2];
-                    if(outOffset == 32){
-                        printf("(%u, %u, %u, %u), output[%d,%d,%d]=%f \n", threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y,
-                             n, row, col, __half2float(output[outOffset]));
-                    }
+                    // if(outOffset == 32){
+                    //     printf("(%u, %u, %u, %u), output[%d,%d,%d]=%f \n", threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y,
+                    //          n, row, col, __half2float(output[outOffset]));
+                    // }
                }
            }
        }
    }
-// #else
-//     GGML_UNUSED(input);
-//     GGML_UNUSED(kernel);
-//     GGML_UNUSED(output);
-//     GGML_UNUSED(param);
-//     NO_DEVICE_CODE;
-// #endif
+
+#else
+    GGML_UNUSED(input);
+    GGML_UNUSED(kernel);
+    GGML_UNUSED(output);
+    GGML_UNUSED(param);
+    NO_DEVICE_CODE;
+#endif
 }


@ -1289,7 +1291,8 @@ static void conv2d_implicit_cuda(const float * X_D, const T * K_D, float * Y_D,

 static void conv2d_implicit_cuda_f16(ggml_backend_cuda_context & ctx, const float * X_D, const half * K_D, float * Y_D, int cc, const param_t P, cudaStream_t st) {
    if (GGML_CUDA_CC_IS_NVIDIA(cc) && ampere_mma_available(cc) && P.layout == 0 && P.c % 8 == 0) {
-// #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
+// #if __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
+        // printf("tensor core path called\n");
        constexpr unsigned int BM_dim = 256;
        constexpr unsigned int BN_dim = 256;
        constexpr unsigned int BK_dim = 32;
--- a/ggml/src/ggml-cuda/conv2d-implicit.cuh
+++ b/ggml/src/ggml-cuda/conv2d-implicit.cuh
@ -26,7 +26,7 @@ typedef struct{
    uint3 OHOW_fastdiv;
 } param_t;

-// #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
+
 // same as above, but writes are swizzled to avoid bank conflicts when shared memory is read later in the kernel
 template<unsigned int TILE_ROWS,
 unsigned int NUM_THREADS>
@ -37,6 +37,7 @@ __device__ __forceinline__ void tileMemcpySwizzleB(
    param_t param
 )
 {
+#if __CUDA_ARCH__ >= GGML_CUDA_TURING
    // constexpr unsigned int SWIZZLE_MASK = 0b111 << SWIZZLE_BITS;

    // // reinterpret input/output as float4
@ -117,6 +118,13 @@ __device__ __forceinline__ void tileMemcpySwizzleB(
        }
        thread_row += ROW_STEP;
    }
+#else
+    GGML_UNUSED(src);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(src_stride);
+    GGML_UNUSED(param);
+    NO_DEVICE_CODE;
+#endif
 }


@ -131,6 +139,7 @@ __device__ __forceinline__ void tileMemcpySwizzleA(
    param_t param
 )
 {
+#if __CUDA_ARCH__ >= GGML_CUDA_TURING
    constexpr unsigned int SWIZZLE_MASK_1 = 0b10000;
    constexpr unsigned int SWIZZLE_BITS_1 = 4;
    constexpr unsigned int SWIZZLE_MASK_2 = 0b1100;
@ -186,6 +195,13 @@ __device__ __forceinline__ void tileMemcpySwizzleA(
        }
        thread_row += ROW_STEP;
    }
+#else
+    GGML_UNUSED(src);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(inChannelOffset);
+    GGML_UNUSED(param);
+    NO_DEVICE_CODE;
+#endif    
 }

 template<unsigned int TILE_ROWS,
@ -201,6 +217,7 @@ __device__ __forceinline__ void tileMemcpyLoadA(
    param_t param
 )
 {
+#if __CUDA_ARCH__ >= GGML_CUDA_TURING
    // reinterpret input/output as float4
    // const float4* src_float4 = reinterpret_cast<const float4*>(src);
    // const unsigned int src_stride_vectorized = src_stride / 8;
@ -251,6 +268,14 @@ __device__ __forceinline__ void tileMemcpyLoadA(
        }
        thread_row += ROW_STEP;
    }
+#else
+    GGML_UNUSED(src);
+    GGML_UNUSED(dst_reg);
+    GGML_UNUSED(block_k);
+    GGML_UNUSED(inChannelOffset);
+    GGML_UNUSED(param);
+    NO_DEVICE_CODE;
+#endif    
 }


@ -266,6 +291,7 @@ __device__ __forceinline__ void tileMemcpyLoadB(
    param_t param
 )
 {
+#if __CUDA_ARCH__ >= GGML_CUDA_TURING
    // reinterpret input/output as float4
    // const float4* src_float4 = reinterpret_cast<const float4*>(src);
    // const unsigned int src_stride_vectorized = src_stride / 8;
@ -305,91 +331,18 @@ __device__ __forceinline__ void tileMemcpyLoadB(
        }
        thread_row += ROW_STEP;
    }
+#else
+    GGML_UNUSED(src);
+    GGML_UNUSED(dst_reg);
+    GGML_UNUSED(block_k);
+    GGML_UNUSED(src_stride);
+    GGML_UNUSED(param);
+    NO_DEVICE_CODE;
+#endif    
 }

-// template<unsigned int TILE_ROWS,
-// unsigned int TILE_COLS,
-// unsigned int NUM_THREADS,
-// unsigned int SWIZZLE_BITS,
-// unsigned int ELEMENTS_PER_THREAD>
-// __device__ __forceinline__ void tileMemcpySwizzleStoreB(
-//     float4 src_reg[ELEMENTS_PER_THREAD],
-//     half* dst
-// )
-// {
-//     constexpr unsigned int SWIZZLE_MASK = 0b111 << SWIZZLE_BITS;
-
-//     // reinterpret input/output as float4
-//     float4* dst_float4 = reinterpret_cast<float4*>(dst);
-
-//     // # of threads is multiple of # of columns in the tile
-//     constexpr unsigned int TILE_COLS_VECTORIZED = TILE_COLS / 8;
-//     static_assert(NUM_THREADS % TILE_COLS_VECTORIZED == 0);
-    
-//     // flatten out 2d grid of threads into in order of increasing threadIdx.x
-//     const unsigned int thread_idx = threadIdx.y * blockDim.x + threadIdx.x;
-
-//     // assign each thread a row/column in the tile, calculate how many iterations we need
-//     // to cover the whole tile
-//     constexpr unsigned int ROW_STEP = NUM_THREADS / TILE_COLS_VECTORIZED;
-//     constexpr unsigned int NUM_ITERS = TILE_ROWS / ROW_STEP;
-//     unsigned int thread_row = thread_idx / TILE_COLS_VECTORIZED;
-//     const unsigned int thread_col = thread_idx % TILE_COLS_VECTORIZED;
-    
-//     // compile time check that we provided the right amount of registers for storage
-//     static_assert(ELEMENTS_PER_THREAD == NUM_ITERS);
-    
-//     #pragma unroll
-//     for (unsigned int i = 0; i < NUM_ITERS; i++)
-//     {
-//         // apply swizzle to the dst index
-//         unsigned int dst_index = thread_row * TILE_COLS_VECTORIZED + thread_col;
-//         dst_index = dst_index ^ ((dst_index & SWIZZLE_MASK) >> SWIZZLE_BITS);
-//         dst_float4[dst_index] = src_reg[i];
-//         thread_row += ROW_STEP;
-//     }
-// }

 // same as above but without the swizzle
-template<unsigned int TILE_ROWS,
-unsigned int TILE_COLS,
-unsigned int NUM_THREADS,
-unsigned int ELEMENTS_PER_THREAD>
-__device__ __forceinline__ void tileMemcpyStore(
-    float4 src_reg[ELEMENTS_PER_THREAD],
-    half* dst,
-    unsigned int dst_stride_float4
-)
-{
-    // reinterpret input/output as float4
-    float4* dst_float4 = reinterpret_cast<float4*>(dst);
-
-    // # of threads is multiple of # of columns in the tile
-    constexpr unsigned int TILE_COLS_VECTORIZED = TILE_COLS / 8;
-    static_assert(NUM_THREADS % TILE_COLS_VECTORIZED == 0);
-    
-    // flatten out 2d grid of threads into in order of increasing threadIdx.x
-    const unsigned int thread_idx = threadIdx.y * blockDim.x + threadIdx.x;
-
-    // assign each thread a row/column in the tile, calculate how many iterations we need
-    // to cover the whole tile
-    constexpr unsigned int ROW_STEP = NUM_THREADS / TILE_COLS_VECTORIZED;
-    constexpr unsigned int NUM_ITERS = TILE_ROWS / ROW_STEP;
-    unsigned int thread_row = thread_idx / TILE_COLS_VECTORIZED;
-    const unsigned int thread_col = thread_idx % TILE_COLS_VECTORIZED;
-    
-    // compile time check that we provided the right amount of registers for storage
-    static_assert(ELEMENTS_PER_THREAD == NUM_ITERS);
-    
-    #pragma unroll
-    for (unsigned int i = 0; i < NUM_ITERS; i++)
-    {
-        // apply swizzle to the dst index
-        unsigned int dst_index = thread_row * dst_stride_float4 + thread_col;
-        dst_float4[dst_index] = src_reg[i];
-        thread_row += ROW_STEP;
-    }
-}

 // this is a special case of the above for when TILE_COLS == 32
 template<unsigned int TILE_ROWS,
@ -400,6 +353,7 @@ __device__ __forceinline__ void tileMemcpySwizzleStore(
    half* dst
 )
 {
+#if __CUDA_ARCH__ >= GGML_CUDA_TURING
    constexpr unsigned int SWIZZLE_MASK_1 = 0b10000;
    constexpr unsigned int SWIZZLE_BITS_1 = 4;
    constexpr unsigned int SWIZZLE_MASK_2 = 0b1100;
@ -436,6 +390,11 @@ __device__ __forceinline__ void tileMemcpySwizzleStore(
        dst_float4[dst_index] =  src_reg[i];
        thread_row += ROW_STEP;
    }
+#else
+    GGML_UNUSED(src_reg);
+    GGML_UNUSED(dst);    
+    NO_DEVICE_CODE;
+#endif    
 }

 __device__ __forceinline__ uint32_t cvta_to_shared_u32(const void *pointer) {
@ -450,8 +409,6 @@ __device__ __forceinline__ uint32_t cvta_to_shared_u32(const void *pointer) {
    return address;
 }

-// #endif
-
 // constexpr unsigned int int_log2(unsigned int x)
 // {
 //     unsigned int result = 0;
--- a/tests/test-conv2d-implicit.cpp
+++ b/tests/test-conv2d-implicit.cpp
@ -339,20 +339,20 @@ int main(void)
 {
    ggml_time_init();
    std::vector<std::tuple<int, int, int, int>> configs = {
-        // std::make_tuple(64,64,48,64),
-        // std::make_tuple(320,320,104,152),
-        // std::make_tuple(640,640,52,76),
-        // std::make_tuple(640,640,104,152),
-        // std::make_tuple(960,320,104,152),
-        std::make_tuple(128,1280,26,38),
-        // std::make_tuple(1280,640,52,76),
-        // std::make_tuple(1920,1280,26,38),
-        // std::make_tuple(2560,1280,26,38),
-        // std::make_tuple(512,512,104,152),
-        // std::make_tuple(512,512,208,304),
-        // std::make_tuple(512,256,416,608),
-        // std::make_tuple(256,128,832,1216),
-        // std::make_tuple(256,256,832,1216),
+        std::make_tuple(64,64,48,64),
+        std::make_tuple(320,320,104,152),
+        std::make_tuple(640,640,52,76),
+        std::make_tuple(640,640,104,152),
+        std::make_tuple(960,320,104,152),
+        std::make_tuple(1280,1280,26,38),
+        std::make_tuple(1280,640,52,76),
+        std::make_tuple(1920,1280,26,38),
+        std::make_tuple(2560,1280,26,38),
+        std::make_tuple(512,512,104,152),
+        std::make_tuple(512,512,208,304),
+        std::make_tuple(512,256,416,608),
+        std::make_tuple(256,128,832,1216),
+        std::make_tuple(256,256,832,1216),
        // std::make_tuple(320,256,1024,1920)
    };

@ -375,7 +375,7 @@ int main(void)
       

        struct ggml_cgraph * gf_res_0 = NULL;    
-        int iterations = 0;
+        int iterations = 20;

        double run_time0;
        std::vector<float> conv2d_data = compute_graph(model, allocr, build_graph_0, iterations, &run_time0);
@ -437,15 +437,15 @@ int main(void)


        // for(int i = 0; i < ggml_nelements(wino_res); i++) {
-        for(int i = 0; i < 26*38; i++) {
-            float diff = fabs(conv2d_data[i] - wino_data[i]);
-            // if(diff > 1.e-4) {
-                  printf("(%f, %f, %f, %d) \n",
-                  conv2d_data[i],
-                  wino_data[i], diff, i);
-                // break;
-            // }
-        }
+        // for(int i = 0; i < 26*38; i++) {
+        //     float diff = fabs(conv2d_data[i] - wino_data[i]);
+        //     // if(diff > 1.e-4) {
+        //           printf("(%f, %f, %f, %d) \n",
+        //           conv2d_data[i],
+        //           wino_data[i], diff, i);
+        //         // break;
+        //     // }
+        // }

        ggml_free(model.ctx);
        ggml_backend_buffer_free(model.buffer);