diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cu b/ggml/src/ggml-cuda/conv2d-implicit.cu index d21e13d5ea..902220b74f 100644 --- a/ggml/src/ggml-cuda/conv2d-implicit.cu +++ b/ggml/src/ggml-cuda/conv2d-implicit.cu @@ -914,7 +914,9 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input, int s = 0; int r = 0; while (block_k < num_block_tiles_k){ + #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE asm volatile("cp.async.wait_group %0;\n" ::"n"(0)); + #endif __syncthreads(); // moves to the next tile