diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cu b/ggml/src/ggml-cuda/conv2d-implicit.cu index 602bc37a0e..80a406e2c5 100644 --- a/ggml/src/ggml-cuda/conv2d-implicit.cu +++ b/ggml/src/ggml-cuda/conv2d-implicit.cu @@ -3,6 +3,7 @@ #include "ggml.h" #include "common.cuh" #include "convert.cuh" +#include "cp-async.cuh" #include "conv2d-implicit.cuh" @@ -365,7 +366,7 @@ __device__ __forceinline__ void ldmatrix_a( unsigned int logical_offset = (threadIdx.x % 32) * smem_stride; unsigned int swizzled_offset = logical_offset ^ ((logical_offset & 0b10000000) >> 4); swizzled_offset = swizzled_offset ^ ((swizzled_offset & 0b1100000) >> 2); - uint32_t src_addr = cvta_to_shared_u32(src + swizzled_offset); + uint32_t src_addr = ggml_cuda_cvta_generic_to_shared(src + swizzled_offset); constexpr unsigned int smem_stride_ = smem_stride * sizeof(half); // convert stride to bytes // 0 @@ -633,7 +634,7 @@ __device__ __forceinline__ void ldmatrix_b( unsigned int logical_offset = (threadIdx.x % 32) * smem_stride; unsigned int swizzled_offset = logical_offset ^ ((logical_offset & 0b10000000) >> 4); swizzled_offset = swizzled_offset ^ ((swizzled_offset & 0b1100000) >> 2); - uint32_t src_addr = cvta_to_shared_u32(src + swizzled_offset); + uint32_t src_addr = ggml_cuda_cvta_generic_to_shared(src + swizzled_offset); constexpr unsigned int smem_stride_ = smem_stride * sizeof(half); // convert stride to bytes // 0 diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cuh b/ggml/src/ggml-cuda/conv2d-implicit.cuh index 6df8478a47..aeaa158d72 100644 --- a/ggml/src/ggml-cuda/conv2d-implicit.cuh +++ b/ggml/src/ggml-cuda/conv2d-implicit.cuh @@ -592,18 +592,6 @@ __device__ __forceinline__ void tileMemcpySwizzleStore( #endif } -__device__ __forceinline__ uint32_t cvta_to_shared_u32(const void *pointer) { - uint32_t address; - asm("{\n\t" - " .reg .u64 u64addr;\n\t" - " cvta.to.shared.u64 u64addr, %1;\n\t" - " cvt.u32.u64 %0, u64addr;\n\t" - "}" - : "=r"(address) - : "l"(pointer)); - return address; -} - template __device__ __forceinline__ void loadFilter(const T * __restrict__ kernel, diff --git a/ggml/src/ggml-cuda/cp-async.cuh b/ggml/src/ggml-cuda/cp-async.cuh index 63d0c482ff..91011234b2 100644 --- a/ggml/src/ggml-cuda/cp-async.cuh +++ b/ggml/src/ggml-cuda/cp-async.cuh @@ -3,7 +3,7 @@ #include "common.cuh" -static __device__ __forceinline__ unsigned int ggml_cuda_cvta_generic_to_shared(void * generic_ptr) { +static __device__ __forceinline__ unsigned int ggml_cuda_cvta_generic_to_shared(const void * generic_ptr) { #ifdef CP_ASYNC_AVAILABLE return __cvta_generic_to_shared(generic_ptr); #else diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 7664e9aec9..c254461215 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -40,6 +40,7 @@ #include #include #include +#include #ifdef __EMSCRIPTEN__ # define N_THREADS 1