use existing cvta_generic_to_shared

This commit is contained in:
bssrdf 2026-01-27 10:02:15 -05:00
parent 1c139e33df
commit 0c571feee1
4 changed files with 5 additions and 15 deletions

View File

@ -3,6 +3,7 @@
#include "ggml.h"
#include "common.cuh"
#include "convert.cuh"
#include "cp-async.cuh"
#include "conv2d-implicit.cuh"
@ -365,7 +366,7 @@ __device__ __forceinline__ void ldmatrix_a(
unsigned int logical_offset = (threadIdx.x % 32) * smem_stride;
unsigned int swizzled_offset = logical_offset ^ ((logical_offset & 0b10000000) >> 4);
swizzled_offset = swizzled_offset ^ ((swizzled_offset & 0b1100000) >> 2);
uint32_t src_addr = cvta_to_shared_u32(src + swizzled_offset);
uint32_t src_addr = ggml_cuda_cvta_generic_to_shared(src + swizzled_offset);
constexpr unsigned int smem_stride_ = smem_stride * sizeof(half); // convert stride to bytes
// 0
@ -633,7 +634,7 @@ __device__ __forceinline__ void ldmatrix_b(
unsigned int logical_offset = (threadIdx.x % 32) * smem_stride;
unsigned int swizzled_offset = logical_offset ^ ((logical_offset & 0b10000000) >> 4);
swizzled_offset = swizzled_offset ^ ((swizzled_offset & 0b1100000) >> 2);
uint32_t src_addr = cvta_to_shared_u32(src + swizzled_offset);
uint32_t src_addr = ggml_cuda_cvta_generic_to_shared(src + swizzled_offset);
constexpr unsigned int smem_stride_ = smem_stride * sizeof(half); // convert stride to bytes
// 0

View File

@ -592,18 +592,6 @@ __device__ __forceinline__ void tileMemcpySwizzleStore(
#endif
}
__device__ __forceinline__ uint32_t cvta_to_shared_u32(const void *pointer) {
uint32_t address;
asm("{\n\t"
" .reg .u64 u64addr;\n\t"
" cvta.to.shared.u64 u64addr, %1;\n\t"
" cvt.u32.u64 %0, u64addr;\n\t"
"}"
: "=r"(address)
: "l"(pointer));
return address;
}
template<typename T, const int BN, const int rowStrideA, const int layout,
const bool vec_load, const int ksplit, const int PAD>
__device__ __forceinline__ void loadFilter(const T * __restrict__ kernel,

View File

@ -3,7 +3,7 @@
#include "common.cuh"
static __device__ __forceinline__ unsigned int ggml_cuda_cvta_generic_to_shared(void * generic_ptr) {
static __device__ __forceinline__ unsigned int ggml_cuda_cvta_generic_to_shared(const void * generic_ptr) {
#ifdef CP_ASYNC_AVAILABLE
return __cvta_generic_to_shared(generic_ptr);
#else

View File

@ -40,6 +40,7 @@
#include <thread>
#include <vector>
#include <unordered_map>
#include <map>
#ifdef __EMSCRIPTEN__
# define N_THREADS 1