use existing cvta_generic_to_shared
This commit is contained in:
parent
1c139e33df
commit
0c571feee1
|
|
@ -3,6 +3,7 @@
|
|||
#include "ggml.h"
|
||||
#include "common.cuh"
|
||||
#include "convert.cuh"
|
||||
#include "cp-async.cuh"
|
||||
#include "conv2d-implicit.cuh"
|
||||
|
||||
|
||||
|
|
@ -365,7 +366,7 @@ __device__ __forceinline__ void ldmatrix_a(
|
|||
unsigned int logical_offset = (threadIdx.x % 32) * smem_stride;
|
||||
unsigned int swizzled_offset = logical_offset ^ ((logical_offset & 0b10000000) >> 4);
|
||||
swizzled_offset = swizzled_offset ^ ((swizzled_offset & 0b1100000) >> 2);
|
||||
uint32_t src_addr = cvta_to_shared_u32(src + swizzled_offset);
|
||||
uint32_t src_addr = ggml_cuda_cvta_generic_to_shared(src + swizzled_offset);
|
||||
constexpr unsigned int smem_stride_ = smem_stride * sizeof(half); // convert stride to bytes
|
||||
|
||||
// 0
|
||||
|
|
@ -633,7 +634,7 @@ __device__ __forceinline__ void ldmatrix_b(
|
|||
unsigned int logical_offset = (threadIdx.x % 32) * smem_stride;
|
||||
unsigned int swizzled_offset = logical_offset ^ ((logical_offset & 0b10000000) >> 4);
|
||||
swizzled_offset = swizzled_offset ^ ((swizzled_offset & 0b1100000) >> 2);
|
||||
uint32_t src_addr = cvta_to_shared_u32(src + swizzled_offset);
|
||||
uint32_t src_addr = ggml_cuda_cvta_generic_to_shared(src + swizzled_offset);
|
||||
constexpr unsigned int smem_stride_ = smem_stride * sizeof(half); // convert stride to bytes
|
||||
|
||||
// 0
|
||||
|
|
|
|||
|
|
@ -592,18 +592,6 @@ __device__ __forceinline__ void tileMemcpySwizzleStore(
|
|||
#endif
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint32_t cvta_to_shared_u32(const void *pointer) {
|
||||
uint32_t address;
|
||||
asm("{\n\t"
|
||||
" .reg .u64 u64addr;\n\t"
|
||||
" cvta.to.shared.u64 u64addr, %1;\n\t"
|
||||
" cvt.u32.u64 %0, u64addr;\n\t"
|
||||
"}"
|
||||
: "=r"(address)
|
||||
: "l"(pointer));
|
||||
return address;
|
||||
}
|
||||
|
||||
template<typename T, const int BN, const int rowStrideA, const int layout,
|
||||
const bool vec_load, const int ksplit, const int PAD>
|
||||
__device__ __forceinline__ void loadFilter(const T * __restrict__ kernel,
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
#include "common.cuh"
|
||||
|
||||
|
||||
static __device__ __forceinline__ unsigned int ggml_cuda_cvta_generic_to_shared(void * generic_ptr) {
|
||||
static __device__ __forceinline__ unsigned int ggml_cuda_cvta_generic_to_shared(const void * generic_ptr) {
|
||||
#ifdef CP_ASYNC_AVAILABLE
|
||||
return __cvta_generic_to_shared(generic_ptr);
|
||||
#else
|
||||
|
|
|
|||
|
|
@ -40,6 +40,7 @@
|
|||
#include <thread>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
#include <map>
|
||||
|
||||
#ifdef __EMSCRIPTEN__
|
||||
# define N_THREADS 1
|
||||
|
|
|
|||
Loading…
Reference in New Issue