added a specialization for cuda copy op when tensor is transposed
This commit is contained in:
parent
30990788e8
commit
cc327f5224
|
|
@ -37,6 +37,48 @@ static __global__ void cpy_flt(const char * cx, char * cdst_direct, const int ne
|
|||
cpy_1(cx + x_offset, cdst + dst_offset);
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
static __global__ void cpy_flt_transpose(char * cx, char * cdst_direct,, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
|
||||
const int nb12, const int nb13, char ** cdst_indirect, int graph_cpynode_index) {
|
||||
|
||||
char * cdst = (cdst_indirect != nullptr) ? cdst_indirect[graph_cpynode_index]: cdst_direct;
|
||||
|
||||
const T* src = reinterpret_cast<const T*>(cx);
|
||||
T* dst = reinterpret_cast<T*>(cdst);
|
||||
|
||||
const int64_t nmat = ne /(ne00 * ne01);
|
||||
const int64_t n = ne00 * ne01;
|
||||
// const int64_t n = ne01 * ne02;
|
||||
int width = gridDim.x * TILE_DIM;
|
||||
int x = blockIdx.x * TILE_DIM + threadIdx.x;
|
||||
int y = blockIdx.y * TILE_DIM + threadIdx.y;
|
||||
int tx = blockIdx.y * TILE_DIM + threadIdx.x; // transpose block offset
|
||||
int ty = blockIdx.x * TILE_DIM + threadIdx.y;
|
||||
|
||||
__shared__ T tile[TILE_DIM * TILE_DIM];
|
||||
|
||||
for(int i = 0; i < BLOCK_NM; ++i){
|
||||
const unsigned int imat = blockIdx.z * BLOCK_NM + i;
|
||||
if(imat < nmat){
|
||||
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS){
|
||||
const unsigned int idx = (y+j)*width + x;
|
||||
if(idx < n)
|
||||
tile[threadIdx.y+j][threadIdx.x] = src[imat*n + idx];
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS){
|
||||
const unsigned int idx = (ty+j)*width + tx;
|
||||
if(idx < n)
|
||||
dst[imat*n + idx] = tile[threadIdx.x][threadIdx.y + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) {
|
||||
float * cdstf = (float *)(cdsti);
|
||||
|
||||
|
|
@ -143,10 +185,25 @@ static void ggml_cpy_flt_cuda(
|
|||
const char * cx, char * cdst, const int ne,
|
||||
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
|
||||
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
|
||||
|
||||
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
|
||||
cpy_flt<cpy_1_flt<src_t, dst_t>><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
if constexpr (std::is_same_v<src_t, half> && std::is_same_v<dst_t, half> ||
|
||||
std::is_same_v<src_t, float> && std::is_same_v<dst_t, float>
|
||||
){
|
||||
if (ne00 == ne11 && ne01 = ne10 && nb00 == nb11 && nb10 == nb01){ //transpose
|
||||
dim3 dimGrid( (ne00 + TILE_DIM - 1) / TILE_DIM,
|
||||
(ne01 + TILE_DIM - 1) / TILE_DIM,
|
||||
(ne/(ne00*ne01) + BLOCK_NM - 1) / BLOCK_NM );
|
||||
dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1);
|
||||
cpy_flt_transpose<cpy_1_flt<dst_t><<<dimGrid, dimBlock, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
} else{ // other
|
||||
cpy_flt<cpy_1_flt<src_t, dst_t>><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
}
|
||||
} else{
|
||||
cpy_flt<cpy_1_flt<src_t, dst_t>><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
|
||||
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cpy_f32_q8_0_cuda(
|
||||
|
|
|
|||
|
|
@ -2,6 +2,11 @@
|
|||
|
||||
#define CUDA_CPY_BLOCK_SIZE 64
|
||||
|
||||
const int TILE_DIM = 32;
|
||||
const int BLOCK_ROWS = 8;
|
||||
const int BLOCK_NM = 8;
|
||||
|
||||
|
||||
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1, bool disable_indirection = false);
|
||||
|
||||
void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
|
|
|||
Loading…
Reference in New Issue