broken for some test cases
This commit is contained in:
parent
64ead3fd4f
commit
9cbc099493
|
|
@ -779,6 +779,33 @@ static void conv2d_implicit_cuda(const float * X_D, const T * K_D, float * Y_D,
|
||||||
WNITER, TM, TN, NUM_THREADS, 1, false, 0><<<grid, thblock, 0, st>>>(X_D, K_D, Y_D, P);
|
WNITER, TM, TN, NUM_THREADS, 1, false, 0><<<grid, thblock, 0, st>>>(X_D, K_D, Y_D, P);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<const int BM, const int BN, const int BK,
|
||||||
|
const int WM, const int WN, const int WK, const int ksplit,
|
||||||
|
const unsigned int ThreadsM, const unsigned int ThreadsN,
|
||||||
|
const int NUM_THREADS>
|
||||||
|
static void launch_conv2d_implicit_split_kernel(ggml_backend_cuda_context & ctx, const half *X_H, const half *K_H, float *Y_D,
|
||||||
|
const unsigned int BlocksM, const unsigned int BlocksN,
|
||||||
|
const unsigned int shmem_bytes,
|
||||||
|
const param_t P, cudaStream_t st){
|
||||||
|
|
||||||
|
int id = ggml_cuda_get_device();
|
||||||
|
|
||||||
|
ggml_cuda_pool_alloc<half> Y_H(ctx.pool(id), ksplit * P.k * P.Oh * P.Ow * P.n);
|
||||||
|
cudaFuncSetAttribute(conv2d_implicit_kernel<BM, BN, BK, WM, WN, WK, ksplit, NUM_THREADS>,
|
||||||
|
cudaFuncAttributeMaxDynamicSharedMemorySize, 65536); // set shared memory limit to 64KB which is maximum for sm_75
|
||||||
|
dim3 gridDim(BlocksN, BlocksM, ksplit);
|
||||||
|
dim3 blockDim(ThreadsN, ThreadsM);
|
||||||
|
|
||||||
|
conv2d_implicit_kernel<BM, BN, BK,
|
||||||
|
WM, WN, WK, ksplit, NUM_THREADS><<<gridDim, blockDim, shmem_bytes, st>>>(X_H, K_H, Y_H.get(), P);
|
||||||
|
|
||||||
|
const unsigned int nrows = P.n * P.k * P.Oh * P.Ow;
|
||||||
|
const unsigned int blockx = (nrows + 511) / 512;
|
||||||
|
const dim3 block_nums(blockx, 1, 1);
|
||||||
|
const dim3 block_dims(512, 1, 1);
|
||||||
|
reduce_f32<half, float><<<block_nums, block_dims, 0, st>>>(Y_H.get(), Y_D, nrows, ksplit);
|
||||||
|
}
|
||||||
|
|
||||||
static void conv2d_implicit_cuda_f16(ggml_backend_cuda_context & ctx, const float * X_D, const half * K_D, float * Y_D, int cc, const param_t P, cudaStream_t st) {
|
static void conv2d_implicit_cuda_f16(ggml_backend_cuda_context & ctx, const float * X_D, const half * K_D, float * Y_D, int cc, const param_t P, cudaStream_t st) {
|
||||||
|
|
||||||
if (GGML_CUDA_CC_IS_NVIDIA(cc) && turing_mma_available(cc) && P.c % 8 == 0 && (P.r > 1 || P.s > 1)) {
|
if (GGML_CUDA_CC_IS_NVIDIA(cc) && turing_mma_available(cc) && P.c % 8 == 0 && (P.r > 1 || P.s > 1)) {
|
||||||
|
|
@ -829,39 +856,67 @@ static void conv2d_implicit_cuda_f16(ggml_backend_cuda_context & ctx, const floa
|
||||||
const unsigned int shmem_bytes = (BM_dim * BK_dim + BK_dim * BN_dim) * 2 * sizeof(half);
|
const unsigned int shmem_bytes = (BM_dim * BK_dim + BK_dim * BN_dim) * 2 * sizeof(half);
|
||||||
|
|
||||||
const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
|
const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
|
||||||
const unsigned int ksplit = 8;
|
// const unsigned int ksplit = 6;
|
||||||
if (BlocksM * BlocksN < nsm && P.c >= 8 * ksplit && (P.c * P.r * P.s) % (8*ksplit) == 0) {
|
// if (BlocksM * BlocksN < nsm && P.c >= 8 * ksplit && (P.c * P.r * P.s) % (8*ksplit) == 0) {
|
||||||
ggml_cuda_pool_alloc<half> Y_H(ctx.pool(id), ksplit * P.k * P.Oh * P.Ow * P.n);
|
printf("split factor info = %d, %d, %d \n", BlocksM, BlocksN, nsm / (BlocksM * BlocksN));
|
||||||
|
if (BlocksM * BlocksN < nsm && nsm / (BlocksM * BlocksN) <= 8 ){
|
||||||
|
|
||||||
cudaFuncSetAttribute(conv2d_implicit_kernel<BM_dim, BN_dim, BK_dim, WM_dim, WN_dim, WK_dim, ksplit, NumThreads>,
|
int ks = nsm / (BlocksM * BlocksN);
|
||||||
cudaFuncAttributeMaxDynamicSharedMemorySize, 65536); // set shared memory limit to 64KB which is maximum for sm_75
|
printf("split factor init = %d \n", ks);
|
||||||
dim3 gridDim(BlocksN, BlocksM, ksplit);
|
int j;
|
||||||
dim3 blockDim(ThreadsN, ThreadsM);
|
bool can_split = false;
|
||||||
|
for (j = ks; j >= 2; j--){
|
||||||
conv2d_implicit_kernel<BM_dim, BN_dim, BK_dim,
|
if ((P.c * P.r * P.s) % (8*j) == 0){
|
||||||
WM_dim, WN_dim, WK_dim, ksplit, NumThreads>
|
can_split = true;
|
||||||
<<<gridDim, blockDim, shmem_bytes, st>>>(X_H, K_H, Y_H.get(), P);
|
break;
|
||||||
|
}
|
||||||
const unsigned int nrows = P.n * P.k * P.Oh * P.Ow;
|
}
|
||||||
const unsigned int blockx = (nrows + 511) / 512;
|
if(can_split){
|
||||||
const dim3 block_nums(blockx, 1, 1);
|
printf("split factor = %d \n", j);
|
||||||
const dim3 block_dims(512, 1, 1);
|
if (j == 2) {
|
||||||
reduce_f32<half, float><<<block_nums, block_dims, 0, st>>>(Y_H.get(), Y_D, nrows, ksplit);
|
const unsigned int ksplit = 2;
|
||||||
|
launch_conv2d_implicit_split_kernel<BM_dim, BN_dim, BK_dim, WM_dim, WN_dim, WK_dim, ksplit,
|
||||||
} else {
|
ThreadsM, ThreadsN, NumThreads>(ctx, X_H, K_H, Y_D, BlocksM, BlocksN, shmem_bytes, P, st);
|
||||||
ggml_cuda_pool_alloc<half> Y_H(ctx.pool(id), P.k * P.Oh * P.Ow * P.n);
|
} else if (j == 3) {
|
||||||
|
const unsigned int ksplit = 3;
|
||||||
cudaFuncSetAttribute(conv2d_implicit_kernel<BM_dim, BN_dim, BK_dim, WM_dim, WN_dim, WK_dim, 0, NumThreads>,
|
launch_conv2d_implicit_split_kernel<BM_dim, BN_dim, BK_dim, WM_dim, WN_dim, WK_dim, ksplit,
|
||||||
cudaFuncAttributeMaxDynamicSharedMemorySize, 65536); // set shared memory limit to 64KB which is maximum for sm_75
|
ThreadsM, ThreadsN, NumThreads>(ctx, X_H, K_H, Y_D, BlocksM, BlocksN, shmem_bytes, P, st);
|
||||||
dim3 gridDim(BlocksN, BlocksM);
|
} else if (j == 4) {
|
||||||
dim3 blockDim(ThreadsN, ThreadsM);
|
const unsigned int ksplit = 4;
|
||||||
|
launch_conv2d_implicit_split_kernel<BM_dim, BN_dim, BK_dim, WM_dim, WN_dim, WK_dim, ksplit,
|
||||||
conv2d_implicit_kernel<BM_dim, BN_dim, BK_dim,
|
ThreadsM, ThreadsN, NumThreads>(ctx, X_H, K_H, Y_D, BlocksM, BlocksN, shmem_bytes, P, st);
|
||||||
WM_dim, WN_dim, WK_dim, 0, NumThreads>
|
} else if (j == 5) {
|
||||||
<<<gridDim, blockDim, shmem_bytes, st>>>(X_H, K_H, Y_H.get(), P);
|
const unsigned int ksplit = 5;
|
||||||
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
launch_conv2d_implicit_split_kernel<BM_dim, BN_dim, BK_dim, WM_dim, WN_dim, WK_dim, ksplit,
|
||||||
to_fp32_cuda(Y_H.get(), Y_D, P.k * P.Oh * P.Ow * P.n, st);
|
ThreadsM, ThreadsN, NumThreads>(ctx, X_H, K_H, Y_D, BlocksM, BlocksN, shmem_bytes, P, st);
|
||||||
|
} else if (j == 6) {
|
||||||
|
const unsigned int ksplit = 6;
|
||||||
|
launch_conv2d_implicit_split_kernel<BM_dim, BN_dim, BK_dim, WM_dim, WN_dim, WK_dim, ksplit,
|
||||||
|
ThreadsM, ThreadsN, NumThreads>(ctx, X_H, K_H, Y_D, BlocksM, BlocksN, shmem_bytes, P, st);
|
||||||
|
} else if (j == 7) {
|
||||||
|
const unsigned int ksplit = 7;
|
||||||
|
launch_conv2d_implicit_split_kernel<BM_dim, BN_dim, BK_dim, WM_dim, WN_dim, WK_dim, ksplit,
|
||||||
|
ThreadsM, ThreadsN, NumThreads>(ctx, X_H, K_H, Y_D, BlocksM, BlocksN, shmem_bytes, P, st);
|
||||||
|
} else if (j == 8) {
|
||||||
|
const unsigned int ksplit = 8;
|
||||||
|
launch_conv2d_implicit_split_kernel<BM_dim, BN_dim, BK_dim, WM_dim, WN_dim, WK_dim, ksplit,
|
||||||
|
ThreadsM, ThreadsN, NumThreads>(ctx, X_H, K_H, Y_D, BlocksM, BlocksN, shmem_bytes, P, st);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
ggml_cuda_pool_alloc<half> Y_H(ctx.pool(id), P.k * P.Oh * P.Ow * P.n);
|
||||||
|
|
||||||
|
cudaFuncSetAttribute(conv2d_implicit_kernel<BM_dim, BN_dim, BK_dim, WM_dim, WN_dim, WK_dim, 0, NumThreads>,
|
||||||
|
cudaFuncAttributeMaxDynamicSharedMemorySize, 65536); // set shared memory limit to 64KB which is maximum for sm_75
|
||||||
|
dim3 gridDim(BlocksN, BlocksM);
|
||||||
|
dim3 blockDim(ThreadsN, ThreadsM);
|
||||||
|
|
||||||
|
conv2d_implicit_kernel<BM_dim, BN_dim, BK_dim,
|
||||||
|
WM_dim, WN_dim, WK_dim, 0, NumThreads>
|
||||||
|
<<<gridDim, blockDim, shmem_bytes, st>>>(X_H, K_H, Y_H.get(), P);
|
||||||
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
||||||
|
to_fp32_cuda(Y_H.get(), Y_D, P.k * P.Oh * P.Ow * P.n, st);
|
||||||
} else{
|
} else{
|
||||||
conv2d_implicit_cuda<half, 1>(X_D, K_D, Y_D, P, st);
|
conv2d_implicit_cuda<half, 1>(X_D, K_D, Y_D, P, st);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -293,42 +293,38 @@ std::vector<float> compute_graph(const test_model & model, ggml_gallocr_t allocr
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::vector<std::tuple<int, int, int, int, int, int>> configs = {
|
||||||
int main(void)
|
// std::make_tuple(64,64,48,64,3,3),
|
||||||
{
|
// std::make_tuple(320,320,104,152,3,3),
|
||||||
ggml_time_init();
|
// std::make_tuple(640,640,52,76,3,3),
|
||||||
|
// std::make_tuple(640,640,104,152,3,3),
|
||||||
double time_iter0 = 0.0, time_iter1 = 0.0;
|
// std::make_tuple(960,320,104,152,3,3),
|
||||||
std::vector<std::tuple<int, int, int, int, int, int>> configs = {
|
// std::make_tuple(1280,1280,26,38,3,3),
|
||||||
std::make_tuple(64,64,48,64,3,3),
|
std::make_tuple(1920,640,32,32,3,3)
|
||||||
std::make_tuple(320,320,104,152,3,3),
|
// std::make_tuple(4,320,96,128,3,3),
|
||||||
std::make_tuple(640,640,52,76,3,3),
|
// std::make_tuple(320,4,96,128,3,3),
|
||||||
std::make_tuple(640,640,104,152,3,3),
|
// std::make_tuple(4,320,64,96,3,3),
|
||||||
std::make_tuple(960,320,104,152,3,3),
|
// std::make_tuple(320,4,64,96,3,3),
|
||||||
std::make_tuple(1280,1280,26,38,3,3),
|
// std::make_tuple(640,640,96,128,3,3),
|
||||||
std::make_tuple(4,320,96,128,3,3),
|
// std::make_tuple(1280,1280,26,38,1,1),
|
||||||
std::make_tuple(320,4,96,128,3,3),
|
// std::make_tuple(256,128,768,1024,3,3),
|
||||||
std::make_tuple(4,320,64,96,3,3),
|
// std::make_tuple(128,3,768,1024,3,3),
|
||||||
std::make_tuple(320,4,64,96,3,3),
|
// std::make_tuple(256,128,768,1024,1,1),
|
||||||
std::make_tuple(640,640,96,128,3,3),
|
// std::make_tuple(512,256,384,512,1,1),
|
||||||
std::make_tuple(1280,1280,26,38,1,1),
|
// std::make_tuple(1280,640,52,76,3,3),
|
||||||
std::make_tuple(256,128,768,1024,3,3),
|
// std::make_tuple(1920,1280,26,38,3,3),
|
||||||
std::make_tuple(128,3,768,1024,3,3),
|
// std::make_tuple(2560,1280,26,38,3,3),
|
||||||
std::make_tuple(256,128,768,1024,1,1),
|
// std::make_tuple(320,1280,26,38,3,3),
|
||||||
std::make_tuple(512,256,384,512,1,1),
|
// std::make_tuple(512,512,104,152,3,3),
|
||||||
std::make_tuple(1280,640,52,76,3,3),
|
// std::make_tuple(512,512,208,304,3,3),
|
||||||
std::make_tuple(1920,1280,26,38,3,3),
|
// std::make_tuple(512,256,416,608,3,3),
|
||||||
std::make_tuple(2560,1280,26,38,3,3),
|
// std::make_tuple(256,128,832,1216,3,3),
|
||||||
std::make_tuple(320,1280,26,38,3,3),
|
// std::make_tuple(256,256,832,1216,3,3),
|
||||||
std::make_tuple(512,512,104,152,3,3),
|
// std::make_tuple(32,64,58,58,3,3)
|
||||||
std::make_tuple(512,512,208,304,3,3),
|
|
||||||
std::make_tuple(512,256,416,608,3,3),
|
|
||||||
std::make_tuple(256,128,832,1216,3,3),
|
|
||||||
std::make_tuple(256,256,832,1216,3,3),
|
|
||||||
std::make_tuple(32,64,58,58,3,3)
|
|
||||||
// std::make_tuple(320,256,1024,1920)
|
// std::make_tuple(320,256,1024,1920)
|
||||||
};
|
};
|
||||||
std::vector<std::tuple<int, int, int, int, int, int>> configs_sdxl_512 = {
|
|
||||||
|
static std::vector<std::tuple<int, int, int, int, int, int>> configs_sdxl_512 = {
|
||||||
//512x512
|
//512x512
|
||||||
std::make_tuple(4,320,64,64,3,3),
|
std::make_tuple(4,320,64,64,3,3),
|
||||||
std::make_tuple(320,320,64,64,3,3),
|
std::make_tuple(320,320,64,64,3,3),
|
||||||
|
|
@ -434,7 +430,7 @@ int main(void)
|
||||||
std::make_tuple(320,4,64,64,3,3)
|
std::make_tuple(320,4,64,64,3,3)
|
||||||
};
|
};
|
||||||
|
|
||||||
std::vector<std::tuple<int, int, int, int, int, int>> configs_sdxl_768 = {
|
static std::vector<std::tuple<int, int, int, int, int, int>> configs_sdxl_768 = {
|
||||||
//768x768
|
//768x768
|
||||||
std::make_tuple(4,320,96,96,3,3),
|
std::make_tuple(4,320,96,96,3,3),
|
||||||
std::make_tuple(320,320,96,96,3,3),
|
std::make_tuple(320,320,96,96,3,3),
|
||||||
|
|
@ -540,7 +536,7 @@ int main(void)
|
||||||
std::make_tuple(320,4,96,96,3,3),
|
std::make_tuple(320,4,96,96,3,3),
|
||||||
};
|
};
|
||||||
|
|
||||||
std::vector<std::tuple<int, int, int, int, int, int>> configs_sdxl_1024 = {
|
static std::vector<std::tuple<int, int, int, int, int, int>> configs_sdxl_1024 = {
|
||||||
//1024x1024
|
//1024x1024
|
||||||
std::make_tuple(4,320,128,128,3,3),
|
std::make_tuple(4,320,128,128,3,3),
|
||||||
std::make_tuple(320,320,128,128,3,3),
|
std::make_tuple(320,320,128,128,3,3),
|
||||||
|
|
@ -646,10 +642,17 @@ int main(void)
|
||||||
std::make_tuple(320,4,128,128,3,3)
|
std::make_tuple(320,4,128,128,3,3)
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
int main(void)
|
||||||
|
{
|
||||||
|
ggml_time_init();
|
||||||
|
|
||||||
|
double time_iter0 = 0.0, time_iter1 = 0.0;
|
||||||
|
|
||||||
int k = 0;
|
int k = 0;
|
||||||
|
|
||||||
for (auto c : configs_sdxl_512){
|
// for (auto c : configs_sdxl_512){
|
||||||
// for (auto c : configs){
|
for (auto c : configs){
|
||||||
test_model model;
|
test_model model;
|
||||||
load_model(model, std::get<0>(c), std::get<1>(c), std::get<2>(c),
|
load_model(model, std::get<0>(c), std::get<1>(c), std::get<2>(c),
|
||||||
std::get<3>(c), std::get<4>(c), std::get<5>(c), true);
|
std::get<3>(c), std::get<4>(c), std::get<5>(c), true);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue