diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cu b/ggml/src/ggml-cuda/conv2d-implicit.cu index 33f5ac23a7..1133626d14 100644 --- a/ggml/src/ggml-cuda/conv2d-implicit.cu +++ b/ggml/src/ggml-cuda/conv2d-implicit.cu @@ -779,6 +779,33 @@ static void conv2d_implicit_cuda(const float * X_D, const T * K_D, float * Y_D, WNITER, TM, TN, NUM_THREADS, 1, false, 0><<>>(X_D, K_D, Y_D, P); } +template +static void launch_conv2d_implicit_split_kernel(ggml_backend_cuda_context & ctx, const half *X_H, const half *K_H, float *Y_D, + const unsigned int BlocksM, const unsigned int BlocksN, + const unsigned int shmem_bytes, + const param_t P, cudaStream_t st){ + + int id = ggml_cuda_get_device(); + + ggml_cuda_pool_alloc Y_H(ctx.pool(id), ksplit * P.k * P.Oh * P.Ow * P.n); + cudaFuncSetAttribute(conv2d_implicit_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, 65536); // set shared memory limit to 64KB which is maximum for sm_75 + dim3 gridDim(BlocksN, BlocksM, ksplit); + dim3 blockDim(ThreadsN, ThreadsM); + + conv2d_implicit_kernel<<>>(X_H, K_H, Y_H.get(), P); + + const unsigned int nrows = P.n * P.k * P.Oh * P.Ow; + const unsigned int blockx = (nrows + 511) / 512; + const dim3 block_nums(blockx, 1, 1); + const dim3 block_dims(512, 1, 1); + reduce_f32<<>>(Y_H.get(), Y_D, nrows, ksplit); +} + static void conv2d_implicit_cuda_f16(ggml_backend_cuda_context & ctx, const float * X_D, const half * K_D, float * Y_D, int cc, const param_t P, cudaStream_t st) { if (GGML_CUDA_CC_IS_NVIDIA(cc) && turing_mma_available(cc) && P.c % 8 == 0 && (P.r > 1 || P.s > 1)) { @@ -829,39 +856,67 @@ static void conv2d_implicit_cuda_f16(ggml_backend_cuda_context & ctx, const floa const unsigned int shmem_bytes = (BM_dim * BK_dim + BK_dim * BN_dim) * 2 * sizeof(half); const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm; - const unsigned int ksplit = 8; - if (BlocksM * BlocksN < nsm && P.c >= 8 * ksplit && (P.c * P.r * P.s) % (8*ksplit) == 0) { - ggml_cuda_pool_alloc Y_H(ctx.pool(id), ksplit * P.k * P.Oh * P.Ow * P.n); + // const unsigned int ksplit = 6; + // if (BlocksM * BlocksN < nsm && P.c >= 8 * ksplit && (P.c * P.r * P.s) % (8*ksplit) == 0) { + printf("split factor info = %d, %d, %d \n", BlocksM, BlocksN, nsm / (BlocksM * BlocksN)); + if (BlocksM * BlocksN < nsm && nsm / (BlocksM * BlocksN) <= 8 ){ - cudaFuncSetAttribute(conv2d_implicit_kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, 65536); // set shared memory limit to 64KB which is maximum for sm_75 - dim3 gridDim(BlocksN, BlocksM, ksplit); - dim3 blockDim(ThreadsN, ThreadsM); - - conv2d_implicit_kernel - <<>>(X_H, K_H, Y_H.get(), P); - - const unsigned int nrows = P.n * P.k * P.Oh * P.Ow; - const unsigned int blockx = (nrows + 511) / 512; - const dim3 block_nums(blockx, 1, 1); - const dim3 block_dims(512, 1, 1); - reduce_f32<<>>(Y_H.get(), Y_D, nrows, ksplit); - - } else { - ggml_cuda_pool_alloc Y_H(ctx.pool(id), P.k * P.Oh * P.Ow * P.n); - - cudaFuncSetAttribute(conv2d_implicit_kernel, - cudaFuncAttributeMaxDynamicSharedMemorySize, 65536); // set shared memory limit to 64KB which is maximum for sm_75 - dim3 gridDim(BlocksN, BlocksM); - dim3 blockDim(ThreadsN, ThreadsM); - - conv2d_implicit_kernel - <<>>(X_H, K_H, Y_H.get(), P); - const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16); - to_fp32_cuda(Y_H.get(), Y_D, P.k * P.Oh * P.Ow * P.n, st); + int ks = nsm / (BlocksM * BlocksN); + printf("split factor init = %d \n", ks); + int j; + bool can_split = false; + for (j = ks; j >= 2; j--){ + if ((P.c * P.r * P.s) % (8*j) == 0){ + can_split = true; + break; + } + } + if(can_split){ + printf("split factor = %d \n", j); + if (j == 2) { + const unsigned int ksplit = 2; + launch_conv2d_implicit_split_kernel(ctx, X_H, K_H, Y_D, BlocksM, BlocksN, shmem_bytes, P, st); + } else if (j == 3) { + const unsigned int ksplit = 3; + launch_conv2d_implicit_split_kernel(ctx, X_H, K_H, Y_D, BlocksM, BlocksN, shmem_bytes, P, st); + } else if (j == 4) { + const unsigned int ksplit = 4; + launch_conv2d_implicit_split_kernel(ctx, X_H, K_H, Y_D, BlocksM, BlocksN, shmem_bytes, P, st); + } else if (j == 5) { + const unsigned int ksplit = 5; + launch_conv2d_implicit_split_kernel(ctx, X_H, K_H, Y_D, BlocksM, BlocksN, shmem_bytes, P, st); + } else if (j == 6) { + const unsigned int ksplit = 6; + launch_conv2d_implicit_split_kernel(ctx, X_H, K_H, Y_D, BlocksM, BlocksN, shmem_bytes, P, st); + } else if (j == 7) { + const unsigned int ksplit = 7; + launch_conv2d_implicit_split_kernel(ctx, X_H, K_H, Y_D, BlocksM, BlocksN, shmem_bytes, P, st); + } else if (j == 8) { + const unsigned int ksplit = 8; + launch_conv2d_implicit_split_kernel(ctx, X_H, K_H, Y_D, BlocksM, BlocksN, shmem_bytes, P, st); + } + return; + } } + ggml_cuda_pool_alloc Y_H(ctx.pool(id), P.k * P.Oh * P.Ow * P.n); + + cudaFuncSetAttribute(conv2d_implicit_kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, 65536); // set shared memory limit to 64KB which is maximum for sm_75 + dim3 gridDim(BlocksN, BlocksM); + dim3 blockDim(ThreadsN, ThreadsM); + + conv2d_implicit_kernel + <<>>(X_H, K_H, Y_H.get(), P); + const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16); + to_fp32_cuda(Y_H.get(), Y_D, P.k * P.Oh * P.Ow * P.n, st); } else{ conv2d_implicit_cuda(X_D, K_D, Y_D, P, st); } diff --git a/tests/test-conv2d.cpp b/tests/test-conv2d.cpp index 57edc02474..5af7da0a91 100644 --- a/tests/test-conv2d.cpp +++ b/tests/test-conv2d.cpp @@ -293,42 +293,38 @@ std::vector compute_graph(const test_model & model, ggml_gallocr_t allocr } - -int main(void) -{ - ggml_time_init(); - - double time_iter0 = 0.0, time_iter1 = 0.0; - std::vector> configs = { - std::make_tuple(64,64,48,64,3,3), - std::make_tuple(320,320,104,152,3,3), - std::make_tuple(640,640,52,76,3,3), - std::make_tuple(640,640,104,152,3,3), - std::make_tuple(960,320,104,152,3,3), - std::make_tuple(1280,1280,26,38,3,3), - std::make_tuple(4,320,96,128,3,3), - std::make_tuple(320,4,96,128,3,3), - std::make_tuple(4,320,64,96,3,3), - std::make_tuple(320,4,64,96,3,3), - std::make_tuple(640,640,96,128,3,3), - std::make_tuple(1280,1280,26,38,1,1), - std::make_tuple(256,128,768,1024,3,3), - std::make_tuple(128,3,768,1024,3,3), - std::make_tuple(256,128,768,1024,1,1), - std::make_tuple(512,256,384,512,1,1), - std::make_tuple(1280,640,52,76,3,3), - std::make_tuple(1920,1280,26,38,3,3), - std::make_tuple(2560,1280,26,38,3,3), - std::make_tuple(320,1280,26,38,3,3), - std::make_tuple(512,512,104,152,3,3), - std::make_tuple(512,512,208,304,3,3), - std::make_tuple(512,256,416,608,3,3), - std::make_tuple(256,128,832,1216,3,3), - std::make_tuple(256,256,832,1216,3,3), - std::make_tuple(32,64,58,58,3,3) +static std::vector> configs = { + // std::make_tuple(64,64,48,64,3,3), + // std::make_tuple(320,320,104,152,3,3), + // std::make_tuple(640,640,52,76,3,3), + // std::make_tuple(640,640,104,152,3,3), + // std::make_tuple(960,320,104,152,3,3), + // std::make_tuple(1280,1280,26,38,3,3), + std::make_tuple(1920,640,32,32,3,3) + // std::make_tuple(4,320,96,128,3,3), + // std::make_tuple(320,4,96,128,3,3), + // std::make_tuple(4,320,64,96,3,3), + // std::make_tuple(320,4,64,96,3,3), + // std::make_tuple(640,640,96,128,3,3), + // std::make_tuple(1280,1280,26,38,1,1), + // std::make_tuple(256,128,768,1024,3,3), + // std::make_tuple(128,3,768,1024,3,3), + // std::make_tuple(256,128,768,1024,1,1), + // std::make_tuple(512,256,384,512,1,1), + // std::make_tuple(1280,640,52,76,3,3), + // std::make_tuple(1920,1280,26,38,3,3), + // std::make_tuple(2560,1280,26,38,3,3), + // std::make_tuple(320,1280,26,38,3,3), + // std::make_tuple(512,512,104,152,3,3), + // std::make_tuple(512,512,208,304,3,3), + // std::make_tuple(512,256,416,608,3,3), + // std::make_tuple(256,128,832,1216,3,3), + // std::make_tuple(256,256,832,1216,3,3), + // std::make_tuple(32,64,58,58,3,3) // std::make_tuple(320,256,1024,1920) }; - std::vector> configs_sdxl_512 = { + +static std::vector> configs_sdxl_512 = { //512x512 std::make_tuple(4,320,64,64,3,3), std::make_tuple(320,320,64,64,3,3), @@ -434,7 +430,7 @@ int main(void) std::make_tuple(320,4,64,64,3,3) }; - std::vector> configs_sdxl_768 = { +static std::vector> configs_sdxl_768 = { //768x768 std::make_tuple(4,320,96,96,3,3), std::make_tuple(320,320,96,96,3,3), @@ -540,7 +536,7 @@ int main(void) std::make_tuple(320,4,96,96,3,3), }; - std::vector> configs_sdxl_1024 = { +static std::vector> configs_sdxl_1024 = { //1024x1024 std::make_tuple(4,320,128,128,3,3), std::make_tuple(320,320,128,128,3,3), @@ -646,10 +642,17 @@ int main(void) std::make_tuple(320,4,128,128,3,3) }; + +int main(void) +{ + ggml_time_init(); + + double time_iter0 = 0.0, time_iter1 = 0.0; + int k = 0; - for (auto c : configs_sdxl_512){ - // for (auto c : configs){ + // for (auto c : configs_sdxl_512){ + for (auto c : configs){ test_model model; load_model(model, std::get<0>(c), std::get<1>(c), std::get<2>(c), std::get<3>(c), std::get<4>(c), std::get<5>(c), true);