diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cu b/ggml/src/ggml-cuda/conv2d-implicit.cu index 3a84935582..2fd244389d 100644 --- a/ggml/src/ggml-cuda/conv2d-implicit.cu +++ b/ggml/src/ggml-cuda/conv2d-implicit.cu @@ -677,14 +677,12 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input, uint32_t (®_)[2] = reinterpret_cast(acc_register_[mma_m][mma_n]); uint idx = output_sts_addr + mma_m * MMA_M * BN / 2 + (mma_n - i * mma_tiles_per_warp_n/2) * MMA_N; - uint idx8 = idx + 8 * BN / 2; idx = idx ^ ((idx & 0b110000000000) >> 9); idx = idx ^ ((idx & 0b1110000000) >> 4); uint32_t* dst_ptr = reinterpret_cast(&smemoutput[idx]); dst_ptr[0] = reg_[0]; - idx8 = idx8 ^ ((idx8 & 0b110000000000) >> 9); - idx8 = idx8 ^ ((idx8 & 0b1110000000) >> 4); - dst_ptr = reinterpret_cast(&smemoutput[idx8]); + idx = (idx + 8 * BN / 2 ) ^ 0b010; + dst_ptr = reinterpret_cast(&smemoutput[idx]); dst_ptr[0] = reg_[1]; } } diff --git a/tests/test-conv2d.cpp b/tests/test-conv2d.cpp index 75778b6e30..720ddbf269 100644 --- a/tests/test-conv2d.cpp +++ b/tests/test-conv2d.cpp @@ -300,353 +300,355 @@ int main(void) double time_iter0 = 0.0, time_iter1 = 0.0; std::vector> configs = { - // std::make_tuple(64,64,48,64,3,3), - // std::make_tuple(320,320,104,152,3,3), - // std::make_tuple(640,640,52,76,3,3), - // std::make_tuple(640,640,104,152,3,3), - // std::make_tuple(960,320,104,152,3,3), + std::make_tuple(64,64,48,64,3,3), + std::make_tuple(320,320,104,152,3,3), + std::make_tuple(640,640,52,76,3,3), + std::make_tuple(640,640,104,152,3,3), + std::make_tuple(960,320,104,152,3,3), std::make_tuple(1280,1280,26,38,3,3), - // std::make_tuple(4,320,96,128,3,3), - // std::make_tuple(320,4,96,128,3,3), - // std::make_tuple(4,320,64,96,3,3), - // std::make_tuple(320,4,64,96,3,3), - // std::make_tuple(640,640,96,128,3,3), - // std::make_tuple(1280,1280,26,38,1,1), - // std::make_tuple(256,128,768,1024,3,3), - // std::make_tuple(128,3,768,1024,3,3), - // std::make_tuple(256,128,768,1024,1,1), - // std::make_tuple(512,256,384,512,1,1), - // std::make_tuple(1280,640,52,76,3,3), - // std::make_tuple(1920,1280,26,38,3,3), - // std::make_tuple(2560,1280,26,38,3,3), - // std::make_tuple(320,1280,26,38,3,3), - // std::make_tuple(512,512,104,152,3,3), - // std::make_tuple(512,512,208,304,3,3), - // std::make_tuple(512,256,416,608,3,3), - // std::make_tuple(256,128,832,1216,3,3), - // std::make_tuple(256,256,832,1216,3,3), + std::make_tuple(4,320,96,128,3,3), + std::make_tuple(320,4,96,128,3,3), + std::make_tuple(4,320,64,96,3,3), + std::make_tuple(320,4,64,96,3,3), + std::make_tuple(640,640,96,128,3,3), + std::make_tuple(1280,1280,26,38,1,1), + std::make_tuple(256,128,768,1024,3,3), + std::make_tuple(128,3,768,1024,3,3), + std::make_tuple(256,128,768,1024,1,1), + std::make_tuple(512,256,384,512,1,1), + std::make_tuple(1280,640,52,76,3,3), + std::make_tuple(1920,1280,26,38,3,3), + std::make_tuple(2560,1280,26,38,3,3), + std::make_tuple(320,1280,26,38,3,3), + std::make_tuple(512,512,104,152,3,3), + std::make_tuple(512,512,208,304,3,3), + std::make_tuple(512,256,416,608,3,3), + std::make_tuple(256,128,832,1216,3,3), + std::make_tuple(256,256,832,1216,3,3), // std::make_tuple(320,256,1024,1920) - // std::make_tuple(32,64,58,58,3,3) - + std::make_tuple(32,64,58,58,3,3) + }; + std::vector> configs_sdxl_512 = { //512x512 - // std::make_tuple(4,320,64,64,3,3), - // std::make_tuple(320,320,64,64,3,3), - // std::make_tuple(320,320,64,64,3,3), - // std::make_tuple(320,320,64,64,3,3), - // std::make_tuple(320,320,64,64,3,3), - // std::make_tuple(320,320,64,64,3,3), - // std::make_tuple(320,640,32,32,3,3), - // std::make_tuple(640,640,32,32,3,3), - // std::make_tuple(320,640,32,32,3,3), - // std::make_tuple(640,640,32,32,3,3), - // std::make_tuple(640,640,32,32,3,3), - // std::make_tuple(640,640,32,32,3,3), - // std::make_tuple(640,1280,16,16,3,3), - // std::make_tuple(1280,1280,16,16,3,3), - // std::make_tuple(640,1280,16,16,3,3), - // std::make_tuple(1280,1280,16,16,3,3), - // std::make_tuple(1280,1280,16,16,3,3), - // std::make_tuple(1280,1280,16,16,3,3), - // std::make_tuple(1280,1280,16,16,3,3), - // std::make_tuple(1280,1280,16,16,3,3), - // std::make_tuple(1280,1280,16,16,3,3), - // std::make_tuple(2560,1280,16,16,3,3), - // std::make_tuple(1280,1280,16,16,3,3), - // std::make_tuple(2560,1280,16,16,3,3), - // std::make_tuple(2560,1280,16,16,3,3), - // std::make_tuple(1280,1280,16,16,3,3), - // std::make_tuple(2560,1280,16,16,3,3), - // std::make_tuple(1920,1280,16,16,3,3), - // std::make_tuple(1280,1280,16,16,3,3), - // std::make_tuple(1920,1280,16,16,3,3), - // std::make_tuple(1280,1280,32,32,3,3), - // std::make_tuple(1920,640,32,32,3,3), - // std::make_tuple(640,640,32,32,3,3), - // std::make_tuple(1920,640,32,32,3,3), - // std::make_tuple(1280,640,32,32,3,3), - // std::make_tuple(640,640,32,32,3,3), - // std::make_tuple(1280,640,32,32,3,3), - // std::make_tuple(960,640,32,32,3,3), - // std::make_tuple(640,640,32,32,3,3), - // std::make_tuple(960,640,32,32,3,3), - // std::make_tuple(640,640,64,64,3,3), - // std::make_tuple(960,320,64,64,3,3), - // std::make_tuple(320,320,64,64,3,3), - // std::make_tuple(960,320,64,64,3,3), - // std::make_tuple(640,320,64,64,3,3), - // std::make_tuple(320,320,64,64,3,3), - // std::make_tuple(640,320,64,64,3,3), - // std::make_tuple(640,320,64,64,3,3), - // std::make_tuple(320,320,64,64,3,3), - // std::make_tuple(640,320,64,64,3,3), - // std::make_tuple(320,4,64,64,3,3), - // std::make_tuple(4,320,64,64,3,3), - // std::make_tuple(320,320,64,64,3,3), - // std::make_tuple(320,320,64,64,3,3), - // std::make_tuple(320,320,64,64,3,3), - // std::make_tuple(320,320,64,64,3,3), - // std::make_tuple(320,320,64,64,3,3), - // std::make_tuple(320,640,32,32,3,3), - // std::make_tuple(640,640,32,32,3,3), - // std::make_tuple(320,640,32,32,3,3), - // std::make_tuple(640,640,32,32,3,3), - // std::make_tuple(640,640,32,32,3,3), - // std::make_tuple(640,640,32,32,3,3), - // std::make_tuple(640,1280,16,16,3,3), - // std::make_tuple(1280,1280,16,16,3,3), - // std::make_tuple(640,1280,16,16,3,3), - // std::make_tuple(1280,1280,16,16,3,3), - // std::make_tuple(1280,1280,16,16,3,3), - // std::make_tuple(1280,1280,16,16,3,3), - // std::make_tuple(1280,1280,16,16,3,3), - // std::make_tuple(1280,1280,16,16,3,3), - // std::make_tuple(1280,1280,16,16,3,3), - // std::make_tuple(2560,1280,16,16,3,3), - // std::make_tuple(1280,1280,16,16,3,3), - // std::make_tuple(2560,1280,16,16,3,3), - // std::make_tuple(2560,1280,16,16,3,3), - // std::make_tuple(1280,1280,16,16,3,3), - // std::make_tuple(2560,1280,16,16,3,3), - // std::make_tuple(1920,1280,16,16,3,3), - // std::make_tuple(1280,1280,16,16,3,3), - // std::make_tuple(1920,1280,16,16,3,3), - // std::make_tuple(1280,1280,32,32,3,3), - // std::make_tuple(1920,640,32,32,3,3), - // std::make_tuple(640,640,32,32,3,3), - // std::make_tuple(1920,640,32,32,3,3), - // std::make_tuple(1280,640,32,32,3,3), - // std::make_tuple(640,640,32,32,3,3), - // std::make_tuple(1280,640,32,32,3,3), - // std::make_tuple(960,640,32,32,3,3), - // std::make_tuple(640,640,32,32,3,3), - // std::make_tuple(960,640,32,32,3,3), - // std::make_tuple(640,640,64,64,3,3), - // std::make_tuple(960,320,64,64,3,3), - // std::make_tuple(320,320,64,64,3,3), - // std::make_tuple(960,320,64,64,3,3), - // std::make_tuple(640,320,64,64,3,3), - // std::make_tuple(320,320,64,64,3,3), - // std::make_tuple(640,320,64,64,3,3), - // std::make_tuple(640,320,64,64,3,3), - // std::make_tuple(320,320,64,64,3,3), - // std::make_tuple(640,320,64,64,3,3), - // std::make_tuple(320,4,64,64,3,3), + std::make_tuple(4,320,64,64,3,3), + std::make_tuple(320,320,64,64,3,3), + std::make_tuple(320,320,64,64,3,3), + std::make_tuple(320,320,64,64,3,3), + std::make_tuple(320,320,64,64,3,3), + std::make_tuple(320,320,64,64,3,3), + std::make_tuple(320,640,32,32,3,3), + std::make_tuple(640,640,32,32,3,3), + std::make_tuple(320,640,32,32,3,3), + std::make_tuple(640,640,32,32,3,3), + std::make_tuple(640,640,32,32,3,3), + std::make_tuple(640,640,32,32,3,3), + std::make_tuple(640,1280,16,16,3,3), + std::make_tuple(1280,1280,16,16,3,3), + std::make_tuple(640,1280,16,16,3,3), + std::make_tuple(1280,1280,16,16,3,3), + std::make_tuple(1280,1280,16,16,3,3), + std::make_tuple(1280,1280,16,16,3,3), + std::make_tuple(1280,1280,16,16,3,3), + std::make_tuple(1280,1280,16,16,3,3), + std::make_tuple(1280,1280,16,16,3,3), + std::make_tuple(2560,1280,16,16,3,3), + std::make_tuple(1280,1280,16,16,3,3), + std::make_tuple(2560,1280,16,16,3,3), + std::make_tuple(2560,1280,16,16,3,3), + std::make_tuple(1280,1280,16,16,3,3), + std::make_tuple(2560,1280,16,16,3,3), + std::make_tuple(1920,1280,16,16,3,3), + std::make_tuple(1280,1280,16,16,3,3), + std::make_tuple(1920,1280,16,16,3,3), + std::make_tuple(1280,1280,32,32,3,3), + std::make_tuple(1920,640,32,32,3,3), + std::make_tuple(640,640,32,32,3,3), + std::make_tuple(1920,640,32,32,3,3), + std::make_tuple(1280,640,32,32,3,3), + std::make_tuple(640,640,32,32,3,3), + std::make_tuple(1280,640,32,32,3,3), + std::make_tuple(960,640,32,32,3,3), + std::make_tuple(640,640,32,32,3,3), + std::make_tuple(960,640,32,32,3,3), + std::make_tuple(640,640,64,64,3,3), + std::make_tuple(960,320,64,64,3,3), + std::make_tuple(320,320,64,64,3,3), + std::make_tuple(960,320,64,64,3,3), + std::make_tuple(640,320,64,64,3,3), + std::make_tuple(320,320,64,64,3,3), + std::make_tuple(640,320,64,64,3,3), + std::make_tuple(640,320,64,64,3,3), + std::make_tuple(320,320,64,64,3,3), + std::make_tuple(640,320,64,64,3,3), + std::make_tuple(320,4,64,64,3,3), + std::make_tuple(4,320,64,64,3,3), + std::make_tuple(320,320,64,64,3,3), + std::make_tuple(320,320,64,64,3,3), + std::make_tuple(320,320,64,64,3,3), + std::make_tuple(320,320,64,64,3,3), + std::make_tuple(320,320,64,64,3,3), + std::make_tuple(320,640,32,32,3,3), + std::make_tuple(640,640,32,32,3,3), + std::make_tuple(320,640,32,32,3,3), + std::make_tuple(640,640,32,32,3,3), + std::make_tuple(640,640,32,32,3,3), + std::make_tuple(640,640,32,32,3,3), + std::make_tuple(640,1280,16,16,3,3), + std::make_tuple(1280,1280,16,16,3,3), + std::make_tuple(640,1280,16,16,3,3), + std::make_tuple(1280,1280,16,16,3,3), + std::make_tuple(1280,1280,16,16,3,3), + std::make_tuple(1280,1280,16,16,3,3), + std::make_tuple(1280,1280,16,16,3,3), + std::make_tuple(1280,1280,16,16,3,3), + std::make_tuple(1280,1280,16,16,3,3), + std::make_tuple(2560,1280,16,16,3,3), + std::make_tuple(1280,1280,16,16,3,3), + std::make_tuple(2560,1280,16,16,3,3), + std::make_tuple(2560,1280,16,16,3,3), + std::make_tuple(1280,1280,16,16,3,3), + std::make_tuple(2560,1280,16,16,3,3), + std::make_tuple(1920,1280,16,16,3,3), + std::make_tuple(1280,1280,16,16,3,3), + std::make_tuple(1920,1280,16,16,3,3), + std::make_tuple(1280,1280,32,32,3,3), + std::make_tuple(1920,640,32,32,3,3), + std::make_tuple(640,640,32,32,3,3), + std::make_tuple(1920,640,32,32,3,3), + std::make_tuple(1280,640,32,32,3,3), + std::make_tuple(640,640,32,32,3,3), + std::make_tuple(1280,640,32,32,3,3), + std::make_tuple(960,640,32,32,3,3), + std::make_tuple(640,640,32,32,3,3), + std::make_tuple(960,640,32,32,3,3), + std::make_tuple(640,640,64,64,3,3), + std::make_tuple(960,320,64,64,3,3), + std::make_tuple(320,320,64,64,3,3), + std::make_tuple(960,320,64,64,3,3), + std::make_tuple(640,320,64,64,3,3), + std::make_tuple(320,320,64,64,3,3), + std::make_tuple(640,320,64,64,3,3), + std::make_tuple(640,320,64,64,3,3), + std::make_tuple(320,320,64,64,3,3), + std::make_tuple(640,320,64,64,3,3), + std::make_tuple(320,4,64,64,3,3) + }; + std::vector> configs_sdxl_768 = { //768x768 - // std::make_tuple(4,320,96,96,3,3), - // std::make_tuple(320,320,96,96,3,3), - // std::make_tuple(320,320,96,96,3,3), - // std::make_tuple(320,320,96,96,3,3), - // std::make_tuple(320,320,96,96,3,3), - // std::make_tuple(320,320,96,96,3,3), - // std::make_tuple(320,640,48,48,3,3), - // std::make_tuple(640,640,48,48,3,3), - // std::make_tuple(320,640,48,48,3,3), - // std::make_tuple(640,640,48,48,3,3), - // std::make_tuple(640,640,48,48,3,3), - // std::make_tuple(640,640,48,48,3,3), - // std::make_tuple(640,1280,24,24,3,3), - // std::make_tuple(1280,1280,24,24,3,3), - // std::make_tuple(640,1280,24,24,3,3), - // std::make_tuple(1280,1280,24,24,3,3), - // std::make_tuple(1280,1280,24,24,3,3), - // std::make_tuple(1280,1280,24,24,3,3), - // std::make_tuple(1280,1280,24,24,3,3), - // std::make_tuple(1280,1280,24,24,3,3), - // std::make_tuple(1280,1280,24,24,3,3), - // std::make_tuple(2560,1280,24,24,3,3), - // std::make_tuple(1280,1280,24,24,3,3), - // std::make_tuple(2560,1280,24,24,3,3), - // std::make_tuple(2560,1280,24,24,3,3), - // std::make_tuple(1280,1280,24,24,3,3), - // std::make_tuple(2560,1280,24,24,3,3), - // std::make_tuple(1920,1280,24,24,3,3), - // std::make_tuple(1280,1280,24,24,3,3), - // std::make_tuple(1920,1280,24,24,3,3), - // std::make_tuple(1280,1280,48,48,3,3), - // std::make_tuple(1920,640,48,48,3,3), - // std::make_tuple(640,640,48,48,3,3), - // std::make_tuple(1920,640,48,48,3,3), - // std::make_tuple(1280,640,48,48,3,3), - // std::make_tuple(640,640,48,48,3,3), - // std::make_tuple(1280,640,48,48,3,3), - // std::make_tuple(960,640,48,48,3,3), - // std::make_tuple(640,640,48,48,3,3), - // std::make_tuple(960,640,48,48,3,3), - // std::make_tuple(640,640,96,96,3,3), - // std::make_tuple(960,320,96,96,3,3), - // std::make_tuple(320,320,96,96,3,3), - // std::make_tuple(960,320,96,96,3,3), - // std::make_tuple(640,320,96,96,3,3), - // std::make_tuple(320,320,96,96,3,3), - // std::make_tuple(640,320,96,96,3,3), - // std::make_tuple(640,320,96,96,3,3), - // std::make_tuple(320,320,96,96,3,3), - // std::make_tuple(640,320,96,96,3,3), - // std::make_tuple(320,4,96,96,3,3), - // std::make_tuple(4,320,96,96,3,3), - // std::make_tuple(320,320,96,96,3,3), - // std::make_tuple(320,320,96,96,3,3), - // std::make_tuple(320,320,96,96,3,3), - // std::make_tuple(320,320,96,96,3,3), - // std::make_tuple(320,320,96,96,3,3), - // std::make_tuple(320,640,48,48,3,3), - // std::make_tuple(640,640,48,48,3,3), - // std::make_tuple(320,640,48,48,3,3), - // std::make_tuple(640,640,48,48,3,3), - // std::make_tuple(640,640,48,48,3,3), - // std::make_tuple(640,640,48,48,3,3), - // std::make_tuple(640,1280,24,24,3,3), - // std::make_tuple(1280,1280,24,24,3,3), - // std::make_tuple(640,1280,24,24,3,3), - // std::make_tuple(1280,1280,24,24,3,3), - // std::make_tuple(1280,1280,24,24,3,3), - // std::make_tuple(1280,1280,24,24,3,3), - // std::make_tuple(1280,1280,24,24,3,3), - // std::make_tuple(1280,1280,24,24,3,3), - // std::make_tuple(1280,1280,24,24,3,3), - // std::make_tuple(2560,1280,24,24,3,3), - // std::make_tuple(1280,1280,24,24,3,3), - // std::make_tuple(2560,1280,24,24,3,3), - // std::make_tuple(2560,1280,24,24,3,3), - // std::make_tuple(1280,1280,24,24,3,3), - // std::make_tuple(2560,1280,24,24,3,3), - // std::make_tuple(1920,1280,24,24,3,3), - // std::make_tuple(1280,1280,24,24,3,3), - // std::make_tuple(1920,1280,24,24,3,3), - // std::make_tuple(1280,1280,48,48,3,3), - // std::make_tuple(1920,640,48,48,3,3), - // std::make_tuple(640,640,48,48,3,3), - // std::make_tuple(1920,640,48,48,3,3), - // std::make_tuple(1280,640,48,48,3,3), - // std::make_tuple(640,640,48,48,3,3), - // std::make_tuple(1280,640,48,48,3,3), - // std::make_tuple(960,640,48,48,3,3), - // std::make_tuple(640,640,48,48,3,3), - // std::make_tuple(960,640,48,48,3,3), - // std::make_tuple(640,640,96,96,3,3), - // std::make_tuple(960,320,96,96,3,3), - // std::make_tuple(320,320,96,96,3,3), - // std::make_tuple(960,320,96,96,3,3), - // std::make_tuple(640,320,96,96,3,3), - // std::make_tuple(320,320,96,96,3,3), - // std::make_tuple(640,320,96,96,3,3), - // std::make_tuple(640,320,96,96,3,3), - // std::make_tuple(320,320,96,96,3,3), - // std::make_tuple(640,320,96,96,3,3), - // std::make_tuple(320,4,96,96,3,3), - + std::make_tuple(4,320,96,96,3,3), + std::make_tuple(320,320,96,96,3,3), + std::make_tuple(320,320,96,96,3,3), + std::make_tuple(320,320,96,96,3,3), + std::make_tuple(320,320,96,96,3,3), + std::make_tuple(320,320,96,96,3,3), + std::make_tuple(320,640,48,48,3,3), + std::make_tuple(640,640,48,48,3,3), + std::make_tuple(320,640,48,48,3,3), + std::make_tuple(640,640,48,48,3,3), + std::make_tuple(640,640,48,48,3,3), + std::make_tuple(640,640,48,48,3,3), + std::make_tuple(640,1280,24,24,3,3), + std::make_tuple(1280,1280,24,24,3,3), + std::make_tuple(640,1280,24,24,3,3), + std::make_tuple(1280,1280,24,24,3,3), + std::make_tuple(1280,1280,24,24,3,3), + std::make_tuple(1280,1280,24,24,3,3), + std::make_tuple(1280,1280,24,24,3,3), + std::make_tuple(1280,1280,24,24,3,3), + std::make_tuple(1280,1280,24,24,3,3), + std::make_tuple(2560,1280,24,24,3,3), + std::make_tuple(1280,1280,24,24,3,3), + std::make_tuple(2560,1280,24,24,3,3), + std::make_tuple(2560,1280,24,24,3,3), + std::make_tuple(1280,1280,24,24,3,3), + std::make_tuple(2560,1280,24,24,3,3), + std::make_tuple(1920,1280,24,24,3,3), + std::make_tuple(1280,1280,24,24,3,3), + std::make_tuple(1920,1280,24,24,3,3), + std::make_tuple(1280,1280,48,48,3,3), + std::make_tuple(1920,640,48,48,3,3), + std::make_tuple(640,640,48,48,3,3), + std::make_tuple(1920,640,48,48,3,3), + std::make_tuple(1280,640,48,48,3,3), + std::make_tuple(640,640,48,48,3,3), + std::make_tuple(1280,640,48,48,3,3), + std::make_tuple(960,640,48,48,3,3), + std::make_tuple(640,640,48,48,3,3), + std::make_tuple(960,640,48,48,3,3), + std::make_tuple(640,640,96,96,3,3), + std::make_tuple(960,320,96,96,3,3), + std::make_tuple(320,320,96,96,3,3), + std::make_tuple(960,320,96,96,3,3), + std::make_tuple(640,320,96,96,3,3), + std::make_tuple(320,320,96,96,3,3), + std::make_tuple(640,320,96,96,3,3), + std::make_tuple(640,320,96,96,3,3), + std::make_tuple(320,320,96,96,3,3), + std::make_tuple(640,320,96,96,3,3), + std::make_tuple(320,4,96,96,3,3), + std::make_tuple(4,320,96,96,3,3), + std::make_tuple(320,320,96,96,3,3), + std::make_tuple(320,320,96,96,3,3), + std::make_tuple(320,320,96,96,3,3), + std::make_tuple(320,320,96,96,3,3), + std::make_tuple(320,320,96,96,3,3), + std::make_tuple(320,640,48,48,3,3), + std::make_tuple(640,640,48,48,3,3), + std::make_tuple(320,640,48,48,3,3), + std::make_tuple(640,640,48,48,3,3), + std::make_tuple(640,640,48,48,3,3), + std::make_tuple(640,640,48,48,3,3), + std::make_tuple(640,1280,24,24,3,3), + std::make_tuple(1280,1280,24,24,3,3), + std::make_tuple(640,1280,24,24,3,3), + std::make_tuple(1280,1280,24,24,3,3), + std::make_tuple(1280,1280,24,24,3,3), + std::make_tuple(1280,1280,24,24,3,3), + std::make_tuple(1280,1280,24,24,3,3), + std::make_tuple(1280,1280,24,24,3,3), + std::make_tuple(1280,1280,24,24,3,3), + std::make_tuple(2560,1280,24,24,3,3), + std::make_tuple(1280,1280,24,24,3,3), + std::make_tuple(2560,1280,24,24,3,3), + std::make_tuple(2560,1280,24,24,3,3), + std::make_tuple(1280,1280,24,24,3,3), + std::make_tuple(2560,1280,24,24,3,3), + std::make_tuple(1920,1280,24,24,3,3), + std::make_tuple(1280,1280,24,24,3,3), + std::make_tuple(1920,1280,24,24,3,3), + std::make_tuple(1280,1280,48,48,3,3), + std::make_tuple(1920,640,48,48,3,3), + std::make_tuple(640,640,48,48,3,3), + std::make_tuple(1920,640,48,48,3,3), + std::make_tuple(1280,640,48,48,3,3), + std::make_tuple(640,640,48,48,3,3), + std::make_tuple(1280,640,48,48,3,3), + std::make_tuple(960,640,48,48,3,3), + std::make_tuple(640,640,48,48,3,3), + std::make_tuple(960,640,48,48,3,3), + std::make_tuple(640,640,96,96,3,3), + std::make_tuple(960,320,96,96,3,3), + std::make_tuple(320,320,96,96,3,3), + std::make_tuple(960,320,96,96,3,3), + std::make_tuple(640,320,96,96,3,3), + std::make_tuple(320,320,96,96,3,3), + std::make_tuple(640,320,96,96,3,3), + std::make_tuple(640,320,96,96,3,3), + std::make_tuple(320,320,96,96,3,3), + std::make_tuple(640,320,96,96,3,3), + std::make_tuple(320,4,96,96,3,3), + }; + std::vector> configs_sdxl_1024 = { //1024x1024 - // std::make_tuple(4,320,128,128,3,3), - // std::make_tuple(320,320,128,128,3,3), - // std::make_tuple(320,320,128,128,3,3), - // std::make_tuple(320,320,128,128,3,3), - // std::make_tuple(320,320,128,128,3,3), - // std::make_tuple(320,320,128,128,3,3), - // std::make_tuple(320,640,64,64,3,3), - // std::make_tuple(640,640,64,64,3,3), - // std::make_tuple(320,640,64,64,3,3), - // std::make_tuple(640,640,64,64,3,3), - // std::make_tuple(640,640,64,64,3,3), - // std::make_tuple(640,640,64,64,3,3), - // std::make_tuple(640,1280,32,32,3,3), - // std::make_tuple(1280,1280,32,32,3,3), - // std::make_tuple(640,1280,32,32,3,3), - // std::make_tuple(1280,1280,32,32,3,3), - // std::make_tuple(1280,1280,32,32,3,3), - // std::make_tuple(1280,1280,32,32,3,3), - // std::make_tuple(1280,1280,32,32,3,3), - // std::make_tuple(1280,1280,32,32,3,3), - // std::make_tuple(1280,1280,32,32,3,3), - // std::make_tuple(2560,1280,32,32,3,3), - // std::make_tuple(1280,1280,32,32,3,3), - // std::make_tuple(2560,1280,32,32,3,3), - // std::make_tuple(2560,1280,32,32,3,3), - // std::make_tuple(1280,1280,32,32,3,3), - // std::make_tuple(2560,1280,32,32,3,3), - // std::make_tuple(1920,1280,32,32,3,3), - // std::make_tuple(1280,1280,32,32,3,3), - // std::make_tuple(1920,1280,32,32,3,3), - // std::make_tuple(1280,1280,64,64,3,3), - // std::make_tuple(1920,640,64,64,3,3), - // std::make_tuple(640,640,64,64,3,3), - // std::make_tuple(1920,640,64,64,3,3), - // std::make_tuple(1280,640,64,64,3,3), - // std::make_tuple(640,640,64,64,3,3), - // std::make_tuple(1280,640,64,64,3,3), - // std::make_tuple(960,640,64,64,3,3), - // std::make_tuple(640,640,64,64,3,3), - // std::make_tuple(960,640,64,64,3,3), - // std::make_tuple(640,640,128,128,3,3), - // std::make_tuple(960,320,128,128,3,3), - // std::make_tuple(320,320,128,128,3,3), - // std::make_tuple(960,320,128,128,3,3), - // std::make_tuple(640,320,128,128,3,3), - // std::make_tuple(320,320,128,128,3,3), - // std::make_tuple(640,320,128,128,3,3), - // std::make_tuple(640,320,128,128,3,3), - // std::make_tuple(320,320,128,128,3,3), - // std::make_tuple(640,320,128,128,3,3), - // std::make_tuple(320,4,128,128,3,3), - // std::make_tuple(4,320,128,128,3,3), - // std::make_tuple(320,320,128,128,3,3), - // std::make_tuple(320,320,128,128,3,3), - // std::make_tuple(320,320,128,128,3,3), - // std::make_tuple(320,320,128,128,3,3), - // std::make_tuple(320,320,128,128,3,3), - // std::make_tuple(320,640,64,64,3,3), - // std::make_tuple(640,640,64,64,3,3), - // std::make_tuple(320,640,64,64,3,3), - // std::make_tuple(640,640,64,64,3,3), - // std::make_tuple(640,640,64,64,3,3), - // std::make_tuple(640,640,64,64,3,3), - // std::make_tuple(640,1280,32,32,3,3), - // std::make_tuple(1280,1280,32,32,3,3), - // std::make_tuple(640,1280,32,32,3,3), - // std::make_tuple(1280,1280,32,32,3,3), - // std::make_tuple(1280,1280,32,32,3,3), - // std::make_tuple(1280,1280,32,32,3,3), - // std::make_tuple(1280,1280,32,32,3,3), - // std::make_tuple(1280,1280,32,32,3,3), - // std::make_tuple(1280,1280,32,32,3,3), - // std::make_tuple(2560,1280,32,32,3,3), - // std::make_tuple(1280,1280,32,32,3,3), - // std::make_tuple(2560,1280,32,32,3,3), - // std::make_tuple(2560,1280,32,32,3,3), - // std::make_tuple(1280,1280,32,32,3,3), - // std::make_tuple(2560,1280,32,32,3,3), - // std::make_tuple(1920,1280,32,32,3,3), - // std::make_tuple(1280,1280,32,32,3,3), - // std::make_tuple(1920,1280,32,32,3,3), - // std::make_tuple(1280,1280,64,64,3,3), - // std::make_tuple(1920,640,64,64,3,3), - // std::make_tuple(640,640,64,64,3,3), - // std::make_tuple(1920,640,64,64,3,3), - // std::make_tuple(1280,640,64,64,3,3), - // std::make_tuple(640,640,64,64,3,3), - // std::make_tuple(1280,640,64,64,3,3), - // std::make_tuple(960,640,64,64,3,3), - // std::make_tuple(640,640,64,64,3,3), - // std::make_tuple(960,640,64,64,3,3), - // std::make_tuple(640,640,128,128,3,3), - // std::make_tuple(960,320,128,128,3,3), - // std::make_tuple(320,320,128,128,3,3), - // std::make_tuple(960,320,128,128,3,3), - // std::make_tuple(640,320,128,128,3,3), - // std::make_tuple(320,320,128,128,3,3), - // std::make_tuple(640,320,128,128,3,3), - // std::make_tuple(640,320,128,128,3,3), - // std::make_tuple(320,320,128,128,3,3), - // std::make_tuple(640,320,128,128,3,3), - // std::make_tuple(320,4,128,128,3,3), - - + std::make_tuple(4,320,128,128,3,3), + std::make_tuple(320,320,128,128,3,3), + std::make_tuple(320,320,128,128,3,3), + std::make_tuple(320,320,128,128,3,3), + std::make_tuple(320,320,128,128,3,3), + std::make_tuple(320,320,128,128,3,3), + std::make_tuple(320,640,64,64,3,3), + std::make_tuple(640,640,64,64,3,3), + std::make_tuple(320,640,64,64,3,3), + std::make_tuple(640,640,64,64,3,3), + std::make_tuple(640,640,64,64,3,3), + std::make_tuple(640,640,64,64,3,3), + std::make_tuple(640,1280,32,32,3,3), + std::make_tuple(1280,1280,32,32,3,3), + std::make_tuple(640,1280,32,32,3,3), + std::make_tuple(1280,1280,32,32,3,3), + std::make_tuple(1280,1280,32,32,3,3), + std::make_tuple(1280,1280,32,32,3,3), + std::make_tuple(1280,1280,32,32,3,3), + std::make_tuple(1280,1280,32,32,3,3), + std::make_tuple(1280,1280,32,32,3,3), + std::make_tuple(2560,1280,32,32,3,3), + std::make_tuple(1280,1280,32,32,3,3), + std::make_tuple(2560,1280,32,32,3,3), + std::make_tuple(2560,1280,32,32,3,3), + std::make_tuple(1280,1280,32,32,3,3), + std::make_tuple(2560,1280,32,32,3,3), + std::make_tuple(1920,1280,32,32,3,3), + std::make_tuple(1280,1280,32,32,3,3), + std::make_tuple(1920,1280,32,32,3,3), + std::make_tuple(1280,1280,64,64,3,3), + std::make_tuple(1920,640,64,64,3,3), + std::make_tuple(640,640,64,64,3,3), + std::make_tuple(1920,640,64,64,3,3), + std::make_tuple(1280,640,64,64,3,3), + std::make_tuple(640,640,64,64,3,3), + std::make_tuple(1280,640,64,64,3,3), + std::make_tuple(960,640,64,64,3,3), + std::make_tuple(640,640,64,64,3,3), + std::make_tuple(960,640,64,64,3,3), + std::make_tuple(640,640,128,128,3,3), + std::make_tuple(960,320,128,128,3,3), + std::make_tuple(320,320,128,128,3,3), + std::make_tuple(960,320,128,128,3,3), + std::make_tuple(640,320,128,128,3,3), + std::make_tuple(320,320,128,128,3,3), + std::make_tuple(640,320,128,128,3,3), + std::make_tuple(640,320,128,128,3,3), + std::make_tuple(320,320,128,128,3,3), + std::make_tuple(640,320,128,128,3,3), + std::make_tuple(320,4,128,128,3,3), + std::make_tuple(4,320,128,128,3,3), + std::make_tuple(320,320,128,128,3,3), + std::make_tuple(320,320,128,128,3,3), + std::make_tuple(320,320,128,128,3,3), + std::make_tuple(320,320,128,128,3,3), + std::make_tuple(320,320,128,128,3,3), + std::make_tuple(320,640,64,64,3,3), + std::make_tuple(640,640,64,64,3,3), + std::make_tuple(320,640,64,64,3,3), + std::make_tuple(640,640,64,64,3,3), + std::make_tuple(640,640,64,64,3,3), + std::make_tuple(640,640,64,64,3,3), + std::make_tuple(640,1280,32,32,3,3), + std::make_tuple(1280,1280,32,32,3,3), + std::make_tuple(640,1280,32,32,3,3), + std::make_tuple(1280,1280,32,32,3,3), + std::make_tuple(1280,1280,32,32,3,3), + std::make_tuple(1280,1280,32,32,3,3), + std::make_tuple(1280,1280,32,32,3,3), + std::make_tuple(1280,1280,32,32,3,3), + std::make_tuple(1280,1280,32,32,3,3), + std::make_tuple(2560,1280,32,32,3,3), + std::make_tuple(1280,1280,32,32,3,3), + std::make_tuple(2560,1280,32,32,3,3), + std::make_tuple(2560,1280,32,32,3,3), + std::make_tuple(1280,1280,32,32,3,3), + std::make_tuple(2560,1280,32,32,3,3), + std::make_tuple(1920,1280,32,32,3,3), + std::make_tuple(1280,1280,32,32,3,3), + std::make_tuple(1920,1280,32,32,3,3), + std::make_tuple(1280,1280,64,64,3,3), + std::make_tuple(1920,640,64,64,3,3), + std::make_tuple(640,640,64,64,3,3), + std::make_tuple(1920,640,64,64,3,3), + std::make_tuple(1280,640,64,64,3,3), + std::make_tuple(640,640,64,64,3,3), + std::make_tuple(1280,640,64,64,3,3), + std::make_tuple(960,640,64,64,3,3), + std::make_tuple(640,640,64,64,3,3), + std::make_tuple(960,640,64,64,3,3), + std::make_tuple(640,640,128,128,3,3), + std::make_tuple(960,320,128,128,3,3), + std::make_tuple(320,320,128,128,3,3), + std::make_tuple(960,320,128,128,3,3), + std::make_tuple(640,320,128,128,3,3), + std::make_tuple(320,320,128,128,3,3), + std::make_tuple(640,320,128,128,3,3), + std::make_tuple(640,320,128,128,3,3), + std::make_tuple(320,320,128,128,3,3), + std::make_tuple(640,320,128,128,3,3), + std::make_tuple(320,4,128,128,3,3) }; int k = 0; - for (auto c : configs){ + for (auto c : configs_sdxl_1024){ test_model model; load_model(model, std::get<0>(c), std::get<1>(c), std::get<2>(c), std::get<3>(c), std::get<4>(c), std::get<5>(c), true); @@ -663,7 +665,7 @@ int main(void) // fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0f/1024.0f); - int iterations = 0; + int iterations = 20; double run_time0; std::vector im2col_data = compute_graph(model, allocr, build_graph_0, iterations, &run_time0);