now bank conflicts free and performance get a bit boosted too

This commit is contained in:
bssrdf 2025-11-07 22:11:21 -05:00
parent 949eca4cba
commit 8809af79a8
2 changed files with 341 additions and 341 deletions

View File

@ -677,14 +677,12 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
uint32_t (&reg_)[2] = reinterpret_cast<uint32_t(&)[2]>(acc_register_[mma_m][mma_n]);
uint idx = output_sts_addr +
mma_m * MMA_M * BN / 2 + (mma_n - i * mma_tiles_per_warp_n/2) * MMA_N;
uint idx8 = idx + 8 * BN / 2;
idx = idx ^ ((idx & 0b110000000000) >> 9);
idx = idx ^ ((idx & 0b1110000000) >> 4);
uint32_t* dst_ptr = reinterpret_cast<uint32_t*>(&smemoutput[idx]);
dst_ptr[0] = reg_[0];
idx8 = idx8 ^ ((idx8 & 0b110000000000) >> 9);
idx8 = idx8 ^ ((idx8 & 0b1110000000) >> 4);
dst_ptr = reinterpret_cast<uint32_t*>(&smemoutput[idx8]);
idx = (idx + 8 * BN / 2 ) ^ 0b010;
dst_ptr = reinterpret_cast<uint32_t*>(&smemoutput[idx]);
dst_ptr[0] = reg_[1];
}
}

View File

@ -300,353 +300,355 @@ int main(void)
double time_iter0 = 0.0, time_iter1 = 0.0;
std::vector<std::tuple<int, int, int, int, int, int>> configs = {
// std::make_tuple(64,64,48,64,3,3),
// std::make_tuple(320,320,104,152,3,3),
// std::make_tuple(640,640,52,76,3,3),
// std::make_tuple(640,640,104,152,3,3),
// std::make_tuple(960,320,104,152,3,3),
std::make_tuple(64,64,48,64,3,3),
std::make_tuple(320,320,104,152,3,3),
std::make_tuple(640,640,52,76,3,3),
std::make_tuple(640,640,104,152,3,3),
std::make_tuple(960,320,104,152,3,3),
std::make_tuple(1280,1280,26,38,3,3),
// std::make_tuple(4,320,96,128,3,3),
// std::make_tuple(320,4,96,128,3,3),
// std::make_tuple(4,320,64,96,3,3),
// std::make_tuple(320,4,64,96,3,3),
// std::make_tuple(640,640,96,128,3,3),
// std::make_tuple(1280,1280,26,38,1,1),
// std::make_tuple(256,128,768,1024,3,3),
// std::make_tuple(128,3,768,1024,3,3),
// std::make_tuple(256,128,768,1024,1,1),
// std::make_tuple(512,256,384,512,1,1),
// std::make_tuple(1280,640,52,76,3,3),
// std::make_tuple(1920,1280,26,38,3,3),
// std::make_tuple(2560,1280,26,38,3,3),
// std::make_tuple(320,1280,26,38,3,3),
// std::make_tuple(512,512,104,152,3,3),
// std::make_tuple(512,512,208,304,3,3),
// std::make_tuple(512,256,416,608,3,3),
// std::make_tuple(256,128,832,1216,3,3),
// std::make_tuple(256,256,832,1216,3,3),
std::make_tuple(4,320,96,128,3,3),
std::make_tuple(320,4,96,128,3,3),
std::make_tuple(4,320,64,96,3,3),
std::make_tuple(320,4,64,96,3,3),
std::make_tuple(640,640,96,128,3,3),
std::make_tuple(1280,1280,26,38,1,1),
std::make_tuple(256,128,768,1024,3,3),
std::make_tuple(128,3,768,1024,3,3),
std::make_tuple(256,128,768,1024,1,1),
std::make_tuple(512,256,384,512,1,1),
std::make_tuple(1280,640,52,76,3,3),
std::make_tuple(1920,1280,26,38,3,3),
std::make_tuple(2560,1280,26,38,3,3),
std::make_tuple(320,1280,26,38,3,3),
std::make_tuple(512,512,104,152,3,3),
std::make_tuple(512,512,208,304,3,3),
std::make_tuple(512,256,416,608,3,3),
std::make_tuple(256,128,832,1216,3,3),
std::make_tuple(256,256,832,1216,3,3),
// std::make_tuple(320,256,1024,1920)
// std::make_tuple(32,64,58,58,3,3)
std::make_tuple(32,64,58,58,3,3)
};
std::vector<std::tuple<int, int, int, int, int, int>> configs_sdxl_512 = {
//512x512
// std::make_tuple(4,320,64,64,3,3),
// std::make_tuple(320,320,64,64,3,3),
// std::make_tuple(320,320,64,64,3,3),
// std::make_tuple(320,320,64,64,3,3),
// std::make_tuple(320,320,64,64,3,3),
// std::make_tuple(320,320,64,64,3,3),
// std::make_tuple(320,640,32,32,3,3),
// std::make_tuple(640,640,32,32,3,3),
// std::make_tuple(320,640,32,32,3,3),
// std::make_tuple(640,640,32,32,3,3),
// std::make_tuple(640,640,32,32,3,3),
// std::make_tuple(640,640,32,32,3,3),
// std::make_tuple(640,1280,16,16,3,3),
// std::make_tuple(1280,1280,16,16,3,3),
// std::make_tuple(640,1280,16,16,3,3),
// std::make_tuple(1280,1280,16,16,3,3),
// std::make_tuple(1280,1280,16,16,3,3),
// std::make_tuple(1280,1280,16,16,3,3),
// std::make_tuple(1280,1280,16,16,3,3),
// std::make_tuple(1280,1280,16,16,3,3),
// std::make_tuple(1280,1280,16,16,3,3),
// std::make_tuple(2560,1280,16,16,3,3),
// std::make_tuple(1280,1280,16,16,3,3),
// std::make_tuple(2560,1280,16,16,3,3),
// std::make_tuple(2560,1280,16,16,3,3),
// std::make_tuple(1280,1280,16,16,3,3),
// std::make_tuple(2560,1280,16,16,3,3),
// std::make_tuple(1920,1280,16,16,3,3),
// std::make_tuple(1280,1280,16,16,3,3),
// std::make_tuple(1920,1280,16,16,3,3),
// std::make_tuple(1280,1280,32,32,3,3),
// std::make_tuple(1920,640,32,32,3,3),
// std::make_tuple(640,640,32,32,3,3),
// std::make_tuple(1920,640,32,32,3,3),
// std::make_tuple(1280,640,32,32,3,3),
// std::make_tuple(640,640,32,32,3,3),
// std::make_tuple(1280,640,32,32,3,3),
// std::make_tuple(960,640,32,32,3,3),
// std::make_tuple(640,640,32,32,3,3),
// std::make_tuple(960,640,32,32,3,3),
// std::make_tuple(640,640,64,64,3,3),
// std::make_tuple(960,320,64,64,3,3),
// std::make_tuple(320,320,64,64,3,3),
// std::make_tuple(960,320,64,64,3,3),
// std::make_tuple(640,320,64,64,3,3),
// std::make_tuple(320,320,64,64,3,3),
// std::make_tuple(640,320,64,64,3,3),
// std::make_tuple(640,320,64,64,3,3),
// std::make_tuple(320,320,64,64,3,3),
// std::make_tuple(640,320,64,64,3,3),
// std::make_tuple(320,4,64,64,3,3),
// std::make_tuple(4,320,64,64,3,3),
// std::make_tuple(320,320,64,64,3,3),
// std::make_tuple(320,320,64,64,3,3),
// std::make_tuple(320,320,64,64,3,3),
// std::make_tuple(320,320,64,64,3,3),
// std::make_tuple(320,320,64,64,3,3),
// std::make_tuple(320,640,32,32,3,3),
// std::make_tuple(640,640,32,32,3,3),
// std::make_tuple(320,640,32,32,3,3),
// std::make_tuple(640,640,32,32,3,3),
// std::make_tuple(640,640,32,32,3,3),
// std::make_tuple(640,640,32,32,3,3),
// std::make_tuple(640,1280,16,16,3,3),
// std::make_tuple(1280,1280,16,16,3,3),
// std::make_tuple(640,1280,16,16,3,3),
// std::make_tuple(1280,1280,16,16,3,3),
// std::make_tuple(1280,1280,16,16,3,3),
// std::make_tuple(1280,1280,16,16,3,3),
// std::make_tuple(1280,1280,16,16,3,3),
// std::make_tuple(1280,1280,16,16,3,3),
// std::make_tuple(1280,1280,16,16,3,3),
// std::make_tuple(2560,1280,16,16,3,3),
// std::make_tuple(1280,1280,16,16,3,3),
// std::make_tuple(2560,1280,16,16,3,3),
// std::make_tuple(2560,1280,16,16,3,3),
// std::make_tuple(1280,1280,16,16,3,3),
// std::make_tuple(2560,1280,16,16,3,3),
// std::make_tuple(1920,1280,16,16,3,3),
// std::make_tuple(1280,1280,16,16,3,3),
// std::make_tuple(1920,1280,16,16,3,3),
// std::make_tuple(1280,1280,32,32,3,3),
// std::make_tuple(1920,640,32,32,3,3),
// std::make_tuple(640,640,32,32,3,3),
// std::make_tuple(1920,640,32,32,3,3),
// std::make_tuple(1280,640,32,32,3,3),
// std::make_tuple(640,640,32,32,3,3),
// std::make_tuple(1280,640,32,32,3,3),
// std::make_tuple(960,640,32,32,3,3),
// std::make_tuple(640,640,32,32,3,3),
// std::make_tuple(960,640,32,32,3,3),
// std::make_tuple(640,640,64,64,3,3),
// std::make_tuple(960,320,64,64,3,3),
// std::make_tuple(320,320,64,64,3,3),
// std::make_tuple(960,320,64,64,3,3),
// std::make_tuple(640,320,64,64,3,3),
// std::make_tuple(320,320,64,64,3,3),
// std::make_tuple(640,320,64,64,3,3),
// std::make_tuple(640,320,64,64,3,3),
// std::make_tuple(320,320,64,64,3,3),
// std::make_tuple(640,320,64,64,3,3),
// std::make_tuple(320,4,64,64,3,3),
std::make_tuple(4,320,64,64,3,3),
std::make_tuple(320,320,64,64,3,3),
std::make_tuple(320,320,64,64,3,3),
std::make_tuple(320,320,64,64,3,3),
std::make_tuple(320,320,64,64,3,3),
std::make_tuple(320,320,64,64,3,3),
std::make_tuple(320,640,32,32,3,3),
std::make_tuple(640,640,32,32,3,3),
std::make_tuple(320,640,32,32,3,3),
std::make_tuple(640,640,32,32,3,3),
std::make_tuple(640,640,32,32,3,3),
std::make_tuple(640,640,32,32,3,3),
std::make_tuple(640,1280,16,16,3,3),
std::make_tuple(1280,1280,16,16,3,3),
std::make_tuple(640,1280,16,16,3,3),
std::make_tuple(1280,1280,16,16,3,3),
std::make_tuple(1280,1280,16,16,3,3),
std::make_tuple(1280,1280,16,16,3,3),
std::make_tuple(1280,1280,16,16,3,3),
std::make_tuple(1280,1280,16,16,3,3),
std::make_tuple(1280,1280,16,16,3,3),
std::make_tuple(2560,1280,16,16,3,3),
std::make_tuple(1280,1280,16,16,3,3),
std::make_tuple(2560,1280,16,16,3,3),
std::make_tuple(2560,1280,16,16,3,3),
std::make_tuple(1280,1280,16,16,3,3),
std::make_tuple(2560,1280,16,16,3,3),
std::make_tuple(1920,1280,16,16,3,3),
std::make_tuple(1280,1280,16,16,3,3),
std::make_tuple(1920,1280,16,16,3,3),
std::make_tuple(1280,1280,32,32,3,3),
std::make_tuple(1920,640,32,32,3,3),
std::make_tuple(640,640,32,32,3,3),
std::make_tuple(1920,640,32,32,3,3),
std::make_tuple(1280,640,32,32,3,3),
std::make_tuple(640,640,32,32,3,3),
std::make_tuple(1280,640,32,32,3,3),
std::make_tuple(960,640,32,32,3,3),
std::make_tuple(640,640,32,32,3,3),
std::make_tuple(960,640,32,32,3,3),
std::make_tuple(640,640,64,64,3,3),
std::make_tuple(960,320,64,64,3,3),
std::make_tuple(320,320,64,64,3,3),
std::make_tuple(960,320,64,64,3,3),
std::make_tuple(640,320,64,64,3,3),
std::make_tuple(320,320,64,64,3,3),
std::make_tuple(640,320,64,64,3,3),
std::make_tuple(640,320,64,64,3,3),
std::make_tuple(320,320,64,64,3,3),
std::make_tuple(640,320,64,64,3,3),
std::make_tuple(320,4,64,64,3,3),
std::make_tuple(4,320,64,64,3,3),
std::make_tuple(320,320,64,64,3,3),
std::make_tuple(320,320,64,64,3,3),
std::make_tuple(320,320,64,64,3,3),
std::make_tuple(320,320,64,64,3,3),
std::make_tuple(320,320,64,64,3,3),
std::make_tuple(320,640,32,32,3,3),
std::make_tuple(640,640,32,32,3,3),
std::make_tuple(320,640,32,32,3,3),
std::make_tuple(640,640,32,32,3,3),
std::make_tuple(640,640,32,32,3,3),
std::make_tuple(640,640,32,32,3,3),
std::make_tuple(640,1280,16,16,3,3),
std::make_tuple(1280,1280,16,16,3,3),
std::make_tuple(640,1280,16,16,3,3),
std::make_tuple(1280,1280,16,16,3,3),
std::make_tuple(1280,1280,16,16,3,3),
std::make_tuple(1280,1280,16,16,3,3),
std::make_tuple(1280,1280,16,16,3,3),
std::make_tuple(1280,1280,16,16,3,3),
std::make_tuple(1280,1280,16,16,3,3),
std::make_tuple(2560,1280,16,16,3,3),
std::make_tuple(1280,1280,16,16,3,3),
std::make_tuple(2560,1280,16,16,3,3),
std::make_tuple(2560,1280,16,16,3,3),
std::make_tuple(1280,1280,16,16,3,3),
std::make_tuple(2560,1280,16,16,3,3),
std::make_tuple(1920,1280,16,16,3,3),
std::make_tuple(1280,1280,16,16,3,3),
std::make_tuple(1920,1280,16,16,3,3),
std::make_tuple(1280,1280,32,32,3,3),
std::make_tuple(1920,640,32,32,3,3),
std::make_tuple(640,640,32,32,3,3),
std::make_tuple(1920,640,32,32,3,3),
std::make_tuple(1280,640,32,32,3,3),
std::make_tuple(640,640,32,32,3,3),
std::make_tuple(1280,640,32,32,3,3),
std::make_tuple(960,640,32,32,3,3),
std::make_tuple(640,640,32,32,3,3),
std::make_tuple(960,640,32,32,3,3),
std::make_tuple(640,640,64,64,3,3),
std::make_tuple(960,320,64,64,3,3),
std::make_tuple(320,320,64,64,3,3),
std::make_tuple(960,320,64,64,3,3),
std::make_tuple(640,320,64,64,3,3),
std::make_tuple(320,320,64,64,3,3),
std::make_tuple(640,320,64,64,3,3),
std::make_tuple(640,320,64,64,3,3),
std::make_tuple(320,320,64,64,3,3),
std::make_tuple(640,320,64,64,3,3),
std::make_tuple(320,4,64,64,3,3)
};
std::vector<std::tuple<int, int, int, int, int, int>> configs_sdxl_768 = {
//768x768
// std::make_tuple(4,320,96,96,3,3),
// std::make_tuple(320,320,96,96,3,3),
// std::make_tuple(320,320,96,96,3,3),
// std::make_tuple(320,320,96,96,3,3),
// std::make_tuple(320,320,96,96,3,3),
// std::make_tuple(320,320,96,96,3,3),
// std::make_tuple(320,640,48,48,3,3),
// std::make_tuple(640,640,48,48,3,3),
// std::make_tuple(320,640,48,48,3,3),
// std::make_tuple(640,640,48,48,3,3),
// std::make_tuple(640,640,48,48,3,3),
// std::make_tuple(640,640,48,48,3,3),
// std::make_tuple(640,1280,24,24,3,3),
// std::make_tuple(1280,1280,24,24,3,3),
// std::make_tuple(640,1280,24,24,3,3),
// std::make_tuple(1280,1280,24,24,3,3),
// std::make_tuple(1280,1280,24,24,3,3),
// std::make_tuple(1280,1280,24,24,3,3),
// std::make_tuple(1280,1280,24,24,3,3),
// std::make_tuple(1280,1280,24,24,3,3),
// std::make_tuple(1280,1280,24,24,3,3),
// std::make_tuple(2560,1280,24,24,3,3),
// std::make_tuple(1280,1280,24,24,3,3),
// std::make_tuple(2560,1280,24,24,3,3),
// std::make_tuple(2560,1280,24,24,3,3),
// std::make_tuple(1280,1280,24,24,3,3),
// std::make_tuple(2560,1280,24,24,3,3),
// std::make_tuple(1920,1280,24,24,3,3),
// std::make_tuple(1280,1280,24,24,3,3),
// std::make_tuple(1920,1280,24,24,3,3),
// std::make_tuple(1280,1280,48,48,3,3),
// std::make_tuple(1920,640,48,48,3,3),
// std::make_tuple(640,640,48,48,3,3),
// std::make_tuple(1920,640,48,48,3,3),
// std::make_tuple(1280,640,48,48,3,3),
// std::make_tuple(640,640,48,48,3,3),
// std::make_tuple(1280,640,48,48,3,3),
// std::make_tuple(960,640,48,48,3,3),
// std::make_tuple(640,640,48,48,3,3),
// std::make_tuple(960,640,48,48,3,3),
// std::make_tuple(640,640,96,96,3,3),
// std::make_tuple(960,320,96,96,3,3),
// std::make_tuple(320,320,96,96,3,3),
// std::make_tuple(960,320,96,96,3,3),
// std::make_tuple(640,320,96,96,3,3),
// std::make_tuple(320,320,96,96,3,3),
// std::make_tuple(640,320,96,96,3,3),
// std::make_tuple(640,320,96,96,3,3),
// std::make_tuple(320,320,96,96,3,3),
// std::make_tuple(640,320,96,96,3,3),
// std::make_tuple(320,4,96,96,3,3),
// std::make_tuple(4,320,96,96,3,3),
// std::make_tuple(320,320,96,96,3,3),
// std::make_tuple(320,320,96,96,3,3),
// std::make_tuple(320,320,96,96,3,3),
// std::make_tuple(320,320,96,96,3,3),
// std::make_tuple(320,320,96,96,3,3),
// std::make_tuple(320,640,48,48,3,3),
// std::make_tuple(640,640,48,48,3,3),
// std::make_tuple(320,640,48,48,3,3),
// std::make_tuple(640,640,48,48,3,3),
// std::make_tuple(640,640,48,48,3,3),
// std::make_tuple(640,640,48,48,3,3),
// std::make_tuple(640,1280,24,24,3,3),
// std::make_tuple(1280,1280,24,24,3,3),
// std::make_tuple(640,1280,24,24,3,3),
// std::make_tuple(1280,1280,24,24,3,3),
// std::make_tuple(1280,1280,24,24,3,3),
// std::make_tuple(1280,1280,24,24,3,3),
// std::make_tuple(1280,1280,24,24,3,3),
// std::make_tuple(1280,1280,24,24,3,3),
// std::make_tuple(1280,1280,24,24,3,3),
// std::make_tuple(2560,1280,24,24,3,3),
// std::make_tuple(1280,1280,24,24,3,3),
// std::make_tuple(2560,1280,24,24,3,3),
// std::make_tuple(2560,1280,24,24,3,3),
// std::make_tuple(1280,1280,24,24,3,3),
// std::make_tuple(2560,1280,24,24,3,3),
// std::make_tuple(1920,1280,24,24,3,3),
// std::make_tuple(1280,1280,24,24,3,3),
// std::make_tuple(1920,1280,24,24,3,3),
// std::make_tuple(1280,1280,48,48,3,3),
// std::make_tuple(1920,640,48,48,3,3),
// std::make_tuple(640,640,48,48,3,3),
// std::make_tuple(1920,640,48,48,3,3),
// std::make_tuple(1280,640,48,48,3,3),
// std::make_tuple(640,640,48,48,3,3),
// std::make_tuple(1280,640,48,48,3,3),
// std::make_tuple(960,640,48,48,3,3),
// std::make_tuple(640,640,48,48,3,3),
// std::make_tuple(960,640,48,48,3,3),
// std::make_tuple(640,640,96,96,3,3),
// std::make_tuple(960,320,96,96,3,3),
// std::make_tuple(320,320,96,96,3,3),
// std::make_tuple(960,320,96,96,3,3),
// std::make_tuple(640,320,96,96,3,3),
// std::make_tuple(320,320,96,96,3,3),
// std::make_tuple(640,320,96,96,3,3),
// std::make_tuple(640,320,96,96,3,3),
// std::make_tuple(320,320,96,96,3,3),
// std::make_tuple(640,320,96,96,3,3),
// std::make_tuple(320,4,96,96,3,3),
std::make_tuple(4,320,96,96,3,3),
std::make_tuple(320,320,96,96,3,3),
std::make_tuple(320,320,96,96,3,3),
std::make_tuple(320,320,96,96,3,3),
std::make_tuple(320,320,96,96,3,3),
std::make_tuple(320,320,96,96,3,3),
std::make_tuple(320,640,48,48,3,3),
std::make_tuple(640,640,48,48,3,3),
std::make_tuple(320,640,48,48,3,3),
std::make_tuple(640,640,48,48,3,3),
std::make_tuple(640,640,48,48,3,3),
std::make_tuple(640,640,48,48,3,3),
std::make_tuple(640,1280,24,24,3,3),
std::make_tuple(1280,1280,24,24,3,3),
std::make_tuple(640,1280,24,24,3,3),
std::make_tuple(1280,1280,24,24,3,3),
std::make_tuple(1280,1280,24,24,3,3),
std::make_tuple(1280,1280,24,24,3,3),
std::make_tuple(1280,1280,24,24,3,3),
std::make_tuple(1280,1280,24,24,3,3),
std::make_tuple(1280,1280,24,24,3,3),
std::make_tuple(2560,1280,24,24,3,3),
std::make_tuple(1280,1280,24,24,3,3),
std::make_tuple(2560,1280,24,24,3,3),
std::make_tuple(2560,1280,24,24,3,3),
std::make_tuple(1280,1280,24,24,3,3),
std::make_tuple(2560,1280,24,24,3,3),
std::make_tuple(1920,1280,24,24,3,3),
std::make_tuple(1280,1280,24,24,3,3),
std::make_tuple(1920,1280,24,24,3,3),
std::make_tuple(1280,1280,48,48,3,3),
std::make_tuple(1920,640,48,48,3,3),
std::make_tuple(640,640,48,48,3,3),
std::make_tuple(1920,640,48,48,3,3),
std::make_tuple(1280,640,48,48,3,3),
std::make_tuple(640,640,48,48,3,3),
std::make_tuple(1280,640,48,48,3,3),
std::make_tuple(960,640,48,48,3,3),
std::make_tuple(640,640,48,48,3,3),
std::make_tuple(960,640,48,48,3,3),
std::make_tuple(640,640,96,96,3,3),
std::make_tuple(960,320,96,96,3,3),
std::make_tuple(320,320,96,96,3,3),
std::make_tuple(960,320,96,96,3,3),
std::make_tuple(640,320,96,96,3,3),
std::make_tuple(320,320,96,96,3,3),
std::make_tuple(640,320,96,96,3,3),
std::make_tuple(640,320,96,96,3,3),
std::make_tuple(320,320,96,96,3,3),
std::make_tuple(640,320,96,96,3,3),
std::make_tuple(320,4,96,96,3,3),
std::make_tuple(4,320,96,96,3,3),
std::make_tuple(320,320,96,96,3,3),
std::make_tuple(320,320,96,96,3,3),
std::make_tuple(320,320,96,96,3,3),
std::make_tuple(320,320,96,96,3,3),
std::make_tuple(320,320,96,96,3,3),
std::make_tuple(320,640,48,48,3,3),
std::make_tuple(640,640,48,48,3,3),
std::make_tuple(320,640,48,48,3,3),
std::make_tuple(640,640,48,48,3,3),
std::make_tuple(640,640,48,48,3,3),
std::make_tuple(640,640,48,48,3,3),
std::make_tuple(640,1280,24,24,3,3),
std::make_tuple(1280,1280,24,24,3,3),
std::make_tuple(640,1280,24,24,3,3),
std::make_tuple(1280,1280,24,24,3,3),
std::make_tuple(1280,1280,24,24,3,3),
std::make_tuple(1280,1280,24,24,3,3),
std::make_tuple(1280,1280,24,24,3,3),
std::make_tuple(1280,1280,24,24,3,3),
std::make_tuple(1280,1280,24,24,3,3),
std::make_tuple(2560,1280,24,24,3,3),
std::make_tuple(1280,1280,24,24,3,3),
std::make_tuple(2560,1280,24,24,3,3),
std::make_tuple(2560,1280,24,24,3,3),
std::make_tuple(1280,1280,24,24,3,3),
std::make_tuple(2560,1280,24,24,3,3),
std::make_tuple(1920,1280,24,24,3,3),
std::make_tuple(1280,1280,24,24,3,3),
std::make_tuple(1920,1280,24,24,3,3),
std::make_tuple(1280,1280,48,48,3,3),
std::make_tuple(1920,640,48,48,3,3),
std::make_tuple(640,640,48,48,3,3),
std::make_tuple(1920,640,48,48,3,3),
std::make_tuple(1280,640,48,48,3,3),
std::make_tuple(640,640,48,48,3,3),
std::make_tuple(1280,640,48,48,3,3),
std::make_tuple(960,640,48,48,3,3),
std::make_tuple(640,640,48,48,3,3),
std::make_tuple(960,640,48,48,3,3),
std::make_tuple(640,640,96,96,3,3),
std::make_tuple(960,320,96,96,3,3),
std::make_tuple(320,320,96,96,3,3),
std::make_tuple(960,320,96,96,3,3),
std::make_tuple(640,320,96,96,3,3),
std::make_tuple(320,320,96,96,3,3),
std::make_tuple(640,320,96,96,3,3),
std::make_tuple(640,320,96,96,3,3),
std::make_tuple(320,320,96,96,3,3),
std::make_tuple(640,320,96,96,3,3),
std::make_tuple(320,4,96,96,3,3),
};
std::vector<std::tuple<int, int, int, int, int, int>> configs_sdxl_1024 = {
//1024x1024
// std::make_tuple(4,320,128,128,3,3),
// std::make_tuple(320,320,128,128,3,3),
// std::make_tuple(320,320,128,128,3,3),
// std::make_tuple(320,320,128,128,3,3),
// std::make_tuple(320,320,128,128,3,3),
// std::make_tuple(320,320,128,128,3,3),
// std::make_tuple(320,640,64,64,3,3),
// std::make_tuple(640,640,64,64,3,3),
// std::make_tuple(320,640,64,64,3,3),
// std::make_tuple(640,640,64,64,3,3),
// std::make_tuple(640,640,64,64,3,3),
// std::make_tuple(640,640,64,64,3,3),
// std::make_tuple(640,1280,32,32,3,3),
// std::make_tuple(1280,1280,32,32,3,3),
// std::make_tuple(640,1280,32,32,3,3),
// std::make_tuple(1280,1280,32,32,3,3),
// std::make_tuple(1280,1280,32,32,3,3),
// std::make_tuple(1280,1280,32,32,3,3),
// std::make_tuple(1280,1280,32,32,3,3),
// std::make_tuple(1280,1280,32,32,3,3),
// std::make_tuple(1280,1280,32,32,3,3),
// std::make_tuple(2560,1280,32,32,3,3),
// std::make_tuple(1280,1280,32,32,3,3),
// std::make_tuple(2560,1280,32,32,3,3),
// std::make_tuple(2560,1280,32,32,3,3),
// std::make_tuple(1280,1280,32,32,3,3),
// std::make_tuple(2560,1280,32,32,3,3),
// std::make_tuple(1920,1280,32,32,3,3),
// std::make_tuple(1280,1280,32,32,3,3),
// std::make_tuple(1920,1280,32,32,3,3),
// std::make_tuple(1280,1280,64,64,3,3),
// std::make_tuple(1920,640,64,64,3,3),
// std::make_tuple(640,640,64,64,3,3),
// std::make_tuple(1920,640,64,64,3,3),
// std::make_tuple(1280,640,64,64,3,3),
// std::make_tuple(640,640,64,64,3,3),
// std::make_tuple(1280,640,64,64,3,3),
// std::make_tuple(960,640,64,64,3,3),
// std::make_tuple(640,640,64,64,3,3),
// std::make_tuple(960,640,64,64,3,3),
// std::make_tuple(640,640,128,128,3,3),
// std::make_tuple(960,320,128,128,3,3),
// std::make_tuple(320,320,128,128,3,3),
// std::make_tuple(960,320,128,128,3,3),
// std::make_tuple(640,320,128,128,3,3),
// std::make_tuple(320,320,128,128,3,3),
// std::make_tuple(640,320,128,128,3,3),
// std::make_tuple(640,320,128,128,3,3),
// std::make_tuple(320,320,128,128,3,3),
// std::make_tuple(640,320,128,128,3,3),
// std::make_tuple(320,4,128,128,3,3),
// std::make_tuple(4,320,128,128,3,3),
// std::make_tuple(320,320,128,128,3,3),
// std::make_tuple(320,320,128,128,3,3),
// std::make_tuple(320,320,128,128,3,3),
// std::make_tuple(320,320,128,128,3,3),
// std::make_tuple(320,320,128,128,3,3),
// std::make_tuple(320,640,64,64,3,3),
// std::make_tuple(640,640,64,64,3,3),
// std::make_tuple(320,640,64,64,3,3),
// std::make_tuple(640,640,64,64,3,3),
// std::make_tuple(640,640,64,64,3,3),
// std::make_tuple(640,640,64,64,3,3),
// std::make_tuple(640,1280,32,32,3,3),
// std::make_tuple(1280,1280,32,32,3,3),
// std::make_tuple(640,1280,32,32,3,3),
// std::make_tuple(1280,1280,32,32,3,3),
// std::make_tuple(1280,1280,32,32,3,3),
// std::make_tuple(1280,1280,32,32,3,3),
// std::make_tuple(1280,1280,32,32,3,3),
// std::make_tuple(1280,1280,32,32,3,3),
// std::make_tuple(1280,1280,32,32,3,3),
// std::make_tuple(2560,1280,32,32,3,3),
// std::make_tuple(1280,1280,32,32,3,3),
// std::make_tuple(2560,1280,32,32,3,3),
// std::make_tuple(2560,1280,32,32,3,3),
// std::make_tuple(1280,1280,32,32,3,3),
// std::make_tuple(2560,1280,32,32,3,3),
// std::make_tuple(1920,1280,32,32,3,3),
// std::make_tuple(1280,1280,32,32,3,3),
// std::make_tuple(1920,1280,32,32,3,3),
// std::make_tuple(1280,1280,64,64,3,3),
// std::make_tuple(1920,640,64,64,3,3),
// std::make_tuple(640,640,64,64,3,3),
// std::make_tuple(1920,640,64,64,3,3),
// std::make_tuple(1280,640,64,64,3,3),
// std::make_tuple(640,640,64,64,3,3),
// std::make_tuple(1280,640,64,64,3,3),
// std::make_tuple(960,640,64,64,3,3),
// std::make_tuple(640,640,64,64,3,3),
// std::make_tuple(960,640,64,64,3,3),
// std::make_tuple(640,640,128,128,3,3),
// std::make_tuple(960,320,128,128,3,3),
// std::make_tuple(320,320,128,128,3,3),
// std::make_tuple(960,320,128,128,3,3),
// std::make_tuple(640,320,128,128,3,3),
// std::make_tuple(320,320,128,128,3,3),
// std::make_tuple(640,320,128,128,3,3),
// std::make_tuple(640,320,128,128,3,3),
// std::make_tuple(320,320,128,128,3,3),
// std::make_tuple(640,320,128,128,3,3),
// std::make_tuple(320,4,128,128,3,3),
std::make_tuple(4,320,128,128,3,3),
std::make_tuple(320,320,128,128,3,3),
std::make_tuple(320,320,128,128,3,3),
std::make_tuple(320,320,128,128,3,3),
std::make_tuple(320,320,128,128,3,3),
std::make_tuple(320,320,128,128,3,3),
std::make_tuple(320,640,64,64,3,3),
std::make_tuple(640,640,64,64,3,3),
std::make_tuple(320,640,64,64,3,3),
std::make_tuple(640,640,64,64,3,3),
std::make_tuple(640,640,64,64,3,3),
std::make_tuple(640,640,64,64,3,3),
std::make_tuple(640,1280,32,32,3,3),
std::make_tuple(1280,1280,32,32,3,3),
std::make_tuple(640,1280,32,32,3,3),
std::make_tuple(1280,1280,32,32,3,3),
std::make_tuple(1280,1280,32,32,3,3),
std::make_tuple(1280,1280,32,32,3,3),
std::make_tuple(1280,1280,32,32,3,3),
std::make_tuple(1280,1280,32,32,3,3),
std::make_tuple(1280,1280,32,32,3,3),
std::make_tuple(2560,1280,32,32,3,3),
std::make_tuple(1280,1280,32,32,3,3),
std::make_tuple(2560,1280,32,32,3,3),
std::make_tuple(2560,1280,32,32,3,3),
std::make_tuple(1280,1280,32,32,3,3),
std::make_tuple(2560,1280,32,32,3,3),
std::make_tuple(1920,1280,32,32,3,3),
std::make_tuple(1280,1280,32,32,3,3),
std::make_tuple(1920,1280,32,32,3,3),
std::make_tuple(1280,1280,64,64,3,3),
std::make_tuple(1920,640,64,64,3,3),
std::make_tuple(640,640,64,64,3,3),
std::make_tuple(1920,640,64,64,3,3),
std::make_tuple(1280,640,64,64,3,3),
std::make_tuple(640,640,64,64,3,3),
std::make_tuple(1280,640,64,64,3,3),
std::make_tuple(960,640,64,64,3,3),
std::make_tuple(640,640,64,64,3,3),
std::make_tuple(960,640,64,64,3,3),
std::make_tuple(640,640,128,128,3,3),
std::make_tuple(960,320,128,128,3,3),
std::make_tuple(320,320,128,128,3,3),
std::make_tuple(960,320,128,128,3,3),
std::make_tuple(640,320,128,128,3,3),
std::make_tuple(320,320,128,128,3,3),
std::make_tuple(640,320,128,128,3,3),
std::make_tuple(640,320,128,128,3,3),
std::make_tuple(320,320,128,128,3,3),
std::make_tuple(640,320,128,128,3,3),
std::make_tuple(320,4,128,128,3,3),
std::make_tuple(4,320,128,128,3,3),
std::make_tuple(320,320,128,128,3,3),
std::make_tuple(320,320,128,128,3,3),
std::make_tuple(320,320,128,128,3,3),
std::make_tuple(320,320,128,128,3,3),
std::make_tuple(320,320,128,128,3,3),
std::make_tuple(320,640,64,64,3,3),
std::make_tuple(640,640,64,64,3,3),
std::make_tuple(320,640,64,64,3,3),
std::make_tuple(640,640,64,64,3,3),
std::make_tuple(640,640,64,64,3,3),
std::make_tuple(640,640,64,64,3,3),
std::make_tuple(640,1280,32,32,3,3),
std::make_tuple(1280,1280,32,32,3,3),
std::make_tuple(640,1280,32,32,3,3),
std::make_tuple(1280,1280,32,32,3,3),
std::make_tuple(1280,1280,32,32,3,3),
std::make_tuple(1280,1280,32,32,3,3),
std::make_tuple(1280,1280,32,32,3,3),
std::make_tuple(1280,1280,32,32,3,3),
std::make_tuple(1280,1280,32,32,3,3),
std::make_tuple(2560,1280,32,32,3,3),
std::make_tuple(1280,1280,32,32,3,3),
std::make_tuple(2560,1280,32,32,3,3),
std::make_tuple(2560,1280,32,32,3,3),
std::make_tuple(1280,1280,32,32,3,3),
std::make_tuple(2560,1280,32,32,3,3),
std::make_tuple(1920,1280,32,32,3,3),
std::make_tuple(1280,1280,32,32,3,3),
std::make_tuple(1920,1280,32,32,3,3),
std::make_tuple(1280,1280,64,64,3,3),
std::make_tuple(1920,640,64,64,3,3),
std::make_tuple(640,640,64,64,3,3),
std::make_tuple(1920,640,64,64,3,3),
std::make_tuple(1280,640,64,64,3,3),
std::make_tuple(640,640,64,64,3,3),
std::make_tuple(1280,640,64,64,3,3),
std::make_tuple(960,640,64,64,3,3),
std::make_tuple(640,640,64,64,3,3),
std::make_tuple(960,640,64,64,3,3),
std::make_tuple(640,640,128,128,3,3),
std::make_tuple(960,320,128,128,3,3),
std::make_tuple(320,320,128,128,3,3),
std::make_tuple(960,320,128,128,3,3),
std::make_tuple(640,320,128,128,3,3),
std::make_tuple(320,320,128,128,3,3),
std::make_tuple(640,320,128,128,3,3),
std::make_tuple(640,320,128,128,3,3),
std::make_tuple(320,320,128,128,3,3),
std::make_tuple(640,320,128,128,3,3),
std::make_tuple(320,4,128,128,3,3)
};
int k = 0;
for (auto c : configs){
for (auto c : configs_sdxl_1024){
test_model model;
load_model(model, std::get<0>(c), std::get<1>(c), std::get<2>(c),
std::get<3>(c), std::get<4>(c), std::get<5>(c), true);
@ -663,7 +665,7 @@ int main(void)
// fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0f/1024.0f);
int iterations = 0;
int iterations = 20;
double run_time0;
std::vector<float> im2col_data = compute_graph(model, allocr, build_graph_0, iterations, &run_time0);