Add TODOs to and adjust heuristics of row-wise soft_max in CUDA
Heuristics were selected based on the following numbers: ``` -- Before Backend 1/2: CUDA0 Device description: NVIDIA RTX PRO 6000 Blackwell Max-Q Workstation Edition Device memory: 97250 MB (96691 MB free) SOFT_MAX(type=f32,ne=[4096,4096,5,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 2236 runs - 450.34 us/run - 655360 kB/run - 1401.20 GB/s SOFT_MAX(type=f32,ne=[12888,256,5,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 17748 runs - 56.80 us/run - 128880 kB/run - 2168.19 GB/s SOFT_MAX(type=f32,ne=[77,4096,5,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 57204 runs - 18.35 us/run - 12320 kB/run - 640.57 GB/s SOFT_MAX(type=f32,ne=[1024,1024,10,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 9840 runs - 102.46 us/run - 81920 kB/run - 763.45 GB/s SOFT_MAX(type=f32,ne=[77,1024,10,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 98064 runs - 10.25 us/run - 6160 kB/run - 573.43 GB/s SOFT_MAX(type=f32,ne=[256,256,20,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 98310 runs - 10.25 us/run - 10240 kB/run - 953.20 GB/s SOFT_MAX(type=f32,ne=[64,64,20,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 172011 runs - 5.99 us/run - 640 kB/run - 101.84 GB/s SOFT_MAX(type=f32,ne=[77,64,20,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 172011 runs - 5.97 us/run - 770 kB/run - 123.02 GB/s SOFT_MAX(type=f32,ne=[8192,1,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 172011 runs - 6.00 us/run - 64 kB/run - 10.16 GB/s SOFT_MAX(type=f32,ne=[8192,4,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 163820 runs - 6.12 us/run - 256 kB/run - 39.91 GB/s SOFT_MAX(type=f32,ne=[8192,16,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 147438 runs - 6.88 us/run - 1024 kB/run - 141.92 GB/s SOFT_MAX(type=f32,ne=[16384,1,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 122865 runs - 8.20 us/run - 128 kB/run - 14.89 GB/s SOFT_MAX(type=f32,ne=[16384,4,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 114674 runs - 8.87 us/run - 512 kB/run - 55.06 GB/s SOFT_MAX(type=f32,ne=[16384,16,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 98292 runs - 10.24 us/run - 2048 kB/run - 190.82 GB/s SOFT_MAX(type=f32,ne=[32768,1,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 49146 runs - 21.37 us/run - 256 kB/run - 11.43 GB/s SOFT_MAX(type=f32,ne=[32768,4,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 49146 runs - 22.54 us/run - 1024 kB/run - 43.33 GB/s SOFT_MAX(type=f32,ne=[32768,16,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 49146 runs - 23.92 us/run - 4096 kB/run - 163.32 GB/s SOFT_MAX(type=f32,ne=[65536,1,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 32764 runs - 38.94 us/run - 512 kB/run - 12.54 GB/s SOFT_MAX(type=f32,ne=[65536,4,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 24573 runs - 41.94 us/run - 2048 kB/run - 46.57 GB/s SOFT_MAX(type=f32,ne=[65536,16,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 24582 runs - 43.09 us/run - 8192 kB/run - 181.32 GB/s SOFT_MAX(type=f32,ne=[131072,1,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 16382 runs - 74.56 us/run - 1024 kB/run - 13.10 GB/s SOFT_MAX(type=f32,ne=[131072,4,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 16382 runs - 79.85 us/run - 4096 kB/run - 48.92 GB/s SOFT_MAX(type=f32,ne=[131072,16,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 12294 runs - 82.41 us/run - 16384 kB/run - 189.64 GB/s SOFT_MAX(type=f32,ne=[262144,1,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 8191 runs - 145.16 us/run - 2048 kB/run - 13.46 GB/s SOFT_MAX(type=f32,ne=[262144,4,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 8194 runs - 155.46 us/run - 8192 kB/run - 50.26 GB/s SOFT_MAX(type=f32,ne=[262144,16,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 7175 runs - 160.70 us/run - 32768 kB/run - 194.56 GB/s SOFT_MAX(type=f32,ne=[524288,1,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 8191 runs - 285.81 us/run - 4096 kB/run - 13.67 GB/s SOFT_MAX(type=f32,ne=[524288,4,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 4098 runs - 306.91 us/run - 16384 kB/run - 50.92 GB/s SOFT_MAX(type=f32,ne=[524288,16,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 3591 runs - 317.06 us/run - 65536 kB/run - 197.32 GB/s -- After Backend 1/2: CUDA0 Device description: NVIDIA RTX PRO 6000 Blackwell Max-Q Workstation Edition Device memory: 97250 MB (96691 MB free) SOFT_MAX(type=f32,ne=[4096,4096,5,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 2236 runs - 450.67 us/run - 655360 kB/run - 1400.15 GB/s SOFT_MAX(type=f32,ne=[12888,256,5,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 17748 runs - 56.97 us/run - 128880 kB/run - 2161.50 GB/s SOFT_MAX(type=f32,ne=[77,4096,5,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 57204 runs - 18.35 us/run - 12320 kB/run - 640.36 GB/s SOFT_MAX(type=f32,ne=[1024,1024,10,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 9840 runs - 102.46 us/run - 81920 kB/run - 763.42 GB/s SOFT_MAX(type=f32,ne=[77,1024,10,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 98064 runs - 10.25 us/run - 6160 kB/run - 573.43 GB/s SOFT_MAX(type=f32,ne=[256,256,20,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 98310 runs - 10.25 us/run - 10240 kB/run - 953.21 GB/s SOFT_MAX(type=f32,ne=[64,64,20,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 147438 runs - 7.00 us/run - 640 kB/run - 87.26 GB/s SOFT_MAX(type=f32,ne=[77,64,20,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 147438 runs - 6.99 us/run - 770 kB/run - 105.05 GB/s SOFT_MAX(type=f32,ne=[8192,1,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 172011 runs - 6.02 us/run - 64 kB/run - 10.13 GB/s SOFT_MAX(type=f32,ne=[8192,4,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 163820 runs - 6.12 us/run - 256 kB/run - 39.87 GB/s SOFT_MAX(type=f32,ne=[8192,16,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 147438 runs - 6.91 us/run - 1024 kB/run - 141.40 GB/s SOFT_MAX(type=f32,ne=[16384,1,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 122865 runs - 8.20 us/run - 128 kB/run - 14.89 GB/s SOFT_MAX(type=f32,ne=[16384,4,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 114674 runs - 8.79 us/run - 512 kB/run - 55.54 GB/s SOFT_MAX(type=f32,ne=[16384,16,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 98292 runs - 10.24 us/run - 2048 kB/run - 190.82 GB/s SOFT_MAX(type=f32,ne=[32768,1,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 131056 runs - 8.11 us/run - 256 kB/run - 30.12 GB/s SOFT_MAX(type=f32,ne=[32768,4,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 49146 runs - 22.54 us/run - 1024 kB/run - 43.33 GB/s SOFT_MAX(type=f32,ne=[32768,16,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 49146 runs - 23.32 us/run - 4096 kB/run - 167.50 GB/s SOFT_MAX(type=f32,ne=[65536,1,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 122865 runs - 8.19 us/run - 512 kB/run - 59.63 GB/s SOFT_MAX(type=f32,ne=[65536,4,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 40955 runs - 24.59 us/run - 2048 kB/run - 79.43 GB/s SOFT_MAX(type=f32,ne=[65536,16,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 24582 runs - 43.21 us/run - 8192 kB/run - 180.84 GB/s SOFT_MAX(type=f32,ne=[131072,1,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 122865 runs - 8.19 us/run - 1024 kB/run - 119.25 GB/s SOFT_MAX(type=f32,ne=[131072,4,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 40955 runs - 24.59 us/run - 4096 kB/run - 158.87 GB/s SOFT_MAX(type=f32,ne=[131072,16,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 12294 runs - 82.37 us/run - 16384 kB/run - 189.74 GB/s SOFT_MAX(type=f32,ne=[262144,1,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 122865 runs - 8.20 us/run - 2048 kB/run - 238.28 GB/s SOFT_MAX(type=f32,ne=[262144,4,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 36873 runs - 28.66 us/run - 8192 kB/run - 272.61 GB/s SOFT_MAX(type=f32,ne=[262144,16,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 9225 runs - 108.51 us/run - 32768 kB/run - 288.13 GB/s SOFT_MAX(type=f32,ne=[524288,1,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 98292 runs - 10.24 us/run - 4096 kB/run - 381.65 GB/s SOFT_MAX(type=f32,ne=[524288,4,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 32784 runs - 31.74 us/run - 16384 kB/run - 492.43 GB/s SOFT_MAX(type=f32,ne=[524288,16,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 8721 runs - 121.20 us/run - 65536 kB/run - 516.19 GB/s ```
This commit is contained in:
parent
a84dfd3e10
commit
886c3668b5
|
|
@ -209,6 +209,7 @@ static __device__ float two_stage_warp_reduce_sum(float val) {
|
|||
}
|
||||
}
|
||||
|
||||
// TODO: Template to allow keeping ncols in registers if they fit
|
||||
static __device__ void soft_max_f32_parallelize_cols_single_row(const float * __restrict__ x,
|
||||
float * __restrict__ dst,
|
||||
float * __restrict__ tmp_vals,
|
||||
|
|
@ -404,7 +405,9 @@ static void soft_max_f32_cuda(const float * x,
|
|||
launch_soft_max_kernels<32, 64, 128, 256, 512, 1024, 2048, 4096>(x, mask, sinks, dst, params, stream, block_dims, block_nums, nbytes_shared);
|
||||
} else {
|
||||
// Parallelize across SMs for top-p/dist-smapling
|
||||
if (ncols_x > 10000 && mask == nullptr && sinks == nullptr && params.scale == 1.0f && params.max_bias == 0.0f) {
|
||||
// The heuristic for parallelizing rows across SMs vs parallelizing single row & looping over all rows was done on the basis of a B6000 GPU and
|
||||
// Can be adapted further for lower-SM-count GPUs, though keeping data in registers should be implemented first as that is the optimal solution.
|
||||
if (ncols_x / (params.ne01 * params.ne02 * params.ne03) > 8192 && mask == nullptr && sinks == nullptr && params.scale == 1.0f && params.max_bias == 0.0f) {
|
||||
if (ggml_cuda_info().devices[id].supports_cooperative_launch) {
|
||||
ggml_cuda_pool_alloc<float> tmp_alloc(ctx.pool(), ggml_cuda_info().devices[id].nsm * sizeof(float));
|
||||
|
||||
|
|
|
|||
|
|
@ -8113,6 +8113,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
|||
}
|
||||
}
|
||||
|
||||
for (int col: {8192, 16384, 32768, 65536, 131072, 262144, 524288}) {
|
||||
for (int rows: {1, 4, 16}){
|
||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {col, rows, 1, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f));
|
||||
}
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_conv_2d_dw({512, 512, 256, 1}, {3, 3, 1, 256}, 1, 1, 1, false));
|
||||
test_cases.emplace_back(new test_conv_2d_dw({512, 512, 256, 1}, {3, 3, 1, 256}, 1, 1, 1, true));
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue