Add TODOs to and adjust heuristics of row-wise soft_max in CUDA
Heuristics were selected based on the following numbers:
```
-- Before
Backend 1/2: CUDA0
Device description: NVIDIA RTX PRO 6000 Blackwell Max-Q Workstation Edition
Device memory: 97250 MB (96691 MB free)
SOFT_MAX(type=f32,ne=[4096,4096,5,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 2236 runs - 450.34 us/run - 655360 kB/run - 1401.20 GB/s
SOFT_MAX(type=f32,ne=[12888,256,5,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 17748 runs - 56.80 us/run - 128880 kB/run - 2168.19 GB/s
SOFT_MAX(type=f32,ne=[77,4096,5,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 57204 runs - 18.35 us/run - 12320 kB/run - 640.57 GB/s
SOFT_MAX(type=f32,ne=[1024,1024,10,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 9840 runs - 102.46 us/run - 81920 kB/run - 763.45 GB/s
SOFT_MAX(type=f32,ne=[77,1024,10,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 98064 runs - 10.25 us/run - 6160 kB/run - 573.43 GB/s
SOFT_MAX(type=f32,ne=[256,256,20,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 98310 runs - 10.25 us/run - 10240 kB/run - 953.20 GB/s
SOFT_MAX(type=f32,ne=[64,64,20,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 172011 runs - 5.99 us/run - 640 kB/run - 101.84 GB/s
SOFT_MAX(type=f32,ne=[77,64,20,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 172011 runs - 5.97 us/run - 770 kB/run - 123.02 GB/s
SOFT_MAX(type=f32,ne=[8192,1,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 172011 runs - 6.00 us/run - 64 kB/run - 10.16 GB/s
SOFT_MAX(type=f32,ne=[8192,4,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 163820 runs - 6.12 us/run - 256 kB/run - 39.91 GB/s
SOFT_MAX(type=f32,ne=[8192,16,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 147438 runs - 6.88 us/run - 1024 kB/run - 141.92 GB/s
SOFT_MAX(type=f32,ne=[16384,1,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 122865 runs - 8.20 us/run - 128 kB/run - 14.89 GB/s
SOFT_MAX(type=f32,ne=[16384,4,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 114674 runs - 8.87 us/run - 512 kB/run - 55.06 GB/s
SOFT_MAX(type=f32,ne=[16384,16,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 98292 runs - 10.24 us/run - 2048 kB/run - 190.82 GB/s
SOFT_MAX(type=f32,ne=[32768,1,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 49146 runs - 21.37 us/run - 256 kB/run - 11.43 GB/s
SOFT_MAX(type=f32,ne=[32768,4,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 49146 runs - 22.54 us/run - 1024 kB/run - 43.33 GB/s
SOFT_MAX(type=f32,ne=[32768,16,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 49146 runs - 23.92 us/run - 4096 kB/run - 163.32 GB/s
SOFT_MAX(type=f32,ne=[65536,1,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 32764 runs - 38.94 us/run - 512 kB/run - 12.54 GB/s
SOFT_MAX(type=f32,ne=[65536,4,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 24573 runs - 41.94 us/run - 2048 kB/run - 46.57 GB/s
SOFT_MAX(type=f32,ne=[65536,16,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 24582 runs - 43.09 us/run - 8192 kB/run - 181.32 GB/s
SOFT_MAX(type=f32,ne=[131072,1,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 16382 runs - 74.56 us/run - 1024 kB/run - 13.10 GB/s
SOFT_MAX(type=f32,ne=[131072,4,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 16382 runs - 79.85 us/run - 4096 kB/run - 48.92 GB/s
SOFT_MAX(type=f32,ne=[131072,16,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 12294 runs - 82.41 us/run - 16384 kB/run - 189.64 GB/s
SOFT_MAX(type=f32,ne=[262144,1,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 8191 runs - 145.16 us/run - 2048 kB/run - 13.46 GB/s
SOFT_MAX(type=f32,ne=[262144,4,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 8194 runs - 155.46 us/run - 8192 kB/run - 50.26 GB/s
SOFT_MAX(type=f32,ne=[262144,16,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 7175 runs - 160.70 us/run - 32768 kB/run - 194.56 GB/s
SOFT_MAX(type=f32,ne=[524288,1,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 8191 runs - 285.81 us/run - 4096 kB/run - 13.67 GB/s
SOFT_MAX(type=f32,ne=[524288,4,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 4098 runs - 306.91 us/run - 16384 kB/run - 50.92 GB/s
SOFT_MAX(type=f32,ne=[524288,16,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 3591 runs - 317.06 us/run - 65536 kB/run - 197.32 GB/s
-- After
Backend 1/2: CUDA0
Device description: NVIDIA RTX PRO 6000 Blackwell Max-Q Workstation Edition
Device memory: 97250 MB (96691 MB free)
SOFT_MAX(type=f32,ne=[4096,4096,5,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 2236 runs - 450.67 us/run - 655360 kB/run - 1400.15 GB/s
SOFT_MAX(type=f32,ne=[12888,256,5,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 17748 runs - 56.97 us/run - 128880 kB/run - 2161.50 GB/s
SOFT_MAX(type=f32,ne=[77,4096,5,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 57204 runs - 18.35 us/run - 12320 kB/run - 640.36 GB/s
SOFT_MAX(type=f32,ne=[1024,1024,10,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 9840 runs - 102.46 us/run - 81920 kB/run - 763.42 GB/s
SOFT_MAX(type=f32,ne=[77,1024,10,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 98064 runs - 10.25 us/run - 6160 kB/run - 573.43 GB/s
SOFT_MAX(type=f32,ne=[256,256,20,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 98310 runs - 10.25 us/run - 10240 kB/run - 953.21 GB/s
SOFT_MAX(type=f32,ne=[64,64,20,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 147438 runs - 7.00 us/run - 640 kB/run - 87.26 GB/s
SOFT_MAX(type=f32,ne=[77,64,20,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 147438 runs - 6.99 us/run - 770 kB/run - 105.05 GB/s
SOFT_MAX(type=f32,ne=[8192,1,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 172011 runs - 6.02 us/run - 64 kB/run - 10.13 GB/s
SOFT_MAX(type=f32,ne=[8192,4,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 163820 runs - 6.12 us/run - 256 kB/run - 39.87 GB/s
SOFT_MAX(type=f32,ne=[8192,16,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 147438 runs - 6.91 us/run - 1024 kB/run - 141.40 GB/s
SOFT_MAX(type=f32,ne=[16384,1,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 122865 runs - 8.20 us/run - 128 kB/run - 14.89 GB/s
SOFT_MAX(type=f32,ne=[16384,4,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 114674 runs - 8.79 us/run - 512 kB/run - 55.54 GB/s
SOFT_MAX(type=f32,ne=[16384,16,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 98292 runs - 10.24 us/run - 2048 kB/run - 190.82 GB/s
SOFT_MAX(type=f32,ne=[32768,1,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 131056 runs - 8.11 us/run - 256 kB/run - 30.12 GB/s
SOFT_MAX(type=f32,ne=[32768,4,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 49146 runs - 22.54 us/run - 1024 kB/run - 43.33 GB/s
SOFT_MAX(type=f32,ne=[32768,16,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 49146 runs - 23.32 us/run - 4096 kB/run - 167.50 GB/s
SOFT_MAX(type=f32,ne=[65536,1,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 122865 runs - 8.19 us/run - 512 kB/run - 59.63 GB/s
SOFT_MAX(type=f32,ne=[65536,4,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 40955 runs - 24.59 us/run - 2048 kB/run - 79.43 GB/s
SOFT_MAX(type=f32,ne=[65536,16,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 24582 runs - 43.21 us/run - 8192 kB/run - 180.84 GB/s
SOFT_MAX(type=f32,ne=[131072,1,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 122865 runs - 8.19 us/run - 1024 kB/run - 119.25 GB/s
SOFT_MAX(type=f32,ne=[131072,4,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 40955 runs - 24.59 us/run - 4096 kB/run - 158.87 GB/s
SOFT_MAX(type=f32,ne=[131072,16,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 12294 runs - 82.37 us/run - 16384 kB/run - 189.74 GB/s
SOFT_MAX(type=f32,ne=[262144,1,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 122865 runs - 8.20 us/run - 2048 kB/run - 238.28 GB/s
SOFT_MAX(type=f32,ne=[262144,4,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 36873 runs - 28.66 us/run - 8192 kB/run - 272.61 GB/s
SOFT_MAX(type=f32,ne=[262144,16,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 9225 runs - 108.51 us/run - 32768 kB/run - 288.13 GB/s
SOFT_MAX(type=f32,ne=[524288,1,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 98292 runs - 10.24 us/run - 4096 kB/run - 381.65 GB/s
SOFT_MAX(type=f32,ne=[524288,4,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 32784 runs - 31.74 us/run - 16384 kB/run - 492.43 GB/s
SOFT_MAX(type=f32,ne=[524288,16,1,1],mask=0,sinks=0,m_prec=f32,nr23=[1,1],scale=1.000000,max_bias=0.000000,inplace=0): 8721 runs - 121.20 us/run - 65536 kB/run - 516.19 GB/s
```