From 07b809bbc0d517f068f63a66e181dedb8367579e Mon Sep 17 00:00:00 2001 From: Oliver Simons Date: Fri, 12 Dec 2025 15:07:28 +0100 Subject: [PATCH] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Johannes Gäßler --- ggml/src/ggml-cuda/cumsum.cu | 12 ++++++------ ggml/src/ggml-cuda/softmax.cu | 10 ++++++++-- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-cuda/cumsum.cu b/ggml/src/ggml-cuda/cumsum.cu index 27e185bc58..1463bfa4f0 100644 --- a/ggml/src/ggml-cuda/cumsum.cu +++ b/ggml/src/ggml-cuda/cumsum.cu @@ -160,11 +160,11 @@ static void cumsum_cub(ggml_cuda_pool & pool, // Query how much temp storage CUDA UnBound (CUB) needs cub::DeviceScan::InclusiveSum(nullptr, // d_temp_storage (null = just query size) - tmp_size, // reference to size (will be set by CUB) - src, // input pointer - dst, // output pointer - ne, // number of elements - stream // CUDA stream to use + tmp_size, // reference to size (will be set by CUB) + src, // input pointer + dst, // output pointer + ne, // number of elements + stream // CUDA stream to use ); ggml_cuda_pool_alloc tmp_alloc(pool, tmp_size); @@ -190,7 +190,7 @@ static void cumsum_cuda( if (is_contiguous) { use_cub = true; - int64_t nrows = ne01 * ne02 * ne03; + const int64_t nrows = ne01 * ne02 * ne03; // TODO: Compare with DeviceSegmentedScan::InclusiveSegmentedSum for nrows > 1 once InclusiveSegmentedSum is released // Heuristics were determined as part of https://github.com/ggml-org/llama.cpp/pull/17004 if (((nrows == 1) && (ne00 > 1024)) || (ne00 / nrows > 4096)) { diff --git a/ggml/src/ggml-cuda/softmax.cu b/ggml/src/ggml-cuda/softmax.cu index b2d7336af5..4dffb1c168 100644 --- a/ggml/src/ggml-cuda/softmax.cu +++ b/ggml/src/ggml-cuda/softmax.cu @@ -232,10 +232,12 @@ static __device__ void soft_max_f32_parallelize_cols_single_row(const float * __ // Compute thread-local max for (int col = col_start; col < p.ncols;) { +#pragma unroll for (int i = 0; i < n_elem_per_thread; i++) { const int idx = col + i * step_size; local_vals[i] = idx < p.ncols ? x[idx] : -INFINITY; } +#pragma unroll for (int i = 0; i < n_elem_per_thread; i++) { local_max = fmaxf(local_max, local_vals[i]); } @@ -263,10 +265,12 @@ static __device__ void soft_max_f32_parallelize_cols_single_row(const float * __ // Compute softmax dividends, accumulate divisor float tmp_expf = 0.0f; for (int col = col_start; col < p.ncols;) { +#pragma unroll for (int i = 0; i < n_elem_per_thread; i++) { const int idx = col + i * step_size; local_vals[i] = idx < p.ncols ? x[idx] : -INFINITY; } +#pragma unroll for (int i = 0; i < n_elem_per_thread; i++) { const int idx = col + i * step_size; if (idx < p.ncols) { @@ -297,10 +301,12 @@ static __device__ void soft_max_f32_parallelize_cols_single_row(const float * __ // Divide dividend by global sum + store data for (int col = col_start; col < p.ncols;) { +#pragma unroll for (int i = 0; i < n_elem_per_thread; i++) { const int idx = col + i * step_size; local_vals[i] = idx < p.ncols ? dst[idx] : -INFINITY; } +#pragma unroll for (int i = 0; i < n_elem_per_thread; i++) { const int idx = col + i * step_size; if (idx < p.ncols) { @@ -367,7 +373,7 @@ static void launch_soft_max_kernels(const float * x, const T * mask, const float soft_max_f32<<>>(x, mask, sinks, dst, p); } -static __global__ void soft_max_f32_parallelize_cols(const float * __restrict__ x, +__launch_bounds__(8*WARP_SIZE, 1) static __global__ void soft_max_f32_parallelize_cols(const float * __restrict__ x, float * __restrict__ dst, float * __restrict__ tmp_vals, const soft_max_params p) @@ -408,7 +414,7 @@ static void soft_max_f32_cuda(const float * x, if (nbytes_shared <= smpbo) { launch_soft_max_kernels<32, 64, 128, 256, 512, 1024, 2048, 4096>(x, mask, sinks, dst, params, stream, block_dims, block_nums, nbytes_shared); } else { - // Parallelize across SMs for top-p/dist-smapling + // Parallelize across SMs for top-p/dist-sampling // The heuristic for parallelizing rows across SMs vs parallelizing single row & looping over all rows was done on the basis of a B6000 GPU and // Can be adapted further for lower-SM-count GPUs, though keeping data in registers should be implemented first as that is the optimal solution. if (ggml_cuda_info().devices[id].supports_cooperative_launch && ncols_x / (params.ne01 * params.ne02 * params.ne03) > 8192 && mask == nullptr && sinks == nullptr && params.scale == 1.0f && params.max_bias == 0.0f) {