cuda: fix race condition in cumsum (#18448)

* ggml-cuda: fix race condition in cumsum * remove unneccesary sync_threads
2025-12-29 14:07:17 +08:00 · 2025-12-29 14:07:17 +08:00 · 5fa66c6e67
parent 382808c14b
commit 5fa66c6e67
1 changed files with 5 additions and 3 deletions
--- a/ggml/src/ggml-cuda/cumsum.cu
+++ b/ggml/src/ggml-cuda/cumsum.cu
@ -61,7 +61,7 @@ static __global__ void cumsum_cub_kernel(

        // Add offset to each item and store
        T thread_offset = thread_prefix - thread_sum + block_carry;
-        #pragma unroll
+#pragma unroll
        for (int i = 0; i < UNROLL_FACTOR; i++) {
            int64_t idx = start + tid * UNROLL_FACTOR + i;
            if (idx < ne00) {
@ -69,11 +69,12 @@ static __global__ void cumsum_cub_kernel(
            }
        }

+        __syncthreads();
+
        // Update carry for next tile
        if (tid == 0) {
            block_carry += block_total;
        }
-        __syncthreads();
    }
 #else
    NO_DEVICE_CODE;
@ -175,11 +176,12 @@ static __global__ void cumsum_kernel(
            }
        }

+        __syncthreads();
+
        // Update carry for next chunk
        if (tid == 0) {
            *s_carry += *s_chunk_total;
        }
-        __syncthreads();
    }
 }