From 67d3b8e84d3915f01cc7fa50a28cfbe3a4d9c974 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Mon, 17 Nov 2025 16:03:04 +0100 Subject: [PATCH] ggml : add initial cumsum implementation for CUDA --- ggml/src/ggml-cuda/cumsum.cu | 69 +++++++++++++++++++++++++++++++++ ggml/src/ggml-cuda/cumsum.cuh | 5 +++ ggml/src/ggml-cuda/ggml-cuda.cu | 5 +++ 3 files changed, 79 insertions(+) create mode 100644 ggml/src/ggml-cuda/cumsum.cu create mode 100644 ggml/src/ggml-cuda/cumsum.cuh diff --git a/ggml/src/ggml-cuda/cumsum.cu b/ggml/src/ggml-cuda/cumsum.cu new file mode 100644 index 0000000000..041dc7cdb5 --- /dev/null +++ b/ggml/src/ggml-cuda/cumsum.cu @@ -0,0 +1,69 @@ +#include "cumsum.cuh" + +#ifdef GGML_CUDA_USE_CUB +#include +using namespace cub; +#endif // GGML_CUDA_USE_CUB + +#include + +__global__ void cumsum_f32_kernel(const float * x, float * dst, int64_t n) { + // TODO: this is a naive implementation just for getting something working. + if (threadIdx.x == 0 && blockIdx.x == 0) { + dst[0] = x[0]; + for (int64_t i = 1; i < n; i++) { + dst[i] = dst[i-1] + x[i]; + } + } +} + +void cumsum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream) { +#ifdef GGML_CUDA_USE_CUB + size_t tmp_size = 0; + + // Query how much temp storage CUDA UnBound (CUB) needs + cub::DeviceScan::InclusiveSum( + nullptr, // d_temp_storage (null = just query size) + tmp_size, // reference to size (will be set by CUB) + x, // input pointer + dst, // output pointer + ne, // number of elements + stream // CUDA stream to use + ); + + ggml_cuda_pool_alloc tmp_alloc(pool, tmp_size); + + // Perform the inclusive scan + cub::DeviceScan::InclusiveSum(tmp_alloc.ptr, tmp_size, x, dst, ne, stream); + +#else + GGML_UNUSED(pool); + cumsum_f32_kernel<<<1, 1, 0, stream>>>(x, dst, ne); +#endif // GGML_CUDA_USE_CUB +} + +void ggml_cuda_op_cumsum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_is_contiguously_allocated(src0)); + + const float * src0_d = (const float *) src0->data; + float * dst_d = (float *) dst->data; + + const int64_t ne0 = src0->ne[0]; // row length (cumsum computed along this dimension) + const int64_t ne1 = src0->ne[1]; + const int64_t ne2 = src0->ne[2]; + const int64_t ne3 = src0->ne[3]; + const int64_t nrows = ne1 * ne2 * ne3; // total number of rows + + ggml_cuda_pool & pool = ctx.pool(); + cudaStream_t stream = ctx.stream(); + + for (int64_t i = 0; i < nrows; i++) { + const float * src_row = src0_d + i * ne0; + float * dst_row = dst_d + i * ne0; + cumsum_f32_cuda(pool, src_row, dst_row, ne0, stream); + } +} diff --git a/ggml/src/ggml-cuda/cumsum.cuh b/ggml/src/ggml-cuda/cumsum.cuh new file mode 100644 index 0000000000..7fca7e1456 --- /dev/null +++ b/ggml/src/ggml-cuda/cumsum.cuh @@ -0,0 +1,5 @@ +#include "common.cuh" + +void cumsum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream); + +void ggml_cuda_op_cumsum(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 7d792e60cf..8648e31413 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -19,6 +19,7 @@ #include "ggml-cuda/count-equal.cuh" #include "ggml-cuda/cpy.cuh" #include "ggml-cuda/cross-entropy-loss.cuh" +#include "ggml-cuda/cumsum.cuh" #include "ggml-cuda/diagmask.cuh" #include "ggml-cuda/fattn.cuh" #include "ggml-cuda/getrows.cuh" @@ -2678,6 +2679,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_SUM: ggml_cuda_op_sum(ctx, dst); break; + case GGML_OP_CUMSUM: + ggml_cuda_op_cumsum(ctx, dst); + break; case GGML_OP_SUM_ROWS: ggml_cuda_op_sum_rows(ctx, dst); break; @@ -4119,6 +4123,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_POOL_2D: case GGML_OP_ACC: return true; + case GGML_OP_CUMSUM: case GGML_OP_SUM: return ggml_is_contiguous_rows(op->src[0]); case GGML_OP_ARGSORT: