ggml : add initial cumsum implementation for CUDA

This commit is contained in:
Daniel Bevenius 2025-11-17 16:03:04 +01:00
parent a3eb847d24
commit 67d3b8e84d
No known key found for this signature in database
3 changed files with 79 additions and 0 deletions

View File

@ -0,0 +1,69 @@
#include "cumsum.cuh"
#ifdef GGML_CUDA_USE_CUB
#include <cub/cub.cuh>
using namespace cub;
#endif // GGML_CUDA_USE_CUB
#include <cstdint>
__global__ void cumsum_f32_kernel(const float * x, float * dst, int64_t n) {
// TODO: this is a naive implementation just for getting something working.
if (threadIdx.x == 0 && blockIdx.x == 0) {
dst[0] = x[0];
for (int64_t i = 1; i < n; i++) {
dst[i] = dst[i-1] + x[i];
}
}
}
void cumsum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream) {
#ifdef GGML_CUDA_USE_CUB
size_t tmp_size = 0;
// Query how much temp storage CUDA UnBound (CUB) needs
cub::DeviceScan::InclusiveSum(
nullptr, // d_temp_storage (null = just query size)
tmp_size, // reference to size (will be set by CUB)
x, // input pointer
dst, // output pointer
ne, // number of elements
stream // CUDA stream to use
);
ggml_cuda_pool_alloc<uint8_t> tmp_alloc(pool, tmp_size);
// Perform the inclusive scan
cub::DeviceScan::InclusiveSum(tmp_alloc.ptr, tmp_size, x, dst, ne, stream);
#else
GGML_UNUSED(pool);
cumsum_f32_kernel<<<1, 1, 0, stream>>>(x, dst, ne);
#endif // GGML_CUDA_USE_CUB
}
void ggml_cuda_op_cumsum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_contiguously_allocated(src0));
const float * src0_d = (const float *) src0->data;
float * dst_d = (float *) dst->data;
const int64_t ne0 = src0->ne[0]; // row length (cumsum computed along this dimension)
const int64_t ne1 = src0->ne[1];
const int64_t ne2 = src0->ne[2];
const int64_t ne3 = src0->ne[3];
const int64_t nrows = ne1 * ne2 * ne3; // total number of rows
ggml_cuda_pool & pool = ctx.pool();
cudaStream_t stream = ctx.stream();
for (int64_t i = 0; i < nrows; i++) {
const float * src_row = src0_d + i * ne0;
float * dst_row = dst_d + i * ne0;
cumsum_f32_cuda(pool, src_row, dst_row, ne0, stream);
}
}

View File

@ -0,0 +1,5 @@
#include "common.cuh"
void cumsum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream);
void ggml_cuda_op_cumsum(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@ -19,6 +19,7 @@
#include "ggml-cuda/count-equal.cuh"
#include "ggml-cuda/cpy.cuh"
#include "ggml-cuda/cross-entropy-loss.cuh"
#include "ggml-cuda/cumsum.cuh"
#include "ggml-cuda/diagmask.cuh"
#include "ggml-cuda/fattn.cuh"
#include "ggml-cuda/getrows.cuh"
@ -2678,6 +2679,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_SUM:
ggml_cuda_op_sum(ctx, dst);
break;
case GGML_OP_CUMSUM:
ggml_cuda_op_cumsum(ctx, dst);
break;
case GGML_OP_SUM_ROWS:
ggml_cuda_op_sum_rows(ctx, dst);
break;
@ -4119,6 +4123,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_POOL_2D:
case GGML_OP_ACC:
return true;
case GGML_OP_CUMSUM:
case GGML_OP_SUM:
return ggml_is_contiguous_rows(op->src[0]);
case GGML_OP_ARGSORT: