From 3eb340ed4b54e9b469a45a01922d5007937eb44f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stanis=C5=82aw=20Szymczyk?= Date: Sun, 15 Mar 2026 12:53:03 +0100 Subject: [PATCH] ggml : add CUDA Hadamard transformation implementation (borrowed from ik_llama.cpp) --- ggml/src/ggml-cuda/ggml-cuda.cu | 7 ++- ggml/src/ggml-cuda/hadamard.cu | 86 +++++++++++++++++++++++++++++++++ ggml/src/ggml-cuda/hadamard.cuh | 3 ++ 3 files changed, 95 insertions(+), 1 deletion(-) create mode 100644 ggml/src/ggml-cuda/hadamard.cu create mode 100644 ggml/src/ggml-cuda/hadamard.cuh diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index cda275b8c5..6a091a6d8a 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -61,6 +61,7 @@ #include "ggml-cuda/tri.cuh" #include "ggml-cuda/cumsum.cuh" #include "ggml-cuda/fill.cuh" +#include "ggml-cuda/hadamard.cuh" #include "ggml.h" #include @@ -2771,6 +2772,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_FILL: ggml_cuda_op_fill(ctx, dst); break; + case GGML_OP_HADAMARD: + ggml_cuda_op_hadamard(ctx, dst); + break; default: return false; } @@ -5013,7 +5017,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_DIAG: case GGML_OP_SOLVE_TRI: return true; - + case GGML_OP_HADAMARD: + return (op->ne[0] == 64 || op->ne[0] == 128 || op->ne[0] == 256) && op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32; default: return false; } diff --git a/ggml/src/ggml-cuda/hadamard.cu b/ggml/src/ggml-cuda/hadamard.cu new file mode 100644 index 0000000000..5f34d2579d --- /dev/null +++ b/ggml/src/ggml-cuda/hadamard.cu @@ -0,0 +1,86 @@ +// Copyright (C) 2025 Iwan Kawrakow +// MIT license +// SPDX-License-Identifier: MIT + +#include "hadamard.cuh" + +template +static __global__ void hadamard_f32(const char * src, char * dst, int ne0, + size_t nb01, size_t nb02, size_t nb03, size_t nb1, size_t nb2, size_t nb3) { + + constexpr float ksqrt2 = 0.707106781f; + + int nc = ne0/nh; + int ii1 = blockIdx.x; + int i1 = ii1 / nc; + int ic = ii1 % nc; + int i2 = blockIdx.y; + int i3 = blockIdx.z; + + int tid = threadIdx.x; + + const float * x = (const float *)((const char *)src + i1*nb01 + i2*nb02 + i3*nb03) + ic*nh; + float * y = ( float *)((const char *)dst + i1*nb1 + i2*nb2 + i3*nb3) + ic*nh; + + __shared__ float ys[nh]; + + ys[2*tid+0] = x[2*tid+0] + x[2*tid+1]; + ys[2*tid+1] = x[2*tid+0] - x[2*tid+1]; + + float scale = ksqrt2; + +#pragma unroll + for (int h = 2; h < nh; h <<= 2) { + __syncthreads(); + int ii = tid/h, jj = tid%h; + int j = 2*h*ii+jj; + float u = ys[j], v = ys[j+h]; + ys[j+0] = u + v; + ys[j+h] = u - v; + scale *= ksqrt2; + } + + __syncthreads(); + y[2*tid+0] = ys[2*tid+0] * scale; + y[2*tid+1] = ys[2*tid+1] * scale; +} + +static void hadamard_f32_cuda(int nh, const char * x, char * y, int ne0, int ne1, int ne2, int ne3, + size_t nb01, size_t nb02, size_t nb03, size_t nb1, size_t nb2, size_t nb3, cudaStream_t stream) { + int nc = ne0/nh; + int nrows = nc*ne1; + dim3 num_blocks = dim3(nrows, ne2, ne3); + switch (nh) { + case 64: hadamard_f32< 64><<>>(x, y, ne0, nb01, nb02, nb03, nb1, nb2, nb3); break; + case 128: hadamard_f32<128><<>>(x, y, ne0, nb01, nb02, nb03, nb1, nb2, nb3); break; + case 256: hadamard_f32<256><<>>(x, y, ne0, nb01, nb02, nb03, nb1, nb2, nb3); break; + default: GGML_ABORT("Unsupported Hadamard block size"); + } +} + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#include +#include +#include +#include +#include +static inline int popcount(uint32_t x) { return __popcnt(x); } +#else +static inline int popcount(uint32_t x) { return __builtin_popcount(x); } +#endif + + +void ggml_cuda_op_hadamard(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src = dst->src[0]; + GGML_ASSERT(src->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_are_same_shape(src, dst)); + + int nh = dst->op_params[0]; + GGML_ASSERT(dst->ne[0]%nh == 0); + GGML_ASSERT(nh > 1 && popcount(nh) == 1); + + hadamard_f32_cuda(nh, (const char *)src->data, (char *)dst->data, src->ne[0], src->ne[1], src->ne[2], src->ne[3], + src->nb[1], src->nb[2], src->nb[3], dst->nb[1], dst->nb[2], dst->nb[3], ctx.stream()); +} diff --git a/ggml/src/ggml-cuda/hadamard.cuh b/ggml/src/ggml-cuda/hadamard.cuh new file mode 100644 index 0000000000..17b3ac9468 --- /dev/null +++ b/ggml/src/ggml-cuda/hadamard.cuh @@ -0,0 +1,3 @@ +#include "common.cuh" + +void ggml_cuda_op_hadamard(ggml_backend_cuda_context & ctx, ggml_tensor * dst);