From 206890897546bd16602c3b79394fd5ea09ef199f Mon Sep 17 00:00:00 2001 From: Oliver Simons Date: Wed, 11 Mar 2026 15:52:23 +0100 Subject: [PATCH] Enable GDN also for prefill, move TODO for chunked_GDN --- ggml/src/ggml-cuda/gated_delta_net.cu | 2 +- ggml/src/ggml-cuda/ggml-cuda.cu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cuda/gated_delta_net.cu b/ggml/src/ggml-cuda/gated_delta_net.cu index 7c50b03960..003bdecacd 100644 --- a/ggml/src/ggml-cuda/gated_delta_net.cu +++ b/ggml/src/ggml-cuda/gated_delta_net.cu @@ -145,7 +145,7 @@ static void launch_gated_delta_net( int64_t sb1, int64_t sb2, int64_t sb3, int64_t neqk1, int64_t rq3, float scale, cudaStream_t stream) { - + //TODO: Add chunked kernel for even faster pre-fill constexpr uint32_t warp_size = ggml_cuda_get_physical_warp_size(); const int num_warps = 4; dim3 grid_dims(H, n_seqs, (S_v + num_warps - 1) / num_warps); diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 8b9330d633..d709007d38 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -5001,7 +5001,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g #else // KDA is faster using the AR kernel even when n_tokens >= 512 //TODO: Add chunked kernel - return op->src[0]->ne[2] == 1 || op->src[3]->ne[0] == op->src[2]->ne[0]; + return true; #endif // GGML_USE_MUSA case GGML_OP_FLASH_ATTN_EXT: return ggml_cuda_flash_attn_ext_supported(dev_ctx->device, op);