From 9ef7523ee99e13fe57a3da5cd600c206ed997630 Mon Sep 17 00:00:00 2001
From: uvos <carl@uvos.xyz>
Date: Wed, 11 Mar 2026 06:04:32 +0100
Subject: [PATCH] cuda/hip: fix loop unrolling in ssm-conv (#20369)

---
 ggml/src/ggml-cuda/ssm-conv.cu | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/ssm-conv.cu b/ggml/src/ggml-cuda/ssm-conv.cu
index 85e82b5a42..69985cd335 100644
--- a/ggml/src/ggml-cuda/ssm-conv.cu
+++ b/ggml/src/ggml-cuda/ssm-conv.cu
@@ -76,7 +76,7 @@ static __global__ void ssm_conv_long_token_f32(const float * __restrict__ src0,
     int row = tid / load_cols;
     int col = tid % load_cols;
 #pragma unroll
-    for (int idx = tid; idx < total_elems; idx += split_d_inner) {
+    for (int idx = 0; idx < total_elems; idx += split_d_inner) {
         if (row < (int)split_d_inner) {
             smem[row * n_cols + col] = x_block[row * stride_x + col];
         }
@@ -84,6 +84,9 @@ static __global__ void ssm_conv_long_token_f32(const float * __restrict__ src0,
         col += split_d_inner;
         row += col / load_cols;
         col  = col % load_cols;
+        if (idx >= total_elems - tid - split_d_inner) {
+            break;
+        }
     }
     __syncthreads();