From d2d814c15663fdefeb9adedc683b6f26c9aed9f0 Mon Sep 17 00:00:00 2001 From: bssrdf Date: Sun, 9 Nov 2025 17:30:08 -0500 Subject: [PATCH] fixed a bug in calculating filter row index --- ggml/src/ggml-cuda/conv3d-implicit.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cuda/conv3d-implicit.cuh b/ggml/src/ggml-cuda/conv3d-implicit.cuh index 9cd7fe4e9b..37449f677e 100644 --- a/ggml/src/ggml-cuda/conv3d-implicit.cuh +++ b/ggml/src/ggml-cuda/conv3d-implicit.cuh @@ -104,7 +104,7 @@ __device__ __forceinline__ void tileMemcpySwizzleB( dst_index = dst_index ^ ((dst_index & SWIZZLE_MASK_1) >> SWIZZLE_BITS_1); dst_index = dst_index ^ ((dst_index & SWIZZLE_MASK_2) >> SWIZZLE_BITS_2); // TODO: move some checks outside of loop? - if (thread_row < param.k && curR < param.r && curS < param.s && curT < param.t && curC < param.c){ + if (thread_row + blockIdx.x * TILE_ROWS < param.k && curR < param.r && curS < param.s && curT < param.t && curC < param.c){ dst_float4[dst_index] = reinterpret_cast(&src[src_index])[0]; }else{ // read 4 halves dst_float4[dst_index] = make_float4(0.f, 0.f, 0.f, 0.f); @@ -302,7 +302,7 @@ __device__ __forceinline__ void tileMemcpyLoadB( for (unsigned int i = 0; i < NUM_ITERS; i++){ const unsigned int src_index = thread_row * src_stride + block_k + thread_col * 8; // TODO : move some checks outside of the loop - if (thread_row < param.k && curR < param.r && curS < param.s && curT < param.t && curC < param.c){ + if (thread_row + blockIdx.x * TILE_ROWS < param.k && curR < param.r && curS < param.s && curT < param.t && curC < param.c){ dst_reg[i] = reinterpret_cast(&src[src_index])[0]; }else{ // read 4 halves dst_reg[i] = make_float4(0.f, 0.f, 0.f, 0.f);