From 76885c769703478cbba8a2e1f2809c144181fae5 Mon Sep 17 00:00:00 2001 From: bssrdf Date: Fri, 7 Nov 2025 17:44:00 -0500 Subject: [PATCH] WIP: debugging --- ggml/src/ggml-cuda/conv2d-implicit.cu | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cu b/ggml/src/ggml-cuda/conv2d-implicit.cu index 5307e58ed7..b02224fc06 100644 --- a/ggml/src/ggml-cuda/conv2d-implicit.cu +++ b/ggml/src/ggml-cuda/conv2d-implicit.cu @@ -677,6 +677,7 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input, uint32_t (®_)[2] = reinterpret_cast(acc_register_[mma_m][mma_n]); uint idx = output_sts_addr + mma_m * MMA_M * BN / 2 + (mma_n - i * mma_tiles_per_warp_n/2) * MMA_N; + idx = idx ^ ((idx & 0b110000000000) >> 9); idx = idx ^ ((idx & 0b1110000000) >> 4); uint32_t* dst_ptr = reinterpret_cast(&smemoutput[idx]); dst_ptr[0] = reg_[0]; @@ -695,19 +696,24 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input, const int n = fastdiv(gemm_i, param.OHOW_fastdiv); const int col = fastmodulo(gemm_i, param.OHOW_fastdiv); uint idx = output_lds_addr + subk*2 + j*32*BN/2; + idx = idx ^ ((idx & 0b110000000000) >> 9); idx = idx ^ ((idx & 0b1110000000) >> 4); - uint32_t* dst_ptr = reinterpret_cast(&smemoutput[idx]); + // uint32_t* dst_ptr = reinterpret_cast(&smemoutput[idx]); + uint32_t dst_ptr = *(reinterpret_cast(&smemoutput[idx])); + half (&res_)[2] = reinterpret_cast(dst_ptr); if (n < param.n && row < param.k && col < PQ) { if constexpr (ksplit > 0) { const uint outOffset = z * NKPQ + n * KPQ + row * PQ + col; // output[outOffset] = smemoutput[idx]; - output[outOffset] = reinterpret_cast(dst_ptr)[0]; + // output[outOffset] = reinterpret_cast(dst_ptr)[0]; + output[outOffset] = res_[0]; } else { const uint outOffset = n * KPQ + row * PQ + col; // output[outOffset] = smemoutput[idx]; - output[outOffset] = reinterpret_cast(dst_ptr)[0]; + // output[outOffset] = reinterpret_cast(dst_ptr)[0]; + output[outOffset] = res_[0]; } } if (n < param.n && row+1 < param.k && col < PQ) { @@ -716,11 +722,13 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input, n * KPQ + (row+1) * PQ + col; // output[outOffset] = smemoutput[idx]; - output[outOffset] = reinterpret_cast(dst_ptr)[1]; + // output[outOffset] = reinterpret_cast(dst_ptr)[1]; + output[outOffset] = res_[1]; } else { const uint outOffset = n * KPQ + (row+1) * PQ + col; // output[outOffset] = smemoutput[idx]; - output[outOffset] = reinterpret_cast(dst_ptr)[1]; + // output[outOffset] = reinterpret_cast(dst_ptr)[1]; + output[outOffset] = res_[1]; } } }