fixed a bug of not bound checking batch dimension

This commit is contained in:
bssrdf 2025-11-10 16:52:46 -05:00
parent 5e1352cb60
commit d6d24487c2
1 changed files with 3 additions and 2 deletions

View File

@ -180,7 +180,8 @@ __device__ __forceinline__ void tileMemcpySwizzleA(
unsigned int dst_index = thread_row * TILE_COLS_VECTORIZED + thread_col; unsigned int dst_index = thread_row * TILE_COLS_VECTORIZED + thread_col;
dst_index = dst_index ^ ((dst_index & SWIZZLE_MASK_1) >> SWIZZLE_BITS_1); dst_index = dst_index ^ ((dst_index & SWIZZLE_MASK_1) >> SWIZZLE_BITS_1);
dst_index = dst_index ^ ((dst_index & SWIZZLE_MASK_2) >> SWIZZLE_BITS_2); dst_index = dst_index ^ ((dst_index & SWIZZLE_MASK_2) >> SWIZZLE_BITS_2);
if (curH >= 0 && curW >= 0 && curD >= 0 && curW < param.w && curH < param.h && curD < param.d && curC < param.c && kidx < end_k){ if (curH >= 0 && curW >= 0 && curD >= 0 && curW < param.w && curH < param.h && curD < param.d &&
n < param.n && curC < param.c && kidx < end_k){
int inOffsetTmp = curD * inDepthOffset + curH * inChannelOffset + curW * param.c + curC; int inOffsetTmp = curD * inDepthOffset + curH * inChannelOffset + curW * param.c + curC;
dst_float4[dst_index] = reinterpret_cast<const float4 *>(&src[n * inNOffset + inOffsetTmp])[0]; dst_float4[dst_index] = reinterpret_cast<const float4 *>(&src[n * inNOffset + inOffsetTmp])[0];
} else{ } else{
@ -249,7 +250,7 @@ __device__ __forceinline__ void tileMemcpyLoadA(
const int curW = posw_ori + curIdx.w * param.dilation0; // input w const int curW = posw_ori + curIdx.w * param.dilation0; // input w
const int curC = curIdx.x; const int curC = curIdx.x;
if (curH >= 0 && curW >= 0 && curD >= 0 && curW < param.w && curH < param.h && curD < param.d if (curH >= 0 && curW >= 0 && curD >= 0 && curW < param.w && curH < param.h && curD < param.d
&& curC < param.c && kidx < end_k){ && n < param.n && curC < param.c && kidx < end_k){
int inOffsetTmp = curD * inDepthOffset + curH * inChannelOffset + curW * param.c + curC; int inOffsetTmp = curD * inDepthOffset + curH * inChannelOffset + curW * param.c + curC;
dst_reg[i] = reinterpret_cast<const float4 *>(&src[n * inNOffset + inOffsetTmp])[0]; dst_reg[i] = reinterpret_cast<const float4 *>(&src[n * inNOffset + inOffsetTmp])[0];
} else{ } else{