fixed anotehr bug in the special filter transpose NCHW2NHWC

This commit is contained in:
bssrdf 2025-11-16 11:14:27 -05:00
parent bccd869968
commit febee580c8
2 changed files with 51 additions and 51 deletions

View File

@ -82,7 +82,6 @@ static __global__ void NCHW2NHWC(const src_T *src, dst_T * dst, const int ne, co
} }
} }
//*** broken, has bugs ***
template <typename src_T, typename dst_T, const unsigned int mask, const int rs, const unsigned int blk_c> template <typename src_T, typename dst_T, const unsigned int mask, const int rs, const unsigned int blk_c>
static __global__ void NCHW2NHWC(const src_T *src, dst_T * dst, const int ne, const int ne00, const int ne01){ static __global__ void NCHW2NHWC(const src_T *src, dst_T * dst, const int ne, const int ne00, const int ne01){
@ -108,10 +107,9 @@ static __global__ void NCHW2NHWC(const src_T *src, dst_T * dst, const int ne, co
const unsigned int row = (j * blk + tx) % rs; const unsigned int row = (j * blk + tx) % rs;
const unsigned int col = (j * blk + tx) / rs; const unsigned int col = (j * blk + tx) / rs;
const unsigned int src_index = imat*n + bx * blk_c * rs + j * blk + tx; const unsigned int src_index = imat*n + bx * blk_c * rs + j * blk + tx;
// const unsigned int src_index = imat*n + rs*ne00 + bx * blk_c + j * blk_c + tx;
unsigned int idx = row * blk_c + col; unsigned int idx = row * blk_c + col;
idx = idx ^ ((idx & mask) >> 4); idx = idx ^ ((idx & mask) >> 4);
if (src_index < ne) { if (src_index < ne && tx < blk) {
tile[idx] = src[src_index]; tile[idx] = src[src_index];
} }
} }
@ -122,7 +120,7 @@ static __global__ void NCHW2NHWC(const src_T *src, dst_T * dst, const int ne, co
if(dst_index < ne && tx < blk){ if(dst_index < ne && tx < blk){
unsigned int idx = j*blk_c + tx; unsigned int idx = j*blk_c + tx;
idx = idx ^ ((idx & mask) >> 4); idx = idx ^ ((idx & mask) >> 4);
dst[dst_index] = ggml_cuda_cast<dst_T>(tile[idx]); dst[dst_index] = ggml_cuda_cast<dst_T>(tile[idx]);
} }
} }
} }
@ -1340,46 +1338,47 @@ static void conv2d_implicit_cuda_f16(ggml_backend_cuda_context & ctx, const floa
// ggml_cuda_pool_alloc<half> kernel_f16(ctx.pool(id), ne); // ggml_cuda_pool_alloc<half> kernel_f16(ctx.pool(id), ne);
ggml_cuda_pool_alloc<half> kernel_f16(ctx.pool(id)); ggml_cuda_pool_alloc<half> kernel_f16(ctx.pool(id));
if (ne01 > 1){ if (ne01 > 1){
// kernel_f16.alloc(ne); kernel_f16.alloc(ne);
// dim3 dimGrid1((ne00 + CUDA_NCHW_2_NHWC_BLOCK_C - 1) / CUDA_NCHW_2_NHWC_BLOCK_C,
// (ne/(ne00*ne01) + CUDA_NCHW_2_NHWC_BLOCK_NM - 1) / CUDA_NCHW_2_NHWC_BLOCK_NM, dim3 dimGrid1((ne00 + CUDA_NCHW_2_NHWC_BLOCK_C - 1) / CUDA_NCHW_2_NHWC_BLOCK_C,
// 1) ; (ne/(ne00*ne01) + CUDA_NCHW_2_NHWC_BLOCK_NM - 1) / CUDA_NCHW_2_NHWC_BLOCK_NM,
// if (ne01 == 25) { 1) ;
// constexpr unsigned int mask = filter_swizzle_mask(25, CUDA_NCHW_2_NHWC_BLOCK_C); if (ne01 == 25) {
// NCHW2NHWC<half, half, mask, 25, CUDA_NCHW_2_NHWC_BLOCK_C><<<dimGrid1, CUDA_NCHW_2_NHWC_BLOCK_C, 0, st>>>(K_D, kernel_f16.get(), ne, ne00, ne01); constexpr unsigned int mask = filter_swizzle_mask(25, CUDA_NCHW_2_NHWC_BLOCK_C);
// } else if (ne01 == 16) { NCHW2NHWC<half, half, mask, 25, CUDA_NCHW_2_NHWC_BLOCK_C><<<dimGrid1, CUDA_NCHW_2_NHWC_BLOCK_C, 0, st>>>(K_D, kernel_f16.get(), ne, ne00, ne01);
// constexpr unsigned int mask = filter_swizzle_mask(16, CUDA_NCHW_2_NHWC_BLOCK_C); } else if (ne01 == 16) {
// NCHW2NHWC<half, half, mask, 16, CUDA_NCHW_2_NHWC_BLOCK_C><<<dimGrid1, CUDA_NCHW_2_NHWC_BLOCK_C, 0, st>>>(K_D, kernel_f16.get(), ne, ne00, ne01); constexpr unsigned int mask = filter_swizzle_mask(16, CUDA_NCHW_2_NHWC_BLOCK_C);
// } else if (ne01 == 9) { NCHW2NHWC<half, half, mask, 16, CUDA_NCHW_2_NHWC_BLOCK_C><<<dimGrid1, CUDA_NCHW_2_NHWC_BLOCK_C, 0, st>>>(K_D, kernel_f16.get(), ne, ne00, ne01);
// constexpr unsigned int mask = filter_swizzle_mask(9, CUDA_NCHW_2_NHWC_BLOCK_C); } else if (ne01 == 9) {
// NCHW2NHWC<half, half, mask, 9, CUDA_NCHW_2_NHWC_BLOCK_C><<<dimGrid1, CUDA_NCHW_2_NHWC_BLOCK_C, 0, st>>>(K_D, kernel_f16.get(), ne, ne00, ne01); constexpr unsigned int mask = filter_swizzle_mask(9, CUDA_NCHW_2_NHWC_BLOCK_C);
// } else if (ne01 == 8) { NCHW2NHWC<half, half, mask, 9, CUDA_NCHW_2_NHWC_BLOCK_C><<<dimGrid1, CUDA_NCHW_2_NHWC_BLOCK_C, 0, st>>>(K_D, kernel_f16.get(), ne, ne00, ne01);
// constexpr unsigned int mask = filter_swizzle_mask(8, CUDA_NCHW_2_NHWC_BLOCK_C); } else if (ne01 == 8) {
// NCHW2NHWC<half, half, mask, 8, CUDA_NCHW_2_NHWC_BLOCK_C><<<dimGrid1, CUDA_NCHW_2_NHWC_BLOCK_C, 0, st>>>(K_D, kernel_f16.get(), ne, ne00, ne01); constexpr unsigned int mask = filter_swizzle_mask(8, CUDA_NCHW_2_NHWC_BLOCK_C);
// } else if (ne01 == 7) { NCHW2NHWC<half, half, mask, 8, CUDA_NCHW_2_NHWC_BLOCK_C><<<dimGrid1, CUDA_NCHW_2_NHWC_BLOCK_C, 0, st>>>(K_D, kernel_f16.get(), ne, ne00, ne01);
// constexpr unsigned int mask = filter_swizzle_mask(7, CUDA_NCHW_2_NHWC_BLOCK_C); } else if (ne01 == 7) {
// NCHW2NHWC<half, half, mask, 7, CUDA_NCHW_2_NHWC_BLOCK_C><<<dimGrid1, CUDA_NCHW_2_NHWC_BLOCK_C, 0, st>>>(K_D, kernel_f16.get(), ne, ne00, ne01); constexpr unsigned int mask = filter_swizzle_mask(7, CUDA_NCHW_2_NHWC_BLOCK_C);
// } else if (ne01 == 6) { NCHW2NHWC<half, half, mask, 7, CUDA_NCHW_2_NHWC_BLOCK_C><<<dimGrid1, CUDA_NCHW_2_NHWC_BLOCK_C, 0, st>>>(K_D, kernel_f16.get(), ne, ne00, ne01);
// constexpr unsigned int mask = filter_swizzle_mask(6, CUDA_NCHW_2_NHWC_BLOCK_C); } else if (ne01 == 6) {
// NCHW2NHWC<half, half, mask, 6, CUDA_NCHW_2_NHWC_BLOCK_C><<<dimGrid1, CUDA_NCHW_2_NHWC_BLOCK_C, 0, st>>>(K_D, kernel_f16.get(), ne, ne00, ne01); constexpr unsigned int mask = filter_swizzle_mask(6, CUDA_NCHW_2_NHWC_BLOCK_C);
// } else if (ne01 == 5) { NCHW2NHWC<half, half, mask, 6, CUDA_NCHW_2_NHWC_BLOCK_C><<<dimGrid1, CUDA_NCHW_2_NHWC_BLOCK_C, 0, st>>>(K_D, kernel_f16.get(), ne, ne00, ne01);
// constexpr unsigned int mask = filter_swizzle_mask(5, CUDA_NCHW_2_NHWC_BLOCK_C); } else if (ne01 == 5) {
// NCHW2NHWC<half, half, mask, 5, CUDA_NCHW_2_NHWC_BLOCK_C><<<dimGrid1, CUDA_NCHW_2_NHWC_BLOCK_C, 0, st>>>(K_D, kernel_f16.get(), ne, ne00, ne01); constexpr unsigned int mask = filter_swizzle_mask(5, CUDA_NCHW_2_NHWC_BLOCK_C);
// } else if (ne01 == 4) { NCHW2NHWC<half, half, mask, 5, CUDA_NCHW_2_NHWC_BLOCK_C><<<dimGrid1, CUDA_NCHW_2_NHWC_BLOCK_C, 0, st>>>(K_D, kernel_f16.get(), ne, ne00, ne01);
// constexpr unsigned int mask = filter_swizzle_mask(4, CUDA_NCHW_2_NHWC_BLOCK_C); } else if (ne01 == 4) {
// NCHW2NHWC<half, half, mask, 4, CUDA_NCHW_2_NHWC_BLOCK_C><<<dimGrid1, CUDA_NCHW_2_NHWC_BLOCK_C, 0, st>>>(K_D, kernel_f16.get(), ne, ne00, ne01); constexpr unsigned int mask = filter_swizzle_mask(4, CUDA_NCHW_2_NHWC_BLOCK_C);
// } else if (ne01 == 3) { NCHW2NHWC<half, half, mask, 4, CUDA_NCHW_2_NHWC_BLOCK_C><<<dimGrid1, CUDA_NCHW_2_NHWC_BLOCK_C, 0, st>>>(K_D, kernel_f16.get(), ne, ne00, ne01);
// constexpr unsigned int mask = filter_swizzle_mask(3, CUDA_NCHW_2_NHWC_BLOCK_C); } else if (ne01 == 3) {
// NCHW2NHWC<half, half, mask, 3, CUDA_NCHW_2_NHWC_BLOCK_C><<<dimGrid1, CUDA_NCHW_2_NHWC_BLOCK_C, 0, st>>>(K_D, kernel_f16.get(), ne, ne00, ne01); constexpr unsigned int mask = filter_swizzle_mask(3, CUDA_NCHW_2_NHWC_BLOCK_C);
// } else if (ne01 == 2) { NCHW2NHWC<half, half, mask, 3, CUDA_NCHW_2_NHWC_BLOCK_C><<<dimGrid1, CUDA_NCHW_2_NHWC_BLOCK_C, 0, st>>>(K_D, kernel_f16.get(), ne, ne00, ne01);
// constexpr unsigned int mask = filter_swizzle_mask(2, CUDA_NCHW_2_NHWC_BLOCK_C); } else if (ne01 == 2) {
// NCHW2NHWC<half, half, mask, 2, CUDA_NCHW_2_NHWC_BLOCK_C><<<dimGrid1, CUDA_NCHW_2_NHWC_BLOCK_C, 0, st>>>(K_D, kernel_f16.get(), ne, ne00, ne01); constexpr unsigned int mask = filter_swizzle_mask(2, CUDA_NCHW_2_NHWC_BLOCK_C);
// } else { NCHW2NHWC<half, half, mask, 2, CUDA_NCHW_2_NHWC_BLOCK_C><<<dimGrid1, CUDA_NCHW_2_NHWC_BLOCK_C, 0, st>>>(K_D, kernel_f16.get(), ne, ne00, ne01);
} else {
dim3 dimGrid2((ne01 + CUDA_NCHW_2_NHWC_TILE_DIM - 1) / CUDA_NCHW_2_NHWC_TILE_DIM, dim3 dimGrid2((ne01 + CUDA_NCHW_2_NHWC_TILE_DIM - 1) / CUDA_NCHW_2_NHWC_TILE_DIM,
(ne00 + CUDA_NCHW_2_NHWC_TILE_DIM - 1) / CUDA_NCHW_2_NHWC_TILE_DIM, (ne00 + CUDA_NCHW_2_NHWC_TILE_DIM - 1) / CUDA_NCHW_2_NHWC_TILE_DIM,
(ne/(ne00*ne01) + CUDA_NCHW_2_NHWC_BLOCK_NM - 1) / CUDA_NCHW_2_NHWC_BLOCK_NM) ; (ne/(ne00*ne01) + CUDA_NCHW_2_NHWC_BLOCK_NM - 1) / CUDA_NCHW_2_NHWC_BLOCK_NM) ;
NCHW2NHWC<half, half><<<dimGrid2, dimBlock, 0, st>>>(K_D, kernel_f16.get(), ne, ne00, ne01); NCHW2NHWC<half, half><<<dimGrid2, dimBlock, 0, st>>>(K_D, kernel_f16.get(), ne, ne00, ne01);
// } }
} }
const half *X_H = input_f16.get(); const half *X_H = input_f16.get();

View File

@ -319,7 +319,7 @@ static std::vector<std::tuple<int, int, int, int, int, int>> configs = {
// std::make_tuple(960,320,104,152,3,3), // std::make_tuple(960,320,104,152,3,3),
// std::make_tuple(1280,1280,26,38,3,3), // std::make_tuple(1280,1280,26,38,3,3),
// std::make_tuple(1920,640,32,32,3,3) // std::make_tuple(1920,640,32,32,3,3)
// std::make_tuple(1280,1280,16,16,3,3), std::make_tuple(1280,1280,16,16,3,3),
// std::make_tuple(32,12,141,133,3,3), // std::make_tuple(32,12,141,133,3,3),
// std::make_tuple(32,6,141,133,3,3), // std::make_tuple(32,6,141,133,3,3),
// std::make_tuple(32,12,141,121,3,3), // std::make_tuple(32,12,141,121,3,3),
@ -329,7 +329,8 @@ static std::vector<std::tuple<int, int, int, int, int, int>> configs = {
// std::make_tuple(320,12,16,16,3,3), //working // std::make_tuple(320,12,16,16,3,3), //working
// std::make_tuple(256,12,16,16,3,3), //working // std::make_tuple(256,12,16,16,3,3), //working
// std::make_tuple(32,12,16,16,3,3), //not working // std::make_tuple(32,12,16,16,3,3), //not working
std::make_tuple(16,12,16,16,3,3), //not working // std::make_tuple(16,12,16,16,3,3), //not working
// std::make_tuple(32,12,16,16,3,3), //not working
// std::make_tuple(48,12,16,16,3,3), // not working // std::make_tuple(48,12,16,16,3,3), // not working
// std::make_tuple(96,12,16,16,3,3), //not working // std::make_tuple(96,12,16,16,3,3), //not working
// std::make_tuple(64,12,16,16,3,3), //working // std::make_tuple(64,12,16,16,3,3), //working
@ -750,15 +751,15 @@ int main(void)
// int i = 2048; // int i = 2048;
// for(int i = 0; i < ggml_nelements(wino_res); i++) { // for(int i = 0; i < ggml_nelements(wino_res); i++) {
// for(int i = 0; i < 26*38; i++) { // for(int i = 0; i < 26*38; i++) {
for(int i = 0; i < conv2d_data.size(); i++) { // for(int i = 0; i < conv2d_data.size(); i++) {
float diff = fabs(im2col_data[i] - conv2d_data[i]); // float diff = fabs(im2col_data[i] - conv2d_data[i]);
// if(diff > 0.5) { // // if(diff > 0.5) {
printf("(%7.3f, %7.3f, %.2f, %d) \n", // printf("(%7.3f, %7.3f, %.2f, %d) \n",
im2col_data[i], conv2d_data[i], // im2col_data[i], conv2d_data[i],
diff, i); // diff, i);
// break; // // break;
// } // // }
} // }
ggml_free(model.ctx); ggml_free(model.ctx);
ggml_backend_buffer_free(model.buffer); ggml_backend_buffer_free(model.buffer);