From 6c90c20cb1ff375c8b3afb1b0544e088ebb9725c Mon Sep 17 00:00:00 2001 From: bssrdf Date: Fri, 24 Oct 2025 15:33:57 -0400 Subject: [PATCH] WIP: bug fix --- ggml/src/ggml-cuda/conv2d-implicit.cu | 15 +++++++++++++- ggml/src/ggml-cuda/conv2d-implicit.cuh | 6 ++++-- tests/test-conv2d-implicit.cpp | 27 +++++++++++++------------- 3 files changed, 32 insertions(+), 16 deletions(-) diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cu b/ggml/src/ggml-cuda/conv2d-implicit.cu index f08e19e9fb..de2cf4aecb 100644 --- a/ggml/src/ggml-cuda/conv2d-implicit.cu +++ b/ggml/src/ggml-cuda/conv2d-implicit.cu @@ -1081,7 +1081,11 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input, tileMemcpySwizzleB(B_block_gmem, B_block_smem, weightKOffset, param); // construct const pointers to warp tiles for use inside the inner loop - +// if(threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x ==0 && blockIdx.y ==0){ +// for(int i = 0; i < 32; ++i) +// printf("%.2f,", __half2float(A_block_smem[i])); +// printf("\n"); +// } int offset_direction = 1; @@ -1127,6 +1131,14 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input, } } } + if(threadIdx.x == 0 && threadIdx.y ==0 && blockIdx.x ==0 && blockIdx.y ==0){ + printf(" %d: %f, %f, %f, %f \n", block_k, __half2float(acc_register_[0][0][0]), __half2float(acc_register_[0][0][1]), + __half2float(acc_register_[0][0][2]), __half2float(acc_register_[0][0][3])); + printf(" %d: %f, %f, %f, %f \n", block_k, __half2float(A_register_[0][0][0]), __half2float(A_register_[0][0][1]), + __half2float(A_register_[0][0][2]), __half2float(A_register_[0][0][3])); + printf(" %d: %f, %f, %f, %f \n", block_k, __half2float(B_register_[0][0][0]), __half2float(B_register_[0][0][1]), + __half2float(B_register_[0][0][2]), __half2float(B_register_[0][0][3])); + } if (block_k != num_block_tiles_k) @@ -1141,6 +1153,7 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input, } } + // reuse smem half *smemoutput = shmem; const uint lane_id = threadIdx.x % WARPSIZE; diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cuh b/ggml/src/ggml-cuda/conv2d-implicit.cuh index 7d966705b8..3ea0461218 100644 --- a/ggml/src/ggml-cuda/conv2d-implicit.cuh +++ b/ggml/src/ggml-cuda/conv2d-implicit.cuh @@ -159,7 +159,8 @@ __device__ __forceinline__ void tileMemcpySwizzleA( #pragma unroll for (unsigned int i = 0; i < NUM_ITERS; i++) { - unsigned int gemm_i = blockDim.y * TILE_ROWS + thread_row; + // unsigned int gemm_i = blockDim.y * TILE_ROWS + thread_row; + unsigned int gemm_i = blockIdx.y * TILE_ROWS + thread_row; unsigned int n = fastdiv(gemm_i, param.OHOW_fastdiv); unsigned int npq_res = fastmodulo(gemm_i, param.OHOW_fastdiv); int posh_ori = fastdiv(npq_res, param.OW_fastdiv) * param.u - param.p; @@ -227,7 +228,8 @@ __device__ __forceinline__ void tileMemcpyLoadA( // const unsigned int src_index = thread_row * src_stride_vectorized + thread_col; // dst_reg[i] = src_float4[src_index]; // thread_row += ROW_STEP; - unsigned int gemm_i = blockDim.y * TILE_ROWS + thread_row; + // unsigned int gemm_i = blockDim.y * TILE_ROWS + thread_row; + unsigned int gemm_i = blockIdx.y * TILE_ROWS + thread_row; unsigned int n = fastdiv(gemm_i, param.OHOW_fastdiv); unsigned int npq_res = fastmodulo(gemm_i, param.OHOW_fastdiv); int posh_ori = fastdiv(npq_res, param.OW_fastdiv) * param.u - param.p; diff --git a/tests/test-conv2d-implicit.cpp b/tests/test-conv2d-implicit.cpp index 3685a10d72..58cd74e7a4 100644 --- a/tests/test-conv2d-implicit.cpp +++ b/tests/test-conv2d-implicit.cpp @@ -48,7 +48,7 @@ void load_model(test_model & model, int ic, int oc, int iw, int ih, bool use_gpu // Initialize adata std::vector adata(KW * KH * IC * OC); for (int i = 0; i < KW * KH * IC * OC; i++) { - adata[i] = 2.5f; + adata[i] = 2.f; } // Convert adata to fp16 format @@ -344,7 +344,7 @@ int main(void) // std::make_tuple(640,640,52,76), // std::make_tuple(640,640,104,152), // std::make_tuple(960,320,104,152), - std::make_tuple(160,1280,26,38), + std::make_tuple(128,1280,26,38), // std::make_tuple(1280,640,52,76), // std::make_tuple(1920,1280,26,38), // std::make_tuple(2560,1280,26,38), @@ -398,7 +398,8 @@ int main(void) struct ggml_cgraph * gf_res_1 = NULL; double run_time1; - std::vector wino_data = compute_graph(model, allocr, build_graph_1, iterations, &run_time1); + // std::vector wino_data = compute_graph(model, allocr, build_graph_1, iterations, &run_time1); + conv2d_data = compute_graph(model, allocr, build_graph_1, iterations, &run_time1); ggml_gallocr_free(allocr); @@ -419,7 +420,7 @@ int main(void) struct ggml_cgraph * gf_res_2 = NULL; double run_time2; - wino_data = compute_graph(model, allocr, build_graph_2, iterations, &run_time2); + std::vector wino_data = compute_graph(model, allocr, build_graph_2, iterations, &run_time2); if(k==0) { @@ -436,15 +437,15 @@ int main(void) // for(int i = 0; i < ggml_nelements(wino_res); i++) { - for(int i = 0; i < 26*38; i++) { - float diff = fabs(conv2d_data[i] - wino_data[i]); - // if(diff > 1.e-4) { - printf("(%f, %f, %f, %d) \n", - conv2d_data[i], - wino_data[i], diff, i); - // break; - // } - } + // for(int i = 0; i < 26*38; i++) { + // float diff = fabs(conv2d_data[i] - wino_data[i]); + // // if(diff > 1.e-4) { + // printf("(%f, %f, %f, %d) \n", + // conv2d_data[i], + // wino_data[i], diff, i); + // // break; + // // } + // } ggml_free(model.ctx); ggml_backend_buffer_free(model.buffer);