diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cu b/ggml/src/ggml-cuda/conv2d-implicit.cu index 902220b74f..ec975194cc 100644 --- a/ggml/src/ggml-cuda/conv2d-implicit.cu +++ b/ggml/src/ggml-cuda/conv2d-implicit.cu @@ -913,13 +913,16 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input, // for (unsigned int block_k = 1; block_k <= num_block_tiles_k; block_k++){ int s = 0; int r = 0; - while (block_k < num_block_tiles_k){ - #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE +#if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE + while (block_krs < num_block_tiles_krs) { + asm volatile("cp.async.wait_group %0;\n" ::"n"(0)); - #endif +#else + while (block_k < num_block_tiles_k) { +#endif __syncthreads(); - // moves to the next tile + // moves to the next channel block tile int next_idx = 0; ++s; if (s == param.s) { @@ -954,7 +957,8 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input, // break; // if(thread_idx == 0 && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0){ - // printf(" s = %d, r = %d, block_k = %d, next_idx = %d , %d %d \n", s, r, block_k, next_idx, block_krs, num_block_tiles_k); + // printf(" s = %d, r = %d, block_k = %d, next_idx = %d , %d, %d, %d \n", s, r, block_k, next_idx, + // block_krs, num_block_tiles_k, num_block_tiles_krs); // } // if (block_k != num_block_tiles_k){ @@ -1044,8 +1048,8 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input, #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE asm volatile("cp.async.wait_group %0;\n" ::"n"(0)); __syncthreads(); - half* A_warp_tile = SA2 + A_warp_tile_offset; - half* B_warp_tile = SB2 + B_warp_tile_offset; + half* A_warp_tile = SA1 + A_warp_tile_offset; + half* B_warp_tile = SB1 + B_warp_tile_offset; ldmatrix_a(A_warp_tile, A_register_); ldmatrix_b(B_warp_tile, B_register_); // outer product between mma tiles diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 16861c71c9..7a1fe441ff 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -5826,7 +5826,7 @@ static std::vector> make_test_cases_eval() { for (uint32_t s0 : { 1, 3 }) { for (uint32_t p1 : { 2, 5 }) { - for (uint32_t Cin : { 1, 25 }) { + for (uint32_t Cin : { 1, 25, 32 }) { for (uint32_t Cout : { 1, 12 }) { for (uint32_t KH : { 1, 2, 3, 11 }) { for (uint32_t KW : { 1, 2, 3, 11 }) { @@ -5854,6 +5854,9 @@ static std::vector> make_test_cases_eval() { GGML_TYPE_F16, 1, 1, 1, 1, 1, 1, false)); test_cases.emplace_back(new test_conv_2d( { 24, 24, 128, 1 }, { 3, 3, 128, 8}, GGML_TYPE_F16, 1, 1, 1, 1, 1, 1, false)); + test_cases.emplace_back(new test_conv_2d( { 24, 24, 128, 3 }, { 3, 3, 128, 8}, + GGML_TYPE_F16, 1, 1, 1, 1, 1, 1, false)); + // sycl backend will limit task global_range < MAX_INT diff --git a/tests/test-conv2d.cpp b/tests/test-conv2d.cpp index e3968f28b8..b87a04c858 100644 --- a/tests/test-conv2d.cpp +++ b/tests/test-conv2d.cpp @@ -43,7 +43,7 @@ struct ggml_cgraph * build_graph_1(const test_model&); void load_model(test_model & model, int ic, int oc, int iw, int ih, int kw = 3, int kh = 3, bool use_gpu = false ) { // create data int KW = kw, KH = kh, IC = ic, OC = oc; - int IW = iw, IH = ih, N = 1; + int IW = iw, IH = ih, N = 2; // srand(time(NULL)); // printf(" input: IC = %d, OC = %d, IW = %d, IH = %d \n ", IC, OC, IW, IH); @@ -176,12 +176,19 @@ struct ggml_cgraph * build_graph_0(const test_model& model) { struct ggml_cgraph * gf = ggml_new_graph(ctx0); - int s0 = 1; - int s1 = 1; - int p0 = 1; - int p1 = 1; - int d0 = 1; - int d1 = 1; + // int s0 = 1; + // int s1 = 1; + // int p0 = 1; + // int p1 = 1; + // int d0 = 1; + // int d1 = 1; + + int s0 = 3; + int s1 = 5; + int p0 = 5; + int p1 = 5; + int d0 = 2; + int d1 = 4; // recalculate for avoid fragmentation struct ggml_tensor* conv2d_res = ggml_conv_2d(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1); @@ -215,12 +222,21 @@ struct ggml_cgraph * build_graph_1(const test_model& model) { struct ggml_cgraph * gf = ggml_new_graph(ctx0); - int s0 = 1; - int s1 = 1; - int p0 = 1; - int p1 = 1; - int d0 = 1; - int d1 = 1; + // int s0 = 1; + // int s1 = 1; + // int p0 = 1; + // int p1 = 1; + // int d0 = 1; + // int d1 = 1; + + + int s0 = 3; + int s1 = 5; + int p0 = 5; + int p1 = 5; + int d0 = 2; + int d1 = 4; + // recalculate for avoid fragmentation // struct ggml_tensor* conv2d_res = ggml_conv_2d(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1); @@ -301,7 +317,8 @@ static std::vector> configs = { // std::make_tuple(960,320,104,152,3,3), // std::make_tuple(1280,1280,26,38,3,3), // std::make_tuple(1920,640,32,32,3,3) - std::make_tuple(1280,1280,16,16,3,3), + // std::make_tuple(1280,1280,16,16,3,3), + std::make_tuple(32,12,141,133,3,3), // std::make_tuple(32,8,24,24,3,3), // std::make_tuple(640,640,64,64,3,3), // std::make_tuple(320,640,32,32,3,3), @@ -718,12 +735,12 @@ int main(void) // for(int i = 0; i < 26*38; i++) { for(int i = 0; i < conv2d_data.size(); i++) { float diff = fabs(im2col_data[i] - conv2d_data[i]); - // if(diff > 0.5) { + if(diff > 0.5) { printf("(%7.3f, %7.3f, %.2f, %d) \n", im2col_data[i], conv2d_data[i], diff, i); - // break; - // } + break; + } } ggml_free(model.ctx);