From 6d84cbb5abc2f7f3590c9ec3c5b01496543ec593 Mon Sep 17 00:00:00 2001 From: bssrdf Date: Wed, 3 Sep 2025 15:45:09 -0400 Subject: [PATCH] Fix parameter order in conv2d_implicit and add comprehensive test cases for 2D convolution --- ggml/src/ggml-cuda/conv2d-implicit.cu | 2 +- tests/test-backend-ops.cpp | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cu b/ggml/src/ggml-cuda/conv2d-implicit.cu index a78720ecc6..4f452ab98b 100644 --- a/ggml/src/ggml-cuda/conv2d-implicit.cu +++ b/ggml/src/ggml-cuda/conv2d-implicit.cu @@ -355,7 +355,7 @@ void ggml_cuda_op_conv2d_implicit(ggml_backend_cuda_context & ctx, ggml_tensor * const int64_t total = B * OC * OH * OW; - param_t params = { B, IC, IH, IW, OC, KH, KW, ST_X, ST_Y, PD_X, PD_Y, DL_X, DL_Y, OH, OW }; + param_t params = { B, IC, IH, IW, OC, KH, KW, ST_Y, ST_X, PD_Y, PD_X, DL_Y, DL_X, OH, OW }; if (kernel->type == GGML_TYPE_F16) { conv2d_implicit_cuda_f16(X_D, (half *) K_D, Y_D, params, st); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 9ab73434fe..d5e1005d2f 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -5790,6 +5790,30 @@ static std::vector> make_test_cases_eval() { } } + for (uint32_t s0 : { 1, 3 }) { + for (uint32_t p1 : { 2, 5 }) { + for (uint32_t Cin : { 1, 25 }) { + for (uint32_t Cout : { 1, 12 }) { + for (uint32_t KH : { 1, 2, 3, 11 }) { + for (uint32_t KW : { 1, 2, 3, 11 }) { + for (uint32_t H : { 1, 133 }) { + for (uint32_t W : { 1, 141 }) { + if (calc_conv_output_size(W, KW, s0, p0, d0) > 0 && + calc_conv_output_size(H, KH, s1, p1, d1) > 0) { + for (auto kernel_type : {GGML_TYPE_F32, GGML_TYPE_F16}) { + test_cases.emplace_back(new test_conv_2d_implicit( + { W, H, Cin, 2 }, { KW, KH, Cin, Cout }, kernel_type, s0, s1, p0, p1, d0, d1, false)); + } + } + } + } + } + } + } + } + } + } + // sycl backend will limit task global_range < MAX_INT // test cases for 2D im2col with large input W and H (occurs in stable-diffusion) // however these cases need to alloc more memory which may fail in some devices (Intel Arc770, etc.)