diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cu b/ggml/src/ggml-cuda/conv2d-implicit.cu index 1bf94476ab..3d086343d7 100644 --- a/ggml/src/ggml-cuda/conv2d-implicit.cu +++ b/ggml/src/ggml-cuda/conv2d-implicit.cu @@ -859,7 +859,7 @@ static void conv2d_implicit_cuda_f16(ggml_backend_cuda_context & ctx, const floa // if (BlocksM * BlocksN < nsm && P.c >= 8 * ksplit && (P.c * P.r * P.s) % (8*ksplit) == 0) { if (BlocksM * BlocksN < nsm){ - int ks = nsm / (BlocksM * BlocksN); + int ks = min(12, nsm / (BlocksM * BlocksN)); int j; bool can_split = false; for (j = ks; j >= 2; j--){ @@ -909,7 +909,7 @@ static void conv2d_implicit_cuda_f16(ggml_backend_cuda_context & ctx, const floa const unsigned int ksplit = 11; launch_conv2d_implicit_split_kernel(ctx, X_H, K_H, Y_D, BlocksM, BlocksN, shmem_bytes, P, st); - } else { + } else if(j == 12) { const unsigned int ksplit = 12; launch_conv2d_implicit_split_kernel(ctx, X_H, K_H, Y_D, BlocksM, BlocksN, shmem_bytes, P, st); diff --git a/tests/test-conv2d.cpp b/tests/test-conv2d.cpp index c460ca7d87..c6bdad23eb 100644 --- a/tests/test-conv2d.cpp +++ b/tests/test-conv2d.cpp @@ -653,7 +653,7 @@ int main(void) int k = 0; - for (auto c : configs_sdxl_768){ + for (auto c : configs_sdxl_1024){ // for (auto c : configs){ test_model model; load_model(model, std::get<0>(c), std::get<1>(c), std::get<2>(c),