make sure there are enough channels for split-k

2025-11-06 10:21:49 -05:00 · 2025-11-06 10:21:49 -05:00 · 311213d209
parent 09e3a5f07d
commit 311213d209
2 changed files with 3 additions and 3 deletions
--- a/ggml/src/ggml-cuda/conv2d-implicit.cu
+++ b/ggml/src/ggml-cuda/conv2d-implicit.cu
@ -991,9 +991,8 @@ static void conv2d_implicit_cuda_f16(ggml_backend_cuda_context & ctx, const floa
        const unsigned int shmem_bytes = (BM_dim * BK_dim + BK_dim * BN_dim) * 2 * sizeof(half);

        const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
-
-        if (BlocksM * BlocksN < nsm) {
-            const unsigned int ksplit = 8;
+        const unsigned int ksplit = 8;
+        if (BlocksM * BlocksN < nsm && P.c > 8 * ksplit) {
            ggml_cuda_pool_alloc<half> Y_H(ctx.pool(id), ksplit * P.k * P.Oh * P.Ow * P.n);

            cudaFuncSetAttribute(conv2d_implicit_kernel<BM_dim, BN_dim, BK_dim, WM_dim, WN_dim, WK_dim, ksplit, NumThreads>,
--- a/tests/test-conv2d.cpp
+++ b/tests/test-conv2d.cpp
@ -324,6 +324,7 @@ int main(void)
        std::make_tuple(256,128,832,1216,3,3),
        std::make_tuple(256,256,832,1216,3,3),
        // std::make_tuple(320,256,1024,1920)
+        // std::make_tuple(32,64,58,58,3,3)
    };

    int k = 0;