From 3308ccef918833ad60a3217961a94b393c4ca562 Mon Sep 17 00:00:00 2001
From: bssrdf <merlintiger@hotmail.com>
Date: Sun, 2 Nov 2025 17:30:36 -0500
Subject: [PATCH] conv3d WIP: enabled tensor core path

---
 ggml/src/ggml-cuda/conv3d-implicit.cu |  8 ++++----
 tests/test-conv3d.cpp                 | 16 ++++++++--------
 2 files changed, 12 insertions(+), 12 deletions(-)
diff --git a/ggml/src/ggml-cuda/conv3d-implicit.cu b/ggml/src/ggml-cuda/conv3d-implicit.cu
index 4f01dfea8d..76f887972a 100644
--- a/ggml/src/ggml-cuda/conv3d-implicit.cu
+++ b/ggml/src/ggml-cuda/conv3d-implicit.cu
@@ -1007,9 +1007,9 @@ static void conv3d_implicit_cuda_f16(ggml_backend_cuda_context & ctx, const floa
 
         int id = ggml_cuda_get_device();
 
-        int64_t ne = P.c * P.h * P.w * P.n;
+        int64_t ne = P.c * P.d * P.h * P.w * P.n;
         int64_t ne00 = P.c;
-        int64_t ne01 = P.h * P.w;
+        int64_t ne01 = P.h * P.w * P.d;
         ggml_cuda_pool_alloc<half> input_f16(ctx.pool(id), ne);
 
         dim3 dimGrid( (ne01 + CUDA_NCHW_2_NHWC_TILE_DIM - 1) / CUDA_NCHW_2_NHWC_TILE_DIM,
@@ -1018,8 +1018,8 @@ static void conv3d_implicit_cuda_f16(ggml_backend_cuda_context & ctx, const floa
         dim3 dimBlock(CUDA_NCHW_2_NHWC_TILE_DIM,CUDA_NCHW_2_NHWC_BLOCK_ROWS, 1);
         NCHW2NHWC<float, half><<<dimGrid, dimBlock, 0, st>>>(X_D, input_f16.get(), ne, ne00, ne01);
 
-        ne = P.c * P.r * P.s * P.k;
-        ne01 = P.r * P.s;
+        ne = P.c * P.r * P.s * P.t * P.k;
+        ne01 = P.r * P.s * P.t;
         ggml_cuda_pool_alloc<half> kernel_f16(ctx.pool(id), ne);
         dim3 dimGrid1((ne01 + CUDA_NCHW_2_NHWC_TILE_DIM - 1) / CUDA_NCHW_2_NHWC_TILE_DIM,
                       (ne00 + CUDA_NCHW_2_NHWC_TILE_DIM - 1) / CUDA_NCHW_2_NHWC_TILE_DIM,
diff --git a/tests/test-conv3d.cpp b/tests/test-conv3d.cpp
index 8b19f05c39..53e37efd31 100644
--- a/tests/test-conv3d.cpp
+++ b/tests/test-conv3d.cpp
@@ -323,13 +323,13 @@ int main(void)
         // std::make_tuple(960,320,104,152,3,3),
         // std::make_tuple(1280,1280,26,38,3,3),
         std::make_tuple(320,1280,26,38,8,3,3,3),
-        // std::make_tuple(1280,1280,26,38,8,3,3,3),
-        // std::make_tuple(320,1280,52,76,8,3,3,3),
-        // std::make_tuple(1280,1280,52,76,8,3,3,3),
-        // std::make_tuple(320,1280,104,152,8,3,3,3),
-        // std::make_tuple(1280,1280,104,152,8,3,3,3),
-        // std::make_tuple(320,1280,208,304,4,3,3,3),
-        // std::make_tuple(640,1280,208,304,4,3,3,3),
+        std::make_tuple(1280,1280,26,38,8,3,3,3),
+        std::make_tuple(320,1280,52,76,8,3,3,3),
+        std::make_tuple(1280,1280,52,76,8,3,3,3),
+        std::make_tuple(320,1280,104,152,8,3,3,3),
+        std::make_tuple(1280,1280,104,152,8,3,3,3),
+        std::make_tuple(320,1280,208,304,4,3,3,3),
+        std::make_tuple(640,1280,208,304,4,3,3,3),
         // std::make_tuple(1280,1280,26,38,1,1),
         // std::make_tuple(256,128,768,1024,3,3),
         // std::make_tuple(128,3,768,1024,3,3),
@@ -367,7 +367,7 @@ int main(void)
 
 
         struct ggml_cgraph * gf_res_0 = NULL;    
-        int iterations = 0;
+        int iterations = 20;
 
         double run_time0;
         std::vector<float> im2col_data = compute_graph(model, allocr, build_graph_0, iterations,