From 3308ccef918833ad60a3217961a94b393c4ca562 Mon Sep 17 00:00:00 2001 From: bssrdf Date: Sun, 2 Nov 2025 17:30:36 -0500 Subject: [PATCH] conv3d WIP: enabled tensor core path --- ggml/src/ggml-cuda/conv3d-implicit.cu | 8 ++++---- tests/test-conv3d.cpp | 16 ++++++++-------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-cuda/conv3d-implicit.cu b/ggml/src/ggml-cuda/conv3d-implicit.cu index 4f01dfea8d..76f887972a 100644 --- a/ggml/src/ggml-cuda/conv3d-implicit.cu +++ b/ggml/src/ggml-cuda/conv3d-implicit.cu @@ -1007,9 +1007,9 @@ static void conv3d_implicit_cuda_f16(ggml_backend_cuda_context & ctx, const floa int id = ggml_cuda_get_device(); - int64_t ne = P.c * P.h * P.w * P.n; + int64_t ne = P.c * P.d * P.h * P.w * P.n; int64_t ne00 = P.c; - int64_t ne01 = P.h * P.w; + int64_t ne01 = P.h * P.w * P.d; ggml_cuda_pool_alloc input_f16(ctx.pool(id), ne); dim3 dimGrid( (ne01 + CUDA_NCHW_2_NHWC_TILE_DIM - 1) / CUDA_NCHW_2_NHWC_TILE_DIM, @@ -1018,8 +1018,8 @@ static void conv3d_implicit_cuda_f16(ggml_backend_cuda_context & ctx, const floa dim3 dimBlock(CUDA_NCHW_2_NHWC_TILE_DIM,CUDA_NCHW_2_NHWC_BLOCK_ROWS, 1); NCHW2NHWC<<>>(X_D, input_f16.get(), ne, ne00, ne01); - ne = P.c * P.r * P.s * P.k; - ne01 = P.r * P.s; + ne = P.c * P.r * P.s * P.t * P.k; + ne01 = P.r * P.s * P.t; ggml_cuda_pool_alloc kernel_f16(ctx.pool(id), ne); dim3 dimGrid1((ne01 + CUDA_NCHW_2_NHWC_TILE_DIM - 1) / CUDA_NCHW_2_NHWC_TILE_DIM, (ne00 + CUDA_NCHW_2_NHWC_TILE_DIM - 1) / CUDA_NCHW_2_NHWC_TILE_DIM, diff --git a/tests/test-conv3d.cpp b/tests/test-conv3d.cpp index 8b19f05c39..53e37efd31 100644 --- a/tests/test-conv3d.cpp +++ b/tests/test-conv3d.cpp @@ -323,13 +323,13 @@ int main(void) // std::make_tuple(960,320,104,152,3,3), // std::make_tuple(1280,1280,26,38,3,3), std::make_tuple(320,1280,26,38,8,3,3,3), - // std::make_tuple(1280,1280,26,38,8,3,3,3), - // std::make_tuple(320,1280,52,76,8,3,3,3), - // std::make_tuple(1280,1280,52,76,8,3,3,3), - // std::make_tuple(320,1280,104,152,8,3,3,3), - // std::make_tuple(1280,1280,104,152,8,3,3,3), - // std::make_tuple(320,1280,208,304,4,3,3,3), - // std::make_tuple(640,1280,208,304,4,3,3,3), + std::make_tuple(1280,1280,26,38,8,3,3,3), + std::make_tuple(320,1280,52,76,8,3,3,3), + std::make_tuple(1280,1280,52,76,8,3,3,3), + std::make_tuple(320,1280,104,152,8,3,3,3), + std::make_tuple(1280,1280,104,152,8,3,3,3), + std::make_tuple(320,1280,208,304,4,3,3,3), + std::make_tuple(640,1280,208,304,4,3,3,3), // std::make_tuple(1280,1280,26,38,1,1), // std::make_tuple(256,128,768,1024,3,3), // std::make_tuple(128,3,768,1024,3,3), @@ -367,7 +367,7 @@ int main(void) struct ggml_cgraph * gf_res_0 = NULL; - int iterations = 0; + int iterations = 20; double run_time0; std::vector im2col_data = compute_graph(model, allocr, build_graph_0, iterations,