conv3d WIP: enabled tensor core path

This commit is contained in:
bssrdf 2025-11-02 17:30:36 -05:00
parent 3f5c5045da
commit 3308ccef91
2 changed files with 12 additions and 12 deletions

View File

@ -1007,9 +1007,9 @@ static void conv3d_implicit_cuda_f16(ggml_backend_cuda_context & ctx, const floa
int id = ggml_cuda_get_device();
int64_t ne = P.c * P.h * P.w * P.n;
int64_t ne = P.c * P.d * P.h * P.w * P.n;
int64_t ne00 = P.c;
int64_t ne01 = P.h * P.w;
int64_t ne01 = P.h * P.w * P.d;
ggml_cuda_pool_alloc<half> input_f16(ctx.pool(id), ne);
dim3 dimGrid( (ne01 + CUDA_NCHW_2_NHWC_TILE_DIM - 1) / CUDA_NCHW_2_NHWC_TILE_DIM,
@ -1018,8 +1018,8 @@ static void conv3d_implicit_cuda_f16(ggml_backend_cuda_context & ctx, const floa
dim3 dimBlock(CUDA_NCHW_2_NHWC_TILE_DIM,CUDA_NCHW_2_NHWC_BLOCK_ROWS, 1);
NCHW2NHWC<float, half><<<dimGrid, dimBlock, 0, st>>>(X_D, input_f16.get(), ne, ne00, ne01);
ne = P.c * P.r * P.s * P.k;
ne01 = P.r * P.s;
ne = P.c * P.r * P.s * P.t * P.k;
ne01 = P.r * P.s * P.t;
ggml_cuda_pool_alloc<half> kernel_f16(ctx.pool(id), ne);
dim3 dimGrid1((ne01 + CUDA_NCHW_2_NHWC_TILE_DIM - 1) / CUDA_NCHW_2_NHWC_TILE_DIM,
(ne00 + CUDA_NCHW_2_NHWC_TILE_DIM - 1) / CUDA_NCHW_2_NHWC_TILE_DIM,

View File

@ -323,13 +323,13 @@ int main(void)
// std::make_tuple(960,320,104,152,3,3),
// std::make_tuple(1280,1280,26,38,3,3),
std::make_tuple(320,1280,26,38,8,3,3,3),
// std::make_tuple(1280,1280,26,38,8,3,3,3),
// std::make_tuple(320,1280,52,76,8,3,3,3),
// std::make_tuple(1280,1280,52,76,8,3,3,3),
// std::make_tuple(320,1280,104,152,8,3,3,3),
// std::make_tuple(1280,1280,104,152,8,3,3,3),
// std::make_tuple(320,1280,208,304,4,3,3,3),
// std::make_tuple(640,1280,208,304,4,3,3,3),
std::make_tuple(1280,1280,26,38,8,3,3,3),
std::make_tuple(320,1280,52,76,8,3,3,3),
std::make_tuple(1280,1280,52,76,8,3,3,3),
std::make_tuple(320,1280,104,152,8,3,3,3),
std::make_tuple(1280,1280,104,152,8,3,3,3),
std::make_tuple(320,1280,208,304,4,3,3,3),
std::make_tuple(640,1280,208,304,4,3,3,3),
// std::make_tuple(1280,1280,26,38,1,1),
// std::make_tuple(256,128,768,1024,3,3),
// std::make_tuple(128,3,768,1024,3,3),
@ -367,7 +367,7 @@ int main(void)
struct ggml_cgraph * gf_res_0 = NULL;
int iterations = 0;
int iterations = 20;
double run_time0;
std::vector<float> im2col_data = compute_graph(model, allocr, build_graph_0, iterations,