add support for both NCHW and NHWC layouts

This commit is contained in:
bssrdf 2025-10-14 14:24:35 -04:00
parent 3e2f722d11
commit b70cca2ea3
6 changed files with 118 additions and 44 deletions

View File

@ -1992,7 +1992,8 @@ extern "C" {
int p0, // padding dimension 0
int p1, // padding dimension 1
int d0, // dilation dimension 0
int d1); // dilation dimension 1
int d1,
int layout); // dilation dimension 1
GGML_API struct ggml_tensor * ggml_conv_3d_direct(

View File

@ -731,15 +731,15 @@ static __global__ void conv2d_implicit_kernel(const float * __restrict__ input,
conv_shapes[][2]: ne_input=[192,256,512,1git diff],ne_kernel=[3,3,512,512]
*/
constexpr static int conv_shapes[][NUM_VARIANTS] = {
{ 128, 128, 128 }, // BM
{ 256, 128, 256 }, // BN
{ 8, 8, 8 }, // BK
{ 128, 64, 32 }, // WM
{ 32, 32 , 256 }, // WN
{ 2, 2, 1 }, // WNITER
{ 8, 4, 8 }, // TM
{ 8, 4, 4 }, // TN
{ 256, 256, 128} // NUM_THREADS
{ 128, 128, 128, 256 }, // BM
{ 256, 128, 256, 128 }, // BN
{ 8, 8, 8, 8 }, // BK
{ 128, 64, 32, 128 }, // WM
{ 32, 32 , 256, 32 }, // WN
{ 2, 2, 1, 1 }, // WNITER
{ 8, 4, 4, 4 }, // TM
{ 8, 4, 8, 8 }, // TN
{ 256, 256, 128, 256} // NUM_THREADS
};
template <typename T, unsigned int CONV_SHAPE>
@ -763,16 +763,29 @@ static void conv2d_implicit_cuda(const float * X_D, const T * K_D, float * Y_D,
dim3 thblock(NUM_THREADS, thready, threadz);
dim3 grid(blockx, blocky, blockz);
// int smem_size = 24 * 1024;
conv2d_implicit_kernel<T, BM, BN, BK, WM, WN,
WNITER, TM, TN, NUM_THREADS, 1, false, 0><<<grid, thblock, 0, st>>>(X_D, K_D, Y_D, P);
if(P.c % 4 == 0){
if(P.layout == 0)
conv2d_implicit_kernel<T, BM, BN, BK, WM, WN,
WNITER, TM, TN, NUM_THREADS, 0, true, 0><<<grid, thblock, 0, st>>>(X_D, K_D, Y_D, P);
else if(P.layout == 1)
conv2d_implicit_kernel<T, BM, BN, BK, WM, WN,
WNITER, TM, TN, NUM_THREADS, 1, false, 0><<<grid, thblock, 0, st>>>(X_D, K_D, Y_D, P);
} else{
if(P.layout == 0)
conv2d_implicit_kernel<T, BM, BN, BK, WM, WN,
WNITER, TM, TN, NUM_THREADS, 0, false, 0><<<grid, thblock, 0, st>>>(X_D, K_D, Y_D, P);
else if(P.layout == 1)
conv2d_implicit_kernel<T, BM, BN, BK, WM, WN,
WNITER, TM, TN, NUM_THREADS, 1, false, 0><<<grid, thblock, 0, st>>>(X_D, K_D, Y_D, P);
}
}
static void conv2d_implicit_cuda_f16(const float * X_D, const half * K_D, float * Y_D, const param_t P, cudaStream_t st) {
conv2d_implicit_cuda<half, 0>(X_D, K_D, Y_D, P, st);
conv2d_implicit_cuda<half, 3>(X_D, K_D, Y_D, P, st);
}
static void conv2d_implicit_cuda_f32(const float * X_D, const float * K_D, float * Y_D, const param_t P, cudaStream_t st) {
conv2d_implicit_cuda<float, 0>(X_D, K_D, Y_D, P, st);
conv2d_implicit_cuda<float, 3>(X_D, K_D, Y_D, P, st);
}
void ggml_cuda_op_conv2d_implicit(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@ -785,8 +798,6 @@ void ggml_cuda_op_conv2d_implicit(ggml_backend_cuda_context & ctx, ggml_tensor *
GGML_ASSERT(ggml_is_contiguous(kernel));
GGML_ASSERT(kernel->type == GGML_TYPE_F16 || kernel->type == GGML_TYPE_F32);
// same number of input channels
GGML_ASSERT(input->ne[2] == kernel->ne[2]);
cudaStream_t st = ctx.stream();
@ -797,17 +808,30 @@ void ggml_cuda_op_conv2d_implicit(ggml_backend_cuda_context & ctx, ggml_tensor *
const int PD_Y = p[3]; // padding_y
const int DL_X = p[4]; // dilation_x
const int DL_Y = p[5]; // dilation_y
const int LT = p[6]; // layout
GGML_ASSERT(LT == 0 || LT == 1);
// same number of input channels
GGML_ASSERT(LT == 0 ? input->ne[0] == kernel->ne[0] : input->ne[2] == kernel->ne[2]);
// No cwhn
GGML_ASSERT(p[6] == false);
GGML_ASSERT(p[7] == false);
const int IW = input->ne[0]; // input_w
const int IH = input->ne[1]; // input_h
// const int IW = input->ne[0]; // input_w
// const int IH = input->ne[1]; // input_h
// const int OW = dst->ne[0]; // output_w
// const int OH = dst->ne[1]; // output_h
// const int KW = kernel->ne[0]; // kernel_w
// const int KH = kernel->ne[1]; // kernel_h
// const int IC = input->ne[2]; // input_channels
const int IW = input->ne[LT == 0 ? 1 : 0]; // input_w
const int IH = input->ne[LT == 0 ? 2 : 1]; // input_h
const int OW = dst->ne[0]; // output_w
const int OH = dst->ne[1]; // output_h
const int KW = kernel->ne[0]; // kernel_w
const int KH = kernel->ne[1]; // kernel_h
const int IC = input->ne[2]; // input_channels
const int KW = kernel->ne[LT == 0 ? 1 : 0]; // kernel_w
const int KH = kernel->ne[LT == 0 ? 2 : 1]; // kernel_h
const int IC = input->ne[LT == 0 ? 0: 2]; // input_channels
const int OC = kernel->ne[3]; // ouptut_chanles
const int B = input->ne[3]; // n_batches
@ -819,7 +843,7 @@ void ggml_cuda_op_conv2d_implicit(ggml_backend_cuda_context & ctx, ggml_tensor *
params.C_fastdiv = init_fastdiv_values(IC);
params.RS_fastdiv = init_fastdiv_values(KW*KH);
params.S_fastdiv = init_fastdiv_values(KW);
params.nchw = false;
params.layout = LT;
if (kernel->type == GGML_TYPE_F16) {
conv2d_implicit_cuda_f16(X_D, (half *) K_D, Y_D, params, st);

View File

@ -17,7 +17,7 @@ typedef struct{
unsigned int d_w; //dilation width
unsigned int Oh; //output height
unsigned int Ow; //output width
bool nchw;
unsigned int layout;
uint3 SC_fastdiv;
uint3 OW_fastdiv;
uint3 C_fastdiv;

View File

@ -4584,7 +4584,9 @@ struct ggml_tensor * ggml_conv_2d_implicitgemm(
int p0, // padding dimension 0
int p1, // padding dimension 1
int d0, // dilation dimension 0
int d1) {// dilation dimension 1
int d1,
// 0: NHWC, 1:NCHW
int layout) {// dilation dimension 1
GGML_ASSERT(a->ne[2] == b->ne[2]);
//GGML_ASSERT(a->type == b->type);
@ -4603,10 +4605,20 @@ struct ggml_tensor * ggml_conv_2d_implicitgemm(
ggml_set_op_params_i32(result, 3, p1);
ggml_set_op_params_i32(result, 4, d0);
ggml_set_op_params_i32(result, 5, d1);
ggml_set_op_params_i32(result, 6, layout);
struct ggml_tensor *ap, *bp;
if(layout == 0){
ap = ggml_cont(ctx, ggml_permute(ctx, a, 1, 2, 0, 3));
bp = ggml_cont(ctx, ggml_permute(ctx, b, 1, 2, 0, 3));
} else{
ap = a;
bp = b;
}
result->op = GGML_OP_CONV_2D_IMPLICIT;
result->src[0] = a;
result->src[1] = b;
result->src[0] = ap;
result->src[1] = bp;
return result;
}

View File

@ -4268,7 +4268,7 @@ struct test_conv_2d_implicit : public test_case {
}
ggml_tensor * out =
ggml_conv_2d_implicitgemm(ctx, kernel, input, stride0, stride1, padding0, padding1, dilation0, dilation1);
ggml_conv_2d_implicitgemm(ctx, kernel, input, stride0, stride1, padding0, padding1, dilation0, dilation1, cwhn?0:1);
ggml_set_name(out, "out");
return out;
}
@ -6788,6 +6788,45 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
GGML_TYPE_F16, 1, 1, p0, p1, 1, 1, false));
}
for (auto act_case : cases_sd) {
GGML_ASSERT(act_case[idx_sd["kw"]] == 3 || act_case[idx_sd["kw"]] == 1);
GGML_ASSERT(act_case[idx_sd["kh"]] == 3 || act_case[idx_sd["kh"]] == 1);
uint32_t p0 = act_case[idx_sd["kw"]] == 3 ? 1 : 0;
uint32_t p1 = act_case[idx_sd["kh"]] == 3 ? 1 : 0;
test_cases.emplace_back(new test_conv_2d_implicit(
{ act_case[idx_sd["iw"]], act_case[idx_sd["ih"]], act_case[idx_sd["Cin"]], act_case[idx_sd["B"]] },
{ act_case[idx_sd["kw"]], act_case[idx_sd["kh"]], act_case[idx_sd["Cin"]], act_case[idx_sd["Cout"]] },
GGML_TYPE_F32, 1, 1, p0, p1, 1, 1, false));
}
for (auto act_case : cases_sd) {
GGML_ASSERT(act_case[idx_sd["kw"]] == 3 || act_case[idx_sd["kw"]] == 1);
GGML_ASSERT(act_case[idx_sd["kh"]] == 3 || act_case[idx_sd["kh"]] == 1);
uint32_t p0 = act_case[idx_sd["kw"]] == 3 ? 1 : 0;
uint32_t p1 = act_case[idx_sd["kh"]] == 3 ? 1 : 0;
test_cases.emplace_back(new test_conv_2d_implicit(
{ act_case[idx_sd["iw"]], act_case[idx_sd["ih"]], act_case[idx_sd["Cin"]], act_case[idx_sd["B"]] },
{ act_case[idx_sd["kw"]], act_case[idx_sd["kh"]], act_case[idx_sd["Cin"]], act_case[idx_sd["Cout"]] },
GGML_TYPE_F16, 1, 1, p0, p1, 1, 1, true));
}
for (auto act_case : cases_sd) {
GGML_ASSERT(act_case[idx_sd["kw"]] == 3 || act_case[idx_sd["kw"]] == 1);
GGML_ASSERT(act_case[idx_sd["kh"]] == 3 || act_case[idx_sd["kh"]] == 1);
uint32_t p0 = act_case[idx_sd["kw"]] == 3 ? 1 : 0;
uint32_t p1 = act_case[idx_sd["kh"]] == 3 ? 1 : 0;
test_cases.emplace_back(new test_conv_2d_implicit(
{ act_case[idx_sd["iw"]], act_case[idx_sd["ih"]], act_case[idx_sd["Cin"]], act_case[idx_sd["B"]] },
{ act_case[idx_sd["kw"]], act_case[idx_sd["kh"]], act_case[idx_sd["Cin"]], act_case[idx_sd["Cout"]] },
GGML_TYPE_F32, 1, 1, p0, p1, 1, 1, true));
}
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 1, 1, 1}));
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1}));

View File

@ -262,7 +262,7 @@ struct ggml_cgraph * build_graph_2(const test_model& model) {
// printf("conv2d: (%zu, %zu, %zu, %zu) \n", ne[0], ne[1], ne[2], ne[3]);
struct ggml_tensor* wino_res = ggml_conv_2d_implicitgemm(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1);
struct ggml_tensor* wino_res = ggml_conv_2d_implicitgemm(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1, 0);
// struct ggml_tensor* wino_res = ggml_conv_2d_direct(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1);
ggml_set_name(wino_res, "wino_res");
ggml_build_forward_expand(gf, wino_res);
@ -339,20 +339,20 @@ int main(void)
{
ggml_time_init();
std::vector<std::tuple<int, int, int, int>> configs = {
// std::make_tuple(64,64,48,64),
// std::make_tuple(320,320,104,152),
// std::make_tuple(640,640,52,76),
// std::make_tuple(640,640,104,152),
// std::make_tuple(960,320,104,152),
// std::make_tuple(1280,1280,26,38),
// std::make_tuple(1280,640,52,76),
// std::make_tuple(1920,1280,26,38),
// std::make_tuple(2560,1280,26,38),
// std::make_tuple(512,512,104,152),
// std::make_tuple(512,512,208,304),
std::make_tuple(64,64,48,64),
std::make_tuple(320,320,104,152),
std::make_tuple(640,640,52,76),
std::make_tuple(640,640,104,152),
std::make_tuple(960,320,104,152),
std::make_tuple(1280,1280,26,38),
std::make_tuple(1280,640,52,76),
std::make_tuple(1920,1280,26,38),
std::make_tuple(2560,1280,26,38),
std::make_tuple(512,512,104,152),
std::make_tuple(512,512,208,304),
std::make_tuple(512,256,416,608),
// std::make_tuple(256,128,832,1216),
// std::make_tuple(256,256,832,1216),
std::make_tuple(256,128,832,1216),
std::make_tuple(256,256,832,1216),
// std::make_tuple(320,256,1024,1920)
};
@ -453,8 +453,6 @@ int main(void)
}
// printf("\nPerforming test:\n");
return 0;
}