This commit is contained in:
bssrdf 2025-11-15 09:58:23 -05:00
parent e10b495dd2
commit e489dd2773
3 changed files with 49 additions and 25 deletions

View File

@ -913,13 +913,16 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
// for (unsigned int block_k = 1; block_k <= num_block_tiles_k; block_k++){
int s = 0;
int r = 0;
while (block_k < num_block_tiles_k){
#if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
#if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
while (block_krs < num_block_tiles_krs) {
asm volatile("cp.async.wait_group %0;\n" ::"n"(0));
#endif
#else
while (block_k < num_block_tiles_k) {
#endif
__syncthreads();
// moves to the next tile
// moves to the next channel block tile
int next_idx = 0;
++s;
if (s == param.s) {
@ -954,7 +957,8 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
// break;
// if(thread_idx == 0 && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0){
// printf(" s = %d, r = %d, block_k = %d, next_idx = %d , %d %d \n", s, r, block_k, next_idx, block_krs, num_block_tiles_k);
// printf(" s = %d, r = %d, block_k = %d, next_idx = %d , %d, %d, %d \n", s, r, block_k, next_idx,
// block_krs, num_block_tiles_k, num_block_tiles_krs);
// }
// if (block_k != num_block_tiles_k){
@ -1044,8 +1048,8 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
#if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
asm volatile("cp.async.wait_group %0;\n" ::"n"(0));
__syncthreads();
half* A_warp_tile = SA2 + A_warp_tile_offset;
half* B_warp_tile = SB2 + B_warp_tile_offset;
half* A_warp_tile = SA1 + A_warp_tile_offset;
half* B_warp_tile = SB1 + B_warp_tile_offset;
ldmatrix_a<mma_tiles_per_warp_m, mma_tiles_per_warp_k, BK>(A_warp_tile, A_register_);
ldmatrix_b<mma_tiles_per_warp_k, mma_tiles_per_warp_n, BK>(B_warp_tile, B_register_);
// outer product between mma tiles

View File

@ -5826,7 +5826,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
for (uint32_t s0 : { 1, 3 }) {
for (uint32_t p1 : { 2, 5 }) {
for (uint32_t Cin : { 1, 25 }) {
for (uint32_t Cin : { 1, 25, 32 }) {
for (uint32_t Cout : { 1, 12 }) {
for (uint32_t KH : { 1, 2, 3, 11 }) {
for (uint32_t KW : { 1, 2, 3, 11 }) {
@ -5854,6 +5854,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
GGML_TYPE_F16, 1, 1, 1, 1, 1, 1, false));
test_cases.emplace_back(new test_conv_2d( { 24, 24, 128, 1 }, { 3, 3, 128, 8},
GGML_TYPE_F16, 1, 1, 1, 1, 1, 1, false));
test_cases.emplace_back(new test_conv_2d( { 24, 24, 128, 3 }, { 3, 3, 128, 8},
GGML_TYPE_F16, 1, 1, 1, 1, 1, 1, false));
// sycl backend will limit task global_range < MAX_INT

View File

@ -43,7 +43,7 @@ struct ggml_cgraph * build_graph_1(const test_model&);
void load_model(test_model & model, int ic, int oc, int iw, int ih, int kw = 3, int kh = 3, bool use_gpu = false ) {
// create data
int KW = kw, KH = kh, IC = ic, OC = oc;
int IW = iw, IH = ih, N = 1;
int IW = iw, IH = ih, N = 2;
// srand(time(NULL));
// printf(" input: IC = %d, OC = %d, IW = %d, IH = %d \n ", IC, OC, IW, IH);
@ -176,12 +176,19 @@ struct ggml_cgraph * build_graph_0(const test_model& model) {
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
int s0 = 1;
int s1 = 1;
int p0 = 1;
int p1 = 1;
int d0 = 1;
int d1 = 1;
// int s0 = 1;
// int s1 = 1;
// int p0 = 1;
// int p1 = 1;
// int d0 = 1;
// int d1 = 1;
int s0 = 3;
int s1 = 5;
int p0 = 5;
int p1 = 5;
int d0 = 2;
int d1 = 4;
// recalculate for avoid fragmentation
struct ggml_tensor* conv2d_res = ggml_conv_2d(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1);
@ -215,12 +222,21 @@ struct ggml_cgraph * build_graph_1(const test_model& model) {
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
int s0 = 1;
int s1 = 1;
int p0 = 1;
int p1 = 1;
int d0 = 1;
int d1 = 1;
// int s0 = 1;
// int s1 = 1;
// int p0 = 1;
// int p1 = 1;
// int d0 = 1;
// int d1 = 1;
int s0 = 3;
int s1 = 5;
int p0 = 5;
int p1 = 5;
int d0 = 2;
int d1 = 4;
// recalculate for avoid fragmentation
// struct ggml_tensor* conv2d_res = ggml_conv_2d(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1);
@ -301,7 +317,8 @@ static std::vector<std::tuple<int, int, int, int, int, int>> configs = {
// std::make_tuple(960,320,104,152,3,3),
// std::make_tuple(1280,1280,26,38,3,3),
// std::make_tuple(1920,640,32,32,3,3)
std::make_tuple(1280,1280,16,16,3,3),
// std::make_tuple(1280,1280,16,16,3,3),
std::make_tuple(32,12,141,133,3,3),
// std::make_tuple(32,8,24,24,3,3),
// std::make_tuple(640,640,64,64,3,3),
// std::make_tuple(320,640,32,32,3,3),
@ -718,12 +735,12 @@ int main(void)
// for(int i = 0; i < 26*38; i++) {
for(int i = 0; i < conv2d_data.size(); i++) {
float diff = fabs(im2col_data[i] - conv2d_data[i]);
// if(diff > 0.5) {
if(diff > 0.5) {
printf("(%7.3f, %7.3f, %.2f, %d) \n",
im2col_data[i], conv2d_data[i],
diff, i);
// break;
// }
break;
}
}
ggml_free(model.ctx);