diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cu b/ggml/src/ggml-cuda/conv2d-implicit.cu index fa7a905d39..6601d160d7 100644 --- a/ggml/src/ggml-cuda/conv2d-implicit.cu +++ b/ggml/src/ggml-cuda/conv2d-implicit.cu @@ -1076,7 +1076,8 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input, // prefetch the first block tile of A,B into shared memory // half* A_block_gmem = input + (block_m * BM * A_stride); const half* A_block_gmem = input; - const half* B_block_gmem = kernel + (block_n * weightKOffset); +// const half* B_block_gmem = kernel + (block_n * weightKOffset); + const half* B_block_gmem = kernel + block_n * BN * weightKOffset; tileMemcpySwizzleA(A_block_gmem, A_block_smem, inChannelOffset, param); tileMemcpySwizzleB(B_block_gmem, B_block_smem, weightKOffset, param); @@ -1097,7 +1098,8 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input, { // half* A_block_gmem = A + (block_m * BM * A_stride) + (block_k * BK); const half* A_block_gmem = input; - const half* B_block_gmem = kernel + (block_n * weightKOffset); + // const half* B_block_gmem = kernel + (block_n * weightKOffset); + const half* B_block_gmem = kernel + (block_n * BN * weightKOffset); tileMemcpyLoadA(A_block_gmem, A_gmem_cache_reg, block_k * BK, inChannelOffset, param); tileMemcpyLoadB(B_block_gmem, B_gmem_cache_reg, block_k * BK, weightKOffset, param); } @@ -1119,6 +1121,7 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input, { asm volatile ( "mma.sync.aligned.m16n8k8.row.col.f16.f16.f16.f16 " + // "mma.sync.aligned.m16n8k8.row.row.f16.f16.f16.f16 " "{%0, %1}, " "{%2, %3}, " "{%4}, " @@ -1130,14 +1133,14 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input, ); } } - // if(threadIdx.x == 0 && threadIdx.y ==0 && blockIdx.x ==0 && blockIdx.y ==0){ - // printf(" %d, %d: %f, %f, %f, %f \n", block_k, mma_k, __half2float(acc_register_[3][0][0]), __half2float(acc_register_[3][0][1]), - // __half2float(acc_register_[3][0][2]), __half2float(acc_register_[3][0][3])); - // printf(" %d, %d: %f, %f, %f, %f \n", block_k, mma_k, __half2float(A_register_[3][mma_k][0]), __half2float(A_register_[3][mma_k][1]), - // __half2float(A_register_[3][mma_k][2]), __half2float(A_register_[3][mma_k][3])); - // printf(" %d, %d: %f, %f, %f, %f \n", block_k, mma_k, __half2float(B_register_[mma_k][0][0]), __half2float(B_register_[mma_k][0][1]), - // __half2float(B_register_[mma_k][0][2]), __half2float(B_register_[mma_k][0][3])); - // } + if(threadIdx.x == 28 && threadIdx.y ==0 && blockIdx.x ==0 && blockIdx.y ==0){ + printf(" %d, %d: %f, %f, %f, %f \n", block_k, mma_k, __half2float(acc_register_[0][0][0]), __half2float(acc_register_[0][0][1]), + __half2float(acc_register_[0][0][2]), __half2float(acc_register_[0][0][3])); + printf(" %d, %d: %f, %f, %f, %f \n", block_k, mma_k, __half2float(A_register_[0][mma_k][0]), __half2float(A_register_[0][mma_k][1]), + __half2float(A_register_[0][mma_k][2]), __half2float(A_register_[0][mma_k][3])); + printf(" %d, %d: %f, %f, %f, %f \n", block_k, mma_k, __half2float(B_register_[mma_k][0][0]), __half2float(B_register_[mma_k][0][1]), + __half2float(B_register_[mma_k][0][2]), __half2float(B_register_[mma_k][0][3])); + } // if(threadIdx.x < 4 && threadIdx.y ==0 && blockIdx.x ==0 && blockIdx.y ==0){ // printf("A %d, %d, %d: %f, %f \n", block_k, mma_k, threadIdx.x, __half2float(A_register_[3][mma_k][0]), __half2float(A_register_[3][mma_k][1])); // printf("B %d, %d, %d: %f, %f \n", block_k, mma_k, threadIdx.x, __half2float(B_register_[mma_k][0][0]), __half2float(B_register_[mma_k][0][1])); diff --git a/tests/test-conv2d-implicit.cpp b/tests/test-conv2d-implicit.cpp index 4b9222a19e..3a5f928ee6 100644 --- a/tests/test-conv2d-implicit.cpp +++ b/tests/test-conv2d-implicit.cpp @@ -42,13 +42,18 @@ void load_model(test_model & model, int ic, int oc, int iw, int ih, bool use_gpu // create data int KW = 3, KH = 3, IC = ic, OC = oc; int IW = iw, IH = ih, N = 1; + srand(time(NULL)); // printf(" input: IC = %d, OC = %d, IW = %d, IH = %d \n ", IC, OC, IW, IH); // Initialize adata std::vector adata(KW * KH * IC * OC); for (int i = 0; i < KW * KH * IC * OC; i++) { - adata[i] = 0.2f; + // adata[i] = 2.f; + adata[i] = (float)(i%KW)-1.f; + // adata[i] = (rand() % 255) / 255.0; + // float r = -1.f + static_cast (rand()) /( static_cast (RAND_MAX/(1.f-(-1.f)))); + // adata[i] = r; } // Convert adata to fp16 format @@ -58,7 +63,11 @@ void load_model(test_model & model, int ic, int oc, int iw, int ih, bool use_gpu // Initialize bdata std::vector bdata(IW * IH * IC * N); for (int i = 0; i < IW * IH * IC * N; i++) { - bdata[i] = 1.5f; + bdata[i] = (float)(i%IW)/10.f; + // bdata[i] = 1.5f; + // bdata[i] = (rand() % 255) / 255.0; + // float r = -1.f + static_cast (rand()) /( static_cast (RAND_MAX/(1.f-(-1.f)))); + // bdata[i] = r; } size_t buffer_size = 0; @@ -344,7 +353,7 @@ int main(void) // std::make_tuple(640,640,52,76), // std::make_tuple(640,640,104,152), // std::make_tuple(960,320,104,152), - std::make_tuple(128,128,26,38), + std::make_tuple(640,128,26,38), // std::make_tuple(1280,640,52,76), // std::make_tuple(1920,1280,26,38), // std::make_tuple(2560,1280,26,38), @@ -378,7 +387,7 @@ int main(void) int iterations = 0; double run_time0; - std::vector conv2d_data = compute_graph(model, allocr, build_graph_0, iterations, &run_time0); + std::vector im2col_data = compute_graph(model, allocr, build_graph_0, iterations, &run_time0); ggml_gallocr_free(allocr); @@ -399,7 +408,7 @@ int main(void) double run_time1; // std::vector wino_data = compute_graph(model, allocr, build_graph_1, iterations, &run_time1); - conv2d_data = compute_graph(model, allocr, build_graph_1, iterations, &run_time1); + std::vector conv2d_data = compute_graph(model, allocr, build_graph_1, iterations, &run_time1); ggml_gallocr_free(allocr); @@ -439,11 +448,13 @@ int main(void) // for(int i = 0; i < ggml_nelements(wino_res); i++) { for(int i = 0; i < 26*38; i++) { // for(int i = 0; i < conv2d_data.size(); i++) { - float diff = fabs(conv2d_data[i] - wino_data[i]); + // float diff = fabs(conv2d_data[i] - wino_data[i]); + float diff = fabs(im2col_data[i] - wino_data[i]); + float diff1 = fabs(im2col_data[i] - conv2d_data[i]); // if(diff > 1.e-4) { - printf("(%f, %f, %f, %d) \n", - conv2d_data[i], - wino_data[i], diff, i); + printf("(%f, %f, %f, %f, %f, %d) \n", + im2col_data[i], conv2d_data[i], + wino_data[i], diff, diff1, i); // break; // } }