diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cu b/ggml/src/ggml-cuda/conv2d-implicit.cu index ec975194cc..b665ae4dfa 100644 --- a/ggml/src/ggml-cuda/conv2d-implicit.cu +++ b/ggml/src/ggml-cuda/conv2d-implicit.cu @@ -956,10 +956,10 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input, // if(block_k == num_block_tiles_k) // break; - // if(thread_idx == 0 && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0){ - // printf(" s = %d, r = %d, block_k = %d, next_idx = %d , %d, %d, %d \n", s, r, block_k, next_idx, - // block_krs, num_block_tiles_k, num_block_tiles_krs); - // } + if(thread_idx == 0 && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0){ + printf(" s = %d, r = %d, block_k = %d, next_idx = %d , %d, %d, %d \n", s, r, block_k, next_idx, + block_krs, num_block_tiles_k, num_block_tiles_krs); + } // if (block_k != num_block_tiles_k){ if (block_krs != num_block_tiles_krs){ @@ -1024,8 +1024,47 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input, #endif } } + + + // if(threadIdx.x >= 8 && threadIdx.x < 12 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0){ + // printf("A %d, %d, %d: %f, %f \n", block_krs, mma_k, threadIdx.x, + // __half2float(A_register_[1][mma_k][0]), + // __half2float(A_register_[1][mma_k][1])); + // } + // if(threadIdx.x < 4 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0){ + // printf("B %d, %d, %d: %f, %f\n", block_krs, mma_k, threadIdx.x, + // __half2float(B_register_[mma_k][1][0]), + // __half2float(B_register_[mma_k][1][1])); + // } + // if(threadIdx.x == 8 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0){ + // printf("C %d, %d, %d: %f, %f, %f, %f\n", block_krs, mma_k, threadIdx.x, + // __half2float(acc_register_[1][1][0]), + // __half2float(acc_register_[1][1][1]), + // __half2float(acc_register_[1][1][2]), + // __half2float(acc_register_[1][1][3])); + // } + + // if(threadIdx.x < 4 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0){ + // printf("A %d, %d, (%d, %d) %d: %f, %f \n", block_krs, mma_k, r, s, threadIdx.x, + // __half2float(A_register_[0][mma_k][0]), + // __half2float(A_register_[0][mma_k][1])); + // } + // if(threadIdx.x < 4 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0){ + // printf("B %d, %d, (%d, %d) %d: %f, %f\n", block_krs, mma_k, r, s, threadIdx.x, + // __half2float(B_register_[mma_k][0][0]), + // __half2float(B_register_[mma_k][0][1])); + // } + // if(threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0){ + // printf("C %d, %d, (%d, %d) %d: %f, %f, %f, %f\n", block_krs, mma_k, r, s, threadIdx.x, + // __half2float(acc_register_[0][0][0]), + // __half2float(acc_register_[0][0][1]), + // __half2float(acc_register_[0][0][2]), + // __half2float(acc_register_[0][0][3])); + // } + } + // if (block_k != num_block_tiles_k) if (block_krs != num_block_tiles_krs) { #if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE @@ -1086,13 +1125,41 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input, #endif } } + // if(threadIdx.x < 4 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0){ + // printf("A %d, %d, (%d, %d) %d: %f, %f \n", block_krs, mma_k, r, s, threadIdx.x, + // __half2float(A_register_[0][mma_k][0]), + // __half2float(A_register_[0][mma_k][1])); + // } + // if(threadIdx.x < 4 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0){ + // printf("B %d, %d, (%d, %d) %d: %f, %f\n", block_krs, mma_k, r, s, threadIdx.x, + // __half2float(B_register_[mma_k][0][0]), + // __half2float(B_register_[mma_k][0][1])); + // } + // if(threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0){ + // printf("C %d, %d, (%d, %d) %d: %f, %f, %f, %f\n", block_krs, mma_k, r, s, threadIdx.x, + // __half2float(acc_register_[0][0][0]), + // __half2float(acc_register_[0][0][1]), + // __half2float(acc_register_[0][0][2]), + // __half2float(acc_register_[0][0][3])); + // } } #endif - // if(threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0){ - // printf(" %u, %f\n", blockIdx.z, __half2float(acc_register_[0][0][0])); + // if(threadIdx.x == 8 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0){ + // printf(" %u, %f, %f, %f, %f\n", blockIdx.z, + // __half2float(acc_register_[1][1][0]), + // __half2float(acc_register_[1][1][1]), + // __half2float(acc_register_[1][1][2]), + // __half2float(acc_register_[1][1][3])); // } + if(threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0){ + printf(" %u, %f, %f, %f, %f\n", blockIdx.z, + __half2float(acc_register_[0][1][0]), + __half2float(acc_register_[0][1][1]), + __half2float(acc_register_[0][1][2]), + __half2float(acc_register_[0][1][3])); + } // reuse smem half *smemoutput = shmem; @@ -1145,10 +1212,14 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input, half (&res_)[2] = reinterpret_cast(dst_ptr); if (n < param.n && row < param.k && col < param.PQ) { const uint outOffset = ((ksplit > 0) ? z * param.NKPQ : 0) + n * param.KPQ + row * param.PQ + col; + // if(row == 8 && col == 18) + // printf("A %u, %u, %f \n", outOffset, z, ggml_cuda_cast(res_[0])); output[outOffset] = ggml_cuda_cast(res_[0]); } if (n < param.n && row+1 < param.k && col < param.PQ) { const uint outOffset = ((ksplit > 0) ? z * param.NKPQ : 0) + n * param.KPQ + (row+1) * param.PQ + col; + // if(row+1 == 8 && col == 17) + // printf("B %u, %u, %f \n", outOffset, z, ggml_cuda_cast(res_[0])); output[outOffset] = ggml_cuda_cast(res_[1]); } } @@ -1353,9 +1424,10 @@ static void conv2d_implicit_cuda_f16(ggml_backend_cuda_context & ctx, const floa } } } - + candidate = -1; if(candidate != -1){ j = candidate; + printf("choosing %d \n", j); if (j == 2) { launch_conv2d_implicit_split_kernel(ctx, X_H, K_H, Y_D, BlocksM, BlocksN, shmem_bytes, P, st); diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cuh b/ggml/src/ggml-cuda/conv2d-implicit.cuh index 4728e0e757..ee56c80b7f 100644 --- a/ggml/src/ggml-cuda/conv2d-implicit.cuh +++ b/ggml/src/ggml-cuda/conv2d-implicit.cuh @@ -476,7 +476,7 @@ __device__ __forceinline__ unsigned int tileMemcpyAsyncLoadA( #pragma unroll for (unsigned int i = 0; i < NUM_ITERS; i++){ bool valid = (masks[i][0] & (1u << curR)) && (masks[i][1] & (1u << curS)); - // if(threadIdx.x == 3 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 1){ + // if(threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 1){ // printf(" %u, %u, %u, %u, %u, %lld, %d\n", i, curR, curS, oldC, curC, element_offset[i], valid?1:0); // } unsigned int dst_index = iter_idx; diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 7a1fe441ff..171c500668 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -5826,7 +5826,7 @@ static std::vector> make_test_cases_eval() { for (uint32_t s0 : { 1, 3 }) { for (uint32_t p1 : { 2, 5 }) { - for (uint32_t Cin : { 1, 25, 32 }) { + for (uint32_t Cin : { 1, 25 }) { for (uint32_t Cout : { 1, 12 }) { for (uint32_t KH : { 1, 2, 3, 11 }) { for (uint32_t KW : { 1, 2, 3, 11 }) { diff --git a/tests/test-conv2d.cpp b/tests/test-conv2d.cpp index b87a04c858..8187d5f5fd 100644 --- a/tests/test-conv2d.cpp +++ b/tests/test-conv2d.cpp @@ -43,7 +43,7 @@ struct ggml_cgraph * build_graph_1(const test_model&); void load_model(test_model & model, int ic, int oc, int iw, int ih, int kw = 3, int kh = 3, bool use_gpu = false ) { // create data int KW = kw, KH = kh, IC = ic, OC = oc; - int IW = iw, IH = ih, N = 2; + int IW = iw, IH = ih, N = 1; // srand(time(NULL)); // printf(" input: IC = %d, OC = %d, IW = %d, IH = %d \n ", IC, OC, IW, IH); @@ -53,6 +53,8 @@ void load_model(test_model & model, int ic, int oc, int iw, int ih, int kw = 3, for (int i = 0; i < KW * KH * IC * OC; i++) { // adata[i] = 2.f; // adata[i] = (float)(i%KW)-1.f; + // adata[i] = (float)((i+1)%KW+1)/10.0; + // adata[i] = (float)(i%100); // adata[i] = (rand() % 255) / 255.0; float r = -1.f + static_cast (rand()) /( static_cast (RAND_MAX/(1.f-(-1.f)))); adata[i] = r; @@ -176,19 +178,19 @@ struct ggml_cgraph * build_graph_0(const test_model& model) { struct ggml_cgraph * gf = ggml_new_graph(ctx0); - // int s0 = 1; - // int s1 = 1; - // int p0 = 1; - // int p1 = 1; - // int d0 = 1; - // int d1 = 1; + int s0 = 1; + int s1 = 1; + int p0 = 1; + int p1 = 1; + int d0 = 1; + int d1 = 1; - int s0 = 3; - int s1 = 5; - int p0 = 5; - int p1 = 5; - int d0 = 2; - int d1 = 4; + // int s0 = 3; + // int s1 = 5; + // int p0 = 5; + // int p1 = 5; + // int d0 = 2; + // int d1 = 4; // recalculate for avoid fragmentation struct ggml_tensor* conv2d_res = ggml_conv_2d(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1); @@ -222,20 +224,20 @@ struct ggml_cgraph * build_graph_1(const test_model& model) { struct ggml_cgraph * gf = ggml_new_graph(ctx0); - // int s0 = 1; - // int s1 = 1; - // int p0 = 1; - // int p1 = 1; - // int d0 = 1; - // int d1 = 1; + int s0 = 1; + int s1 = 1; + int p0 = 1; + int p1 = 1; + int d0 = 1; + int d1 = 1; - int s0 = 3; - int s1 = 5; - int p0 = 5; - int p1 = 5; - int d0 = 2; - int d1 = 4; + // int s0 = 3; + // int s1 = 5; + // int p0 = 5; + // int p1 = 5; + // int d0 = 2; + // int d1 = 4; // recalculate for avoid fragmentation @@ -318,7 +320,21 @@ static std::vector> configs = { // std::make_tuple(1280,1280,26,38,3,3), // std::make_tuple(1920,640,32,32,3,3) // std::make_tuple(1280,1280,16,16,3,3), - std::make_tuple(32,12,141,133,3,3), + // std::make_tuple(32,12,141,133,3,3), + // std::make_tuple(32,6,141,133,3,3), + // std::make_tuple(32,12,141,121,3,3), + // std::make_tuple(32,9,141,121,3,3), + // std::make_tuple(320,8,16,16,3,3), //working + // std::make_tuple(320,9,16,16,3,3), //working + // std::make_tuple(320,12,16,16,3,3), //working + // std::make_tuple(256,12,16,16,3,3), //working + // std::make_tuple(32,12,16,16,3,3), //not working + // std::make_tuple(48,12,16,16,3,3), // not working + std::make_tuple(96,12,16,16,3,3), //not working + // std::make_tuple(64,12,16,16,3,3), //working + // std::make_tuple(64,12,141,133,3,3), //working + // std::make_tuple(32,12,141,133,3,3), //working + // std::make_tuple(1280,1280,16,16,3,3), // std::make_tuple(32,8,24,24,3,3), // std::make_tuple(640,640,64,64,3,3), // std::make_tuple(320,640,32,32,3,3), @@ -730,18 +746,19 @@ int main(void) run_time0, mem_size0/1024.0f/1024.0f, run_time1, mem_size1/1024.0f/1024.0f); - + // int i = 2048; // for(int i = 0; i < ggml_nelements(wino_res); i++) { // for(int i = 0; i < 26*38; i++) { - for(int i = 0; i < conv2d_data.size(); i++) { - float diff = fabs(im2col_data[i] - conv2d_data[i]); - if(diff > 0.5) { - printf("(%7.3f, %7.3f, %.2f, %d) \n", - im2col_data[i], conv2d_data[i], - diff, i); - break; - } - } + // for(int i = 0; i < conv2d_data.size(); i++) { + // float diff = fabs(im2col_data[i] - conv2d_data[i]); + // // if(diff > 0.5) { + // // if(diff > 2.0) { + // printf("(%7.3f, %7.3f, %.2f, %d) \n", + // im2col_data[i], conv2d_data[i], + // diff, i); + // // break; + // // } + // } ggml_free(model.ctx); ggml_backend_buffer_free(model.buffer);