diff --git a/ggml/src/ggml-cuda/conv2d-implicit.cu b/ggml/src/ggml-cuda/conv2d-implicit.cu index 000fd89e20..fa7a905d39 100644 --- a/ggml/src/ggml-cuda/conv2d-implicit.cu +++ b/ggml/src/ggml-cuda/conv2d-implicit.cu @@ -1130,14 +1130,26 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input, ); } } + // if(threadIdx.x == 0 && threadIdx.y ==0 && blockIdx.x ==0 && blockIdx.y ==0){ + // printf(" %d, %d: %f, %f, %f, %f \n", block_k, mma_k, __half2float(acc_register_[3][0][0]), __half2float(acc_register_[3][0][1]), + // __half2float(acc_register_[3][0][2]), __half2float(acc_register_[3][0][3])); + // printf(" %d, %d: %f, %f, %f, %f \n", block_k, mma_k, __half2float(A_register_[3][mma_k][0]), __half2float(A_register_[3][mma_k][1]), + // __half2float(A_register_[3][mma_k][2]), __half2float(A_register_[3][mma_k][3])); + // printf(" %d, %d: %f, %f, %f, %f \n", block_k, mma_k, __half2float(B_register_[mma_k][0][0]), __half2float(B_register_[mma_k][0][1]), + // __half2float(B_register_[mma_k][0][2]), __half2float(B_register_[mma_k][0][3])); + // } + // if(threadIdx.x < 4 && threadIdx.y ==0 && blockIdx.x ==0 && blockIdx.y ==0){ + // printf("A %d, %d, %d: %f, %f \n", block_k, mma_k, threadIdx.x, __half2float(A_register_[3][mma_k][0]), __half2float(A_register_[3][mma_k][1])); + // printf("B %d, %d, %d: %f, %f \n", block_k, mma_k, threadIdx.x, __half2float(B_register_[mma_k][0][0]), __half2float(B_register_[mma_k][0][1])); + // } } - // if(threadIdx.x == 4 && threadIdx.y ==0 && blockIdx.x ==0 && blockIdx.y ==0){ - // printf(" %d: %f, %f, %f, %f \n", block_k, __half2float(acc_register_[0][0][0]), __half2float(acc_register_[0][0][1]), - // __half2float(acc_register_[0][0][2]), __half2float(acc_register_[0][0][3])); - // printf(" %d: %f, %f, %f, %f \n", block_k, __half2float(A_register_[0][0][0]), __half2float(A_register_[0][0][1]), - // __half2float(A_register_[0][0][2]), __half2float(A_register_[0][0][3])); - // printf(" %d: %f, %f, %f, %f \n", block_k, __half2float(B_register_[0][0][0]), __half2float(B_register_[0][0][1]), - // __half2float(B_register_[0][0][2]), __half2float(B_register_[0][0][3])); + // if(threadIdx.x == 0 && threadIdx.y ==0 && blockIdx.x ==0 && blockIdx.y ==0){ + // printf(" %d: %f, %f, %f, %f \n", block_k, __half2float(acc_register_[3][0][0]), __half2float(acc_register_[3][0][1]), + // __half2float(acc_register_[3][0][2]), __half2float(acc_register_[3][0][3])); + // printf(" %d: %f, %f, %f, %f \n", block_k, __half2float(A_register_[3][0][0]), __half2float(A_register_[3][0][1]), + // __half2float(A_register_[3][0][2]), __half2float(A_register_[3][0][3])); + // printf(" %d: %f, %f, %f, %f \n", block_k, __half2float(B_register_[3][0][0]), __half2float(B_register_[3][0][1]), + // __half2float(B_register_[3][0][2]), __half2float(B_register_[3][0][3])); // } diff --git a/tests/test-conv2d-implicit.cpp b/tests/test-conv2d-implicit.cpp index f6dfa8c1b4..4b9222a19e 100644 --- a/tests/test-conv2d-implicit.cpp +++ b/tests/test-conv2d-implicit.cpp @@ -48,7 +48,7 @@ void load_model(test_model & model, int ic, int oc, int iw, int ih, bool use_gpu // Initialize adata std::vector adata(KW * KH * IC * OC); for (int i = 0; i < KW * KH * IC * OC; i++) { - adata[i] = 2.f; + adata[i] = 0.2f; } // Convert adata to fp16 format @@ -344,7 +344,7 @@ int main(void) // std::make_tuple(640,640,52,76), // std::make_tuple(640,640,104,152), // std::make_tuple(960,320,104,152), - std::make_tuple(640,128,26,38), + std::make_tuple(128,128,26,38), // std::make_tuple(1280,640,52,76), // std::make_tuple(1920,1280,26,38), // std::make_tuple(2560,1280,26,38),