WIP
This commit is contained in:
parent
e10b495dd2
commit
e489dd2773
|
|
@ -913,13 +913,16 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
|
||||||
// for (unsigned int block_k = 1; block_k <= num_block_tiles_k; block_k++){
|
// for (unsigned int block_k = 1; block_k <= num_block_tiles_k; block_k++){
|
||||||
int s = 0;
|
int s = 0;
|
||||||
int r = 0;
|
int r = 0;
|
||||||
while (block_k < num_block_tiles_k){
|
#if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||||
#if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
while (block_krs < num_block_tiles_krs) {
|
||||||
|
|
||||||
asm volatile("cp.async.wait_group %0;\n" ::"n"(0));
|
asm volatile("cp.async.wait_group %0;\n" ::"n"(0));
|
||||||
#endif
|
#else
|
||||||
|
while (block_k < num_block_tiles_k) {
|
||||||
|
#endif
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
// moves to the next tile
|
// moves to the next channel block tile
|
||||||
int next_idx = 0;
|
int next_idx = 0;
|
||||||
++s;
|
++s;
|
||||||
if (s == param.s) {
|
if (s == param.s) {
|
||||||
|
|
@ -954,7 +957,8 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
|
||||||
// break;
|
// break;
|
||||||
|
|
||||||
// if(thread_idx == 0 && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0){
|
// if(thread_idx == 0 && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0){
|
||||||
// printf(" s = %d, r = %d, block_k = %d, next_idx = %d , %d %d \n", s, r, block_k, next_idx, block_krs, num_block_tiles_k);
|
// printf(" s = %d, r = %d, block_k = %d, next_idx = %d , %d, %d, %d \n", s, r, block_k, next_idx,
|
||||||
|
// block_krs, num_block_tiles_k, num_block_tiles_krs);
|
||||||
// }
|
// }
|
||||||
|
|
||||||
// if (block_k != num_block_tiles_k){
|
// if (block_k != num_block_tiles_k){
|
||||||
|
|
@ -1044,8 +1048,8 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
|
||||||
#if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
#if __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
|
||||||
asm volatile("cp.async.wait_group %0;\n" ::"n"(0));
|
asm volatile("cp.async.wait_group %0;\n" ::"n"(0));
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
half* A_warp_tile = SA2 + A_warp_tile_offset;
|
half* A_warp_tile = SA1 + A_warp_tile_offset;
|
||||||
half* B_warp_tile = SB2 + B_warp_tile_offset;
|
half* B_warp_tile = SB1 + B_warp_tile_offset;
|
||||||
ldmatrix_a<mma_tiles_per_warp_m, mma_tiles_per_warp_k, BK>(A_warp_tile, A_register_);
|
ldmatrix_a<mma_tiles_per_warp_m, mma_tiles_per_warp_k, BK>(A_warp_tile, A_register_);
|
||||||
ldmatrix_b<mma_tiles_per_warp_k, mma_tiles_per_warp_n, BK>(B_warp_tile, B_register_);
|
ldmatrix_b<mma_tiles_per_warp_k, mma_tiles_per_warp_n, BK>(B_warp_tile, B_register_);
|
||||||
// outer product between mma tiles
|
// outer product between mma tiles
|
||||||
|
|
|
||||||
|
|
@ -5826,7 +5826,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||||
|
|
||||||
for (uint32_t s0 : { 1, 3 }) {
|
for (uint32_t s0 : { 1, 3 }) {
|
||||||
for (uint32_t p1 : { 2, 5 }) {
|
for (uint32_t p1 : { 2, 5 }) {
|
||||||
for (uint32_t Cin : { 1, 25 }) {
|
for (uint32_t Cin : { 1, 25, 32 }) {
|
||||||
for (uint32_t Cout : { 1, 12 }) {
|
for (uint32_t Cout : { 1, 12 }) {
|
||||||
for (uint32_t KH : { 1, 2, 3, 11 }) {
|
for (uint32_t KH : { 1, 2, 3, 11 }) {
|
||||||
for (uint32_t KW : { 1, 2, 3, 11 }) {
|
for (uint32_t KW : { 1, 2, 3, 11 }) {
|
||||||
|
|
@ -5854,6 +5854,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||||
GGML_TYPE_F16, 1, 1, 1, 1, 1, 1, false));
|
GGML_TYPE_F16, 1, 1, 1, 1, 1, 1, false));
|
||||||
test_cases.emplace_back(new test_conv_2d( { 24, 24, 128, 1 }, { 3, 3, 128, 8},
|
test_cases.emplace_back(new test_conv_2d( { 24, 24, 128, 1 }, { 3, 3, 128, 8},
|
||||||
GGML_TYPE_F16, 1, 1, 1, 1, 1, 1, false));
|
GGML_TYPE_F16, 1, 1, 1, 1, 1, 1, false));
|
||||||
|
test_cases.emplace_back(new test_conv_2d( { 24, 24, 128, 3 }, { 3, 3, 128, 8},
|
||||||
|
GGML_TYPE_F16, 1, 1, 1, 1, 1, 1, false));
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// sycl backend will limit task global_range < MAX_INT
|
// sycl backend will limit task global_range < MAX_INT
|
||||||
|
|
|
||||||
|
|
@ -43,7 +43,7 @@ struct ggml_cgraph * build_graph_1(const test_model&);
|
||||||
void load_model(test_model & model, int ic, int oc, int iw, int ih, int kw = 3, int kh = 3, bool use_gpu = false ) {
|
void load_model(test_model & model, int ic, int oc, int iw, int ih, int kw = 3, int kh = 3, bool use_gpu = false ) {
|
||||||
// create data
|
// create data
|
||||||
int KW = kw, KH = kh, IC = ic, OC = oc;
|
int KW = kw, KH = kh, IC = ic, OC = oc;
|
||||||
int IW = iw, IH = ih, N = 1;
|
int IW = iw, IH = ih, N = 2;
|
||||||
// srand(time(NULL));
|
// srand(time(NULL));
|
||||||
|
|
||||||
// printf(" input: IC = %d, OC = %d, IW = %d, IH = %d \n ", IC, OC, IW, IH);
|
// printf(" input: IC = %d, OC = %d, IW = %d, IH = %d \n ", IC, OC, IW, IH);
|
||||||
|
|
@ -176,12 +176,19 @@ struct ggml_cgraph * build_graph_0(const test_model& model) {
|
||||||
|
|
||||||
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||||
|
|
||||||
int s0 = 1;
|
// int s0 = 1;
|
||||||
int s1 = 1;
|
// int s1 = 1;
|
||||||
int p0 = 1;
|
// int p0 = 1;
|
||||||
int p1 = 1;
|
// int p1 = 1;
|
||||||
int d0 = 1;
|
// int d0 = 1;
|
||||||
int d1 = 1;
|
// int d1 = 1;
|
||||||
|
|
||||||
|
int s0 = 3;
|
||||||
|
int s1 = 5;
|
||||||
|
int p0 = 5;
|
||||||
|
int p1 = 5;
|
||||||
|
int d0 = 2;
|
||||||
|
int d1 = 4;
|
||||||
|
|
||||||
// recalculate for avoid fragmentation
|
// recalculate for avoid fragmentation
|
||||||
struct ggml_tensor* conv2d_res = ggml_conv_2d(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1);
|
struct ggml_tensor* conv2d_res = ggml_conv_2d(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1);
|
||||||
|
|
@ -215,12 +222,21 @@ struct ggml_cgraph * build_graph_1(const test_model& model) {
|
||||||
|
|
||||||
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||||
|
|
||||||
int s0 = 1;
|
// int s0 = 1;
|
||||||
int s1 = 1;
|
// int s1 = 1;
|
||||||
int p0 = 1;
|
// int p0 = 1;
|
||||||
int p1 = 1;
|
// int p1 = 1;
|
||||||
int d0 = 1;
|
// int d0 = 1;
|
||||||
int d1 = 1;
|
// int d1 = 1;
|
||||||
|
|
||||||
|
|
||||||
|
int s0 = 3;
|
||||||
|
int s1 = 5;
|
||||||
|
int p0 = 5;
|
||||||
|
int p1 = 5;
|
||||||
|
int d0 = 2;
|
||||||
|
int d1 = 4;
|
||||||
|
|
||||||
|
|
||||||
// recalculate for avoid fragmentation
|
// recalculate for avoid fragmentation
|
||||||
// struct ggml_tensor* conv2d_res = ggml_conv_2d(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1);
|
// struct ggml_tensor* conv2d_res = ggml_conv_2d(ctx0, model.a, model.b, s0, s1, p0, p1, d0, d1);
|
||||||
|
|
@ -301,7 +317,8 @@ static std::vector<std::tuple<int, int, int, int, int, int>> configs = {
|
||||||
// std::make_tuple(960,320,104,152,3,3),
|
// std::make_tuple(960,320,104,152,3,3),
|
||||||
// std::make_tuple(1280,1280,26,38,3,3),
|
// std::make_tuple(1280,1280,26,38,3,3),
|
||||||
// std::make_tuple(1920,640,32,32,3,3)
|
// std::make_tuple(1920,640,32,32,3,3)
|
||||||
std::make_tuple(1280,1280,16,16,3,3),
|
// std::make_tuple(1280,1280,16,16,3,3),
|
||||||
|
std::make_tuple(32,12,141,133,3,3),
|
||||||
// std::make_tuple(32,8,24,24,3,3),
|
// std::make_tuple(32,8,24,24,3,3),
|
||||||
// std::make_tuple(640,640,64,64,3,3),
|
// std::make_tuple(640,640,64,64,3,3),
|
||||||
// std::make_tuple(320,640,32,32,3,3),
|
// std::make_tuple(320,640,32,32,3,3),
|
||||||
|
|
@ -718,12 +735,12 @@ int main(void)
|
||||||
// for(int i = 0; i < 26*38; i++) {
|
// for(int i = 0; i < 26*38; i++) {
|
||||||
for(int i = 0; i < conv2d_data.size(); i++) {
|
for(int i = 0; i < conv2d_data.size(); i++) {
|
||||||
float diff = fabs(im2col_data[i] - conv2d_data[i]);
|
float diff = fabs(im2col_data[i] - conv2d_data[i]);
|
||||||
// if(diff > 0.5) {
|
if(diff > 0.5) {
|
||||||
printf("(%7.3f, %7.3f, %.2f, %d) \n",
|
printf("(%7.3f, %7.3f, %.2f, %d) \n",
|
||||||
im2col_data[i], conv2d_data[i],
|
im2col_data[i], conv2d_data[i],
|
||||||
diff, i);
|
diff, i);
|
||||||
// break;
|
break;
|
||||||
// }
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_free(model.ctx);
|
ggml_free(model.ctx);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue