WIP: bug fix
This commit is contained in:
parent
be25be8ed3
commit
6c90c20cb1
|
|
@ -1081,7 +1081,11 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
|
|||
tileMemcpySwizzleB<BN, NUM_THREADS>(B_block_gmem, B_block_smem, weightKOffset, param);
|
||||
|
||||
// construct const pointers to warp tiles for use inside the inner loop
|
||||
|
||||
// if(threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x ==0 && blockIdx.y ==0){
|
||||
// for(int i = 0; i < 32; ++i)
|
||||
// printf("%.2f,", __half2float(A_block_smem[i]));
|
||||
// printf("\n");
|
||||
// }
|
||||
|
||||
int offset_direction = 1;
|
||||
|
||||
|
|
@ -1127,6 +1131,14 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
|
|||
}
|
||||
}
|
||||
}
|
||||
if(threadIdx.x == 0 && threadIdx.y ==0 && blockIdx.x ==0 && blockIdx.y ==0){
|
||||
printf(" %d: %f, %f, %f, %f \n", block_k, __half2float(acc_register_[0][0][0]), __half2float(acc_register_[0][0][1]),
|
||||
__half2float(acc_register_[0][0][2]), __half2float(acc_register_[0][0][3]));
|
||||
printf(" %d: %f, %f, %f, %f \n", block_k, __half2float(A_register_[0][0][0]), __half2float(A_register_[0][0][1]),
|
||||
__half2float(A_register_[0][0][2]), __half2float(A_register_[0][0][3]));
|
||||
printf(" %d: %f, %f, %f, %f \n", block_k, __half2float(B_register_[0][0][0]), __half2float(B_register_[0][0][1]),
|
||||
__half2float(B_register_[0][0][2]), __half2float(B_register_[0][0][3]));
|
||||
}
|
||||
|
||||
|
||||
if (block_k != num_block_tiles_k)
|
||||
|
|
@ -1141,6 +1153,7 @@ static __global__ void conv2d_implicit_kernel(const half * __restrict__ input,
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
// reuse smem
|
||||
half *smemoutput = shmem;
|
||||
const uint lane_id = threadIdx.x % WARPSIZE;
|
||||
|
|
|
|||
|
|
@ -159,7 +159,8 @@ __device__ __forceinline__ void tileMemcpySwizzleA(
|
|||
#pragma unroll
|
||||
for (unsigned int i = 0; i < NUM_ITERS; i++)
|
||||
{
|
||||
unsigned int gemm_i = blockDim.y * TILE_ROWS + thread_row;
|
||||
// unsigned int gemm_i = blockDim.y * TILE_ROWS + thread_row;
|
||||
unsigned int gemm_i = blockIdx.y * TILE_ROWS + thread_row;
|
||||
unsigned int n = fastdiv(gemm_i, param.OHOW_fastdiv);
|
||||
unsigned int npq_res = fastmodulo(gemm_i, param.OHOW_fastdiv);
|
||||
int posh_ori = fastdiv(npq_res, param.OW_fastdiv) * param.u - param.p;
|
||||
|
|
@ -227,7 +228,8 @@ __device__ __forceinline__ void tileMemcpyLoadA(
|
|||
// const unsigned int src_index = thread_row * src_stride_vectorized + thread_col;
|
||||
// dst_reg[i] = src_float4[src_index];
|
||||
// thread_row += ROW_STEP;
|
||||
unsigned int gemm_i = blockDim.y * TILE_ROWS + thread_row;
|
||||
// unsigned int gemm_i = blockDim.y * TILE_ROWS + thread_row;
|
||||
unsigned int gemm_i = blockIdx.y * TILE_ROWS + thread_row;
|
||||
unsigned int n = fastdiv(gemm_i, param.OHOW_fastdiv);
|
||||
unsigned int npq_res = fastmodulo(gemm_i, param.OHOW_fastdiv);
|
||||
int posh_ori = fastdiv(npq_res, param.OW_fastdiv) * param.u - param.p;
|
||||
|
|
|
|||
|
|
@ -48,7 +48,7 @@ void load_model(test_model & model, int ic, int oc, int iw, int ih, bool use_gpu
|
|||
// Initialize adata
|
||||
std::vector<float> adata(KW * KH * IC * OC);
|
||||
for (int i = 0; i < KW * KH * IC * OC; i++) {
|
||||
adata[i] = 2.5f;
|
||||
adata[i] = 2.f;
|
||||
}
|
||||
|
||||
// Convert adata to fp16 format
|
||||
|
|
@ -344,7 +344,7 @@ int main(void)
|
|||
// std::make_tuple(640,640,52,76),
|
||||
// std::make_tuple(640,640,104,152),
|
||||
// std::make_tuple(960,320,104,152),
|
||||
std::make_tuple(160,1280,26,38),
|
||||
std::make_tuple(128,1280,26,38),
|
||||
// std::make_tuple(1280,640,52,76),
|
||||
// std::make_tuple(1920,1280,26,38),
|
||||
// std::make_tuple(2560,1280,26,38),
|
||||
|
|
@ -398,7 +398,8 @@ int main(void)
|
|||
struct ggml_cgraph * gf_res_1 = NULL;
|
||||
|
||||
double run_time1;
|
||||
std::vector<float> wino_data = compute_graph(model, allocr, build_graph_1, iterations, &run_time1);
|
||||
// std::vector<float> wino_data = compute_graph(model, allocr, build_graph_1, iterations, &run_time1);
|
||||
conv2d_data = compute_graph(model, allocr, build_graph_1, iterations, &run_time1);
|
||||
|
||||
|
||||
ggml_gallocr_free(allocr);
|
||||
|
|
@ -419,7 +420,7 @@ int main(void)
|
|||
struct ggml_cgraph * gf_res_2 = NULL;
|
||||
|
||||
double run_time2;
|
||||
wino_data = compute_graph(model, allocr, build_graph_2, iterations, &run_time2);
|
||||
std::vector<float> wino_data = compute_graph(model, allocr, build_graph_2, iterations, &run_time2);
|
||||
|
||||
|
||||
if(k==0) {
|
||||
|
|
@ -436,15 +437,15 @@ int main(void)
|
|||
|
||||
|
||||
// for(int i = 0; i < ggml_nelements(wino_res); i++) {
|
||||
for(int i = 0; i < 26*38; i++) {
|
||||
float diff = fabs(conv2d_data[i] - wino_data[i]);
|
||||
// if(diff > 1.e-4) {
|
||||
printf("(%f, %f, %f, %d) \n",
|
||||
conv2d_data[i],
|
||||
wino_data[i], diff, i);
|
||||
// break;
|
||||
// }
|
||||
}
|
||||
// for(int i = 0; i < 26*38; i++) {
|
||||
// float diff = fabs(conv2d_data[i] - wino_data[i]);
|
||||
// // if(diff > 1.e-4) {
|
||||
// printf("(%f, %f, %f, %d) \n",
|
||||
// conv2d_data[i],
|
||||
// wino_data[i], diff, i);
|
||||
// // break;
|
||||
// // }
|
||||
// }
|
||||
|
||||
ggml_free(model.ctx);
|
||||
ggml_backend_buffer_free(model.buffer);
|
||||
|
|
|
|||
Loading…
Reference in New Issue