Keep barrier at the end of loop to synchronise threads reducing cache-line contention (cache HITM)
This improves throughput for cases where threads have to wait due to lack work and causing process to spend many cycles in a spin loop. This enables to update dynamic chunk counter with static stride partitioning which further helps to eliminate shared counter. * remove one barrier in sgemm() * static stride partitioning
This commit is contained in:
parent
0de8878c96
commit
e516cd0056
|
|
@ -446,10 +446,7 @@ class tinyBLAS {
|
||||||
ggml_threadpool_chunk_set(params->threadpool, params->nth);
|
ggml_threadpool_chunk_set(params->threadpool, params->nth);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_barrier(params->threadpool);
|
for (int64_t job = params->ith; job < nb_job; job += params->nth) {
|
||||||
|
|
||||||
int64_t job = params->ith;
|
|
||||||
while (job < nb_job) {
|
|
||||||
const int64_t ii = (job % ytiles) * RM * BM;
|
const int64_t ii = (job % ytiles) * RM * BM;
|
||||||
const int64_t jb = job / ytiles;
|
const int64_t jb = job / ytiles;
|
||||||
const int64_t jr0 = BLOC_POS(jb , jj_BN, SIZE_BN);
|
const int64_t jr0 = BLOC_POS(jb , jj_BN, SIZE_BN);
|
||||||
|
|
@ -472,7 +469,6 @@ class tinyBLAS {
|
||||||
GGML_ASSERT(jj == jj2);
|
GGML_ASSERT(jj == jj2);
|
||||||
}
|
}
|
||||||
|
|
||||||
job = ggml_threadpool_chunk_add(params->threadpool, 1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_barrier(params->threadpool);
|
ggml_barrier(params->threadpool);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue