Keep barrier at the end of loop to synchronise threads reducing cache-line contention (cache HITM)
This improves throughput for cases where threads have to wait due to lack work and causing process to spend many cycles in a spin loop. This enables to update dynamic chunk counter with static stride partitioning which further helps to eliminate shared counter. * remove one barrier in sgemm() * static stride partitioning
This commit is contained in:
parent
0de8878c96
commit
e516cd0056
|
|
@ -446,10 +446,7 @@ class tinyBLAS {
|
|||
ggml_threadpool_chunk_set(params->threadpool, params->nth);
|
||||
}
|
||||
|
||||
ggml_barrier(params->threadpool);
|
||||
|
||||
int64_t job = params->ith;
|
||||
while (job < nb_job) {
|
||||
for (int64_t job = params->ith; job < nb_job; job += params->nth) {
|
||||
const int64_t ii = (job % ytiles) * RM * BM;
|
||||
const int64_t jb = job / ytiles;
|
||||
const int64_t jr0 = BLOC_POS(jb , jj_BN, SIZE_BN);
|
||||
|
|
@ -472,7 +469,6 @@ class tinyBLAS {
|
|||
GGML_ASSERT(jj == jj2);
|
||||
}
|
||||
|
||||
job = ggml_threadpool_chunk_add(params->threadpool, 1);
|
||||
}
|
||||
|
||||
ggml_barrier(params->threadpool);
|
||||
|
|
|
|||
Loading…
Reference in New Issue