mirror of https://github.com/google/gemma.cpp.git
1.02x speedup: improve load balance and simplify parallelFor
Remove ParallelizeOne/TwoRange, use ParallelForAcross/WithinCluster instead. PiperOrigin-RevId: 823388890
This commit is contained in:
parent
085a34965a
commit
a48e614f64
162
ops/matmul.h
162
ops/matmul.h
|
|
@ -103,17 +103,13 @@ struct MMParallelWithinCluster {
|
||||||
template <class Func>
|
template <class Func>
|
||||||
void ForN(ThreadingContext& ctx, const IndexRange& range_n, size_t n_multiple,
|
void ForN(ThreadingContext& ctx, const IndexRange& range_n, size_t n_multiple,
|
||||||
size_t inner_tasks, size_t cluster_idx, const Func& func) const {
|
size_t inner_tasks, size_t cluster_idx, const Func& func) const {
|
||||||
HWY_DASSERT(1 <= inner_tasks && inner_tasks <= 4);
|
const hwy::pool::Caller caller =
|
||||||
|
ctx.pool_callers.Get(Callers::kMMClusterForN);
|
||||||
|
|
||||||
hwy::ThreadPool& cluster = ctx.pools.Cluster(cluster_idx);
|
ParallelPartitionWithinCluster(
|
||||||
const size_t base = ctx.Worker(cluster_idx);
|
range_n, n_multiple, inner_tasks, ctx, cluster_idx, caller,
|
||||||
|
|
||||||
const IndexRangePartition ranges_n = StaticPartition(
|
|
||||||
range_n, cluster.NumWorkers() * inner_tasks, n_multiple);
|
|
||||||
ParallelizeOneRange(ranges_n, cluster,
|
|
||||||
ctx.pool_callers.Get(Callers::kMMClusterForN),
|
|
||||||
[&](const IndexRange& worker_range, size_t worker) {
|
[&](const IndexRange& worker_range, size_t worker) {
|
||||||
func(worker_range, base + worker);
|
func(worker_range, worker);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -122,80 +118,57 @@ struct MMParallelWithinCluster {
|
||||||
const IndexRangePartition& ranges_mc,
|
const IndexRangePartition& ranges_mc,
|
||||||
const IndexRangePartition& ranges_nc, size_t cluster_idx,
|
const IndexRangePartition& ranges_nc, size_t cluster_idx,
|
||||||
const Func& func) const {
|
const Func& func) const {
|
||||||
hwy::ThreadPool& cluster = ctx.pools.Cluster(cluster_idx);
|
const hwy::pool::Caller caller =
|
||||||
const size_t base = ctx.Worker(cluster_idx);
|
ctx.pool_callers.Get(Callers::kMMClusterForMCNC);
|
||||||
|
|
||||||
// Low-batch: avoid Divide/Remainder.
|
// We are running on one pool, hence collapse into a 1D range.
|
||||||
if (HWY_UNLIKELY(ranges_mc.NumTasks() == 1)) {
|
const hwy::Divisor div_m(static_cast<uint32_t>(ranges_mc.NumTasks()));
|
||||||
ParallelizeOneRange(ranges_nc, cluster,
|
const auto get_mc = [&](uint64_t task) {
|
||||||
ctx.pool_callers.Get(Callers::kMMClusterForMCNC),
|
return ranges_mc.Range(div_m.Remainder(static_cast<uint32_t>(task)));
|
||||||
[&](const IndexRange& range_nc, size_t worker) {
|
};
|
||||||
func(ranges_mc.Range(0), range_nc, base + worker);
|
const auto get_nc = [&](uint64_t task) {
|
||||||
|
return ranges_nc.Range(div_m.Divide(static_cast<uint32_t>(task)));
|
||||||
|
};
|
||||||
|
const size_t num_tasks = ranges_mc.NumTasks() * ranges_nc.NumTasks();
|
||||||
|
|
||||||
|
ParallelForWithinCluster(num_tasks, ctx, cluster_idx, caller,
|
||||||
|
[&](uint64_t task, size_t worker) {
|
||||||
|
func(get_mc(task), get_nc(task), worker);
|
||||||
});
|
});
|
||||||
} else {
|
|
||||||
ParallelizeTwoRanges(
|
|
||||||
ranges_mc, ranges_nc, cluster,
|
|
||||||
ctx.pool_callers.Get(Callers::kMMClusterForMCNC),
|
|
||||||
[&](const IndexRange& range_mc, const IndexRange& range_nc,
|
|
||||||
size_t worker) { func(range_mc, range_nc, base + worker); });
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Func>
|
template <class Func>
|
||||||
void ForRangeMC(ThreadingContext& ctx, const IndexRange& range_mc,
|
void ForRangeMC(ThreadingContext& ctx, const IndexRange& range_mc,
|
||||||
size_t cluster_idx, const Func& func) const {
|
size_t cluster_idx, const Func& func) const {
|
||||||
hwy::ThreadPool& cluster = ctx.pools.Cluster(cluster_idx);
|
const hwy::pool::Caller caller =
|
||||||
const size_t base = ctx.Worker(cluster_idx);
|
ctx.pool_callers.Get(Callers::kMMClusterForMC);
|
||||||
|
|
||||||
cluster.Run(
|
ParallelForWithinCluster(
|
||||||
range_mc.begin(), range_mc.end(),
|
range_mc.Num(), ctx, cluster_idx, caller,
|
||||||
ctx.pool_callers.Get(Callers::kMMClusterForMC),
|
[&](uint64_t i, size_t worker) { func(range_mc.begin() + i, worker); });
|
||||||
[&](uint64_t row_a, size_t worker) { func(row_a, base + worker); });
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct MMParallelHierarchical {
|
struct MMParallelHierarchical {
|
||||||
// Cluster/CCX-aware parallel-for over B rows in `range_n`. `n_multiple` is
|
// Similar to `HierarchicalParallelFor`, but over *sub-ranges* of B rows in
|
||||||
// the granularity of per-cluster tasks. Calls `func(worker_range, worker)`.
|
// `range_n` governed by `n_multiple` and `inner_tasks`.
|
||||||
template <class Func>
|
template <class Func>
|
||||||
void ForN(ThreadingContext& ctx, const IndexRange& range_n, size_t n_multiple,
|
void ForN(ThreadingContext& ctx, const IndexRange& range_n, size_t n_multiple,
|
||||||
size_t inner_tasks, HWY_MAYBE_UNUSED size_t caller_cluster_idx,
|
size_t inner_tasks, size_t caller_cluster_idx,
|
||||||
const Func& func) const {
|
const Func& func) const {
|
||||||
HWY_DASSERT(1 <= inner_tasks && inner_tasks <= 4);
|
|
||||||
HWY_DASSERT(caller_cluster_idx == 0);
|
HWY_DASSERT(caller_cluster_idx == 0);
|
||||||
|
(void)caller_cluster_idx;
|
||||||
const hwy::pool::Caller caller = ctx.pool_callers.Get(Callers::kMMHierForN);
|
const hwy::pool::Caller caller = ctx.pool_callers.Get(Callers::kMMHierForN);
|
||||||
|
|
||||||
// Single cluster: parallel-for over static partition of `range_n`.
|
// Assign clusters (if any) a sub-range of `range_n` (typically hundreds).
|
||||||
hwy::ThreadPool& all_clusters = ctx.pools.AllClusters();
|
ParallelPartitionAcrossClusters(
|
||||||
const size_t num_clusters = all_clusters.NumWorkers();
|
range_n, n_multiple, /*inner_tasks=*/1, ctx, caller,
|
||||||
if (num_clusters == 1) {
|
[&](const IndexRange& cluster_range, size_t cluster_idx) {
|
||||||
const size_t cluster_idx = 0;
|
ParallelPartitionWithinCluster(
|
||||||
hwy::ThreadPool& cluster = ctx.pools.Cluster(cluster_idx);
|
cluster_range, n_multiple, inner_tasks, ctx, cluster_idx, caller,
|
||||||
const IndexRangePartition ranges_n = StaticPartition(
|
|
||||||
range_n, cluster.NumWorkers() * inner_tasks, n_multiple);
|
|
||||||
return ParallelizeOneRange(
|
|
||||||
ranges_n, cluster, caller,
|
|
||||||
[&](const IndexRange& worker_range, size_t worker) {
|
[&](const IndexRange& worker_range, size_t worker) {
|
||||||
func(worker_range, worker);
|
func(worker_range, worker);
|
||||||
});
|
});
|
||||||
}
|
|
||||||
|
|
||||||
// Assign each cluster a sub-range of `range_n` (typically hundreds).
|
|
||||||
const IndexRangePartition ranges_n =
|
|
||||||
StaticPartition(range_n, num_clusters, n_multiple);
|
|
||||||
ParallelizeOneRange(
|
|
||||||
ranges_n, all_clusters, caller,
|
|
||||||
[&](const IndexRange& n_range, const size_t cluster_idx) {
|
|
||||||
hwy::ThreadPool& cluster = ctx.pools.Cluster(cluster_idx);
|
|
||||||
const size_t cluster_base = ctx.Worker(cluster_idx);
|
|
||||||
// Parallel-for over sub-ranges of `cluster_range` within the cluster.
|
|
||||||
const IndexRangePartition worker_ranges = StaticPartition(
|
|
||||||
n_range, cluster.NumWorkers() * inner_tasks, n_multiple);
|
|
||||||
ParallelizeOneRange(
|
|
||||||
worker_ranges, cluster, caller,
|
|
||||||
[&](const IndexRange& worker_range, size_t worker) {
|
|
||||||
func(worker_range, cluster_base + worker);
|
|
||||||
});
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -205,57 +178,44 @@ struct MMParallelHierarchical {
|
||||||
void ForRangesMC_NC(ThreadingContext& ctx,
|
void ForRangesMC_NC(ThreadingContext& ctx,
|
||||||
const IndexRangePartition& ranges_mc,
|
const IndexRangePartition& ranges_mc,
|
||||||
const IndexRangePartition& ranges_nc,
|
const IndexRangePartition& ranges_nc,
|
||||||
HWY_MAYBE_UNUSED size_t caller_cluster_idx,
|
size_t caller_cluster_idx, const Func& func) const {
|
||||||
const Func& func) const {
|
|
||||||
HWY_DASSERT(caller_cluster_idx == 0);
|
HWY_DASSERT(caller_cluster_idx == 0);
|
||||||
|
(void)caller_cluster_idx;
|
||||||
const hwy::pool::Caller caller =
|
const hwy::pool::Caller caller =
|
||||||
ctx.pool_callers.Get(Callers::kMMHierForMCNC);
|
ctx.pool_callers.Get(Callers::kMMHierForMCNC);
|
||||||
|
|
||||||
hwy::ThreadPool& all_clusters = ctx.pools.AllClusters();
|
// Collapse two range indices into a 1D range for better load-balancing,
|
||||||
// `all_clusters` is a pool with one worker per cluster in a package.
|
// because `ranges_mc` may just have one task.
|
||||||
const size_t num_clusters = all_clusters.NumWorkers();
|
const hwy::Divisor div_m(static_cast<uint32_t>(ranges_mc.NumTasks()));
|
||||||
// Single (big) cluster: collapse two range indices into one parallel-for
|
const auto get_mc = [&](uint64_t task) {
|
||||||
// to reduce the number of fork-joins.
|
return ranges_mc.Range(div_m.Remainder(static_cast<uint32_t>(task)));
|
||||||
if (num_clusters == 1) {
|
};
|
||||||
const size_t cluster_idx = 0;
|
const auto get_nc = [&](uint64_t task) {
|
||||||
hwy::ThreadPool& cluster = ctx.pools.Cluster(cluster_idx);
|
return ranges_nc.Range(div_m.Divide(static_cast<uint32_t>(task)));
|
||||||
// Low-batch: avoid Divide/Remainder.
|
};
|
||||||
if (HWY_UNLIKELY(ranges_mc.NumTasks() == 1)) {
|
const IndexRange all_range(0, ranges_mc.NumTasks() * ranges_nc.NumTasks());
|
||||||
return ParallelizeOneRange(
|
|
||||||
ranges_nc, cluster, caller,
|
|
||||||
[&](const IndexRange& range_nc, size_t worker) {
|
|
||||||
func(ranges_mc.Range(0), range_nc, worker);
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
return ParallelizeTwoRanges(
|
|
||||||
ranges_mc, ranges_nc, cluster, caller,
|
|
||||||
[&](const IndexRange& range_mc, const IndexRange& range_nc,
|
|
||||||
size_t worker) { func(range_mc, range_nc, worker); });
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Multiple clusters: N across clusters (both are usually the larger), and
|
ParallelPartitionAcrossClusters(
|
||||||
// M within each cluster. We assume auto-tuning finds small MC/NC tasks.
|
all_range, /*task_multiple=*/1, /*inner_tasks=*/1, ctx, caller,
|
||||||
ParallelizeOneRange(
|
[&](const IndexRange& cluster_range, size_t cluster_idx) {
|
||||||
ranges_nc, all_clusters, caller,
|
ParallelForWithinCluster(cluster_range.Num(), ctx, cluster_idx,
|
||||||
[&](const IndexRange range_nc, size_t cluster_idx) {
|
caller, [&](uint64_t i, size_t worker) {
|
||||||
const size_t cluster_base = ctx.Worker(cluster_idx);
|
const size_t task =
|
||||||
hwy::ThreadPool& cluster = ctx.pools.Cluster(cluster_idx);
|
cluster_range.begin() + i;
|
||||||
ParallelizeOneRange(ranges_mc, cluster, caller,
|
func(get_mc(task), get_nc(task), worker);
|
||||||
[&](const IndexRange& range_mc, size_t worker) {
|
|
||||||
func(range_mc, range_nc, cluster_base + worker);
|
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calls `func(row_a, worker)` in parallel.
|
// No multiple/inner_tasks, so this is just HierarchicalParallelFor.
|
||||||
template <class Func>
|
template <class Func>
|
||||||
void ForRangeMC(ThreadingContext& ctx, const IndexRange& range_mc,
|
void ForRangeMC(ThreadingContext& ctx, const IndexRange& range_mc,
|
||||||
size_t caller_cluster_idx, const Func& func) const {
|
size_t caller_cluster_idx, const Func& func) const {
|
||||||
HierarchicalParallelFor(range_mc.Num(), ctx, Callers::kMMHierForMC,
|
HWY_DASSERT(caller_cluster_idx == 0);
|
||||||
[&](size_t task, size_t worker) {
|
(void)caller_cluster_idx;
|
||||||
func(range_mc.begin() + task, worker);
|
HierarchicalParallelFor(
|
||||||
});
|
range_mc.Num(), ctx, Callers::kMMHierForMC,
|
||||||
|
[&](size_t i, size_t worker) { func(range_mc.begin() + i, worker); });
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -195,9 +195,10 @@ HWY_INLINE void MatMulSlow(const MatPtrT<TA> A, const MatPtrT<TB> B,
|
||||||
const size_t multiple = env.ctx.allocator.QuantumBytes() / sizeof(TB);
|
const size_t multiple = env.ctx.allocator.QuantumBytes() / sizeof(TB);
|
||||||
const IndexRangePartition get_col_c =
|
const IndexRangePartition get_col_c =
|
||||||
StaticPartition(all_cols_c, all_clusters.NumWorkers(), multiple);
|
StaticPartition(all_cols_c, all_clusters.NumWorkers(), multiple);
|
||||||
ParallelizeOneRange(
|
ParallelForAcrossClusters(
|
||||||
get_col_c, all_clusters, env.ctx.pool_callers.Get(Callers::kTest),
|
get_col_c.NumTasks(), env.ctx, env.ctx.pool_callers.Get(Callers::kTest),
|
||||||
[&](const IndexRange& cols_c, size_t cluster_idx) HWY_ATTR {
|
[&](size_t range_idx, size_t cluster_idx) HWY_ATTR {
|
||||||
|
const IndexRange cols_c = get_col_c.Range(range_idx);
|
||||||
for (size_t r : all_rows_c) {
|
for (size_t r : all_rows_c) {
|
||||||
TC* HWY_RESTRICT C_row = C.Row(r);
|
TC* HWY_RESTRICT C_row = C.Row(r);
|
||||||
for (size_t c : cols_c) {
|
for (size_t c : cols_c) {
|
||||||
|
|
|
||||||
|
|
@ -262,43 +262,6 @@ static inline IndexRangePartition StaticPartition(const IndexRange& range,
|
||||||
return IndexRangePartition(range, size);
|
return IndexRangePartition(range, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Parallel-for over a single range. This takes care of translating the task
|
|
||||||
// index to a range.
|
|
||||||
template <class Func>
|
|
||||||
void ParallelizeOneRange(const IndexRangePartition& get1, hwy::ThreadPool& pool,
|
|
||||||
hwy::pool::Caller caller, const Func& func) {
|
|
||||||
const size_t num_tasks = get1.NumTasks();
|
|
||||||
pool.Run(0, num_tasks, caller, [&](uint64_t task, size_t thread) {
|
|
||||||
const IndexRange range1 = get1.Range(task);
|
|
||||||
func(range1, thread);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// Parallel-for over the Cartesian product of the two sets of ranges. This
|
|
||||||
// combines their indices into a single 'task' so they can be executed by one
|
|
||||||
// `pool.Run`, which increases the amount of work available to workers and
|
|
||||||
// reduces fork-join overhead vs. nested parallel-for loops. Calls `func` with
|
|
||||||
// the two ranges and the thread index within `pool`.
|
|
||||||
template <class Func>
|
|
||||||
void ParallelizeTwoRanges(const IndexRangePartition& get1,
|
|
||||||
const IndexRangePartition& get2,
|
|
||||||
hwy::ThreadPool& pool, hwy::pool::Caller caller,
|
|
||||||
const Func& func) {
|
|
||||||
const hwy::Divisor div1(static_cast<uint32_t>(get1.NumTasks()));
|
|
||||||
|
|
||||||
const size_t num_tasks = get1.NumTasks() * get2.NumTasks();
|
|
||||||
pool.Run(0, num_tasks, caller, [&](uint64_t task, size_t thread) {
|
|
||||||
HWY_DASSERT(task < (uint64_t{1} << 32));
|
|
||||||
const size_t idx2 = div1.Divide(static_cast<uint32_t>(task));
|
|
||||||
const size_t idx1 = div1.Remainder(static_cast<uint32_t>(task));
|
|
||||||
HWY_DASSERT(idx1 < get1.NumTasks());
|
|
||||||
HWY_DASSERT(idx2 < get2.NumTasks());
|
|
||||||
const IndexRange range1 = get1.Range(idx1);
|
|
||||||
const IndexRange range2 = get2.Range(idx2);
|
|
||||||
func(range1, range2, thread);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace gcpp
|
} // namespace gcpp
|
||||||
|
|
||||||
#endif // THIRD_PARTY_GEMMA_CPP_UTIL_THREADING_H_
|
#endif // THIRD_PARTY_GEMMA_CPP_UTIL_THREADING_H_
|
||||||
|
|
|
||||||
|
|
@ -154,42 +154,96 @@ enum class ParallelismStrategy : uint8_t {
|
||||||
kHierarchical,
|
kHierarchical,
|
||||||
};
|
};
|
||||||
|
|
||||||
// Calls `func(task, worker)` for each task in `[0, num_tasks)`. Parallelizes
|
// Helper functions used to implement `ParallelFor`, also reused in multiple
|
||||||
// over clusters of ONE package, then within each cluster.
|
// places. User code should call `ParallelFor` instead, which accepts the more
|
||||||
|
// convenient `Callers` enum.
|
||||||
|
//
|
||||||
|
// These call `func(task, worker)` for each task in `[0, num_tasks)`.
|
||||||
|
|
||||||
|
// NOTE: the worker argument is actually the `cluster_idx`, so that `Func` can
|
||||||
|
// pass that to `ParallelForWithinCluster`.
|
||||||
|
template <class Func>
|
||||||
|
void ParallelForAcrossClusters(size_t num_tasks, ThreadingContext& ctx,
|
||||||
|
hwy::pool::Caller caller, const Func& func) {
|
||||||
|
ctx.pools.AllClusters().Run(
|
||||||
|
0, num_tasks, caller,
|
||||||
|
[&](uint64_t task, size_t cluster_idx) { func(task, cluster_idx); });
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Func>
|
||||||
|
void ParallelForWithinCluster(size_t num_tasks, ThreadingContext& ctx,
|
||||||
|
size_t cluster_idx, hwy::pool::Caller caller,
|
||||||
|
const Func& func) {
|
||||||
|
const size_t cluster_base = ctx.Worker(cluster_idx);
|
||||||
|
ctx.pools.Cluster(cluster_idx)
|
||||||
|
.Run(0, num_tasks, caller, [&](uint64_t task, size_t worker) {
|
||||||
|
func(task, cluster_base + worker);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calls `func(range, cluster_idx)`, for passing to `*WithinCluster`.
|
||||||
|
template <class Func>
|
||||||
|
void ParallelPartitionAcrossClusters(const IndexRange range,
|
||||||
|
size_t task_multiple, size_t inner_tasks,
|
||||||
|
ThreadingContext& ctx,
|
||||||
|
hwy::pool::Caller caller,
|
||||||
|
const Func& func) {
|
||||||
|
HWY_DASSERT(1 <= inner_tasks && inner_tasks <= 4);
|
||||||
|
const IndexRangePartition ranges = StaticPartition(
|
||||||
|
range, ctx.pools.NumClusters() * inner_tasks, task_multiple);
|
||||||
|
ParallelForAcrossClusters(ranges.NumTasks(), ctx, caller,
|
||||||
|
[&](uint64_t task, size_t cluster_idx) {
|
||||||
|
func(ranges.Range(task), cluster_idx);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calls `func(range, worker)`.
|
||||||
|
template <class Func>
|
||||||
|
void ParallelPartitionWithinCluster(const IndexRange range,
|
||||||
|
size_t task_multiple, size_t inner_tasks,
|
||||||
|
ThreadingContext& ctx, size_t cluster_idx,
|
||||||
|
hwy::pool::Caller caller,
|
||||||
|
const Func& func) {
|
||||||
|
HWY_DASSERT(1 <= inner_tasks && inner_tasks <= 4);
|
||||||
|
const size_t num_workers = ctx.pools.Cluster(cluster_idx).NumWorkers();
|
||||||
|
const IndexRangePartition ranges =
|
||||||
|
StaticPartition(range, num_workers * inner_tasks, task_multiple);
|
||||||
|
ParallelForWithinCluster(
|
||||||
|
ranges.NumTasks(), ctx, cluster_idx, caller,
|
||||||
|
[&](uint64_t task, size_t worker) { func(ranges.Range(task), worker); });
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parallelizes across clusters, then within each cluster.
|
||||||
template <class Func>
|
template <class Func>
|
||||||
void HierarchicalParallelFor(size_t num_tasks, ThreadingContext& ctx,
|
void HierarchicalParallelFor(size_t num_tasks, ThreadingContext& ctx,
|
||||||
Callers callers, const Func& func) {
|
Callers callers, const Func& func) {
|
||||||
const hwy::pool::Caller caller = ctx.pool_callers.Get(callers);
|
const hwy::pool::Caller caller = ctx.pool_callers.Get(callers);
|
||||||
// If few tasks, run on a single cluster. Also avoids a bit of overhead if
|
|
||||||
// there is only one cluster.
|
// If at most one task per cluster worker, run on a single cluster to avoid
|
||||||
hwy::ThreadPool& all_clusters = ctx.pools.AllClusters();
|
// the expensive cross-cluster barrier.
|
||||||
const size_t num_clusters = all_clusters.NumWorkers();
|
{
|
||||||
hwy::ThreadPool& cluster = ctx.pools.Cluster(0);
|
const size_t cluster_idx = 0;
|
||||||
if (num_clusters == 1 || num_tasks <= cluster.NumWorkers()) {
|
const size_t cluster_workers = ctx.pools.Cluster(cluster_idx).NumWorkers();
|
||||||
return cluster.Run(0, num_tasks, caller, [&](uint64_t task, size_t thread) {
|
if (HWY_UNLIKELY(num_tasks <= cluster_workers)) {
|
||||||
func(task, thread);
|
return ParallelForWithinCluster(num_tasks, ctx, cluster_idx, caller,
|
||||||
});
|
func);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Assign each cluster a sub-range.
|
ParallelPartitionAcrossClusters(
|
||||||
const IndexRangePartition ranges =
|
IndexRange(0, num_tasks), /*task_multiple=*/1, /*inner_tasks=*/1, ctx,
|
||||||
StaticPartition(IndexRange(0, num_tasks), num_clusters, 1);
|
caller, [&](const IndexRange& cluster_range, size_t cluster_idx) {
|
||||||
ParallelizeOneRange(ranges, all_clusters, caller,
|
ParallelForWithinCluster(cluster_range.Num(), ctx, cluster_idx, caller,
|
||||||
[&](const IndexRange& range, const size_t cluster_idx) {
|
[&](uint64_t i, size_t worker) {
|
||||||
hwy::ThreadPool& cluster =
|
func(cluster_range.begin() + i, worker);
|
||||||
ctx.pools.Cluster(cluster_idx);
|
|
||||||
const size_t cluster_base =
|
|
||||||
cluster_idx * ctx.pools.MaxWorkersPerCluster();
|
|
||||||
cluster.Run(range.begin(), range.end(), caller,
|
|
||||||
[&](uint64_t task, size_t thread) {
|
|
||||||
func(task, cluster_base + thread);
|
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calls `func(task, worker)` for each `task` in `[0, num_tasks)`, with the
|
// Calls `func(task, worker)` for each `task` in `[0, num_tasks)`, with the
|
||||||
// number/type of workers determined by `parallelism`. `cluster_idx` is for
|
// number/type of workers determined by `parallelism`. NOTE: worker is actually
|
||||||
// `parallelism == kWithinCluster`, and should be 0 if unknown.
|
// `cluster_idx` for `kAcrossClusters`. The `cluster_idx` argument is for
|
||||||
|
// `parallelism == {kWithinCluster, kNone}`, and should be 0 if unknown.
|
||||||
template <class Func>
|
template <class Func>
|
||||||
void ParallelFor(ParallelismStrategy parallelism, size_t num_tasks,
|
void ParallelFor(ParallelismStrategy parallelism, size_t num_tasks,
|
||||||
ThreadingContext& ctx, size_t cluster_idx, Callers callers,
|
ThreadingContext& ctx, size_t cluster_idx, Callers callers,
|
||||||
|
|
@ -212,37 +266,25 @@ void ParallelFor(ParallelismStrategy parallelism, size_t num_tasks,
|
||||||
}
|
}
|
||||||
|
|
||||||
case ParallelismStrategy::kAcrossClusters:
|
case ParallelismStrategy::kAcrossClusters:
|
||||||
return ctx.pools.AllClusters().Run(
|
return ParallelForAcrossClusters(
|
||||||
0, num_tasks, caller,
|
num_tasks, ctx, caller,
|
||||||
[&](uint64_t task, size_t cluster_idx) { func(task, cluster_idx); });
|
[&](uint64_t task, size_t cluster_idx) { func(task, cluster_idx); });
|
||||||
|
|
||||||
case ParallelismStrategy::kWithinCluster: {
|
case ParallelismStrategy::kWithinCluster:
|
||||||
// Ensure the worker argument is unique across clusters, because it is
|
return ParallelForWithinCluster(num_tasks, ctx, cluster_idx, caller,
|
||||||
// used for TLS indexing for example in profiler.h.
|
func);
|
||||||
const size_t base = ctx.Worker(cluster_idx);
|
|
||||||
return ctx.pools.Cluster(cluster_idx)
|
|
||||||
.Run(0, num_tasks, caller, [&](uint64_t task, size_t worker) {
|
|
||||||
func(task, base + worker);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
case ParallelismStrategy::kFlat: {
|
case ParallelismStrategy::kFlat:
|
||||||
// Check for single cluster; if not, we must compute `cluster_base` for
|
// Choose a single pool: the only cluster, or across all clusters
|
||||||
// consistent and non-overlapping worker indices.
|
// (slower synchronization, but more memory bandwidth)
|
||||||
hwy::ThreadPool& all_clusters = ctx.pools.AllClusters();
|
if (HWY_UNLIKELY(ctx.pools.NumClusters() == 1)) {
|
||||||
const size_t num_clusters = all_clusters.NumWorkers();
|
return ParallelForWithinCluster(num_tasks, ctx, cluster_idx, caller,
|
||||||
if (num_clusters == 1) {
|
func);
|
||||||
return ctx.pools.Cluster(cluster_idx)
|
|
||||||
.Run(0, num_tasks, caller,
|
|
||||||
[&](uint64_t task, size_t worker) { func(task, worker); });
|
|
||||||
}
|
}
|
||||||
|
return ParallelForAcrossClusters(num_tasks, ctx, caller,
|
||||||
return all_clusters.Run(0, num_tasks, caller,
|
|
||||||
[&](uint64_t task, size_t cluster_idx) {
|
[&](uint64_t task, size_t cluster_idx) {
|
||||||
const size_t worker = ctx.Worker(cluster_idx);
|
func(task, ctx.Worker(cluster_idx));
|
||||||
func(task, worker);
|
|
||||||
});
|
});
|
||||||
}
|
|
||||||
|
|
||||||
case ParallelismStrategy::kHierarchical:
|
case ParallelismStrategy::kHierarchical:
|
||||||
return HierarchicalParallelFor(num_tasks, ctx, callers, func);
|
return HierarchicalParallelFor(num_tasks, ctx, callers, func);
|
||||||
|
|
|
||||||
|
|
@ -202,57 +202,6 @@ TEST(ThreadingTest, TestStaticPartition) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(ThreadingTest, TestParallelizeOneRange) {
|
|
||||||
const IndexRange range(0, 10);
|
|
||||||
const IndexRangePartition partition = StaticPartition(range, 2, 4);
|
|
||||||
hwy::ThreadPool null_pool(0);
|
|
||||||
size_t calls = 0;
|
|
||||||
ParallelizeOneRange(partition, null_pool, kCaller,
|
|
||||||
[&](const IndexRange& range, size_t) {
|
|
||||||
if (++calls == 1) {
|
|
||||||
HWY_ASSERT(range.begin() == 0 && range.end() == 8);
|
|
||||||
} else {
|
|
||||||
HWY_ASSERT(range.begin() == 8 && range.end() == 10);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
HWY_ASSERT(calls == 2);
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(ThreadingTest, TestParallelizeTwoRanges) {
|
|
||||||
const IndexRangePartition partition1 =
|
|
||||||
StaticPartition(IndexRange(0, 10), 2, 4);
|
|
||||||
const IndexRangePartition partition2 =
|
|
||||||
MaxSizePartition(IndexRange(128, 256), 32, 32);
|
|
||||||
HWY_ASSERT(partition2.NumTasks() == 4);
|
|
||||||
hwy::ThreadPool null_pool(0);
|
|
||||||
{
|
|
||||||
size_t calls = 0;
|
|
||||||
ParallelizeTwoRanges(
|
|
||||||
partition1, partition2, null_pool, kCaller,
|
|
||||||
[&](const IndexRange& range1, const IndexRange& range2, size_t) {
|
|
||||||
++calls;
|
|
||||||
HWY_ASSERT(range1.begin() == 0 || range1.begin() == 8);
|
|
||||||
HWY_ASSERT(range2.begin() % 32 == 0);
|
|
||||||
HWY_ASSERT(range2.Num() % 32 == 0);
|
|
||||||
});
|
|
||||||
HWY_ASSERT(calls == 2 * 4);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Also swap order to test Remainder() logic.
|
|
||||||
{
|
|
||||||
size_t calls = 0;
|
|
||||||
ParallelizeTwoRanges(
|
|
||||||
partition2, partition1, null_pool, kCaller,
|
|
||||||
[&](const IndexRange& range2, const IndexRange& range1, size_t) {
|
|
||||||
++calls;
|
|
||||||
HWY_ASSERT(range1.begin() == 0 || range1.begin() == 8);
|
|
||||||
HWY_ASSERT(range2.begin() % 32 == 0);
|
|
||||||
HWY_ASSERT(range2.Num() % 32 == 0);
|
|
||||||
});
|
|
||||||
HWY_ASSERT(calls == 2 * 4);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static constexpr size_t kU64PerThread = HWY_ALIGNMENT / sizeof(size_t);
|
static constexpr size_t kU64PerThread = HWY_ALIGNMENT / sizeof(size_t);
|
||||||
static uint64_t outputs[hwy::kMaxLogicalProcessors * kU64PerThread];
|
static uint64_t outputs[hwy::kMaxLogicalProcessors * kU64PerThread];
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue