1.02x speedup: improve load balance and simplify parallelFor

Remove ParallelizeOne/TwoRange, use ParallelForAcross/WithinCluster instead.

PiperOrigin-RevId: 823388890
This commit is contained in:
Jan Wassenberg 2025-10-24 00:17:45 -07:00 committed by Copybara-Service
parent 085a34965a
commit a48e614f64
5 changed files with 166 additions and 251 deletions

View File

@ -103,17 +103,13 @@ struct MMParallelWithinCluster {
template <class Func> template <class Func>
void ForN(ThreadingContext& ctx, const IndexRange& range_n, size_t n_multiple, void ForN(ThreadingContext& ctx, const IndexRange& range_n, size_t n_multiple,
size_t inner_tasks, size_t cluster_idx, const Func& func) const { size_t inner_tasks, size_t cluster_idx, const Func& func) const {
HWY_DASSERT(1 <= inner_tasks && inner_tasks <= 4); const hwy::pool::Caller caller =
ctx.pool_callers.Get(Callers::kMMClusterForN);
hwy::ThreadPool& cluster = ctx.pools.Cluster(cluster_idx); ParallelPartitionWithinCluster(
const size_t base = ctx.Worker(cluster_idx); range_n, n_multiple, inner_tasks, ctx, cluster_idx, caller,
const IndexRangePartition ranges_n = StaticPartition(
range_n, cluster.NumWorkers() * inner_tasks, n_multiple);
ParallelizeOneRange(ranges_n, cluster,
ctx.pool_callers.Get(Callers::kMMClusterForN),
[&](const IndexRange& worker_range, size_t worker) { [&](const IndexRange& worker_range, size_t worker) {
func(worker_range, base + worker); func(worker_range, worker);
}); });
} }
@ -122,80 +118,57 @@ struct MMParallelWithinCluster {
const IndexRangePartition& ranges_mc, const IndexRangePartition& ranges_mc,
const IndexRangePartition& ranges_nc, size_t cluster_idx, const IndexRangePartition& ranges_nc, size_t cluster_idx,
const Func& func) const { const Func& func) const {
hwy::ThreadPool& cluster = ctx.pools.Cluster(cluster_idx); const hwy::pool::Caller caller =
const size_t base = ctx.Worker(cluster_idx); ctx.pool_callers.Get(Callers::kMMClusterForMCNC);
// Low-batch: avoid Divide/Remainder. // We are running on one pool, hence collapse into a 1D range.
if (HWY_UNLIKELY(ranges_mc.NumTasks() == 1)) { const hwy::Divisor div_m(static_cast<uint32_t>(ranges_mc.NumTasks()));
ParallelizeOneRange(ranges_nc, cluster, const auto get_mc = [&](uint64_t task) {
ctx.pool_callers.Get(Callers::kMMClusterForMCNC), return ranges_mc.Range(div_m.Remainder(static_cast<uint32_t>(task)));
[&](const IndexRange& range_nc, size_t worker) { };
func(ranges_mc.Range(0), range_nc, base + worker); const auto get_nc = [&](uint64_t task) {
return ranges_nc.Range(div_m.Divide(static_cast<uint32_t>(task)));
};
const size_t num_tasks = ranges_mc.NumTasks() * ranges_nc.NumTasks();
ParallelForWithinCluster(num_tasks, ctx, cluster_idx, caller,
[&](uint64_t task, size_t worker) {
func(get_mc(task), get_nc(task), worker);
}); });
} else {
ParallelizeTwoRanges(
ranges_mc, ranges_nc, cluster,
ctx.pool_callers.Get(Callers::kMMClusterForMCNC),
[&](const IndexRange& range_mc, const IndexRange& range_nc,
size_t worker) { func(range_mc, range_nc, base + worker); });
}
} }
template <class Func> template <class Func>
void ForRangeMC(ThreadingContext& ctx, const IndexRange& range_mc, void ForRangeMC(ThreadingContext& ctx, const IndexRange& range_mc,
size_t cluster_idx, const Func& func) const { size_t cluster_idx, const Func& func) const {
hwy::ThreadPool& cluster = ctx.pools.Cluster(cluster_idx); const hwy::pool::Caller caller =
const size_t base = ctx.Worker(cluster_idx); ctx.pool_callers.Get(Callers::kMMClusterForMC);
cluster.Run( ParallelForWithinCluster(
range_mc.begin(), range_mc.end(), range_mc.Num(), ctx, cluster_idx, caller,
ctx.pool_callers.Get(Callers::kMMClusterForMC), [&](uint64_t i, size_t worker) { func(range_mc.begin() + i, worker); });
[&](uint64_t row_a, size_t worker) { func(row_a, base + worker); });
} }
}; };
struct MMParallelHierarchical { struct MMParallelHierarchical {
// Cluster/CCX-aware parallel-for over B rows in `range_n`. `n_multiple` is // Similar to `HierarchicalParallelFor`, but over *sub-ranges* of B rows in
// the granularity of per-cluster tasks. Calls `func(worker_range, worker)`. // `range_n` governed by `n_multiple` and `inner_tasks`.
template <class Func> template <class Func>
void ForN(ThreadingContext& ctx, const IndexRange& range_n, size_t n_multiple, void ForN(ThreadingContext& ctx, const IndexRange& range_n, size_t n_multiple,
size_t inner_tasks, HWY_MAYBE_UNUSED size_t caller_cluster_idx, size_t inner_tasks, size_t caller_cluster_idx,
const Func& func) const { const Func& func) const {
HWY_DASSERT(1 <= inner_tasks && inner_tasks <= 4);
HWY_DASSERT(caller_cluster_idx == 0); HWY_DASSERT(caller_cluster_idx == 0);
(void)caller_cluster_idx;
const hwy::pool::Caller caller = ctx.pool_callers.Get(Callers::kMMHierForN); const hwy::pool::Caller caller = ctx.pool_callers.Get(Callers::kMMHierForN);
// Single cluster: parallel-for over static partition of `range_n`. // Assign clusters (if any) a sub-range of `range_n` (typically hundreds).
hwy::ThreadPool& all_clusters = ctx.pools.AllClusters(); ParallelPartitionAcrossClusters(
const size_t num_clusters = all_clusters.NumWorkers(); range_n, n_multiple, /*inner_tasks=*/1, ctx, caller,
if (num_clusters == 1) { [&](const IndexRange& cluster_range, size_t cluster_idx) {
const size_t cluster_idx = 0; ParallelPartitionWithinCluster(
hwy::ThreadPool& cluster = ctx.pools.Cluster(cluster_idx); cluster_range, n_multiple, inner_tasks, ctx, cluster_idx, caller,
const IndexRangePartition ranges_n = StaticPartition(
range_n, cluster.NumWorkers() * inner_tasks, n_multiple);
return ParallelizeOneRange(
ranges_n, cluster, caller,
[&](const IndexRange& worker_range, size_t worker) { [&](const IndexRange& worker_range, size_t worker) {
func(worker_range, worker); func(worker_range, worker);
}); });
}
// Assign each cluster a sub-range of `range_n` (typically hundreds).
const IndexRangePartition ranges_n =
StaticPartition(range_n, num_clusters, n_multiple);
ParallelizeOneRange(
ranges_n, all_clusters, caller,
[&](const IndexRange& n_range, const size_t cluster_idx) {
hwy::ThreadPool& cluster = ctx.pools.Cluster(cluster_idx);
const size_t cluster_base = ctx.Worker(cluster_idx);
// Parallel-for over sub-ranges of `cluster_range` within the cluster.
const IndexRangePartition worker_ranges = StaticPartition(
n_range, cluster.NumWorkers() * inner_tasks, n_multiple);
ParallelizeOneRange(
worker_ranges, cluster, caller,
[&](const IndexRange& worker_range, size_t worker) {
func(worker_range, cluster_base + worker);
});
}); });
} }
@ -205,57 +178,44 @@ struct MMParallelHierarchical {
void ForRangesMC_NC(ThreadingContext& ctx, void ForRangesMC_NC(ThreadingContext& ctx,
const IndexRangePartition& ranges_mc, const IndexRangePartition& ranges_mc,
const IndexRangePartition& ranges_nc, const IndexRangePartition& ranges_nc,
HWY_MAYBE_UNUSED size_t caller_cluster_idx, size_t caller_cluster_idx, const Func& func) const {
const Func& func) const {
HWY_DASSERT(caller_cluster_idx == 0); HWY_DASSERT(caller_cluster_idx == 0);
(void)caller_cluster_idx;
const hwy::pool::Caller caller = const hwy::pool::Caller caller =
ctx.pool_callers.Get(Callers::kMMHierForMCNC); ctx.pool_callers.Get(Callers::kMMHierForMCNC);
hwy::ThreadPool& all_clusters = ctx.pools.AllClusters(); // Collapse two range indices into a 1D range for better load-balancing,
// `all_clusters` is a pool with one worker per cluster in a package. // because `ranges_mc` may just have one task.
const size_t num_clusters = all_clusters.NumWorkers(); const hwy::Divisor div_m(static_cast<uint32_t>(ranges_mc.NumTasks()));
// Single (big) cluster: collapse two range indices into one parallel-for const auto get_mc = [&](uint64_t task) {
// to reduce the number of fork-joins. return ranges_mc.Range(div_m.Remainder(static_cast<uint32_t>(task)));
if (num_clusters == 1) { };
const size_t cluster_idx = 0; const auto get_nc = [&](uint64_t task) {
hwy::ThreadPool& cluster = ctx.pools.Cluster(cluster_idx); return ranges_nc.Range(div_m.Divide(static_cast<uint32_t>(task)));
// Low-batch: avoid Divide/Remainder. };
if (HWY_UNLIKELY(ranges_mc.NumTasks() == 1)) { const IndexRange all_range(0, ranges_mc.NumTasks() * ranges_nc.NumTasks());
return ParallelizeOneRange(
ranges_nc, cluster, caller,
[&](const IndexRange& range_nc, size_t worker) {
func(ranges_mc.Range(0), range_nc, worker);
});
} else {
return ParallelizeTwoRanges(
ranges_mc, ranges_nc, cluster, caller,
[&](const IndexRange& range_mc, const IndexRange& range_nc,
size_t worker) { func(range_mc, range_nc, worker); });
}
}
// Multiple clusters: N across clusters (both are usually the larger), and ParallelPartitionAcrossClusters(
// M within each cluster. We assume auto-tuning finds small MC/NC tasks. all_range, /*task_multiple=*/1, /*inner_tasks=*/1, ctx, caller,
ParallelizeOneRange( [&](const IndexRange& cluster_range, size_t cluster_idx) {
ranges_nc, all_clusters, caller, ParallelForWithinCluster(cluster_range.Num(), ctx, cluster_idx,
[&](const IndexRange range_nc, size_t cluster_idx) { caller, [&](uint64_t i, size_t worker) {
const size_t cluster_base = ctx.Worker(cluster_idx); const size_t task =
hwy::ThreadPool& cluster = ctx.pools.Cluster(cluster_idx); cluster_range.begin() + i;
ParallelizeOneRange(ranges_mc, cluster, caller, func(get_mc(task), get_nc(task), worker);
[&](const IndexRange& range_mc, size_t worker) {
func(range_mc, range_nc, cluster_base + worker);
}); });
}); });
} }
// Calls `func(row_a, worker)` in parallel. // No multiple/inner_tasks, so this is just HierarchicalParallelFor.
template <class Func> template <class Func>
void ForRangeMC(ThreadingContext& ctx, const IndexRange& range_mc, void ForRangeMC(ThreadingContext& ctx, const IndexRange& range_mc,
size_t caller_cluster_idx, const Func& func) const { size_t caller_cluster_idx, const Func& func) const {
HierarchicalParallelFor(range_mc.Num(), ctx, Callers::kMMHierForMC, HWY_DASSERT(caller_cluster_idx == 0);
[&](size_t task, size_t worker) { (void)caller_cluster_idx;
func(range_mc.begin() + task, worker); HierarchicalParallelFor(
}); range_mc.Num(), ctx, Callers::kMMHierForMC,
[&](size_t i, size_t worker) { func(range_mc.begin() + i, worker); });
} }
}; };

View File

@ -195,9 +195,10 @@ HWY_INLINE void MatMulSlow(const MatPtrT<TA> A, const MatPtrT<TB> B,
const size_t multiple = env.ctx.allocator.QuantumBytes() / sizeof(TB); const size_t multiple = env.ctx.allocator.QuantumBytes() / sizeof(TB);
const IndexRangePartition get_col_c = const IndexRangePartition get_col_c =
StaticPartition(all_cols_c, all_clusters.NumWorkers(), multiple); StaticPartition(all_cols_c, all_clusters.NumWorkers(), multiple);
ParallelizeOneRange( ParallelForAcrossClusters(
get_col_c, all_clusters, env.ctx.pool_callers.Get(Callers::kTest), get_col_c.NumTasks(), env.ctx, env.ctx.pool_callers.Get(Callers::kTest),
[&](const IndexRange& cols_c, size_t cluster_idx) HWY_ATTR { [&](size_t range_idx, size_t cluster_idx) HWY_ATTR {
const IndexRange cols_c = get_col_c.Range(range_idx);
for (size_t r : all_rows_c) { for (size_t r : all_rows_c) {
TC* HWY_RESTRICT C_row = C.Row(r); TC* HWY_RESTRICT C_row = C.Row(r);
for (size_t c : cols_c) { for (size_t c : cols_c) {

View File

@ -262,43 +262,6 @@ static inline IndexRangePartition StaticPartition(const IndexRange& range,
return IndexRangePartition(range, size); return IndexRangePartition(range, size);
} }
// Parallel-for over a single range. This takes care of translating the task
// index to a range.
template <class Func>
void ParallelizeOneRange(const IndexRangePartition& get1, hwy::ThreadPool& pool,
hwy::pool::Caller caller, const Func& func) {
const size_t num_tasks = get1.NumTasks();
pool.Run(0, num_tasks, caller, [&](uint64_t task, size_t thread) {
const IndexRange range1 = get1.Range(task);
func(range1, thread);
});
}
// Parallel-for over the Cartesian product of the two sets of ranges. This
// combines their indices into a single 'task' so they can be executed by one
// `pool.Run`, which increases the amount of work available to workers and
// reduces fork-join overhead vs. nested parallel-for loops. Calls `func` with
// the two ranges and the thread index within `pool`.
template <class Func>
void ParallelizeTwoRanges(const IndexRangePartition& get1,
const IndexRangePartition& get2,
hwy::ThreadPool& pool, hwy::pool::Caller caller,
const Func& func) {
const hwy::Divisor div1(static_cast<uint32_t>(get1.NumTasks()));
const size_t num_tasks = get1.NumTasks() * get2.NumTasks();
pool.Run(0, num_tasks, caller, [&](uint64_t task, size_t thread) {
HWY_DASSERT(task < (uint64_t{1} << 32));
const size_t idx2 = div1.Divide(static_cast<uint32_t>(task));
const size_t idx1 = div1.Remainder(static_cast<uint32_t>(task));
HWY_DASSERT(idx1 < get1.NumTasks());
HWY_DASSERT(idx2 < get2.NumTasks());
const IndexRange range1 = get1.Range(idx1);
const IndexRange range2 = get2.Range(idx2);
func(range1, range2, thread);
});
}
} // namespace gcpp } // namespace gcpp
#endif // THIRD_PARTY_GEMMA_CPP_UTIL_THREADING_H_ #endif // THIRD_PARTY_GEMMA_CPP_UTIL_THREADING_H_

View File

@ -154,42 +154,96 @@ enum class ParallelismStrategy : uint8_t {
kHierarchical, kHierarchical,
}; };
// Calls `func(task, worker)` for each task in `[0, num_tasks)`. Parallelizes // Helper functions used to implement `ParallelFor`, also reused in multiple
// over clusters of ONE package, then within each cluster. // places. User code should call `ParallelFor` instead, which accepts the more
// convenient `Callers` enum.
//
// These call `func(task, worker)` for each task in `[0, num_tasks)`.
// NOTE: the worker argument is actually the `cluster_idx`, so that `Func` can
// pass that to `ParallelForWithinCluster`.
template <class Func>
void ParallelForAcrossClusters(size_t num_tasks, ThreadingContext& ctx,
hwy::pool::Caller caller, const Func& func) {
ctx.pools.AllClusters().Run(
0, num_tasks, caller,
[&](uint64_t task, size_t cluster_idx) { func(task, cluster_idx); });
}
template <class Func>
void ParallelForWithinCluster(size_t num_tasks, ThreadingContext& ctx,
size_t cluster_idx, hwy::pool::Caller caller,
const Func& func) {
const size_t cluster_base = ctx.Worker(cluster_idx);
ctx.pools.Cluster(cluster_idx)
.Run(0, num_tasks, caller, [&](uint64_t task, size_t worker) {
func(task, cluster_base + worker);
});
}
// Calls `func(range, cluster_idx)`, for passing to `*WithinCluster`.
template <class Func>
void ParallelPartitionAcrossClusters(const IndexRange range,
size_t task_multiple, size_t inner_tasks,
ThreadingContext& ctx,
hwy::pool::Caller caller,
const Func& func) {
HWY_DASSERT(1 <= inner_tasks && inner_tasks <= 4);
const IndexRangePartition ranges = StaticPartition(
range, ctx.pools.NumClusters() * inner_tasks, task_multiple);
ParallelForAcrossClusters(ranges.NumTasks(), ctx, caller,
[&](uint64_t task, size_t cluster_idx) {
func(ranges.Range(task), cluster_idx);
});
}
// Calls `func(range, worker)`.
template <class Func>
void ParallelPartitionWithinCluster(const IndexRange range,
size_t task_multiple, size_t inner_tasks,
ThreadingContext& ctx, size_t cluster_idx,
hwy::pool::Caller caller,
const Func& func) {
HWY_DASSERT(1 <= inner_tasks && inner_tasks <= 4);
const size_t num_workers = ctx.pools.Cluster(cluster_idx).NumWorkers();
const IndexRangePartition ranges =
StaticPartition(range, num_workers * inner_tasks, task_multiple);
ParallelForWithinCluster(
ranges.NumTasks(), ctx, cluster_idx, caller,
[&](uint64_t task, size_t worker) { func(ranges.Range(task), worker); });
}
// Parallelizes across clusters, then within each cluster.
template <class Func> template <class Func>
void HierarchicalParallelFor(size_t num_tasks, ThreadingContext& ctx, void HierarchicalParallelFor(size_t num_tasks, ThreadingContext& ctx,
Callers callers, const Func& func) { Callers callers, const Func& func) {
const hwy::pool::Caller caller = ctx.pool_callers.Get(callers); const hwy::pool::Caller caller = ctx.pool_callers.Get(callers);
// If few tasks, run on a single cluster. Also avoids a bit of overhead if
// there is only one cluster. // If at most one task per cluster worker, run on a single cluster to avoid
hwy::ThreadPool& all_clusters = ctx.pools.AllClusters(); // the expensive cross-cluster barrier.
const size_t num_clusters = all_clusters.NumWorkers(); {
hwy::ThreadPool& cluster = ctx.pools.Cluster(0); const size_t cluster_idx = 0;
if (num_clusters == 1 || num_tasks <= cluster.NumWorkers()) { const size_t cluster_workers = ctx.pools.Cluster(cluster_idx).NumWorkers();
return cluster.Run(0, num_tasks, caller, [&](uint64_t task, size_t thread) { if (HWY_UNLIKELY(num_tasks <= cluster_workers)) {
func(task, thread); return ParallelForWithinCluster(num_tasks, ctx, cluster_idx, caller,
}); func);
}
} }
// Assign each cluster a sub-range. ParallelPartitionAcrossClusters(
const IndexRangePartition ranges = IndexRange(0, num_tasks), /*task_multiple=*/1, /*inner_tasks=*/1, ctx,
StaticPartition(IndexRange(0, num_tasks), num_clusters, 1); caller, [&](const IndexRange& cluster_range, size_t cluster_idx) {
ParallelizeOneRange(ranges, all_clusters, caller, ParallelForWithinCluster(cluster_range.Num(), ctx, cluster_idx, caller,
[&](const IndexRange& range, const size_t cluster_idx) { [&](uint64_t i, size_t worker) {
hwy::ThreadPool& cluster = func(cluster_range.begin() + i, worker);
ctx.pools.Cluster(cluster_idx);
const size_t cluster_base =
cluster_idx * ctx.pools.MaxWorkersPerCluster();
cluster.Run(range.begin(), range.end(), caller,
[&](uint64_t task, size_t thread) {
func(task, cluster_base + thread);
}); });
}); });
} }
// Calls `func(task, worker)` for each `task` in `[0, num_tasks)`, with the // Calls `func(task, worker)` for each `task` in `[0, num_tasks)`, with the
// number/type of workers determined by `parallelism`. `cluster_idx` is for // number/type of workers determined by `parallelism`. NOTE: worker is actually
// `parallelism == kWithinCluster`, and should be 0 if unknown. // `cluster_idx` for `kAcrossClusters`. The `cluster_idx` argument is for
// `parallelism == {kWithinCluster, kNone}`, and should be 0 if unknown.
template <class Func> template <class Func>
void ParallelFor(ParallelismStrategy parallelism, size_t num_tasks, void ParallelFor(ParallelismStrategy parallelism, size_t num_tasks,
ThreadingContext& ctx, size_t cluster_idx, Callers callers, ThreadingContext& ctx, size_t cluster_idx, Callers callers,
@ -212,37 +266,25 @@ void ParallelFor(ParallelismStrategy parallelism, size_t num_tasks,
} }
case ParallelismStrategy::kAcrossClusters: case ParallelismStrategy::kAcrossClusters:
return ctx.pools.AllClusters().Run( return ParallelForAcrossClusters(
0, num_tasks, caller, num_tasks, ctx, caller,
[&](uint64_t task, size_t cluster_idx) { func(task, cluster_idx); }); [&](uint64_t task, size_t cluster_idx) { func(task, cluster_idx); });
case ParallelismStrategy::kWithinCluster: { case ParallelismStrategy::kWithinCluster:
// Ensure the worker argument is unique across clusters, because it is return ParallelForWithinCluster(num_tasks, ctx, cluster_idx, caller,
// used for TLS indexing for example in profiler.h. func);
const size_t base = ctx.Worker(cluster_idx);
return ctx.pools.Cluster(cluster_idx)
.Run(0, num_tasks, caller, [&](uint64_t task, size_t worker) {
func(task, base + worker);
});
}
case ParallelismStrategy::kFlat: { case ParallelismStrategy::kFlat:
// Check for single cluster; if not, we must compute `cluster_base` for // Choose a single pool: the only cluster, or across all clusters
// consistent and non-overlapping worker indices. // (slower synchronization, but more memory bandwidth)
hwy::ThreadPool& all_clusters = ctx.pools.AllClusters(); if (HWY_UNLIKELY(ctx.pools.NumClusters() == 1)) {
const size_t num_clusters = all_clusters.NumWorkers(); return ParallelForWithinCluster(num_tasks, ctx, cluster_idx, caller,
if (num_clusters == 1) { func);
return ctx.pools.Cluster(cluster_idx)
.Run(0, num_tasks, caller,
[&](uint64_t task, size_t worker) { func(task, worker); });
} }
return ParallelForAcrossClusters(num_tasks, ctx, caller,
return all_clusters.Run(0, num_tasks, caller,
[&](uint64_t task, size_t cluster_idx) { [&](uint64_t task, size_t cluster_idx) {
const size_t worker = ctx.Worker(cluster_idx); func(task, ctx.Worker(cluster_idx));
func(task, worker);
}); });
}
case ParallelismStrategy::kHierarchical: case ParallelismStrategy::kHierarchical:
return HierarchicalParallelFor(num_tasks, ctx, callers, func); return HierarchicalParallelFor(num_tasks, ctx, callers, func);

View File

@ -202,57 +202,6 @@ TEST(ThreadingTest, TestStaticPartition) {
} }
} }
TEST(ThreadingTest, TestParallelizeOneRange) {
const IndexRange range(0, 10);
const IndexRangePartition partition = StaticPartition(range, 2, 4);
hwy::ThreadPool null_pool(0);
size_t calls = 0;
ParallelizeOneRange(partition, null_pool, kCaller,
[&](const IndexRange& range, size_t) {
if (++calls == 1) {
HWY_ASSERT(range.begin() == 0 && range.end() == 8);
} else {
HWY_ASSERT(range.begin() == 8 && range.end() == 10);
}
});
HWY_ASSERT(calls == 2);
}
TEST(ThreadingTest, TestParallelizeTwoRanges) {
const IndexRangePartition partition1 =
StaticPartition(IndexRange(0, 10), 2, 4);
const IndexRangePartition partition2 =
MaxSizePartition(IndexRange(128, 256), 32, 32);
HWY_ASSERT(partition2.NumTasks() == 4);
hwy::ThreadPool null_pool(0);
{
size_t calls = 0;
ParallelizeTwoRanges(
partition1, partition2, null_pool, kCaller,
[&](const IndexRange& range1, const IndexRange& range2, size_t) {
++calls;
HWY_ASSERT(range1.begin() == 0 || range1.begin() == 8);
HWY_ASSERT(range2.begin() % 32 == 0);
HWY_ASSERT(range2.Num() % 32 == 0);
});
HWY_ASSERT(calls == 2 * 4);
}
// Also swap order to test Remainder() logic.
{
size_t calls = 0;
ParallelizeTwoRanges(
partition2, partition1, null_pool, kCaller,
[&](const IndexRange& range2, const IndexRange& range1, size_t) {
++calls;
HWY_ASSERT(range1.begin() == 0 || range1.begin() == 8);
HWY_ASSERT(range2.begin() % 32 == 0);
HWY_ASSERT(range2.Num() % 32 == 0);
});
HWY_ASSERT(calls == 2 * 4);
}
}
static constexpr size_t kU64PerThread = HWY_ALIGNMENT / sizeof(size_t); static constexpr size_t kU64PerThread = HWY_ALIGNMENT / sizeof(size_t);
static uint64_t outputs[hwy::kMaxLogicalProcessors * kU64PerThread]; static uint64_t outputs[hwy::kMaxLogicalProcessors * kU64PerThread];