mirror of https://github.com/google/gemma.cpp.git
Remove multi-package support from topology
Also no longer assume equal-sized clusters PiperOrigin-RevId: 820164125
This commit is contained in:
parent
9b6ed1a58f
commit
f59eb2ed72
|
|
@ -28,7 +28,6 @@
|
|||
#include "util/threading_context.h"
|
||||
#include "hwy/aligned_allocator.h" // Span
|
||||
#include "hwy/base.h"
|
||||
#include "hwy/contrib/thread_pool/thread_pool.h"
|
||||
#include "hwy/timer.h"
|
||||
|
||||
namespace gcpp {
|
||||
|
|
@ -104,12 +103,14 @@ BlobVec ReserveMemory(const RangeVec& ranges, BytePtr& all_blobs, size_t& pos) {
|
|||
// Reads one set of blobs in parallel (helpful if in disk cache).
|
||||
// Aborts on error.
|
||||
void ReadBlobs(BlobReader& reader, const RangeVec& ranges, BlobVec& blobs,
|
||||
hwy::ThreadPool& pool) {
|
||||
ThreadingContext& ctx, size_t cluster_idx) {
|
||||
HWY_ASSERT(reader.Keys().size() == blobs.size());
|
||||
HWY_ASSERT(ranges.size() == blobs.size());
|
||||
pool.Run(0, blobs.size(), [&](size_t i, size_t /*thread*/) {
|
||||
ParallelFor(ParallelismStrategy::kWithinCluster, blobs.size(), ctx,
|
||||
cluster_idx, [&](size_t i, size_t /*thread*/) {
|
||||
HWY_ASSERT(ranges[i].bytes == blobs[i].size());
|
||||
reader.file().Read(ranges[i].offset, ranges[i].bytes, blobs[i].data());
|
||||
reader.file().Read(ranges[i].offset, ranges[i].bytes,
|
||||
blobs[i].data());
|
||||
});
|
||||
}
|
||||
|
||||
|
|
@ -117,14 +118,16 @@ void ReadBlobs(BlobReader& reader, const RangeVec& ranges, BlobVec& blobs,
|
|||
void ReadBothBlobs(BlobReader& reader1, BlobReader& reader2,
|
||||
const RangeVec& ranges1, const RangeVec& ranges2,
|
||||
size_t total_bytes, BlobVec& blobs1, BlobVec& blobs2,
|
||||
NestedPools& pools) {
|
||||
ThreadingContext& ctx) {
|
||||
const double t0 = hwy::platform::Now();
|
||||
HWY_WARN("Reading %zu GiB, %zux%zu cores: ", total_bytes >> 30,
|
||||
pools.AllPackages().NumWorkers(), pools.Pool().NumWorkers());
|
||||
pools.AllPackages().Run(0, 2, [&](size_t task, size_t pkg_idx) {
|
||||
ReadBlobs(task ? reader2 : reader1, task ? ranges2 : ranges1,
|
||||
task ? blobs2 : blobs1, pools.Pool(pkg_idx));
|
||||
HWY_WARN("Reading %zu GiB, %zu clusters: ", total_bytes >> 30,
|
||||
ctx.pools.NumClusters());
|
||||
ParallelFor(ParallelismStrategy::kAcrossClusters, 2, ctx, 0,
|
||||
[&](const size_t task, size_t cluster_idx) {
|
||||
ReadBlobs(task ? reader1 : reader2, task ? ranges1 : ranges2,
|
||||
task ? blobs1 : blobs2, ctx, cluster_idx);
|
||||
});
|
||||
|
||||
const double t1 = hwy::platform::Now();
|
||||
HWY_WARN("%.1f GB/s\n", total_bytes / (t1 - t0) * 1E-9);
|
||||
}
|
||||
|
|
@ -181,18 +184,13 @@ size_t BlobDifferences(const ByteSpan data1, const ByteSpan data2,
|
|||
}
|
||||
|
||||
void CompareBlobs(const KeyVec& keys, BlobVec& blobs1, BlobVec& blobs2,
|
||||
size_t total_bytes, NestedPools& pools) {
|
||||
size_t total_bytes, ThreadingContext& ctx) {
|
||||
HWY_WARN("Comparing %zu blobs in parallel: ", keys.size());
|
||||
const double t0 = hwy::platform::Now();
|
||||
std::atomic<size_t> blobs_equal{};
|
||||
std::atomic<size_t> blobs_diff{};
|
||||
const IndexRangePartition ranges = StaticPartition(
|
||||
IndexRange(0, keys.size()), pools.AllPackages().NumWorkers(), 1);
|
||||
ParallelizeOneRange(
|
||||
ranges, pools.AllPackages(),
|
||||
[&](const IndexRange& range, size_t pkg_idx) {
|
||||
pools.Pool(pkg_idx).Run(
|
||||
range.begin(), range.end(), [&](size_t i, size_t /*thread*/) {
|
||||
ParallelFor(ParallelismStrategy::kHierarchical, keys.size(), ctx, 0,
|
||||
[&](size_t i, size_t /*thread*/) {
|
||||
const size_t mismatches =
|
||||
BlobDifferences(blobs1[i], blobs2[i], keys[i]);
|
||||
if (mismatches != 0) {
|
||||
|
|
@ -203,7 +201,6 @@ void CompareBlobs(const KeyVec& keys, BlobVec& blobs1, BlobVec& blobs2,
|
|||
blobs_equal.fetch_add(1);
|
||||
}
|
||||
});
|
||||
});
|
||||
const double t1 = hwy::platform::Now();
|
||||
HWY_WARN("%.1f GB/s; total blob matches=%zu, mismatches=%zu\n",
|
||||
total_bytes / (t1 - t0) * 1E-9, blobs_equal.load(),
|
||||
|
|
@ -230,9 +227,9 @@ void ReadAndCompareBlobs(const Path& path1, const Path& path2) {
|
|||
ThreadingArgs args;
|
||||
ThreadingContext ctx(args);
|
||||
ReadBothBlobs(reader1, reader2, ranges1, ranges2, total_bytes, blobs1, blobs2,
|
||||
ctx.pools);
|
||||
ctx);
|
||||
|
||||
CompareBlobs(reader1.Keys(), blobs1, blobs2, total_bytes, ctx.pools);
|
||||
CompareBlobs(reader1.Keys(), blobs1, blobs2, total_bytes, ctx);
|
||||
}
|
||||
|
||||
} // namespace gcpp
|
||||
|
|
|
|||
|
|
@ -1124,8 +1124,9 @@ void TestAllDot() {
|
|||
MatPadding::kOdd);
|
||||
std::array<DotStats, kMaxWorkers> all_stats;
|
||||
|
||||
ctx.pools.Cluster(0, 0).Run(
|
||||
0, kReps, [&](const uint32_t rep, size_t thread) {
|
||||
ParallelFor(
|
||||
ParallelismStrategy::kWithinCluster, kReps, ctx, 0,
|
||||
[&](size_t rep, size_t thread) {
|
||||
float* HWY_RESTRICT pa = a.Row(thread);
|
||||
float* HWY_RESTRICT pb = b.Row(thread);
|
||||
double* HWY_RESTRICT buf = bufs.Row(thread);
|
||||
|
|
|
|||
|
|
@ -351,7 +351,7 @@ std::vector<MMConfig> MMCandidates(const CacheInfo& cache, size_t M, size_t K,
|
|||
|
||||
MatMulEnv::MatMulEnv(ThreadingContext& ctx)
|
||||
: ctx(ctx), A_BF(ctx.allocator), C_tiles(ctx) {
|
||||
const size_t num_clusters = ctx.pools.AllClusters(/*pkg_idx=*/0).NumWorkers();
|
||||
const size_t num_clusters = ctx.pools.NumClusters();
|
||||
per_cluster.resize(num_clusters);
|
||||
for (size_t cluster_idx = 0; cluster_idx < num_clusters; ++cluster_idx) {
|
||||
row_ptrs.push_back(hwy::AllocateAligned<uint8_t*>(kMaxBatchSize)); // C
|
||||
|
|
@ -368,7 +368,7 @@ void BindB(ThreadingContext& ctx, MatPtr& B, size_t sizeof_TC) {
|
|||
|
||||
PROFILER_ZONE("Startup.BindB");
|
||||
|
||||
const size_t node = ctx.topology.GetCluster(/*pkg_idx=*/0, 0).Node();
|
||||
const size_t node = ctx.topology.GetCluster(0).Node();
|
||||
uintptr_t begin = reinterpret_cast<uintptr_t>(B.RowBytes(0));
|
||||
uintptr_t end = begin + B.Rows() * B.Stride() * B.ElementBytes();
|
||||
// B row padding is less than the page size, so only bind the subset that
|
||||
|
|
@ -394,7 +394,7 @@ void BindC(ThreadingContext& ctx, MatPtr& C) {
|
|||
const size_t end = hwy::RoundDownTo(cols_c.end() * C.ElementBytes(),
|
||||
allocator.BasePageBytes());
|
||||
|
||||
const size_t node = ctx.topology.GetCluster(/*pkg_idx=*/0, 0).Node();
|
||||
const size_t node = ctx.topology.GetCluster(0).Node();
|
||||
bool ok = true;
|
||||
for (size_t im = 0; im < C.Rows(); ++im) {
|
||||
ok &= allocator.BindMemory(C.RowBytes(im) + begin, end - begin, node);
|
||||
|
|
|
|||
23
ops/matmul.h
23
ops/matmul.h
|
|
@ -105,8 +105,7 @@ struct MMParallelWithinCluster {
|
|||
size_t inner_tasks, size_t cluster_idx, const Func& func) const {
|
||||
HWY_DASSERT(1 <= inner_tasks && inner_tasks <= 4);
|
||||
|
||||
const size_t pkg_idx = 0;
|
||||
hwy::ThreadPool& cluster = ctx.pools.Cluster(pkg_idx, cluster_idx);
|
||||
hwy::ThreadPool& cluster = ctx.pools.Cluster(cluster_idx);
|
||||
const size_t base = ctx.Worker(cluster_idx);
|
||||
|
||||
const IndexRangePartition ranges_n = StaticPartition(
|
||||
|
|
@ -122,8 +121,7 @@ struct MMParallelWithinCluster {
|
|||
const IndexRangePartition& ranges_mc,
|
||||
const IndexRangePartition& ranges_nc, size_t cluster_idx,
|
||||
const Func& func) const {
|
||||
const size_t pkg_idx = 0;
|
||||
hwy::ThreadPool& cluster = ctx.pools.Cluster(pkg_idx, cluster_idx);
|
||||
hwy::ThreadPool& cluster = ctx.pools.Cluster(cluster_idx);
|
||||
const size_t base = ctx.Worker(cluster_idx);
|
||||
|
||||
// Low-batch: avoid Divide/Remainder.
|
||||
|
|
@ -143,8 +141,7 @@ struct MMParallelWithinCluster {
|
|||
template <class Func>
|
||||
void ForRangeMC(ThreadingContext& ctx, const IndexRange& range_mc,
|
||||
size_t cluster_idx, const Func& func) const {
|
||||
const size_t pkg_idx = 0;
|
||||
hwy::ThreadPool& cluster = ctx.pools.Cluster(pkg_idx, cluster_idx);
|
||||
hwy::ThreadPool& cluster = ctx.pools.Cluster(cluster_idx);
|
||||
const size_t base = ctx.Worker(cluster_idx);
|
||||
|
||||
cluster.Run(
|
||||
|
|
@ -164,12 +161,11 @@ struct MMParallelHierarchical {
|
|||
HWY_DASSERT(caller_cluster_idx == 0);
|
||||
|
||||
// Single cluster: parallel-for over static partition of `range_n`.
|
||||
const size_t pkg_idx = 0;
|
||||
hwy::ThreadPool& all_clusters = ctx.pools.AllClusters(pkg_idx);
|
||||
hwy::ThreadPool& all_clusters = ctx.pools.AllClusters();
|
||||
const size_t num_clusters = all_clusters.NumWorkers();
|
||||
if (num_clusters == 1) {
|
||||
const size_t cluster_idx = 0;
|
||||
hwy::ThreadPool& cluster = ctx.pools.Cluster(pkg_idx, cluster_idx);
|
||||
hwy::ThreadPool& cluster = ctx.pools.Cluster(cluster_idx);
|
||||
const IndexRangePartition ranges_n = StaticPartition(
|
||||
range_n, cluster.NumWorkers() * inner_tasks, n_multiple);
|
||||
return ParallelizeOneRange(
|
||||
|
|
@ -185,7 +181,7 @@ struct MMParallelHierarchical {
|
|||
ParallelizeOneRange(
|
||||
ranges_n, all_clusters,
|
||||
[&](const IndexRange& n_range, const size_t cluster_idx) {
|
||||
hwy::ThreadPool& cluster = ctx.pools.Cluster(pkg_idx, cluster_idx);
|
||||
hwy::ThreadPool& cluster = ctx.pools.Cluster(cluster_idx);
|
||||
const size_t cluster_base = ctx.Worker(cluster_idx);
|
||||
// Parallel-for over sub-ranges of `cluster_range` within the cluster.
|
||||
const IndexRangePartition worker_ranges = StaticPartition(
|
||||
|
|
@ -206,17 +202,16 @@ struct MMParallelHierarchical {
|
|||
const IndexRangePartition& ranges_nc,
|
||||
HWY_MAYBE_UNUSED size_t caller_cluster_idx,
|
||||
const Func& func) const {
|
||||
const size_t pkg_idx = 0;
|
||||
HWY_DASSERT(caller_cluster_idx == 0);
|
||||
|
||||
hwy::ThreadPool& all_clusters = ctx.pools.AllClusters(pkg_idx);
|
||||
hwy::ThreadPool& all_clusters = ctx.pools.AllClusters();
|
||||
// `all_clusters` is a pool with one worker per cluster in a package.
|
||||
const size_t num_clusters = all_clusters.NumWorkers();
|
||||
// Single (big) cluster: collapse two range indices into one parallel-for
|
||||
// to reduce the number of fork-joins.
|
||||
if (num_clusters == 1) {
|
||||
const size_t cluster_idx = 0;
|
||||
hwy::ThreadPool& cluster = ctx.pools.Cluster(pkg_idx, cluster_idx);
|
||||
hwy::ThreadPool& cluster = ctx.pools.Cluster(cluster_idx);
|
||||
// Low-batch: avoid Divide/Remainder.
|
||||
if (HWY_UNLIKELY(ranges_mc.NumTasks() == 1)) {
|
||||
return ParallelizeOneRange(
|
||||
|
|
@ -237,7 +232,7 @@ struct MMParallelHierarchical {
|
|||
ranges_nc, all_clusters,
|
||||
[&](const IndexRange range_nc, size_t cluster_idx) {
|
||||
const size_t cluster_base = ctx.Worker(cluster_idx);
|
||||
hwy::ThreadPool& cluster = ctx.pools.Cluster(pkg_idx, cluster_idx);
|
||||
hwy::ThreadPool& cluster = ctx.pools.Cluster(cluster_idx);
|
||||
ParallelizeOneRange(ranges_mc, cluster,
|
||||
[&](const IndexRange& range_mc, size_t worker) {
|
||||
func(range_mc, range_nc, cluster_base + worker);
|
||||
|
|
|
|||
|
|
@ -191,20 +191,14 @@ HWY_INLINE void MatMulSlow(const MatPtrT<TA> A, const MatPtrT<TB> B,
|
|||
const IndexRange all_cols_c(0, C.Cols());
|
||||
|
||||
NestedPools& pools = env.ctx.pools;
|
||||
hwy::ThreadPool& all_packages = pools.AllPackages();
|
||||
const IndexRangePartition get_row_c =
|
||||
StaticPartition(all_rows_c, all_packages.NumWorkers(), 1);
|
||||
ParallelizeOneRange(
|
||||
get_row_c, all_packages,
|
||||
[&](const IndexRange& rows_c, size_t package_idx) HWY_ATTR {
|
||||
hwy::ThreadPool& all_clusters = pools.AllClusters(package_idx);
|
||||
hwy::ThreadPool& all_clusters = pools.AllClusters();
|
||||
const size_t multiple = env.ctx.allocator.QuantumBytes() / sizeof(TB);
|
||||
const IndexRangePartition get_col_c =
|
||||
StaticPartition(all_cols_c, all_clusters.NumWorkers(), multiple);
|
||||
ParallelizeOneRange(
|
||||
get_col_c, all_clusters,
|
||||
[&](const IndexRange& cols_c, size_t cluster_idx) HWY_ATTR {
|
||||
for (size_t r : rows_c) {
|
||||
for (size_t r : all_rows_c) {
|
||||
TC* HWY_RESTRICT C_row = C.Row(r);
|
||||
for (size_t c : cols_c) {
|
||||
const float add = add_row ? add_row[c] : 0.0f;
|
||||
|
|
@ -214,7 +208,6 @@ HWY_INLINE void MatMulSlow(const MatPtrT<TA> A, const MatPtrT<TB> B,
|
|||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
void PrintSpeed(const char* algo, const Extents2D& A_extents,
|
||||
|
|
|
|||
|
|
@ -139,7 +139,7 @@ CacheInfo::CacheInfo(const BoundedTopology& topology) {
|
|||
|
||||
step_bytes_ = HWY_MAX(line_bytes_, vector_bytes_);
|
||||
|
||||
const BoundedTopology::Cluster& cluster = topology.GetCluster(0, 0);
|
||||
const BoundedTopology::Cluster& cluster = topology.GetCluster(0);
|
||||
if (const hwy::Cache* caches = hwy::DataCaches()) {
|
||||
l1_bytes_ = caches[1].size_kib << 10;
|
||||
l2_bytes_ = caches[2].size_kib << 10;
|
||||
|
|
|
|||
|
|
@ -169,7 +169,7 @@ class Allocator {
|
|||
bool ShouldBind() const { return should_bind_; }
|
||||
|
||||
// Attempts to move(!) `[p, p + bytes)` to the given NUMA node, which is
|
||||
// typically `BoundedTopology::GetCluster(package_idx, cluster_idx).node`.
|
||||
// typically `BoundedTopology::GetCluster(cluster_idx).node`.
|
||||
// Writes zeros to SOME of the memory. Only call if `ShouldBind()`.
|
||||
// `p` and `bytes` must be multiples of `QuantumBytes()`.
|
||||
bool BindMemory(void* p, size_t bytes, size_t node) const;
|
||||
|
|
|
|||
|
|
@ -18,7 +18,6 @@
|
|||
|
||||
#include <stdio.h>
|
||||
|
||||
#include <algorithm> // std::sort
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <vector>
|
||||
|
|
@ -31,14 +30,6 @@
|
|||
|
||||
namespace gcpp {
|
||||
|
||||
// Sort T := packages/clusters by descending 'size' so that users who only use
|
||||
// one Group get the largest.
|
||||
template <class T>
|
||||
static void SortByDescendingSize(std::vector<T>& groups) {
|
||||
std::sort(groups.begin(), groups.end(),
|
||||
[](const T& a, const T& b) { return a.Size() > b.Size(); });
|
||||
}
|
||||
|
||||
static bool InContainer() {
|
||||
return false; // placeholder for container detection, do not remove
|
||||
}
|
||||
|
|
@ -55,8 +46,8 @@ PinningPolicy::PinningPolicy(Tristate pin) {
|
|||
|
||||
// If `pinning.Want()`, tries to pin each worker in `pool` to an LP in
|
||||
// `cluster`, and calls `pinning.NotifyFailed()` if any fails.
|
||||
void MaybePin(const BoundedTopology& topology, size_t pkg_idx,
|
||||
size_t cluster_idx, const BoundedTopology::Cluster& cluster,
|
||||
static void MaybePin(const BoundedTopology& topology, size_t cluster_idx,
|
||||
const BoundedTopology::Cluster& cluster,
|
||||
PinningPolicy& pinning, hwy::ThreadPool& pool) {
|
||||
const std::vector<size_t> lps = cluster.LPVector();
|
||||
HWY_ASSERT(pool.NumWorkers() <= lps.size());
|
||||
|
|
@ -64,10 +55,9 @@ void MaybePin(const BoundedTopology& topology, size_t pkg_idx,
|
|||
HWY_ASSERT(task == thread); // each worker has one task
|
||||
|
||||
char buf[16]; // Linux limitation
|
||||
const int bytes_written = snprintf(buf, sizeof(buf), "P%zu X%02zu C%03d",
|
||||
topology.SkippedPackages() + pkg_idx,
|
||||
topology.SkippedClusters() + cluster_idx,
|
||||
static_cast<int>(task));
|
||||
const int bytes_written = snprintf(
|
||||
buf, sizeof(buf), "P%zu X%02zu C%03d", topology.SkippedPackages(),
|
||||
topology.SkippedClusters() + cluster_idx, static_cast<int>(task));
|
||||
HWY_ASSERT(bytes_written < static_cast<int>(sizeof(buf)));
|
||||
hwy::SetThreadName(buf, 0); // does not support varargs
|
||||
|
||||
|
|
@ -113,79 +103,56 @@ static size_t DivideMaxAcross(const size_t max, const size_t instances) {
|
|||
return max;
|
||||
}
|
||||
|
||||
NestedPools::NestedPools(const BoundedTopology& topology,
|
||||
const Allocator& allocator, size_t max_threads,
|
||||
Tristate pin)
|
||||
: pinning_(pin) {
|
||||
packages_.resize(topology.NumPackages());
|
||||
all_packages_ =
|
||||
MakePool(allocator, packages_.size(), hwy::PoolWorkerMapping());
|
||||
const size_t max_workers_per_package =
|
||||
DivideMaxAcross(max_threads, packages_.size());
|
||||
// Each worker in all_packages_, including the main thread, will be the
|
||||
// calling thread of an all_clusters->Run, and hence pinned to one of the
|
||||
// `cluster.lps` if `pin`.
|
||||
all_packages_->Run(0, packages_.size(), [&](uint64_t pkg_idx, size_t thread) {
|
||||
HWY_ASSERT(pkg_idx == thread); // each thread has one task
|
||||
packages_[pkg_idx] = Package(topology, allocator, pinning_, pkg_idx,
|
||||
max_workers_per_package);
|
||||
});
|
||||
|
||||
all_pinned_ = pinning_.AllPinned(&pin_string_);
|
||||
|
||||
// For mapping package/cluster/thread to noncontiguous TLS indices, in case
|
||||
// cluster/thread counts differ.
|
||||
HWY_ASSERT(!packages_.empty() && packages_.size() <= 16);
|
||||
for (const Package& p : packages_) {
|
||||
max_clusters_per_package_ =
|
||||
HWY_MAX(max_clusters_per_package_, p.NumClusters());
|
||||
max_workers_per_cluster_ =
|
||||
HWY_MAX(max_workers_per_cluster_, p.MaxWorkersPerCluster());
|
||||
}
|
||||
HWY_ASSERT(max_clusters_per_package_ >= 1);
|
||||
HWY_ASSERT(max_clusters_per_package_ <= 64);
|
||||
HWY_ASSERT(max_workers_per_cluster_ >= 1);
|
||||
HWY_ASSERT(max_workers_per_cluster_ <= 256);
|
||||
}
|
||||
|
||||
// `max_or_zero` == 0 means no limit.
|
||||
static inline size_t CapIfNonZero(size_t num, size_t max_or_zero) {
|
||||
return (max_or_zero == 0) ? num : HWY_MIN(num, max_or_zero);
|
||||
}
|
||||
|
||||
NestedPools::Package::Package(const BoundedTopology& topology,
|
||||
const Allocator& allocator,
|
||||
PinningPolicy& pinning, size_t pkg_idx,
|
||||
size_t max_workers_per_package) {
|
||||
// Pre-allocate because elements are set concurrently.
|
||||
clusters_.resize(topology.NumClusters(pkg_idx));
|
||||
const size_t max_workers_per_cluster =
|
||||
DivideMaxAcross(max_workers_per_package, clusters_.size());
|
||||
NestedPools::NestedPools(const BoundedTopology& topology,
|
||||
const Allocator& allocator, size_t max_threads,
|
||||
Tristate pin)
|
||||
: pinning_(pin) {
|
||||
const size_t num_clusters = topology.NumClusters();
|
||||
const size_t cluster_workers_cap = DivideMaxAcross(max_threads, num_clusters);
|
||||
|
||||
// Precompute cluster sizes to ensure we pass the same values to `MakePool`.
|
||||
// The max is also used for `all_clusters_mapping`, see below.
|
||||
size_t workers_per_cluster[hwy::kMaxClusters] = {};
|
||||
size_t all_clusters_node = 0;
|
||||
for (size_t cluster_idx = 0; cluster_idx < num_clusters; ++cluster_idx) {
|
||||
const BoundedTopology::Cluster& tcluster = topology.GetCluster(cluster_idx);
|
||||
workers_per_cluster[cluster_idx] =
|
||||
CapIfNonZero(tcluster.NumWorkers(), cluster_workers_cap);
|
||||
// Cluster sizes can vary because individual LPs may be disabled. Use the
|
||||
// max so that `GlobalIdx` is consistent within and across clusters. It is
|
||||
// OK to have holes or gaps in the worker index space.
|
||||
max_workers_per_cluster_ =
|
||||
HWY_MAX(max_workers_per_cluster_, workers_per_cluster[cluster_idx]);
|
||||
all_clusters_node = tcluster.Node(); // arbitrarily use the last node seen
|
||||
}
|
||||
|
||||
const BoundedTopology::Cluster& cluster0 = topology.GetCluster(pkg_idx, 0);
|
||||
// Core 0 of each cluster. The second argument is the cluster size, not
|
||||
// number of clusters. We ensure that it is the same for all clusters so that
|
||||
// the `GlobalIdx` computation is consistent within and across clusters.
|
||||
const hwy::PoolWorkerMapping all_clusters_mapping(hwy::kAllClusters,
|
||||
cluster0.Size());
|
||||
all_clusters_ = MakePool(allocator, clusters_.size(), all_clusters_mapping,
|
||||
cluster0.Node());
|
||||
max_workers_per_cluster_);
|
||||
all_clusters_ = MakePool(allocator, num_clusters, all_clusters_mapping,
|
||||
all_clusters_node);
|
||||
|
||||
// Pre-allocate because elements are set concurrently.
|
||||
clusters_.resize(num_clusters);
|
||||
|
||||
// Parallel so we also pin the calling worker in `all_clusters` to
|
||||
// `cluster.lps`.
|
||||
all_clusters_->Run(
|
||||
0, all_clusters_->NumWorkers(), [&](size_t cluster_idx, size_t thread) {
|
||||
all_clusters_->Run(0, num_clusters, [&](size_t cluster_idx, size_t thread) {
|
||||
HWY_ASSERT(cluster_idx == thread); // each thread has one task
|
||||
const BoundedTopology::Cluster& cluster =
|
||||
topology.GetCluster(pkg_idx, cluster_idx);
|
||||
HWY_ASSERT(cluster.Size() == cluster0.Size());
|
||||
clusters_[cluster_idx] = MakePool(
|
||||
allocator, CapIfNonZero(cluster.Size(), max_workers_per_cluster),
|
||||
hwy::PoolWorkerMapping(cluster_idx, cluster.Size()),
|
||||
cluster.Node());
|
||||
// Pin workers AND the calling thread from `all_clusters`.
|
||||
MaybePin(topology, pkg_idx, cluster_idx, cluster, pinning,
|
||||
const BoundedTopology::Cluster& tcluster = topology.GetCluster(cluster_idx);
|
||||
clusters_[cluster_idx] =
|
||||
MakePool(allocator, workers_per_cluster[cluster_idx],
|
||||
hwy::PoolWorkerMapping(cluster_idx, max_workers_per_cluster_),
|
||||
tcluster.Node());
|
||||
// Pin workers AND the calling thread from `all_clusters_`.
|
||||
MaybePin(topology, cluster_idx, tcluster, pinning_,
|
||||
*clusters_[cluster_idx]);
|
||||
});
|
||||
all_pinned_ = pinning_.AllPinned(&pin_string_);
|
||||
}
|
||||
|
||||
} // namespace gcpp
|
||||
|
|
|
|||
142
util/threading.h
142
util/threading.h
|
|
@ -66,17 +66,14 @@ class PinningPolicy {
|
|||
}; // PinningPolicy
|
||||
|
||||
// Creates a hierarchy of thread pools according to `BoundedTopology`: one with
|
||||
// a thread per enabled package; for each of those, one with a thread per
|
||||
// enabled cluster (CCX/shared L3), and for each of those, the remaining
|
||||
// enabled cores in that cluster.
|
||||
// a thread per enabled cluster (CCX/shared L3), and for each of those, the
|
||||
// remaining enabled cores in that cluster.
|
||||
//
|
||||
// Note that we support spin waits, thus it is important for each thread to be
|
||||
// responsive, hence we do not create more than one thread per enabled core.
|
||||
// For example, when there are two packages with four clusters of 8 cores,
|
||||
// `AllPackages` has the main thread plus one extra thread, each `AllClusters`
|
||||
// has one of the `AllPackages` threads plus three extras, each `Cluster` runs
|
||||
// on one `AllClusters` thread plus seven extra workers, for a total of
|
||||
// 1 + 2*3 + 2*(4*7) = 63 extras plus the main thread.
|
||||
// For example, when there are four clusters of 8 cores, `AllClusters` has the
|
||||
// main thread plus three extras, each `Cluster` runs on one of `AllClusters`
|
||||
// plus seven extras, for a total of 3 + (4*7) = 31 extras plus the main thread.
|
||||
//
|
||||
// Useful when there are tasks which should be parallelized by workers sharing a
|
||||
// cache, or on the same NUMA node. In both cases, individual pools have lower
|
||||
|
|
@ -96,6 +93,10 @@ class NestedPools {
|
|||
NestedPools(NestedPools&&) = delete;
|
||||
NestedPools& operator=(NestedPools&&) = delete;
|
||||
|
||||
// Because cross-package latency is high, this interface assumes only one
|
||||
// package is used. The `skip_packages` argument to `BoundedTopology` selects
|
||||
// which package that is for this `NestedPools` instance.
|
||||
//
|
||||
// `max_threads` is the maximum number of threads to divide among all
|
||||
// clusters. This is more intuitive than a per-cluster limit for users who
|
||||
// may not be aware of the CPU topology. This should be zero (meaning no
|
||||
|
|
@ -104,8 +105,8 @@ class NestedPools {
|
|||
//
|
||||
// To ensure we do not create more threads than there are HW cores, which
|
||||
// would cause huge slowdowns when spinning, the `BoundedSlice` arguments
|
||||
// only impose upper bounds on the number of detected packages and clusters
|
||||
// rather than defining the actual number of threads.
|
||||
// only impose upper bounds on the number of detected clusters rather than
|
||||
// defining the actual number of threads.
|
||||
NestedPools(const BoundedTopology& topology, const Allocator& allocator,
|
||||
size_t max_threads = 0, Tristate pin = Tristate::kDefault);
|
||||
|
||||
|
|
@ -133,80 +134,33 @@ class NestedPools {
|
|||
}
|
||||
}
|
||||
|
||||
size_t NumPackages() const { return packages_.size(); }
|
||||
hwy::ThreadPool& AllPackages() { return *all_packages_; }
|
||||
hwy::ThreadPool& AllClusters(size_t pkg_idx) {
|
||||
HWY_DASSERT(pkg_idx < NumPackages());
|
||||
return packages_[pkg_idx].AllClusters();
|
||||
}
|
||||
hwy::ThreadPool& Cluster(size_t pkg_idx, size_t cluster_idx) {
|
||||
HWY_DASSERT(pkg_idx < NumPackages());
|
||||
return packages_[pkg_idx].Cluster(cluster_idx);
|
||||
}
|
||||
|
||||
// Reasonably tight upper bounds for allocating thread-local storage (TLS).
|
||||
size_t MaxWorkersPerCluster() const { return max_workers_per_cluster_; }
|
||||
size_t MaxWorkersPerPackage() const {
|
||||
return max_clusters_per_package_ * MaxWorkersPerCluster();
|
||||
}
|
||||
size_t MaxWorkers() const { return NumPackages() * MaxWorkersPerPackage(); }
|
||||
|
||||
// Actual number of workers.
|
||||
size_t TotalWorkers() const {
|
||||
size_t total_workers = 0;
|
||||
for (size_t pkg_idx = 0; pkg_idx < NumPackages(); ++pkg_idx) {
|
||||
total_workers += packages_[pkg_idx].TotalWorkers();
|
||||
}
|
||||
return total_workers;
|
||||
}
|
||||
|
||||
// For ShowConfig
|
||||
const char* PinString() const { return pin_string_; }
|
||||
|
||||
// Returns a single pool on the given package: either one thread per cluster
|
||||
// if there is more than one, which maximizes available memory bandwidth, or
|
||||
// the first cluster, which is typically the whole package. For use by callers
|
||||
// that only have a single parallel-for.
|
||||
hwy::ThreadPool& Pool(size_t pkg_idx = 0) {
|
||||
// Only one cluster: use its pool, typically a whole socket.
|
||||
if (AllClusters(pkg_idx).NumWorkers() == 1) {
|
||||
return Cluster(pkg_idx, 0);
|
||||
}
|
||||
// One worker per cluster to maximize bandwidth availability.
|
||||
return AllClusters(pkg_idx);
|
||||
}
|
||||
|
||||
private:
|
||||
class Package {
|
||||
public:
|
||||
Package() = default; // for vector
|
||||
Package(const BoundedTopology& topology, const Allocator& allocator,
|
||||
PinningPolicy& pinning, size_t pkg_idx,
|
||||
size_t max_workers_per_package);
|
||||
|
||||
size_t NumClusters() const { return clusters_.size(); }
|
||||
size_t MaxWorkersPerCluster() const {
|
||||
size_t max_workers_per_cluster = 0;
|
||||
for (const PoolPtr& cluster : clusters_) {
|
||||
max_workers_per_cluster =
|
||||
HWY_MAX(max_workers_per_cluster, cluster->NumWorkers());
|
||||
}
|
||||
return max_workers_per_cluster;
|
||||
}
|
||||
size_t TotalWorkers() const {
|
||||
size_t total_workers = 0;
|
||||
for (const PoolPtr& cluster : clusters_) {
|
||||
total_workers += cluster->NumWorkers();
|
||||
}
|
||||
return total_workers;
|
||||
}
|
||||
|
||||
hwy::ThreadPool& AllClusters() { return *all_clusters_; }
|
||||
hwy::ThreadPool& Cluster(size_t cluster_idx) {
|
||||
HWY_DASSERT(cluster_idx < clusters_.size());
|
||||
return *clusters_[cluster_idx];
|
||||
}
|
||||
|
||||
// Reasonably tight upper bounds for allocating thread-local storage (TLS).
|
||||
size_t MaxWorkersPerCluster() const { return max_workers_per_cluster_; }
|
||||
size_t MaxWorkers() const { return NumClusters() * MaxWorkersPerCluster(); }
|
||||
|
||||
// For ShowConfig
|
||||
const char* PinString() const { return pin_string_; }
|
||||
|
||||
// Returns a single pool on the given package: either one thread per cluster
|
||||
// if there is more than one, which maximizes available memory bandwidth, or
|
||||
// the first cluster, which is typically the whole package. For use by
|
||||
// callers that only have a single parallel-for.
|
||||
// DEPRECATED: use ParallelFor instead.
|
||||
hwy::ThreadPool& Pool(size_t pkg_idx = 0) {
|
||||
// Only one cluster: use its pool, typically a whole socket.
|
||||
if (NumClusters() == 1) return Cluster(0);
|
||||
// One worker per cluster to maximize bandwidth availability.
|
||||
return AllClusters();
|
||||
}
|
||||
|
||||
private:
|
||||
void SetWaitMode(hwy::PoolWaitMode wait_mode) {
|
||||
all_clusters_->SetWaitMode(wait_mode);
|
||||
for (PoolPtr& cluster : clusters_) {
|
||||
|
|
@ -214,30 +168,17 @@ class NestedPools {
|
|||
}
|
||||
}
|
||||
|
||||
private:
|
||||
// Must be freed after `clusters_` because it reserves threads which are
|
||||
// the main threads of `clusters_`.
|
||||
PoolPtr all_clusters_;
|
||||
std::vector<PoolPtr> clusters_;
|
||||
}; // Package
|
||||
|
||||
void SetWaitMode(hwy::PoolWaitMode wait_mode) {
|
||||
all_packages_->SetWaitMode(wait_mode);
|
||||
for (Package& package : packages_) {
|
||||
package.SetWaitMode(wait_mode);
|
||||
}
|
||||
}
|
||||
|
||||
PinningPolicy pinning_;
|
||||
bool all_pinned_;
|
||||
const char* pin_string_;
|
||||
|
||||
std::vector<Package> packages_;
|
||||
PoolPtr all_packages_;
|
||||
// Must be freed after `clusters_` because it reserves threads which are
|
||||
// the main threads of `clusters_`.
|
||||
PoolPtr all_clusters_;
|
||||
std::vector<PoolPtr> clusters_;
|
||||
|
||||
// For TLS indices. One might think this belongs in BoundedTopology, but it
|
||||
// depends on max_threads, which is passed to the NestedPools constructor.
|
||||
size_t max_clusters_per_package_ = 0;
|
||||
// Used by `PoolWorkerMapping`. This depends on the `max_threads` argument,
|
||||
// hence we can only compute this here, not in `BoundedTopology`.
|
||||
size_t max_workers_per_cluster_ = 0;
|
||||
};
|
||||
|
||||
|
|
@ -362,14 +303,11 @@ void ParallelizeTwoRanges(const IndexRangePartition& get1,
|
|||
template <class Func>
|
||||
void HierarchicalParallelFor(size_t num_tasks, NestedPools& pools,
|
||||
const Func& func) {
|
||||
// Even if there are multiple packages, we only use the first.
|
||||
const size_t pkg_idx = 0;
|
||||
|
||||
// If few tasks, run on a single cluster. Also avoids a bit of overhead if
|
||||
// there is only one cluster.
|
||||
hwy::ThreadPool& all_clusters = pools.AllClusters(pkg_idx);
|
||||
hwy::ThreadPool& all_clusters = pools.AllClusters();
|
||||
const size_t num_clusters = all_clusters.NumWorkers();
|
||||
hwy::ThreadPool& cluster = pools.Cluster(pkg_idx, 0);
|
||||
hwy::ThreadPool& cluster = pools.Cluster(0);
|
||||
if (num_clusters == 1 || num_tasks <= cluster.NumWorkers()) {
|
||||
return cluster.Run(0, num_tasks, [&](uint64_t task, size_t thread) {
|
||||
func(task, thread);
|
||||
|
|
@ -382,7 +320,7 @@ void HierarchicalParallelFor(size_t num_tasks, NestedPools& pools,
|
|||
ParallelizeOneRange(
|
||||
ranges, all_clusters,
|
||||
[&](const IndexRange& range, const size_t cluster_idx) {
|
||||
hwy::ThreadPool& cluster = pools.Cluster(pkg_idx, cluster_idx);
|
||||
hwy::ThreadPool& cluster = pools.Cluster(cluster_idx);
|
||||
const size_t cluster_base = cluster_idx * pools.MaxWorkersPerCluster();
|
||||
cluster.Run(range.begin(), range.end(),
|
||||
[&](uint64_t task, size_t thread) {
|
||||
|
|
|
|||
|
|
@ -79,18 +79,15 @@ static void TunePool(hwy::PoolWaitMode wait_mode, hwy::ThreadPool& pool) {
|
|||
}
|
||||
|
||||
static void TunePools(hwy::PoolWaitMode wait_mode, NestedPools& pools) {
|
||||
TunePool(wait_mode, pools.AllPackages());
|
||||
for (size_t pkg_idx = 0; pkg_idx < pools.NumPackages(); ++pkg_idx) {
|
||||
hwy::ThreadPool& clusters = pools.AllClusters(pkg_idx);
|
||||
hwy::ThreadPool& clusters = pools.AllClusters();
|
||||
TunePool(wait_mode, clusters);
|
||||
|
||||
// Run in parallel because Turin CPUs have 16, and in real usage, we often
|
||||
// run all at the same time.
|
||||
clusters.Run(0, clusters.NumWorkers(),
|
||||
[&](uint64_t cluster_idx, size_t /*thread*/) {
|
||||
TunePool(wait_mode, pools.Cluster(pkg_idx, cluster_idx));
|
||||
TunePool(wait_mode, pools.Cluster(cluster_idx));
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
ThreadingContext::ThreadingContext(const ThreadingArgs& args)
|
||||
|
|
|
|||
|
|
@ -153,10 +153,7 @@ enum class ParallelismStrategy : uint8_t {
|
|||
template <class Func>
|
||||
void ParallelFor(ParallelismStrategy parallelism, size_t num_tasks,
|
||||
ThreadingContext& ctx, size_t cluster_idx, const Func& func) {
|
||||
HWY_DASSERT(ctx.topology.NumPackages() == 1);
|
||||
const size_t pkg_idx = 0;
|
||||
|
||||
HWY_DASSERT(cluster_idx < ctx.topology.NumClusters(pkg_idx));
|
||||
HWY_DASSERT(cluster_idx < ctx.topology.NumClusters());
|
||||
if (cluster_idx != 0) {
|
||||
// If already running across clusters, only use within-cluster modes.
|
||||
HWY_DASSERT(parallelism == ParallelismStrategy::kNone ||
|
||||
|
|
@ -173,7 +170,7 @@ void ParallelFor(ParallelismStrategy parallelism, size_t num_tasks,
|
|||
}
|
||||
|
||||
case ParallelismStrategy::kAcrossClusters:
|
||||
return ctx.pools.AllClusters(pkg_idx).Run(
|
||||
return ctx.pools.AllClusters().Run(
|
||||
0, num_tasks,
|
||||
[&](uint64_t task, size_t cluster_idx) { func(task, cluster_idx); });
|
||||
|
||||
|
|
@ -181,7 +178,7 @@ void ParallelFor(ParallelismStrategy parallelism, size_t num_tasks,
|
|||
// Ensure the worker argument is unique across clusters, because it is
|
||||
// used for TLS indexing for example in profiler.h.
|
||||
const size_t base = ctx.Worker(cluster_idx);
|
||||
return ctx.pools.Cluster(pkg_idx, cluster_idx)
|
||||
return ctx.pools.Cluster(cluster_idx)
|
||||
.Run(0, num_tasks, [&](uint64_t task, size_t worker) {
|
||||
func(task, base + worker);
|
||||
});
|
||||
|
|
@ -190,15 +187,15 @@ void ParallelFor(ParallelismStrategy parallelism, size_t num_tasks,
|
|||
case ParallelismStrategy::kFlat: {
|
||||
// Check for single cluster; if not, we must compute `cluster_base` for
|
||||
// consistent and non-overlapping worker indices.
|
||||
hwy::ThreadPool& all_clusters = ctx.pools.AllClusters(pkg_idx);
|
||||
hwy::ThreadPool& all_clusters = ctx.pools.AllClusters();
|
||||
const size_t num_clusters = all_clusters.NumWorkers();
|
||||
if (num_clusters == 1) {
|
||||
return ctx.pools.Cluster(pkg_idx, cluster_idx)
|
||||
return ctx.pools.Cluster(cluster_idx)
|
||||
.Run(0, num_tasks,
|
||||
[&](uint64_t task, size_t worker) { func(task, worker); });
|
||||
}
|
||||
|
||||
return ctx.pools.AllClusters(pkg_idx).Run(
|
||||
return ctx.pools.AllClusters().Run(
|
||||
0, num_tasks, [&](uint64_t task, size_t cluster_idx) {
|
||||
const size_t worker = ctx.Worker(cluster_idx);
|
||||
func(task, worker);
|
||||
|
|
|
|||
|
|
@ -99,23 +99,16 @@ TEST(ThreadingTest, TestBoundedTopology) {
|
|||
const BoundedSlice all;
|
||||
const BoundedSlice one(0, 1);
|
||||
// All
|
||||
{
|
||||
BoundedTopology topology(all, all, all);
|
||||
fprintf(stderr, "%s\n", topology.TopologyString());
|
||||
}
|
||||
|
||||
// Max one package
|
||||
{
|
||||
BoundedTopology topology(one, all, all);
|
||||
fprintf(stderr, "%s\n", topology.TopologyString());
|
||||
ASSERT_EQ(1, topology.NumPackages());
|
||||
}
|
||||
|
||||
// Max one cluster
|
||||
{
|
||||
BoundedTopology topology(all, one, all);
|
||||
BoundedTopology topology(one, one, all);
|
||||
fprintf(stderr, "%s\n", topology.TopologyString());
|
||||
ASSERT_EQ(1, topology.NumClusters(0));
|
||||
ASSERT_EQ(1, topology.NumClusters());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -380,24 +373,32 @@ TEST(ThreadingTest, BenchJoin) {
|
|||
ThreadingArgs threading_args;
|
||||
ThreadingContext ctx(threading_args);
|
||||
NestedPools& pools = ctx.pools;
|
||||
// Use last package because the main thread has been pinned to it.
|
||||
const size_t pkg_idx = pools.NumPackages() - 1;
|
||||
|
||||
measure(pools.AllPackages(), false, "block packages");
|
||||
if (pools.AllClusters(pkg_idx).NumWorkers() > 1) {
|
||||
measure(pools.AllClusters(pkg_idx), false, "block clusters");
|
||||
if (pools.NumClusters() > 1) {
|
||||
measure(pools.AllClusters(), false, "block clusters");
|
||||
}
|
||||
measure(pools.Cluster(pkg_idx, 0), false, "block in_cluster");
|
||||
measure(pools.Cluster(0), false, "block in_cluster");
|
||||
|
||||
if (pools.AllPinned()) {
|
||||
const bool kSpin = true;
|
||||
measure(pools.AllPackages(), kSpin, "spin packages");
|
||||
if (pools.AllClusters(pkg_idx).NumWorkers() > 1) {
|
||||
measure(pools.AllClusters(pkg_idx), kSpin, "spin clusters");
|
||||
if (pools.NumClusters() > 1) {
|
||||
measure(pools.AllClusters(), kSpin, "spin clusters");
|
||||
}
|
||||
measure(pools.Cluster(pkg_idx, 0), kSpin, "spin in_cluster");
|
||||
measure(pools.Cluster(0), kSpin, "spin in_cluster");
|
||||
}
|
||||
}
|
||||
|
||||
TEST(ThreadingTest, TestUnequalClusters) {
|
||||
ThreadingArgs threading_args;
|
||||
threading_args.max_lps = 13;
|
||||
ThreadingContext ctx(threading_args);
|
||||
const size_t last_workers =
|
||||
ctx.pools.Cluster(ctx.topology.NumClusters() - 1).NumWorkers();
|
||||
const size_t max_workers = ctx.pools.MaxWorkersPerCluster();
|
||||
fprintf(stderr, "%zu clusters, last with %zu (max %zu)\n",
|
||||
ctx.topology.NumClusters(), last_workers, max_workers);
|
||||
HWY_ASSERT(last_workers <= max_workers);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace gcpp
|
||||
|
|
|
|||
277
util/topology.cc
277
util/topology.cc
|
|
@ -18,21 +18,12 @@
|
|||
#include <stdio.h>
|
||||
|
||||
#include <algorithm> // std::sort
|
||||
#include <utility> // std::move
|
||||
#include <vector>
|
||||
|
||||
#include "hwy/base.h"
|
||||
|
||||
namespace gcpp {
|
||||
|
||||
// Sort T := packages/clusters by descending 'size' so that users who only use
|
||||
// one Group get the largest.
|
||||
template <class T>
|
||||
static void SortByDescendingSize(std::vector<T>& groups) {
|
||||
std::sort(groups.begin(), groups.end(),
|
||||
[](const T& a, const T& b) { return a.Size() > b.Size(); });
|
||||
}
|
||||
|
||||
// Returns set of LPs available for use.
|
||||
static LPS EnabledLPs(const BoundedSlice& lp_slice) {
|
||||
LPS enabled_lps;
|
||||
|
|
@ -88,21 +79,23 @@ BoundedTopology::BoundedTopology(BoundedSlice package_slice,
|
|||
BoundedSlice cluster_slice,
|
||||
BoundedSlice lp_slice)
|
||||
: package_slice_(package_slice), cluster_slice_(cluster_slice) {
|
||||
HWY_ASSERT(package_slice_.Max() == 1);
|
||||
const LPS enabled_lps = EnabledLPs(lp_slice);
|
||||
|
||||
bool topology_ok = false;
|
||||
#if !GEMMA_DISABLE_TOPOLOGY
|
||||
if (HWY_LIKELY(!topology_.packages.empty())) {
|
||||
InitFromTopology(enabled_lps);
|
||||
topology_ok = InitFromTopology(enabled_lps);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Topology unknown or no packages with enabled LPs: create a single
|
||||
// package with one cluster, and one node.
|
||||
if (HWY_UNLIKELY(NumPackages() == 0)) {
|
||||
if (HWY_UNLIKELY(!topology_ok)) {
|
||||
InitFromLPs(enabled_lps);
|
||||
}
|
||||
|
||||
HWY_ASSERT(NumPackages() != 0 && NumClusters(0) != 0 && NumNodes() != 0);
|
||||
HWY_ASSERT(NumClusters() != 0 && NumNodes() != 0);
|
||||
}
|
||||
|
||||
// Topology is unknown, take the given set of LPs.
|
||||
|
|
@ -161,9 +154,113 @@ constexpr bool kSplitLargeClusters = false;
|
|||
constexpr size_t kMaxClusters = 8;
|
||||
constexpr size_t kMaxLPsPerCluster = 6;
|
||||
|
||||
// Topology is unknown, use only the given LPs which derive from OS affinity
|
||||
// and `lp_slice`.
|
||||
BoundedTopology::Package::Package(const LPS& enabled_lps) {
|
||||
#if !GEMMA_DISABLE_TOPOLOGY
|
||||
|
||||
static size_t CoresFromLPs(const LPS& lps, const hwy::Topology& topology) {
|
||||
LPS cores;
|
||||
lps.Foreach([&](size_t lp) {
|
||||
if (topology.lps[lp].smt == 0) cores.Set(lp);
|
||||
});
|
||||
return cores.Count();
|
||||
}
|
||||
|
||||
// tcluster is a modifiable copy of the first cluster in the package.
|
||||
void BoundedTopology::SplitLargeCluster(const LPS& enabled_lps,
|
||||
hwy::Topology::Cluster tcluster) {
|
||||
const LPS lps = clusters_[0].LPSet(); // copy so we can clear
|
||||
clusters_.clear();
|
||||
|
||||
// Split `lps` into several clusters.
|
||||
LPS clusters_lps[kMaxClusters];
|
||||
const size_t num_clusters =
|
||||
HWY_MIN(kMaxClusters, hwy::DivCeil(lps.Count(), kMaxLPsPerCluster));
|
||||
size_t num_lps = 0;
|
||||
lps.Foreach(
|
||||
[&](size_t lp) { clusters_lps[num_lps++ % num_clusters].Set(lp); });
|
||||
HWY_DASSERT(num_lps == lps.Count());
|
||||
|
||||
// Create new clusters, just inserting the new LPS.
|
||||
for (size_t cluster_idx = 0; cluster_idx < num_clusters; ++cluster_idx) {
|
||||
tcluster.lps = clusters_lps[cluster_idx];
|
||||
// Keep same `private_kib` and `shared_kib`.
|
||||
clusters_.push_back(Cluster(enabled_lps, topology_.lps, tcluster));
|
||||
}
|
||||
}
|
||||
|
||||
// Main part of ctor, called when topology is known.
|
||||
bool BoundedTopology::InitFromTopology(const LPS& enabled_lps) {
|
||||
const size_t tpkg_idx = package_slice_.Begin();
|
||||
HWY_ASSERT(tpkg_idx < topology_.packages.size());
|
||||
const hwy::Topology::Package& tpackage = topology_.packages[tpkg_idx];
|
||||
const std::vector<hwy::Topology::Cluster>& tclusters = tpackage.clusters;
|
||||
if (HWY_UNLIKELY(tclusters.empty())) {
|
||||
HWY_WARN("Topology: no clusters found in package %zu.", tpkg_idx);
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t max_tcluster_cores = 0;
|
||||
size_t max_tcluster_lps = 0;
|
||||
for (const hwy::Topology::Cluster& tcluster : tclusters) {
|
||||
const size_t cores = CoresFromLPs(tcluster.lps, topology_);
|
||||
const size_t lps = tcluster.lps.Count();
|
||||
max_tcluster_cores = HWY_MAX(max_tcluster_cores, cores);
|
||||
max_tcluster_lps = HWY_MAX(max_tcluster_lps, lps);
|
||||
}
|
||||
HWY_ASSERT(max_tcluster_cores != 0);
|
||||
HWY_ASSERT(max_tcluster_lps >= max_tcluster_cores);
|
||||
|
||||
// Populate `clusters` with the subset of clusters in `cluster_slice` that
|
||||
// have any enabled LPs.
|
||||
clusters_.reserve(cluster_slice_.Num(tclusters.size()));
|
||||
cluster_slice_.Foreach("cluster", tclusters.size(), [&](size_t cluster_idx) {
|
||||
const hwy::Topology::Cluster& tcluster = tpackage.clusters[cluster_idx];
|
||||
Cluster cluster(enabled_lps, topology_.lps, tcluster);
|
||||
|
||||
// Skip if empty, i.e. too few `enabled_lps`.
|
||||
if (HWY_LIKELY(cluster.NumWorkers() != 0)) {
|
||||
clusters_.push_back(cluster);
|
||||
// Remember NUMA nodes that we are actually using (not just enabled).
|
||||
nodes_.Set(cluster.Node());
|
||||
}
|
||||
});
|
||||
if (HWY_UNLIKELY(clusters_.empty())) {
|
||||
HWY_WARN("Too restrictive cluster_slice or enabled_lps, no clusters left.");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (kSplitLargeClusters && clusters_.size() == 1 &&
|
||||
enabled_lps.Count() >= 16) {
|
||||
SplitLargeCluster(enabled_lps, tpackage.clusters[0]);
|
||||
}
|
||||
|
||||
// Sort by descending 'size' so that users who only use one get the largest.
|
||||
std::sort(clusters_.begin(), clusters_.end(),
|
||||
[](const Cluster& a, const Cluster& b) {
|
||||
return a.NumWorkers() > b.NumWorkers();
|
||||
});
|
||||
|
||||
// Largest number of enabled workers in any cluster, for `topology_string_`.
|
||||
// This may be less than `max_tcluster_cores` if `enabled_lps` excludes some.
|
||||
size_t max_cluster_workers = 0;
|
||||
for (const Cluster& c : clusters_) {
|
||||
max_cluster_workers = HWY_MAX(max_cluster_workers, c.NumWorkers());
|
||||
}
|
||||
HWY_ASSERT(max_cluster_workers <= max_tcluster_cores);
|
||||
// Do not warn about large clusters: GNR has 40.
|
||||
|
||||
snprintf(topology_string_, sizeof(topology_string_),
|
||||
"%zuS %zuX %zuC %zuH, using %zuX %zuC (nodes=%zu)",
|
||||
topology_.packages.size(), tclusters.size(), max_tcluster_cores,
|
||||
max_tcluster_lps / max_tcluster_cores, NumClusters(),
|
||||
max_cluster_workers, nodes_.Count());
|
||||
return true;
|
||||
}
|
||||
|
||||
#endif // !GEMMA_DISABLE_TOPOLOGY
|
||||
|
||||
// Called when topology is unknown or `GEMMA_DISABLE_TOPOLOGY`. Uses only the
|
||||
// given LPs which derive from OS affinity and `lp_slice`.
|
||||
void BoundedTopology::InitFromLPs(const LPS& enabled_lps) {
|
||||
LPS clusters_lps[kMaxClusters];
|
||||
const size_t num_clusters =
|
||||
kSplitLargeClusters
|
||||
|
|
@ -178,157 +275,11 @@ BoundedTopology::Package::Package(const LPS& enabled_lps) {
|
|||
});
|
||||
|
||||
for (size_t cluster_idx = 0; cluster_idx < num_clusters; ++cluster_idx) {
|
||||
clusters.push_back(Cluster(clusters_lps[cluster_idx]));
|
||||
clusters_.push_back(Cluster(clusters_lps[cluster_idx]));
|
||||
}
|
||||
}
|
||||
|
||||
// NOTE: caller is responsible for checking whether `clusters` is empty.
|
||||
BoundedTopology::Package::Package(const LPS& enabled_lps,
|
||||
const hwy::Topology& topology, size_t pkg_idx,
|
||||
BoundedSlice cluster_slice) {
|
||||
const hwy::Topology::Package& tpackage = topology.packages[pkg_idx];
|
||||
// Populate `clusters` with the subset of clusters in `cluster_slice` that
|
||||
// have any enabled LPs. If `clusters` remains empty, the caller will
|
||||
// skip this `Package`.
|
||||
clusters.reserve(cluster_slice.Num(tpackage.clusters.size()));
|
||||
cluster_slice.Foreach(
|
||||
"cluster", tpackage.clusters.size(), [&](size_t cluster_idx) {
|
||||
const hwy::Topology::Cluster& tcluster = tpackage.clusters[cluster_idx];
|
||||
Cluster cluster(enabled_lps, topology.lps, tcluster);
|
||||
|
||||
// Skip if empty, i.e. too few `enabled_lps`.
|
||||
if (HWY_LIKELY(cluster.Size() != 0)) {
|
||||
clusters.push_back(cluster);
|
||||
}
|
||||
});
|
||||
SortByDescendingSize(clusters);
|
||||
|
||||
// If there is only one large cluster, split it into smaller ones.
|
||||
if (kSplitLargeClusters && clusters.size() == 1 &&
|
||||
enabled_lps.Count() >= 16) {
|
||||
const LPS lps = clusters[0].LPSet(); // copy so we can clear
|
||||
clusters.clear();
|
||||
|
||||
// Split `lps` into several clusters.
|
||||
LPS clusters_lps[kMaxClusters];
|
||||
const size_t num_clusters =
|
||||
HWY_MIN(kMaxClusters, hwy::DivCeil(lps.Count(), kMaxLPsPerCluster));
|
||||
size_t num_lps = 0;
|
||||
lps.Foreach(
|
||||
[&](size_t lp) { clusters_lps[num_lps++ % num_clusters].Set(lp); });
|
||||
HWY_DASSERT(num_lps == lps.Count());
|
||||
|
||||
// Create new clusters, just inserting the new LPS.
|
||||
hwy::Topology::Cluster tcluster = tpackage.clusters[0]; // modifiable copy
|
||||
for (size_t cluster_idx = 0; cluster_idx < num_clusters; ++cluster_idx) {
|
||||
tcluster.lps = clusters_lps[cluster_idx];
|
||||
// Keep same `private_kib` and `shared_kib`.
|
||||
clusters.push_back(Cluster(enabled_lps, topology.lps, tcluster));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if !GEMMA_DISABLE_TOPOLOGY
|
||||
|
||||
static size_t CoresFromLPs(const LPS& lps, const hwy::Topology& topology) {
|
||||
LPS cores;
|
||||
lps.Foreach([&](size_t lp) {
|
||||
if (topology.lps[lp].smt == 0) cores.Set(lp);
|
||||
});
|
||||
return cores.Count();
|
||||
}
|
||||
|
||||
// Scans hwy::Topology for clusters and their size, for use by topology_string_.
|
||||
static void ScanTClusters(hwy::Topology& topology_, size_t& max_tclusters,
|
||||
size_t& max_tcluster_cores,
|
||||
size_t& max_tcluster_lps) {
|
||||
max_tclusters = 0;
|
||||
max_tcluster_cores = 0;
|
||||
max_tcluster_lps = 0;
|
||||
for (size_t pkg_idx = 0; pkg_idx < topology_.packages.size(); ++pkg_idx) {
|
||||
const std::vector<hwy::Topology::Cluster>& tclusters =
|
||||
topology_.packages[pkg_idx].clusters;
|
||||
max_tclusters = HWY_MAX(max_tclusters, tclusters.size());
|
||||
size_t tcluster_cores = 0;
|
||||
size_t tcluster_lps = 0;
|
||||
for (size_t cluster_idx = 0; cluster_idx < tclusters.size();
|
||||
++cluster_idx) {
|
||||
const size_t cores = CoresFromLPs(tclusters[cluster_idx].lps, topology_);
|
||||
const size_t lps = tclusters[cluster_idx].lps.Count();
|
||||
tcluster_cores = HWY_MAX(tcluster_cores, cores);
|
||||
tcluster_lps = HWY_MAX(tcluster_lps, lps);
|
||||
}
|
||||
|
||||
if (tclusters.size() > 1 && tcluster_cores > 8) {
|
||||
HWY_WARN(
|
||||
"Package %zu: multiple clusters with max size %zu, whereas CCX "
|
||||
"only have 8, may indicate a bug in hwy::Topology.",
|
||||
pkg_idx, tcluster_cores);
|
||||
}
|
||||
max_tcluster_cores = HWY_MAX(max_tcluster_cores, tcluster_cores);
|
||||
max_tcluster_lps = HWY_MAX(max_tcluster_lps, tcluster_lps);
|
||||
}
|
||||
HWY_ASSERT(max_tclusters != 0);
|
||||
HWY_ASSERT(max_tcluster_cores != 0);
|
||||
HWY_ASSERT(max_tcluster_lps >= max_tcluster_cores);
|
||||
}
|
||||
|
||||
// Main part of ctor, called when topology is known.
|
||||
void BoundedTopology::InitFromTopology(const LPS& enabled_lps) {
|
||||
size_t max_tclusters, max_tcluster_cores, max_tcluster_lps;
|
||||
ScanTClusters(topology_, max_tclusters, max_tcluster_cores, max_tcluster_lps);
|
||||
|
||||
// (Possibly empty) subset of `Topology` packages that have `enabled_lps`.
|
||||
package_slice_.Foreach(
|
||||
"package", topology_.packages.size(), [&](size_t pkg_idx) {
|
||||
Package package(enabled_lps, topology_, pkg_idx, cluster_slice_);
|
||||
// Skip if empty, i.e. too few `enabled_lps`.
|
||||
if (HWY_LIKELY(!package.clusters.empty())) {
|
||||
packages_.push_back(std::move(package));
|
||||
}
|
||||
});
|
||||
if (NumPackages() == 0) return;
|
||||
SortByDescendingSize(packages_);
|
||||
|
||||
// Remember NUMA nodes that we are actually using (not just enabled).
|
||||
for (const Package& p : packages_) {
|
||||
for (const Cluster& c : p.clusters) {
|
||||
nodes_.Set(c.Node());
|
||||
}
|
||||
}
|
||||
|
||||
// Scan for max BoundedTopology clusters and their size, for topology_string_.
|
||||
size_t all_max_cluster_size = 0;
|
||||
for (size_t pkg_idx = 0; pkg_idx < NumPackages(); ++pkg_idx) {
|
||||
size_t max_cluster_size = 0;
|
||||
for (size_t cluster_idx = 0; cluster_idx < NumClusters(pkg_idx);
|
||||
++cluster_idx) {
|
||||
max_cluster_size =
|
||||
HWY_MAX(max_cluster_size, GetCluster(pkg_idx, cluster_idx).Size());
|
||||
}
|
||||
if (NumClusters(pkg_idx) > 1 && max_cluster_size > 8) {
|
||||
HWY_WARN(
|
||||
"Package %zu: multiple clusters with max size %zu, whereas CCX "
|
||||
"only have 8, may indicate a bug in BoundedTopology.",
|
||||
pkg_idx, max_cluster_size);
|
||||
}
|
||||
all_max_cluster_size = HWY_MAX(all_max_cluster_size, max_cluster_size);
|
||||
}
|
||||
|
||||
snprintf(topology_string_, sizeof(topology_string_),
|
||||
"%zuS %zuX %zuC %zuH, using %zuS %zuX %zuC (nodes=%zu)",
|
||||
topology_.packages.size(), max_tclusters, max_tcluster_cores,
|
||||
max_tcluster_lps / max_tcluster_cores, packages_.size(),
|
||||
NumClusters(0), all_max_cluster_size, nodes_.Count());
|
||||
}
|
||||
|
||||
#endif // !GEMMA_DISABLE_TOPOLOGY
|
||||
|
||||
void BoundedTopology::InitFromLPs(const LPS& enabled_lps) {
|
||||
packages_.push_back(Package(enabled_lps));
|
||||
|
||||
snprintf(topology_string_, sizeof(topology_string_), "LPs=%zu",
|
||||
GetCluster(0, 0).Size());
|
||||
GetCluster(0).NumWorkers());
|
||||
|
||||
// Assume a single NUMA node.
|
||||
nodes_.Set(0);
|
||||
|
|
|
|||
|
|
@ -40,6 +40,7 @@ class BoundedSlice {
|
|||
BoundedSlice(size_t skip = 0, size_t max = 0) : skip_(skip), max_(max) {}
|
||||
|
||||
size_t Begin() const { return skip_; }
|
||||
size_t Max() const { return max_; }
|
||||
|
||||
// STL-style one past the end.
|
||||
size_t End(size_t detected) const {
|
||||
|
|
@ -82,12 +83,11 @@ using LPS = hwy::LogicalProcessorSet;
|
|||
// back to a single package and cluster.
|
||||
class BoundedTopology {
|
||||
public:
|
||||
// Defaults to "use all detected".
|
||||
BoundedTopology(BoundedSlice package_slice = BoundedSlice(),
|
||||
// `package_slice` must have `Max() == 1`. Others default to "use all".
|
||||
BoundedTopology(BoundedSlice package_slice,
|
||||
BoundedSlice cluster_slice = BoundedSlice(),
|
||||
BoundedSlice lp_slice = BoundedSlice());
|
||||
|
||||
size_t NumPackages() const { return packages_.size(); }
|
||||
size_t NumNodes() const { return nodes_.Count(); }
|
||||
const char* TopologyString() const { return topology_string_; }
|
||||
|
||||
|
|
@ -98,8 +98,7 @@ class BoundedTopology {
|
|||
const std::vector<hwy::Topology::LP>& all_lps,
|
||||
const hwy::Topology::Cluster& tcluster);
|
||||
|
||||
// For SortByDescendingSize.
|
||||
size_t Size() const { return num_workers_; }
|
||||
size_t NumWorkers() const { return num_workers_; }
|
||||
|
||||
// Returns vector with all enabled LPs, used for pinning.
|
||||
std::vector<size_t> LPVector() const {
|
||||
|
|
@ -127,26 +126,11 @@ class BoundedTopology {
|
|||
size_t shared_kib_ = 0;
|
||||
}; // Cluster
|
||||
|
||||
size_t NumClusters(size_t pkg_idx) const {
|
||||
HWY_ASSERT(pkg_idx < NumPackages());
|
||||
return packages_[pkg_idx].clusters.size();
|
||||
size_t NumClusters() const { return clusters_.size(); }
|
||||
const Cluster& GetCluster(size_t cluster_idx) const {
|
||||
HWY_ASSERT(cluster_idx < clusters_.size());
|
||||
return clusters_[cluster_idx];
|
||||
}
|
||||
const Cluster& GetCluster(size_t pkg_idx, size_t cluster_idx) const {
|
||||
HWY_ASSERT(pkg_idx < NumPackages());
|
||||
const Package& package = packages_[pkg_idx];
|
||||
HWY_ASSERT(cluster_idx < package.clusters.size());
|
||||
return package.clusters[cluster_idx];
|
||||
}
|
||||
Cluster& GetCluster(size_t pkg_idx, size_t cluster_idx) {
|
||||
HWY_ASSERT(pkg_idx < NumPackages());
|
||||
Package& package = packages_[pkg_idx];
|
||||
HWY_ASSERT(cluster_idx < package.clusters.size());
|
||||
return package.clusters[cluster_idx];
|
||||
}
|
||||
|
||||
#if !GEMMA_DISABLE_TOPOLOGY
|
||||
const hwy::Topology& FullTopology() const { return topology_; }
|
||||
#endif
|
||||
|
||||
// In case we are running with a subset of packages/clusters, these are added
|
||||
// to the package/cluster indices for purposes of the thread name, so that
|
||||
|
|
@ -155,26 +139,17 @@ class BoundedTopology {
|
|||
size_t SkippedClusters() const { return cluster_slice_.Begin(); }
|
||||
|
||||
private:
|
||||
struct Package {
|
||||
explicit Package(const LPS& enabled_lps);
|
||||
Package(const LPS& enabled_lps, const hwy::Topology& topology,
|
||||
size_t pkg_idx, BoundedSlice cluster_slice);
|
||||
|
||||
// For SortByDescendingSize.
|
||||
size_t Size() const { return clusters.size(); }
|
||||
|
||||
std::vector<Cluster> clusters;
|
||||
}; // Package
|
||||
|
||||
void InitFromTopology(const LPS& enabled_lps);
|
||||
void SplitLargeCluster(const LPS& enabled_lps,
|
||||
hwy::Topology::Cluster tcluster);
|
||||
bool InitFromTopology(const LPS& enabled_lps);
|
||||
void InitFromLPs(const LPS& enabled_lps);
|
||||
|
||||
#if !GEMMA_DISABLE_TOPOLOGY
|
||||
hwy::Topology topology_;
|
||||
#endif
|
||||
BoundedSlice package_slice_;
|
||||
BoundedSlice package_slice_; // Within the entire detected topology.
|
||||
BoundedSlice cluster_slice_;
|
||||
std::vector<Package> packages_;
|
||||
std::vector<Cluster> clusters_;
|
||||
char topology_string_[96];
|
||||
LPS nodes_;
|
||||
};
|
||||
|
|
|
|||
Loading…
Reference in New Issue