diff --git a/evals/benchmark_helper.cc b/evals/benchmark_helper.cc index 55e99cf..e9fdafb 100644 --- a/evals/benchmark_helper.cc +++ b/evals/benchmark_helper.cc @@ -241,8 +241,8 @@ void ShowConfig(const LoaderArgs& loader, const ThreadingArgs& threading, dt, cpu100, static_cast(threading.bind), ctx.topology.TopologyString(), ctx.pools.PinString(), CacheString().c_str(), hwy::TargetName(hwy::DispatchedTarget()), - ctx.allocator.VectorBytes() * 8, CompiledConfig(), PROFILER_ENABLED, - ctx.allocator.TotalMiB()); + ctx.cache_info.VectorBytes() * 8, CompiledConfig(), + PROFILER_ENABLED, ctx.allocator.TotalMiB()); } } diff --git a/ops/matmul-inl.h b/ops/matmul-inl.h index 737feb6..65dc185 100644 --- a/ops/matmul-inl.h +++ b/ops/matmul-inl.h @@ -21,7 +21,7 @@ #include "compression/types.h" #include "ops/matmul.h" // IWYU pragma: export -#include "util/allocator.h" +#include "util/allocator.h" // CacheInfo #include "util/basics.h" #include "util/mat.h" #include "util/threading_context.h" @@ -566,7 +566,7 @@ class MMKernel { }; // Miscellaneous stateless helper functions. -struct MMImpl { +class MMImpl { // Returns existing entry for the given key or -1. static HWY_INLINE intptr_t IndexOfKey(MMKeys::Key key, const MMKeys& keys) { const hwy::Span all_keys = keys.Keys(); @@ -596,6 +596,63 @@ struct MMImpl { return -1; } + public: + static MMPerKey& FindOrAddPerKey(size_t M, size_t K, size_t N, + size_t vector_bytes, + MatMulEnv::PerCluster& per_cluster) { + const MMKeys::Key key = MMKeys::KeyFromDims(M, K, N); + intptr_t index = MMImpl::IndexOfKey(key, per_cluster.keys); + // First time we see this shape/key. + if (HWY_UNLIKELY(index < 0)) { + per_cluster.keys.Append(key, vector_bytes); + + // Invalidates `MMAutoTune::Best()`. + std::vector& per_keys = per_cluster.per_key; + index = per_keys.size(); + per_keys.push_back(MMPerKey()); + } + return per_cluster.per_key[index]; + } + + static void NotifyAutotuneResult(size_t M, size_t K, size_t N, double t0, + const MMConfig& cfg, MatMulEnv& env, + MMAutoTune& tuner) { + const uint64_t t1 = + env.have_timer_stop ? hwy::timer::Stop() : hwy::timer::Start(); + const double min_elapsed = static_cast(tuner.NotifyTicks(t1 - t0)) / + hwy::platform::InvariantTicksPerSecond(); + const double flops = 2 * M * K * N / min_elapsed; // * 2 for FMA + if (HWY_UNLIKELY(env.print_measurement && tuner.ShouldPrint())) { + fprintf(stderr, "%7.1f,%.2f,%zu,%4zu,%4zu,%5zu,%s,%zu\n", flops * 1E-9, + min_elapsed * 1E3, cfg.MR(), cfg.MC(), cfg.KC(), cfg.NC(), + StringFromOrder(cfg.Order()), cfg.InnerTasks()); + } + if (HWY_UNLIKELY(env.print_best && tuner.Best())) { + const auto ratio = [&tuner](uint64_t ticks) -> double { + return static_cast(ticks) / + static_cast(tuner.BestTicks()); + }; + const MMConfig& best = *tuner.Best(); + fprintf(stderr, + "\n%zu,%zu,%zu,%7.1f,%.2f,%zu,%4zu,%4zu,%5zu,%s,%zu,%.2f,%.2f\n", + M, K, N, flops * 1E-9, min_elapsed * 1E3, best.MR(), best.MC(), + best.KC(), best.NC(), StringFromOrder(best.Order()), + best.InnerTasks(), ratio(tuner.WorstMinTicks()), + ratio(tuner.FirstConfigTicks())); + } + } + + static void EnsureAligned(const MatPtr& A, const size_t vector_bytes) { + // Ensure A rows are vector-aligned. Neither `Stride` nor `IsPacked` are + // reliable: the latter returns true for single rows, and the former may + // match `Cols` if the width matches the padding. + // Note that B is packed in matmul_test, but otherwise generally padded. + HWY_ASSERT(hwy::IsAligned(A.RowBytes(0), vector_bytes)); + if (A.Rows() > 1) { + HWY_ASSERT(hwy::IsAligned(A.RowBytes(1), vector_bytes)); + } + } + static size_t Worker(const MMArgs& args) { return args.options.cluster_idx * args.env->ctx.pools.MaxWorkersPerCluster(); @@ -753,14 +810,14 @@ struct MMImpl { // loops over the inner KC and MC. Member variables avoid long argument lists. class MMState { public: - MMState(const Extents2D A, const size_t B_rows, const MMArgs& args, + MMState(size_t M, size_t K, size_t N, const MMArgs& args, const MMConfig& config) : args_(args), - range_n_(0, B_rows), + range_n_(0, N), mr_(config.MR()), - ranges_mc_(config.RangesOfMC(A.rows)), - ranges_kc_(config.RangesOfKC(A.cols)), - ranges_nc_(config.RangesOfNC(B_rows)), + ranges_mc_(config.RangesOfMC(M)), + ranges_kc_(config.RangesOfKC(K)), + ranges_nc_(config.RangesOfNC(N)), order_(config.Order()), inner_tasks_(config.InnerTasks()) {} @@ -783,7 +840,7 @@ class MMState { // Compute size of per-worker storage for `kNR` row ranges of B. Stack // allocation avoids passing a worker index. static constexpr size_t B_stride_max_ = - kMaxKC + 2 * Allocator::MaxLineBytes() / sizeof(BF16); + kMaxKC + 2 * CacheInfo::MaxLineBytes() / sizeof(BF16); static constexpr size_t B_storage_max_ = kNR * B_stride_max_; // Granularity of `ForN`. B rows produce C columns, so we @@ -1056,88 +1113,48 @@ HWY_NOINLINE MMPerKey* MatMul(const MatPtrT& A, const MatPtrT& B, options.cluster_idx * env.ctx.pools.MaxWorkersPerCluster(), zone); - const Allocator& allocator = env.ctx.allocator; HWY_DASSERT(options.cluster_idx < env.row_ptrs.size()); - MatMulEnv::PerCluster& per_cluster = env.per_cluster[options.cluster_idx]; RowPtrs C_rows = GetOrSetTempRowPtrs(C, env.row_ptrs[options.cluster_idx]); const size_t M = A.Rows(); const size_t K = A.Cols(); const size_t N = B.Rows(); - const MMKeys::Key key = MMKeys::KeyFromDims(M, K, N); - intptr_t index = MMImpl::IndexOfKey(key, per_cluster.keys); - // First time we see this shape/key. - if (HWY_UNLIKELY(index < 0)) { - per_cluster.keys.Append(key, allocator); - // invalidates `MMAutoTune::Best()` - std::vector& per_keys = per_cluster.per_key; - index = per_keys.size(); - per_keys.push_back(MMPerKey()); - } - MMPerKey& per_key = per_cluster.per_key[index]; + const CacheInfo& cache = env.ctx.cache_info; + MMPerKey& per_key = MMImpl::FindOrAddPerKey( + M, K, N, cache.VectorBytes(), env.per_cluster[options.cluster_idx]); MMAutoTune& tuner = per_key.autotune; const MMArgs args(env, per_key, static_cast(A.Scale()) * B.Scale(), add, options); if (HWY_LIKELY(tuner.Best())) { - const MMState state(A.Extents(), B.Rows(), args, *tuner.Best()); + const MMState state(M, K, N, args, *tuner.Best()); const StridedViewBF A_view = MMImpl::MaybeDecompressA(A, args); state.DispatchParallelism(A_view, B, C_rows); return &per_key; } - // From here, CPU time is negligible except DoMatMul. - - // First call: enumerate all feasible configs. + // Autotuning, first call: enumerate all feasible configs. if (HWY_UNLIKELY(!tuner.HasCandidates())) { - // Ensure matrix dimensions match each other. + // Ensure matrix dimensions match each other (off the hot path). HWY_ASSERT(K == B.Cols()); HWY_ASSERT(M <= kMaxBatchSize); HWY_ASSERT(K <= MMStorage::kMaxK); HWY_ASSERT(N % kNR == 0); - // Ensure A rows are vector-aligned. Neither `Stride` nor `IsPacked` are - // reliable: the latter returns true for single rows, and the former may - // match `Cols` if the width matches the padding. - // Note that B is packed in matmul_test, but otherwise generally padded. - HWY_ASSERT(hwy::IsAligned(A.Row(0), env.ctx.allocator.LineBytes())); - if (A.Rows() > 1) { - HWY_ASSERT(hwy::IsAligned(A.Row(1), env.ctx.allocator.LineBytes())); - } - - tuner.SetCandidates(MMCandidates(allocator, M, K, N, sizeof(TC), kMaxMR, - kNR, env.print_config)); + MMImpl::EnsureAligned(A, cache.VectorBytes()); + tuner.SetCandidates( + MMCandidates(cache, M, K, N, sizeof(TC), env.print_config)); } + // (Also auto-tunes, hence outside the timed section to prevent interference.) + const StridedViewBF A_view = MMImpl::MaybeDecompressA(A, args); + const MMConfig& cfg = tuner.NextConfig(); const uint64_t t0 = hwy::timer::Start(); - MMState state(A.Extents(), B.Rows(), args, cfg); - const StridedViewBF A_view = MMImpl::MaybeDecompressA(A, args); + MMState state(M, K, N, args, cfg); state.DispatchParallelism(A_view, B, C_rows); - const uint64_t t1 = - env.have_timer_stop ? hwy::timer::Stop() : hwy::timer::Start(); - const double min_elapsed = static_cast(tuner.NotifyTicks(t1 - t0)) / - hwy::platform::InvariantTicksPerSecond(); - const double flops = 2 * M * K * N / min_elapsed; // * 2 for FMA - if (HWY_UNLIKELY(env.print_measurement && tuner.ShouldPrint())) { - fprintf(stderr, "%7.1f,%.2f,%zu,%4zu,%4zu,%5zu,%s,%zu\n", flops * 1E-9, - min_elapsed * 1E3, cfg.MR(), cfg.MC(), cfg.KC(), cfg.NC(), - StringFromOrder(cfg.Order()), cfg.InnerTasks()); - } - if (HWY_UNLIKELY(env.print_best && tuner.Best())) { - const auto ratio = [per_key](uint64_t ticks) -> double { - return static_cast(ticks) / - static_cast(per_key.autotune.BestTicks()); - }; - const MMConfig& best = *tuner.Best(); - fprintf(stderr, - "\n%zu,%zu,%zu,%7.1f,%.2f,%zu,%4zu,%4zu,%5zu,%s,%zu,%.2f,%.2f\n", M, - K, N, flops * 1E-9, min_elapsed * 1E3, best.MR(), best.MC(), - best.KC(), best.NC(), StringFromOrder(best.Order()), - best.InnerTasks(), ratio(tuner.WorstMinTicks()), - ratio(tuner.FirstConfigTicks())); - } + MMImpl::NotifyAutotuneResult(M, K, N, t0, cfg, env, tuner); return &per_key; } diff --git a/ops/matmul.cc b/ops/matmul.cc index 35887a5..00330e5 100644 --- a/ops/matmul.cc +++ b/ops/matmul.cc @@ -62,22 +62,19 @@ size_t PrevDivisor(const size_t begin, const size_t end, const size_t dim, // and holds most of their arguments in member variables. class GenerateCandidates { public: - GenerateCandidates(const Allocator& allocator, size_t M, size_t K, size_t N, - size_t sizeof_TC, size_t max_mr, size_t nr, - bool print_config) - : allocator_(allocator), + GenerateCandidates(const CacheInfo& cache, size_t M, size_t K, size_t N, + size_t sizeof_TC, bool print_config) + : cache_(cache), M_(M), K_(K), N_(N), sizeof_TC_(sizeof_TC), - max_mr_(max_mr), - nr_(nr), // These influence kc/nc, but are also stored in `MMConfig` for // `RangesOf*`. Must be a vector multiple. The previous/next cache line // is likely still in L1, but we expect K > 1000 and might as well round // up to the line size. Both A and B are BF16. - kc_multiple_(HWY_MIN(K, allocator.LineBytes() / sizeof(BF16))), - nc_multiple_(allocator.StepBytes() / sizeof_TC), + kc_multiple_(HWY_MIN(K, cache.LineBytes() / sizeof(BF16))), + nc_multiple_(cache.StepBytes() / sizeof_TC), print_config_(print_config) {} std::vector operator()() const { @@ -127,10 +124,10 @@ class GenerateCandidates { SizeVec all_mr; all_mr.reserve(3); // AVX2's 16 registers are not enough for four rows, but SSE4 may benefit. - if (M_ >= max_mr_ && !is_avx2) all_mr.push_back(max_mr_); + if (M_ >= kMaxMR && !is_avx2) all_mr.push_back(kMaxMR); // Allow for AVX-512 but not SSE4 (for which 4 are usually better). Also // enable if not enough rows for 4. - if (M_ >= 2 && (M_ < max_mr_ || (!is_sse && !is_wasm))) { + if (M_ >= 2 && (M_ < kMaxMR || (!is_sse && !is_wasm))) { all_mr.push_back(size_t{2}); } // Even SSE4 usually prefers 2 rows; only enable for single rows. @@ -172,8 +169,8 @@ class GenerateCandidates { // size. This results in an overestimate, and the loop below will propose // the next few smaller values for the autotuner to evaluate. const size_t bytes_ab = - allocator_.L1Bytes() * (sizeof(BF16) + sizeof(SfpStream)); - const size_t col_bytes = rows_a * sizeof(BF16) + nr_ * sizeof(BF16); + cache_.L1Bytes() * (sizeof(BF16) + sizeof(SfpStream)); + const size_t col_bytes = rows_a * sizeof(BF16) + kNR * sizeof(BF16); size_t kc_max = hwy::DivCeil(bytes_ab, col_bytes); kc_max = RoundDownWithFloor(HWY_MIN(kc_max, kMaxKC), kc_multiple_); kc_max = HWY_MIN(kc_max, K_); @@ -213,14 +210,14 @@ class GenerateCandidates { SizeVec MC(size_t mr, size_t kc, MMOrder order) const { // Typically 12-24K. The B rows are pinned in L1, but also occupy L2 because // it is typically inclusive. - const size_t bytes_b = nr_ * kc * (sizeof(SfpStream) + sizeof(BF16)); + const size_t bytes_b = kNR * kc * (sizeof(SfpStream) + sizeof(BF16)); // Choose the largest feasible `mc_max` (A/C rows) to maximize reuse of the // packed B. We want `mc * kc` elements of A to fit in L2, alongside // `bytes_b` plus `mc` cache lines because resident-A updates `mc` rows of // partial. - const size_t bytes_per_mc = kc * sizeof(BF16) + allocator_.LineBytes(); - size_t mc_max = hwy::DivCeil(allocator_.L2Bytes() - bytes_b, bytes_per_mc); + const size_t bytes_per_mc = kc * sizeof(BF16) + cache_.LineBytes(); + size_t mc_max = hwy::DivCeil(cache_.L2Bytes() - bytes_b, bytes_per_mc); mc_max = HWY_MIN(mc_max, kMaxBatchSize); HWY_DASSERT(mc_max != 0); mc_max = HWY_MIN(mc_max, M_); @@ -261,7 +258,7 @@ class GenerateCandidates { // Otherwise, leave it unbounded. if (M_ > mr) { const size_t bytes_per_nc = (kc * sizeof(BF16) + mc * sizeof_TC_); - nc_max = HWY_MIN(hwy::DivCeil(allocator_.L3Bytes(), bytes_per_nc), N_); + nc_max = HWY_MIN(hwy::DivCeil(cache_.L3Bytes(), bytes_per_nc), N_); } HWY_DASSERT(nc_max != 0); nc_max = RoundDownWithFloor(nc_max, nc_multiple_); @@ -328,15 +325,12 @@ class GenerateCandidates { return inner_tasks; } - const Allocator& allocator_; + const CacheInfo& cache_; const size_t M_; const size_t K_; const size_t N_; const size_t sizeof_TC_; - const size_t max_mr_; - const size_t nr_; - const size_t kc_multiple_; const size_t nc_multiple_; @@ -346,12 +340,10 @@ class GenerateCandidates { } // namespace // Facade to avoid exposing `GenerateCandidates` in the header. -std::vector MMCandidates(const Allocator& allocator, size_t M, - size_t K, size_t N, size_t sizeof_TC, - size_t max_mr, size_t nr, +std::vector MMCandidates(const CacheInfo& cache, size_t M, size_t K, + size_t N, size_t sizeof_TC, bool print_config) { - return GenerateCandidates(allocator, M, K, N, sizeof_TC, max_mr, nr, - print_config)(); + return GenerateCandidates(cache, M, K, N, sizeof_TC, print_config)(); } MatMulEnv::MatMulEnv(ThreadingContext& ctx) : ctx(ctx) { diff --git a/ops/matmul.h b/ops/matmul.h index 8c7d724..641dad9 100644 --- a/ops/matmul.h +++ b/ops/matmul.h @@ -477,9 +477,9 @@ class MMConfig { static_assert(sizeof(MMConfig) == 32); // for faster indexing #pragma pack(pop) -std::vector MMCandidates(const Allocator& allocator, size_t M, - size_t K, size_t N, size_t sizeof_TC, - size_t max_mr, size_t nr, bool print_config); +std::vector MMCandidates(const CacheInfo& cache, size_t M, size_t K, + size_t N, size_t sizeof_TC, + bool print_config); // State machine for choosing the best `TConfig`, which is `MMConfig` for the // main MatMul autotuner. @@ -619,11 +619,11 @@ class MMKeys { } // Must only be called if not already present in `Keys()`. - void Append(Key key, const Allocator& allocator) { + void Append(Key key, size_t vector_bytes) { // Dynamic allocation because the test checks many more dimensions than // would be reasonable to pre-allocate. DIY for alignment and padding. if (HWY_UNLIKELY(num_unique_ >= capacity_)) { - const size_t NU64 = allocator.VectorBytes() / sizeof(Key); + const size_t NU64 = vector_bytes / sizeof(Key); // Start at one vector so the size is always a multiple of N. if (HWY_UNLIKELY(capacity_ == 0)) { capacity_ = hwy::DivCeil(NU64, 2); // will be doubled below @@ -704,7 +704,7 @@ struct MMArgs { scale(scale), add(add), options(options), - line_bytes(env.ctx.allocator.LineBytes()) {} + line_bytes(env.ctx.cache_info.LineBytes()) {} MatMulEnv* env; MMPerKey* per_key; diff --git a/util/allocator.cc b/util/allocator.cc index f8bfdd5..f99586e 100644 --- a/util/allocator.cc +++ b/util/allocator.cc @@ -130,7 +130,7 @@ size_t DetectTotalMiB(size_t page_bytes) { } // namespace -Allocator::Allocator(const BoundedTopology& topology, bool enable_bind) { +CacheInfo::CacheInfo(const BoundedTopology& topology) { line_bytes_ = DetectLineBytes(); // Ensure MaxLineBytes() is an upper bound. HWY_ASSERT(MaxLineBytes() >= LineBytes()); @@ -138,8 +138,6 @@ Allocator::Allocator(const BoundedTopology& topology, bool enable_bind) { vector_bytes_ = hwy::VectorBytes(); step_bytes_ = HWY_MAX(line_bytes_, vector_bytes_); - base_page_bytes_ = DetectPageSize(); - quantum_bytes_ = step_bytes_; // may overwrite below const BoundedTopology::Cluster& cluster = topology.GetCluster(0, 0); if (const hwy::Cache* caches = hwy::DataCaches()) { @@ -153,8 +151,14 @@ Allocator::Allocator(const BoundedTopology& topology, bool enable_bind) { if (l3_bytes_ == 0) { l3_bytes_ = (cluster.SharedKiB() ? cluster.SharedKiB() : 1024) << 10; } +} - total_mib_ = DetectTotalMiB(base_page_bytes_); +Allocator::Allocator(const BoundedTopology& topology, + const CacheInfo& cache_info, bool enable_bind) + : line_bytes_(cache_info.LineBytes()), + base_page_bytes_(DetectPageSize()), + total_mib_(DetectTotalMiB(base_page_bytes_)) { + quantum_bytes_ = cache_info.StepBytes(); // may overwrite below // Prerequisites for binding: // - supported by the OS (currently Linux only), diff --git a/util/allocator.h b/util/allocator.h index 42e261c..086b6e9 100644 --- a/util/allocator.h +++ b/util/allocator.h @@ -77,27 +77,49 @@ using AlignedPtr = std::unique_ptr; template using AlignedClassPtr = std::unique_ptr; -// Both allocation, binding, and row accessors depend on the sizes of memory -// pages and cache lines. To avoid having to pass `Allocator&` everywhere, we -// wrap this in a singleton. A monostate requires explicit initialization, -// which we prefer to avoid because there are many main() functions. -class Allocator { +// Holds cache line size/capacity and vector size. Stored in `ThreadingContext`. +class CacheInfo { public: - // Must be called at least once before any other function. Not thread-safe, - // hence only call this from the main thread. - Allocator(const BoundedTopology& topology, bool enable_bind); + CacheInfo(const BoundedTopology& topology); // Bytes per cache line, or a reasonable guess if unknown. Used to choose // ranges such that there will be no false sharing. size_t LineBytes() const { return line_bytes_; } // Upper bound on `LineBytes()`, for stack allocations. static constexpr size_t MaxLineBytes() { return 256; } + // Bytes per full vector. Used to compute loop steps. size_t VectorBytes() const { return vector_bytes_; } // Work granularity that avoids false sharing and partial vectors. // = HWY_MAX(LineBytes(), VectorBytes()) size_t StepBytes() const { return step_bytes_; } + // L1 and L2 are typically per core. + size_t L1Bytes() const { return l1_bytes_; } + size_t L2Bytes() const { return l2_bytes_; } + // Clusters often share an L3. We return the total size per package. + size_t L3Bytes() const { return l3_bytes_; } + + private: + size_t line_bytes_; + size_t vector_bytes_; + size_t step_bytes_; + + size_t l1_bytes_ = 0; + size_t l2_bytes_ = 0; + size_t l3_bytes_ = 0; +}; + +// NUMA-aware allocation and memory binding. Stored in `ThreadingContext`. +class Allocator { + public: + Allocator(const BoundedTopology& topology, const CacheInfo& cache_info, + bool enable_bind); + + // Used by `AllocateFor`, which only takes an `Allocator` argument, + // hence copy from `CacheInfo`. + size_t LineBytes() const { return line_bytes_; } + // File size multiple required for memory mapping. Also used when binding // memory to NUMA nodes (see `BindB/BindC`). size_t BasePageBytes() const { return base_page_bytes_; } @@ -105,12 +127,6 @@ class Allocator { // Desired allocator alignment: Either StepBytes, or BasePageBytes if NUMA. size_t QuantumBytes() const { return quantum_bytes_; } - // L1 and L2 are typically per core. - size_t L1Bytes() const { return l1_bytes_; } - size_t L2Bytes() const { return l2_bytes_; } - // Clusters often share an L3. We return the total size per package. - size_t L3Bytes() const { return l3_bytes_; } - size_t TotalMiB() const { return total_mib_; } size_t FreeMiB() const; @@ -159,18 +175,11 @@ class Allocator { bool BindMemory(void* p, size_t bytes, size_t node) const; private: - size_t line_bytes_; - size_t vector_bytes_; - size_t step_bytes_; - size_t base_page_bytes_; + const size_t line_bytes_; + const size_t base_page_bytes_; + const size_t total_mib_; + size_t quantum_bytes_; - - size_t l1_bytes_ = 0; - size_t l2_bytes_ = 0; - size_t l3_bytes_ = 0; - - size_t total_mib_; - bool should_bind_ = false; }; diff --git a/util/threading_context.cc b/util/threading_context.cc index 81155c5..90a64d1 100644 --- a/util/threading_context.cc +++ b/util/threading_context.cc @@ -76,7 +76,8 @@ ThreadingContext::ThreadingContext(const ThreadingArgs& args) topology(BoundedSlice(args.skip_packages, args.max_packages), BoundedSlice(args.skip_clusters, args.max_clusters), BoundedSlice(args.skip_lps, args.max_lps)), - allocator(topology, args.bind != Tristate::kFalse), + cache_info(topology), + allocator(topology, cache_info, args.bind != Tristate::kFalse), pools(topology, allocator, args.max_threads, args.pin) { PROFILER_ZONE("Startup.ThreadingContext autotune"); TunePool(pools.AllPackages()); diff --git a/util/threading_context.h b/util/threading_context.h index 6bd6936..41d0811 100644 --- a/util/threading_context.h +++ b/util/threading_context.h @@ -105,7 +105,10 @@ struct ThreadingContext { // will be 1 regardless of the actual system topology. BoundedTopology topology; - // Ctor depends on `topology` for deciding whether to enable NUMA. + // Ctor depends on `topology` for per-cluster cache sizes. + CacheInfo cache_info; + + // Ctor depends on `topology` (for NUMA) and `cache_info` (for step size). Allocator allocator; // Per-package/cluster/within cluster pools of threads, matching `topology`.