diff --git a/evals/benchmark_helper.cc b/evals/benchmark_helper.cc
index 55e99cf..e9fdafb 100644
--- a/evals/benchmark_helper.cc
+++ b/evals/benchmark_helper.cc
@@ -241,8 +241,8 @@ void ShowConfig(const LoaderArgs& loader, const ThreadingArgs& threading,
             dt, cpu100, static_cast<int>(threading.bind),
             ctx.topology.TopologyString(), ctx.pools.PinString(),
             CacheString().c_str(), hwy::TargetName(hwy::DispatchedTarget()),
-            ctx.allocator.VectorBytes() * 8, CompiledConfig(), PROFILER_ENABLED,
-            ctx.allocator.TotalMiB());
+            ctx.cache_info.VectorBytes() * 8, CompiledConfig(),
+            PROFILER_ENABLED, ctx.allocator.TotalMiB());
   }
 }
 
diff --git a/ops/matmul-inl.h b/ops/matmul-inl.h
index 737feb6..65dc185 100644
--- a/ops/matmul-inl.h
+++ b/ops/matmul-inl.h
@@ -21,7 +21,7 @@
 
 #include "compression/types.h"
 #include "ops/matmul.h"  // IWYU pragma: export
-#include "util/allocator.h"
+#include "util/allocator.h"  // CacheInfo
 #include "util/basics.h"
 #include "util/mat.h"
 #include "util/threading_context.h"
@@ -566,7 +566,7 @@ class MMKernel {
 };
 
 // Miscellaneous stateless helper functions.
-struct MMImpl {
+class MMImpl {
   // Returns existing entry for the given key or -1.
   static HWY_INLINE intptr_t IndexOfKey(MMKeys::Key key, const MMKeys& keys) {
     const hwy::Span<const uint64_t> all_keys = keys.Keys();
@@ -596,6 +596,63 @@ struct MMImpl {
     return -1;
   }
 
+ public:
+  static MMPerKey& FindOrAddPerKey(size_t M, size_t K, size_t N,
+                                   size_t vector_bytes,
+                                   MatMulEnv::PerCluster& per_cluster) {
+    const MMKeys::Key key = MMKeys::KeyFromDims(M, K, N);
+    intptr_t index = MMImpl::IndexOfKey(key, per_cluster.keys);
+    // First time we see this shape/key.
+    if (HWY_UNLIKELY(index < 0)) {
+      per_cluster.keys.Append(key, vector_bytes);
+
+      // Invalidates `MMAutoTune::Best()`.
+      std::vector<MMPerKey>& per_keys = per_cluster.per_key;
+      index = per_keys.size();
+      per_keys.push_back(MMPerKey());
+    }
+    return per_cluster.per_key[index];
+  }
+
+  static void NotifyAutotuneResult(size_t M, size_t K, size_t N, double t0,
+                                   const MMConfig& cfg, MatMulEnv& env,
+                                   MMAutoTune<MMConfig>& tuner) {
+    const uint64_t t1 =
+        env.have_timer_stop ? hwy::timer::Stop() : hwy::timer::Start();
+    const double min_elapsed = static_cast<double>(tuner.NotifyTicks(t1 - t0)) /
+                               hwy::platform::InvariantTicksPerSecond();
+    const double flops = 2 * M * K * N / min_elapsed;  // * 2 for FMA
+    if (HWY_UNLIKELY(env.print_measurement && tuner.ShouldPrint())) {
+      fprintf(stderr, "%7.1f,%.2f,%zu,%4zu,%4zu,%5zu,%s,%zu\n", flops * 1E-9,
+              min_elapsed * 1E3, cfg.MR(), cfg.MC(), cfg.KC(), cfg.NC(),
+              StringFromOrder(cfg.Order()), cfg.InnerTasks());
+    }
+    if (HWY_UNLIKELY(env.print_best && tuner.Best())) {
+      const auto ratio = [&tuner](uint64_t ticks) -> double {
+        return static_cast<double>(ticks) /
+               static_cast<double>(tuner.BestTicks());
+      };
+      const MMConfig& best = *tuner.Best();
+      fprintf(stderr,
+              "\n%zu,%zu,%zu,%7.1f,%.2f,%zu,%4zu,%4zu,%5zu,%s,%zu,%.2f,%.2f\n",
+              M, K, N, flops * 1E-9, min_elapsed * 1E3, best.MR(), best.MC(),
+              best.KC(), best.NC(), StringFromOrder(best.Order()),
+              best.InnerTasks(), ratio(tuner.WorstMinTicks()),
+              ratio(tuner.FirstConfigTicks()));
+    }
+  }
+
+  static void EnsureAligned(const MatPtr& A, const size_t vector_bytes) {
+    // Ensure A rows are vector-aligned. Neither `Stride` nor `IsPacked` are
+    // reliable: the latter returns true for single rows, and the former may
+    // match `Cols` if the width matches the padding.
+    // Note that B is packed in matmul_test, but otherwise generally padded.
+    HWY_ASSERT(hwy::IsAligned(A.RowBytes(0), vector_bytes));
+    if (A.Rows() > 1) {
+      HWY_ASSERT(hwy::IsAligned(A.RowBytes(1), vector_bytes));
+    }
+  }
+
   static size_t Worker(const MMArgs& args) {
     return args.options.cluster_idx *
            args.env->ctx.pools.MaxWorkersPerCluster();
@@ -753,14 +810,14 @@ struct MMImpl {
 // loops over the inner KC and MC. Member variables avoid long argument lists.
 class MMState {
  public:
-  MMState(const Extents2D A, const size_t B_rows, const MMArgs& args,
+  MMState(size_t M, size_t K, size_t N, const MMArgs& args,
           const MMConfig& config)
       : args_(args),
-        range_n_(0, B_rows),
+        range_n_(0, N),
         mr_(config.MR()),
-        ranges_mc_(config.RangesOfMC(A.rows)),
-        ranges_kc_(config.RangesOfKC(A.cols)),
-        ranges_nc_(config.RangesOfNC(B_rows)),
+        ranges_mc_(config.RangesOfMC(M)),
+        ranges_kc_(config.RangesOfKC(K)),
+        ranges_nc_(config.RangesOfNC(N)),
         order_(config.Order()),
         inner_tasks_(config.InnerTasks()) {}
 
@@ -783,7 +840,7 @@ class MMState {
   // Compute size of per-worker storage for `kNR` row ranges of B. Stack
   // allocation avoids passing a worker index.
   static constexpr size_t B_stride_max_ =
-      kMaxKC + 2 * Allocator::MaxLineBytes() / sizeof(BF16);
+      kMaxKC + 2 * CacheInfo::MaxLineBytes() / sizeof(BF16);
   static constexpr size_t B_storage_max_ = kNR * B_stride_max_;
 
   // Granularity of `ForN`. B rows produce C columns, so we
@@ -1056,88 +1113,48 @@ HWY_NOINLINE MMPerKey* MatMul(const MatPtrT<TA>& A, const MatPtrT<TB>& B,
                  options.cluster_idx * env.ctx.pools.MaxWorkersPerCluster(),
                  zone);
 
-  const Allocator& allocator = env.ctx.allocator;
   HWY_DASSERT(options.cluster_idx < env.row_ptrs.size());
-  MatMulEnv::PerCluster& per_cluster = env.per_cluster[options.cluster_idx];
   RowPtrs<TC> C_rows =
       GetOrSetTempRowPtrs(C, env.row_ptrs[options.cluster_idx]);
 
   const size_t M = A.Rows();
   const size_t K = A.Cols();
   const size_t N = B.Rows();
-  const MMKeys::Key key = MMKeys::KeyFromDims(M, K, N);
-  intptr_t index = MMImpl::IndexOfKey(key, per_cluster.keys);
-  // First time we see this shape/key.
-  if (HWY_UNLIKELY(index < 0)) {
-    per_cluster.keys.Append(key, allocator);
 
-    // invalidates `MMAutoTune::Best()`
-    std::vector<MMPerKey>& per_keys = per_cluster.per_key;
-    index = per_keys.size();
-    per_keys.push_back(MMPerKey());
-  }
-  MMPerKey& per_key = per_cluster.per_key[index];
+  const CacheInfo& cache = env.ctx.cache_info;
+  MMPerKey& per_key = MMImpl::FindOrAddPerKey(
+      M, K, N, cache.VectorBytes(), env.per_cluster[options.cluster_idx]);
   MMAutoTune<MMConfig>& tuner = per_key.autotune;
 
   const MMArgs args(env, per_key, static_cast<double>(A.Scale()) * B.Scale(),
                     add, options);
   if (HWY_LIKELY(tuner.Best())) {
-    const MMState state(A.Extents(), B.Rows(), args, *tuner.Best());
+    const MMState state(M, K, N, args, *tuner.Best());
     const StridedViewBF A_view = MMImpl::MaybeDecompressA(A, args);
     state.DispatchParallelism(A_view, B, C_rows);
     return &per_key;
   }
 
-  // From here, CPU time is negligible except DoMatMul.
-
-  // First call: enumerate all feasible configs.
+  // Autotuning, first call: enumerate all feasible configs.
   if (HWY_UNLIKELY(!tuner.HasCandidates())) {
-    // Ensure matrix dimensions match each other.
+    // Ensure matrix dimensions match each other (off the hot path).
     HWY_ASSERT(K == B.Cols());
     HWY_ASSERT(M <= kMaxBatchSize);
     HWY_ASSERT(K <= MMStorage::kMaxK);
     HWY_ASSERT(N % kNR == 0);
-    // Ensure A rows are vector-aligned. Neither `Stride` nor `IsPacked` are
-    // reliable: the latter returns true for single rows, and the former may
-    // match `Cols` if the width matches the padding.
-    // Note that B is packed in matmul_test, but otherwise generally padded.
-    HWY_ASSERT(hwy::IsAligned(A.Row(0), env.ctx.allocator.LineBytes()));
-    if (A.Rows() > 1) {
-      HWY_ASSERT(hwy::IsAligned(A.Row(1), env.ctx.allocator.LineBytes()));
-    }
-
-    tuner.SetCandidates(MMCandidates(allocator, M, K, N, sizeof(TC), kMaxMR,
-                                     kNR, env.print_config));
+    MMImpl::EnsureAligned(A, cache.VectorBytes());
+    tuner.SetCandidates(
+        MMCandidates(cache, M, K, N, sizeof(TC), env.print_config));
   }
 
+  // (Also auto-tunes, hence outside the timed section to prevent interference.)
+  const StridedViewBF A_view = MMImpl::MaybeDecompressA(A, args);
+
   const MMConfig& cfg = tuner.NextConfig();
   const uint64_t t0 = hwy::timer::Start();
-  MMState state(A.Extents(), B.Rows(), args, cfg);
-  const StridedViewBF A_view = MMImpl::MaybeDecompressA(A, args);
+  MMState state(M, K, N, args, cfg);
   state.DispatchParallelism(A_view, B, C_rows);
-  const uint64_t t1 =
-      env.have_timer_stop ? hwy::timer::Stop() : hwy::timer::Start();
-  const double min_elapsed = static_cast<double>(tuner.NotifyTicks(t1 - t0)) /
-                             hwy::platform::InvariantTicksPerSecond();
-  const double flops = 2 * M * K * N / min_elapsed;  // * 2 for FMA
-  if (HWY_UNLIKELY(env.print_measurement && tuner.ShouldPrint())) {
-    fprintf(stderr, "%7.1f,%.2f,%zu,%4zu,%4zu,%5zu,%s,%zu\n", flops * 1E-9,
-            min_elapsed * 1E3, cfg.MR(), cfg.MC(), cfg.KC(), cfg.NC(),
-            StringFromOrder(cfg.Order()), cfg.InnerTasks());
-  }
-  if (HWY_UNLIKELY(env.print_best && tuner.Best())) {
-    const auto ratio = [per_key](uint64_t ticks) -> double {
-      return static_cast<double>(ticks) /
-             static_cast<double>(per_key.autotune.BestTicks());
-    };
-    const MMConfig& best = *tuner.Best();
-    fprintf(stderr,
-            "\n%zu,%zu,%zu,%7.1f,%.2f,%zu,%4zu,%4zu,%5zu,%s,%zu,%.2f,%.2f\n", M,
-            K, N, flops * 1E-9, min_elapsed * 1E3, best.MR(), best.MC(),
-            best.KC(), best.NC(), StringFromOrder(best.Order()),
-            best.InnerTasks(), ratio(tuner.WorstMinTicks()),
-            ratio(tuner.FirstConfigTicks()));
-  }
+  MMImpl::NotifyAutotuneResult(M, K, N, t0, cfg, env, tuner);
 
   return &per_key;
 }
diff --git a/ops/matmul.cc b/ops/matmul.cc
index 35887a5..00330e5 100644
--- a/ops/matmul.cc
+++ b/ops/matmul.cc
@@ -62,22 +62,19 @@ size_t PrevDivisor(const size_t begin, const size_t end, const size_t dim,
 // and holds most of their arguments in member variables.
 class GenerateCandidates {
  public:
-  GenerateCandidates(const Allocator& allocator, size_t M, size_t K, size_t N,
-                     size_t sizeof_TC, size_t max_mr, size_t nr,
-                     bool print_config)
-      : allocator_(allocator),
+  GenerateCandidates(const CacheInfo& cache, size_t M, size_t K, size_t N,
+                     size_t sizeof_TC, bool print_config)
+      : cache_(cache),
         M_(M),
         K_(K),
         N_(N),
         sizeof_TC_(sizeof_TC),
-        max_mr_(max_mr),
-        nr_(nr),
         // These influence kc/nc, but are also stored in `MMConfig` for
         // `RangesOf*`. Must be a vector multiple. The previous/next cache line
         // is likely still in L1, but we expect K > 1000 and might as well round
         // up to the line size. Both A and B are BF16.
-        kc_multiple_(HWY_MIN(K, allocator.LineBytes() / sizeof(BF16))),
-        nc_multiple_(allocator.StepBytes() / sizeof_TC),
+        kc_multiple_(HWY_MIN(K, cache.LineBytes() / sizeof(BF16))),
+        nc_multiple_(cache.StepBytes() / sizeof_TC),
         print_config_(print_config) {}
 
   std::vector<MMConfig> operator()() const {
@@ -127,10 +124,10 @@ class GenerateCandidates {
     SizeVec all_mr;
     all_mr.reserve(3);
     // AVX2's 16 registers are not enough for four rows, but SSE4 may benefit.
-    if (M_ >= max_mr_ && !is_avx2) all_mr.push_back(max_mr_);
+    if (M_ >= kMaxMR && !is_avx2) all_mr.push_back(kMaxMR);
     // Allow for AVX-512 but not SSE4 (for which 4 are usually better). Also
     // enable if not enough rows for 4.
-    if (M_ >= 2 && (M_ < max_mr_ || (!is_sse && !is_wasm))) {
+    if (M_ >= 2 && (M_ < kMaxMR || (!is_sse && !is_wasm))) {
       all_mr.push_back(size_t{2});
     }
     // Even SSE4 usually prefers 2 rows; only enable for single rows.
@@ -172,8 +169,8 @@ class GenerateCandidates {
     // size. This results in an overestimate, and the loop below will propose
     // the next few smaller values for the autotuner to evaluate.
     const size_t bytes_ab =
-        allocator_.L1Bytes() * (sizeof(BF16) + sizeof(SfpStream));
-    const size_t col_bytes = rows_a * sizeof(BF16) + nr_ * sizeof(BF16);
+        cache_.L1Bytes() * (sizeof(BF16) + sizeof(SfpStream));
+    const size_t col_bytes = rows_a * sizeof(BF16) + kNR * sizeof(BF16);
     size_t kc_max = hwy::DivCeil(bytes_ab, col_bytes);
     kc_max = RoundDownWithFloor(HWY_MIN(kc_max, kMaxKC), kc_multiple_);
     kc_max = HWY_MIN(kc_max, K_);
@@ -213,14 +210,14 @@ class GenerateCandidates {
   SizeVec MC(size_t mr, size_t kc, MMOrder order) const {
     // Typically 12-24K. The B rows are pinned in L1, but also occupy L2 because
     // it is typically inclusive.
-    const size_t bytes_b = nr_ * kc * (sizeof(SfpStream) + sizeof(BF16));
+    const size_t bytes_b = kNR * kc * (sizeof(SfpStream) + sizeof(BF16));
 
     // Choose the largest feasible `mc_max` (A/C rows) to maximize reuse of the
     // packed B. We want `mc * kc` elements of A to fit in L2, alongside
     // `bytes_b` plus `mc` cache lines because resident-A updates `mc` rows of
     // partial.
-    const size_t bytes_per_mc = kc * sizeof(BF16) + allocator_.LineBytes();
-    size_t mc_max = hwy::DivCeil(allocator_.L2Bytes() - bytes_b, bytes_per_mc);
+    const size_t bytes_per_mc = kc * sizeof(BF16) + cache_.LineBytes();
+    size_t mc_max = hwy::DivCeil(cache_.L2Bytes() - bytes_b, bytes_per_mc);
     mc_max = HWY_MIN(mc_max, kMaxBatchSize);
     HWY_DASSERT(mc_max != 0);
     mc_max = HWY_MIN(mc_max, M_);
@@ -261,7 +258,7 @@ class GenerateCandidates {
     // Otherwise, leave it unbounded.
     if (M_ > mr) {
       const size_t bytes_per_nc = (kc * sizeof(BF16) + mc * sizeof_TC_);
-      nc_max = HWY_MIN(hwy::DivCeil(allocator_.L3Bytes(), bytes_per_nc), N_);
+      nc_max = HWY_MIN(hwy::DivCeil(cache_.L3Bytes(), bytes_per_nc), N_);
     }
     HWY_DASSERT(nc_max != 0);
     nc_max = RoundDownWithFloor(nc_max, nc_multiple_);
@@ -328,15 +325,12 @@ class GenerateCandidates {
     return inner_tasks;
   }
 
-  const Allocator& allocator_;
+  const CacheInfo& cache_;
   const size_t M_;
   const size_t K_;
   const size_t N_;
   const size_t sizeof_TC_;
 
-  const size_t max_mr_;
-  const size_t nr_;
-
   const size_t kc_multiple_;
   const size_t nc_multiple_;
 
@@ -346,12 +340,10 @@ class GenerateCandidates {
 }  // namespace
 
 // Facade to avoid exposing `GenerateCandidates` in the header.
-std::vector<MMConfig> MMCandidates(const Allocator& allocator, size_t M,
-                                   size_t K, size_t N, size_t sizeof_TC,
-                                   size_t max_mr, size_t nr,
+std::vector<MMConfig> MMCandidates(const CacheInfo& cache, size_t M, size_t K,
+                                   size_t N, size_t sizeof_TC,
                                    bool print_config) {
-  return GenerateCandidates(allocator, M, K, N, sizeof_TC, max_mr, nr,
-                            print_config)();
+  return GenerateCandidates(cache, M, K, N, sizeof_TC, print_config)();
 }
 
 MatMulEnv::MatMulEnv(ThreadingContext& ctx) : ctx(ctx) {
diff --git a/ops/matmul.h b/ops/matmul.h
index 8c7d724..641dad9 100644
--- a/ops/matmul.h
+++ b/ops/matmul.h
@@ -477,9 +477,9 @@ class MMConfig {
 static_assert(sizeof(MMConfig) == 32);  // for faster indexing
 #pragma pack(pop)
 
-std::vector<MMConfig> MMCandidates(const Allocator& allocator, size_t M,
-                                   size_t K, size_t N, size_t sizeof_TC,
-                                   size_t max_mr, size_t nr, bool print_config);
+std::vector<MMConfig> MMCandidates(const CacheInfo& cache, size_t M, size_t K,
+                                   size_t N, size_t sizeof_TC,
+                                   bool print_config);
 
 // State machine for choosing the best `TConfig`, which is `MMConfig` for the
 // main MatMul autotuner.
@@ -619,11 +619,11 @@ class MMKeys {
   }
 
   // Must only be called if not already present in `Keys()`.
-  void Append(Key key, const Allocator& allocator) {
+  void Append(Key key, size_t vector_bytes) {
     // Dynamic allocation because the test checks many more dimensions than
     // would be reasonable to pre-allocate. DIY for alignment and padding.
     if (HWY_UNLIKELY(num_unique_ >= capacity_)) {
-      const size_t NU64 = allocator.VectorBytes() / sizeof(Key);
+      const size_t NU64 = vector_bytes / sizeof(Key);
       // Start at one vector so the size is always a multiple of N.
       if (HWY_UNLIKELY(capacity_ == 0)) {
         capacity_ = hwy::DivCeil(NU64, 2);  // will be doubled below
@@ -704,7 +704,7 @@ struct MMArgs {
         scale(scale),
         add(add),
         options(options),
-        line_bytes(env.ctx.allocator.LineBytes()) {}
+        line_bytes(env.ctx.cache_info.LineBytes()) {}
 
   MatMulEnv* env;
   MMPerKey* per_key;
diff --git a/util/allocator.cc b/util/allocator.cc
index f8bfdd5..f99586e 100644
--- a/util/allocator.cc
+++ b/util/allocator.cc
@@ -130,7 +130,7 @@ size_t DetectTotalMiB(size_t page_bytes) {
 
 }  // namespace
 
-Allocator::Allocator(const BoundedTopology& topology, bool enable_bind) {
+CacheInfo::CacheInfo(const BoundedTopology& topology) {
   line_bytes_ = DetectLineBytes();
   // Ensure MaxLineBytes() is an upper bound.
   HWY_ASSERT(MaxLineBytes() >= LineBytes());
@@ -138,8 +138,6 @@ Allocator::Allocator(const BoundedTopology& topology, bool enable_bind) {
   vector_bytes_ = hwy::VectorBytes();
 
   step_bytes_ = HWY_MAX(line_bytes_, vector_bytes_);
-  base_page_bytes_ = DetectPageSize();
-  quantum_bytes_ = step_bytes_;  // may overwrite below
 
   const BoundedTopology::Cluster& cluster = topology.GetCluster(0, 0);
   if (const hwy::Cache* caches = hwy::DataCaches()) {
@@ -153,8 +151,14 @@ Allocator::Allocator(const BoundedTopology& topology, bool enable_bind) {
   if (l3_bytes_ == 0) {
     l3_bytes_ = (cluster.SharedKiB() ? cluster.SharedKiB() : 1024) << 10;
   }
+}
 
-  total_mib_ = DetectTotalMiB(base_page_bytes_);
+Allocator::Allocator(const BoundedTopology& topology,
+                     const CacheInfo& cache_info, bool enable_bind)
+    : line_bytes_(cache_info.LineBytes()),
+      base_page_bytes_(DetectPageSize()),
+      total_mib_(DetectTotalMiB(base_page_bytes_)) {
+  quantum_bytes_ = cache_info.StepBytes();  // may overwrite below
 
   // Prerequisites for binding:
   // - supported by the OS (currently Linux only),
diff --git a/util/allocator.h b/util/allocator.h
index 42e261c..086b6e9 100644
--- a/util/allocator.h
+++ b/util/allocator.h
@@ -77,27 +77,49 @@ using AlignedPtr = std::unique_ptr<T, DeleterFunc>;
 template <typename T>
 using AlignedClassPtr = std::unique_ptr<T, DeleterDtor>;
 
-// Both allocation, binding, and row accessors depend on the sizes of memory
-// pages and cache lines. To avoid having to pass `Allocator&` everywhere, we
-// wrap this in a singleton. A monostate requires explicit initialization,
-// which we prefer to avoid because there are many main() functions.
-class Allocator {
+// Holds cache line size/capacity and vector size. Stored in `ThreadingContext`.
+class CacheInfo {
  public:
-  // Must be called at least once before any other function. Not thread-safe,
-  // hence only call this from the main thread.
-  Allocator(const BoundedTopology& topology, bool enable_bind);
+  CacheInfo(const BoundedTopology& topology);
 
   // Bytes per cache line, or a reasonable guess if unknown. Used to choose
   // ranges such that there will be no false sharing.
   size_t LineBytes() const { return line_bytes_; }
   // Upper bound on `LineBytes()`, for stack allocations.
   static constexpr size_t MaxLineBytes() { return 256; }
+
   // Bytes per full vector. Used to compute loop steps.
   size_t VectorBytes() const { return vector_bytes_; }
   // Work granularity that avoids false sharing and partial vectors.
   // = HWY_MAX(LineBytes(), VectorBytes())
   size_t StepBytes() const { return step_bytes_; }
 
+  // L1 and L2 are typically per core.
+  size_t L1Bytes() const { return l1_bytes_; }
+  size_t L2Bytes() const { return l2_bytes_; }
+  // Clusters often share an L3. We return the total size per package.
+  size_t L3Bytes() const { return l3_bytes_; }
+
+ private:
+  size_t line_bytes_;
+  size_t vector_bytes_;
+  size_t step_bytes_;
+
+  size_t l1_bytes_ = 0;
+  size_t l2_bytes_ = 0;
+  size_t l3_bytes_ = 0;
+};
+
+// NUMA-aware allocation and memory binding. Stored in `ThreadingContext`.
+class Allocator {
+ public:
+  Allocator(const BoundedTopology& topology, const CacheInfo& cache_info,
+            bool enable_bind);
+
+  // Used by `AllocateFor`, which only takes an `Allocator` argument,
+  // hence copy from `CacheInfo`.
+  size_t LineBytes() const { return line_bytes_; }
+
   // File size multiple required for memory mapping. Also used when binding
   // memory to NUMA nodes (see `BindB/BindC`).
   size_t BasePageBytes() const { return base_page_bytes_; }
@@ -105,12 +127,6 @@ class Allocator {
   // Desired allocator alignment: Either StepBytes, or BasePageBytes if NUMA.
   size_t QuantumBytes() const { return quantum_bytes_; }
 
-  // L1 and L2 are typically per core.
-  size_t L1Bytes() const { return l1_bytes_; }
-  size_t L2Bytes() const { return l2_bytes_; }
-  // Clusters often share an L3. We return the total size per package.
-  size_t L3Bytes() const { return l3_bytes_; }
-
   size_t TotalMiB() const { return total_mib_; }
   size_t FreeMiB() const;
 
@@ -159,18 +175,11 @@ class Allocator {
   bool BindMemory(void* p, size_t bytes, size_t node) const;
 
  private:
-  size_t line_bytes_;
-  size_t vector_bytes_;
-  size_t step_bytes_;
-  size_t base_page_bytes_;
+  const size_t line_bytes_;
+  const size_t base_page_bytes_;
+  const size_t total_mib_;
+
   size_t quantum_bytes_;
-
-  size_t l1_bytes_ = 0;
-  size_t l2_bytes_ = 0;
-  size_t l3_bytes_ = 0;
-
-  size_t total_mib_;
-
   bool should_bind_ = false;
 };
 
diff --git a/util/threading_context.cc b/util/threading_context.cc
index 81155c5..90a64d1 100644
--- a/util/threading_context.cc
+++ b/util/threading_context.cc
@@ -76,7 +76,8 @@ ThreadingContext::ThreadingContext(const ThreadingArgs& args)
       topology(BoundedSlice(args.skip_packages, args.max_packages),
                BoundedSlice(args.skip_clusters, args.max_clusters),
                BoundedSlice(args.skip_lps, args.max_lps)),
-      allocator(topology, args.bind != Tristate::kFalse),
+      cache_info(topology),
+      allocator(topology, cache_info, args.bind != Tristate::kFalse),
       pools(topology, allocator, args.max_threads, args.pin) {
   PROFILER_ZONE("Startup.ThreadingContext autotune");
   TunePool(pools.AllPackages());
diff --git a/util/threading_context.h b/util/threading_context.h
index 6bd6936..41d0811 100644
--- a/util/threading_context.h
+++ b/util/threading_context.h
@@ -105,7 +105,10 @@ struct ThreadingContext {
   // will be 1 regardless of the actual system topology.
   BoundedTopology topology;
 
-  // Ctor depends on `topology` for deciding whether to enable NUMA.
+  // Ctor depends on `topology` for per-cluster cache sizes.
+  CacheInfo cache_info;
+
+  // Ctor depends on `topology` (for NUMA) and `cache_info` (for step size).
   Allocator allocator;
 
   // Per-package/cluster/within cluster pools of threads, matching `topology`.