Add new singleton Allocator2 instead of monostate

Not yet used. Also fix format-string warning in topology.cc. PiperOrigin-RevId: 745166210
2025-04-08 09:00:18 -07:00 · 2025-04-08 09:00:18 -07:00 · 5d4f7e0f7e
parent 4e6aa36e9b
commit 5d4f7e0f7e
5 changed files with 520 additions and 7 deletions
--- a/util/allocator.cc
+++ b/util/allocator.cc
@ -15,12 +15,12 @@

 #include "util/allocator.h"

+#include <stdint.h>
 #include <stdio.h>

 #include "util/basics.h"  // MaybeCheckInitialized
 #include "hwy/aligned_allocator.h"
 #include "hwy/base.h"
-#include "hwy/contrib/thread_pool/futex.h"
 #include "hwy/contrib/thread_pool/topology.h"
 #include "hwy/per_target.h"  // VectorBytes

@ -46,13 +46,32 @@
 #endif  // GEMMA_BIND

 #if GEMMA_BIND && HWY_OS_LINUX
+#include <atomic>
+
+#include "hwy/contrib/thread_pool/futex.h"
+#endif
+
+#if HWY_OS_LINUX
+#include <unistd.h>  // sysconf
+#if GEMMA_BIND
 // `move_pages` requires anonymous/private mappings, hence mmap.
 #include <sys/mman.h>
 #include <sys/syscall.h>

 #include <cerrno>
 #include <vector>
-#endif  // GEMMA_BIND && HWY_OS_LINUX
+#endif  // GEMMA_BIND
+#elif HWY_OS_WIN
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+#ifndef VC_EXTRALEAN
+#define VC_EXTRALEAN
+#endif
+#include <Windows.h>
+#elif HWY_OS_APPLE
+#include <sys/sysctl.h>
+#endif  // HWY_OS_LINUX

 namespace gcpp {
 namespace {
@ -68,14 +87,47 @@ size_t DetectLineBytes() {

 size_t DetectPageSize() {
 #if HWY_OS_LINUX
-  size_t page_bytes = static_cast<size_t>(sysconf(_SC_PAGESIZE));
+  const long ret = sysconf(_SC_PAGESIZE);  // NOLINT(runtime/int)
+  HWY_ASSERT(ret != -1);
+  const size_t page_bytes = static_cast<size_t>(ret);
  HWY_ASSERT(page_bytes <= (4 << 20));
  return page_bytes;
+#elif HWY_OS_WIN
+  SYSTEM_INFO info;
+  GetSystemInfo(&info);
+  return info.dwPageSize;
+#elif HWY_OS_APPLE
+  uint64_t data = 0;
+  size_t len = sizeof(data);
+  HWY_ASSERT(sysctlbyname("vm.pagesize", &data, &len, nullptr, 0) == 0);
+  return data;
 #else
  return 0;
 #endif
 }

+size_t DetectTotalMiB(size_t page_bytes) {
+  (void)page_bytes;
+#if HWY_OS_LINUX
+  const long ret = sysconf(_SC_PHYS_PAGES);  // NOLINT(runtime/int)
+  HWY_ASSERT(ret != -1);
+  return static_cast<size_t>(ret) * page_bytes >> 20;
+#elif HWY_OS_WIN
+  MEMORYSTATUSEX ms = {sizeof(MEMORYSTATUSEX)};
+  HWY_ASSERT(GlobalMemoryStatusEx(&ms) != 0);
+  return ms.ullTotalPhys >> 20;
+#elif HWY_OS_APPLE
+  int mib[2] = {CTL_HW, HW_MEMSIZE};
+  uint64_t data = 0;
+  size_t len = sizeof(data);
+  HWY_ASSERT(sysctl(mib, sizeof(mib) / sizeof(*mib), &data, &len, nullptr, 0) ==
+             0);
+  return data >> 20;
+#else
+#error "Port"
+#endif
+}
+
 }  // namespace

 static size_t line_bytes_;
@ -305,4 +357,123 @@ bool Allocator::BindMemory(void* ptr, size_t bytes, size_t node) {
 bool Allocator::BindMemory(void*, size_t, size_t) { return false; }
 #endif  // GEMMA_BIND && HWY_OS_LINUX

+Allocator2::Allocator2(const BoundedTopology& topology, bool enable_bind) {
+  line_bytes_ = DetectLineBytes();
+  vector_bytes_ = hwy::VectorBytes();
+  step_bytes_ = HWY_MAX(line_bytes_, vector_bytes_);
+  base_page_bytes_ = DetectPageSize();
+  quantum_bytes_ = step_bytes_;  // may overwrite below
+
+  const BoundedTopology::Cluster& cluster = topology.GetCluster(0, 0);
+  if (const hwy::Cache* caches = hwy::DataCaches()) {
+    l1_bytes_ = caches[1].size_kib << 10;
+    l2_bytes_ = caches[2].size_kib << 10;
+    l3_bytes_ = (caches[3].size_kib << 10) * caches[3].cores_sharing;
+  } else {  // Unknown, make reasonable assumptions.
+    l1_bytes_ = 32 << 10;
+    l2_bytes_ = (cluster.PrivateKiB() ? cluster.PrivateKiB() : 256) << 10;
+  }
+  if (l3_bytes_ == 0) {
+    l3_bytes_ = (cluster.SharedKiB() ? cluster.SharedKiB() : 1024) << 10;
+  }
+
+  total_mib_ = DetectTotalMiB(base_page_bytes_);
+
+  // Prerequisites for binding:
+  // - supported by the OS (currently Linux only),
+  // - the page size is known and 'reasonably small', preferably less than
+  //   a fraction of MatMul row/col sizes, which for 27B are up to 144 KiB.
+  // - we successfully detected topology and there are multiple nodes;
+  // - there are multiple packages, because we shard by package_idx.
+  if constexpr (GEMMA_BIND) {
+    if ((base_page_bytes_ != 0 && base_page_bytes_ <= 16 * 1024) &&
+        topology.NumNodes() > 1 && topology.NumPackages() > 1) {
+      if (enable_bind) {
+        // Ensure pages meet the alignment requirements of `AllocBytes`.
+        HWY_ASSERT(base_page_bytes_ >= quantum_bytes_);
+        quantum_bytes_ = base_page_bytes_;
+        // Ensure MaxQuantum() is an upper bound.
+        HWY_ASSERT(MaxQuantum<uint8_t>() >= Quantum<uint8_t>());
+        should_bind_ = true;
+      } else {
+        HWY_WARN(
+            "Multiple sockets but binding disabled. This reduces speed; "
+            "set or remove enable_bind to avoid this warning.");
+      }
+    }
+  }
+
+  HWY_DASSERT(quantum_bytes_ % step_bytes_ == 0);
+  quantum_step_mask_ = quantum_bytes_ / step_bytes_ - 1;
+}
+
+size_t Allocator2::FreeMiB() const {
+#if HWY_OS_LINUX
+  const long ret = sysconf(_SC_AVPHYS_PAGES);  // NOLINT(runtime/int)
+  HWY_ASSERT(ret != -1);
+  return static_cast<size_t>(ret) * base_page_bytes_ >> 20;
+#elif HWY_OS_WIN
+  MEMORYSTATUSEX ms = {sizeof(MEMORYSTATUSEX)};
+  HWY_ASSERT(GlobalMemoryStatusEx(&ms) != 0);
+  return ms.ullAvailVirtual >> 20;
+#elif HWY_OS_APPLE
+  uint64_t free = 0, inactive = 0, speculative = 0;
+  size_t len = sizeof(free);
+  sysctlbyname("vm.page_free_count", &free, &len, nullptr, 0);
+  sysctlbyname("vm.page_inactive_count", &inactive, &len, nullptr, 0);
+  sysctlbyname("vm.page_speculative_count", &speculative, &len, nullptr, 0);
+  return (free + inactive + speculative) * base_page_bytes_ >> 20;
+#else
+#error "Port"
+#endif
+}
+
+Allocator2::PtrAndDeleter Allocator2::AllocBytes(size_t bytes) const {
+  // If we are not binding, the Highway allocator is cheaper than `mmap`, and
+  // defends against 2K aliasing.
+  if (!should_bind_) {
+    // Perf warning if Highway's alignment is less than we want.
+    if (HWY_ALIGNMENT < QuantumBytes()) {
+      HWY_WARN(
+          "HWY_ALIGNMENT %d < QuantumBytes %zu: either vector or cache lines "
+          "are huge, enable GEMMA_BIND to avoid this warning.",
+          HWY_ALIGNMENT, QuantumBytes());
+    }
+    auto p = hwy::AllocateAligned<uint8_t>(bytes);
+    // The `hwy::AlignedFreeUniquePtr` deleter is unfortunately specific to the
+    // alignment scheme in aligned_allocator.cc and does not work for
+    // already-aligned pointers as returned by `mmap`, hence we wrap the Highway
+    // pointer in our own deleter.
+    return PtrAndDeleter{p.release(), DeleterFunc2([](void* ptr) {
+                           hwy::FreeAlignedBytes(ptr, nullptr, nullptr);
+                         })};
+  }
+
+  // Binding, or large vector/cache line size: use platform-specific allocator.
+
+#if HWY_OS_LINUX && !defined(__ANDROID_API__)
+  // `move_pages` is documented to require an anonymous/private mapping or
+  // `MAP_SHARED`. A normal allocation might not suffice, so we use `mmap`.
+  // `Init` verified that the page size is a multiple of `QuantumBytes()`.
+  const int prot = PROT_READ | PROT_WRITE;
+  const int flags = MAP_ANONYMOUS | MAP_PRIVATE;
+  const int fd = -1;
+  void* p = mmap(0, bytes, prot, flags, fd, off_t{0});
+  if (p == MAP_FAILED) p = nullptr;
+  return PtrAndDeleter{p, DeleterFunc2([bytes](void* ptr) {
+                         HWY_ASSERT(munmap(ptr, bytes) == 0);
+                       })};
+#elif HWY_OS_WIN
+  const size_t alignment = HWY_MAX(vector_bytes_, line_bytes_);
+  return PtrAndDeleter{_aligned_malloc(bytes, alignment),
+                       DeleterFunc2([](void* ptr) { _aligned_free(ptr); })};
+#else
+  return PtrAndDeleter{nullptr, DeleterFunc2()};
+#endif
+}
+
+bool Allocator2::BindMemory(void* ptr, size_t bytes, size_t node) const {
+  return Allocator::BindMemory(ptr, bytes, node);
+}
+
 }  // namespace gcpp
--- a/util/allocator.h
+++ b/util/allocator.h
@ -21,6 +21,7 @@
 #include <stddef.h>
 #include <stdint.h>

+#include <functional>
 // IWYU pragma: begin_exports
 #include <memory>  // std::unique_ptr

@ -330,6 +331,156 @@ RowPtr<T> RowPtrFromBatch(RowVectorBatch<T>& row_vectors) {
  return RowPtr<T>(row_vectors.All(), row_vectors.Cols(), row_vectors.Stride());
 }

+// Custom deleter for types without a dtor, but where the deallocation requires
+// state, e.g. a lambda with *by-value* capture.
+class DeleterFunc2 {
+ public:
+  // `MatOwnerT` requires this to be default-constructible.
+  DeleterFunc2() = default;
+
+  template <class Closure>
+  DeleterFunc2(const Closure& free_closure) : free_func_(free_closure) {}
+
+  template <typename T>
+  void operator()(T* p) const {
+    free_func_(const_cast<hwy::RemoveConst<T>*>(p));
+  }
+
+ private:
+  std::function<void(void*)> free_func_;
+};
+
+// Wrapper that also calls the destructor for each element being deallocated.
+class DeleterDtor2 {
+ public:
+  DeleterDtor2() {}
+  DeleterDtor2(size_t num, DeleterFunc2 free) : num_(num), free_(free) {}
+
+  template <typename T>
+  void operator()(T* p) const {
+    for (size_t i = 0; i < num_; ++i) {
+      p[i].~T();
+    }
+    free_(p);
+  }
+
+ private:
+  size_t num_;
+  DeleterFunc2 free_;
+};
+
+// Unique (move-only) pointer to aligned POD T, which can be an array or class.
+template <typename T>
+using AlignedPtr2 = std::unique_ptr<T, DeleterFunc2>;
+// Unique (move-only) pointer to an aligned array of non-POD T.
+template <typename T>
+using AlignedClassPtr2 = std::unique_ptr<T, DeleterDtor2>;
+
+// Both allocation, binding, and row accessors depend on the sizes of memory
+// pages and cache lines. To avoid having to pass `Allocator2&` everywhere, we
+// wrap this in a singleton. A monostate requires explicit initialization,
+// which we prefer to avoid because there are many main() functions.
+class Allocator2 {
+ public:
+  // Must be called at least once before any other function. Not thread-safe,
+  // hence only call this from the main thread.
+  // TODO: remove enable_bind once Gemma tensors support binding.
+  Allocator2(const BoundedTopology& topology, bool enable_bind);
+
+  // Bytes per cache line, or a reasonable guess if unknown. Used to choose
+  // ranges such that there will be no false sharing.
+  size_t LineBytes() const { return line_bytes_; }
+  // Bytes per full vector. Used to compute loop steps.
+  size_t VectorBytes() const { return vector_bytes_; }
+  // Work granularity that avoids false sharing and partial vectors.
+  // = HWY_MAX(LineBytes(), VectorBytes())
+  size_t StepBytes() const { return step_bytes_; }
+  // File size multiple required for memory mapping.
+  size_t BasePageBytes() const { return base_page_bytes_; }
+  // Either StepBytes or BasePageBytes if NUMA.
+  size_t QuantumBytes() const { return quantum_bytes_; }
+  template <typename T>
+  size_t Quantum() const {
+    return QuantumBytes() / sizeof(T);
+  }
+  // Upper bound on `Quantum()`, for stack allocations.
+  template <typename T>
+  static constexpr size_t MaxQuantum() {
+    return 4096 / sizeof(T);
+  }
+  // = QuantumBytes() / StepBytes() - 1
+  size_t QuantumStepMask() const { return quantum_step_mask_; }
+
+  // L1 and L2 are typically per core.
+  size_t L1Bytes() const { return l1_bytes_; }
+  size_t L2Bytes() const { return l2_bytes_; }
+  // Clusters often share an L3. We return the total size per package.
+  size_t L3Bytes() const { return l3_bytes_; }
+
+  size_t TotalMiB() const { return total_mib_; }
+  size_t FreeMiB() const;
+
+  // Returns pointer aligned to `QuantumBytes()`.
+  template <typename T>
+  AlignedPtr2<T[]> Alloc(size_t num) const {
+    const size_t bytes = num * sizeof(T);
+    // Fail if the `bytes = num * sizeof(T)` computation overflowed.
+    HWY_ASSERT(bytes / sizeof(T) == num);
+
+    PtrAndDeleter pd = AllocBytes(bytes);
+    return AlignedPtr2<T[]>(static_cast<T*>(pd.p), pd.deleter);
+  }
+
+  // Same as Alloc, but calls constructor(s) with `args` and the deleter will
+  // call destructor(s).
+  template <typename T, class... Args>
+  AlignedClassPtr2<T> AllocClasses(size_t num, Args&&... args) const {
+    const size_t bytes = num * sizeof(T);
+    // Fail if the `bytes = num * sizeof(T)` computation overflowed.
+    HWY_ASSERT(bytes / sizeof(T) == num);
+
+    PtrAndDeleter pd = AllocBytes(bytes);
+    T* p = static_cast<T*>(pd.p);
+    for (size_t i = 0; i < num; ++i) {
+      new (p + i) T(std::forward<Args>(args)...);
+    }
+    return AlignedClassPtr2<T>(p, DeleterDtor2(num, pd.deleter));
+  }
+
+  // Returns whether `BindMemory` can/should be called, i.e. we have page-level
+  // control over memory placement and multiple packages and NUMA nodes.
+  bool ShouldBind() const { return should_bind_; }
+
+  // Attempts to move(!) `[p, p + bytes)` to the given NUMA node, which is
+  // typically `BoundedTopology::GetCluster(package_idx, cluster_idx).node`.
+  // Writes zeros to SOME of the memory. Only call if `ShouldBind()`.
+  // `p` and `bytes` must be multiples of `QuantumBytes()`.
+  bool BindMemory(void* p, size_t bytes, size_t node) const;
+
+ private:
+  // Type-erased so this can be implemented in allocator.cc.
+  struct PtrAndDeleter {
+    void* p;
+    DeleterFunc2 deleter;
+  };
+  PtrAndDeleter AllocBytes(size_t bytes) const;
+
+  size_t line_bytes_;
+  size_t vector_bytes_;
+  size_t step_bytes_;
+  size_t base_page_bytes_;
+  size_t quantum_bytes_;
+  size_t quantum_step_mask_;
+
+  size_t l1_bytes_ = 0;
+  size_t l2_bytes_ = 0;
+  size_t l3_bytes_ = 0;
+
+  size_t total_mib_;
+
+  bool should_bind_ = false;
+};
+
 }  // namespace gcpp

 #endif  // THIRD_PARTY_GEMMA_CPP_UTIL_ALLOCATOR_H_
--- a/util/threading_context.cc
+++ b/util/threading_context.cc
@ -0,0 +1,63 @@
+// Copyright 2025 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "util/threading_context.h"
+
+#include <memory>
+#include <mutex>  // NOLINT
+
+namespace gcpp {
+
+static ThreadingArgs s_args;
+// Cannot use magic static because that does not support `Invalidate`, hence
+// allocate manually.
+static std::unique_ptr<ThreadingContext2> s_ctx;
+static std::mutex s_ctx_mutex;
+
+/*static*/ void ThreadingContext2::SetArgs(const ThreadingArgs& args) {
+  s_ctx_mutex.lock();
+  HWY_ASSERT(!s_ctx);  // Ensure not already initialized, else this is too late.
+  s_args = args;
+  s_ctx_mutex.unlock();
+}
+
+/*static*/ ThreadingContext2& ThreadingContext2::Get() {
+  // We do not bother with double-checked locking because it requires an
+  // atomic pointer, but we prefer to use unique_ptr for simplicity. Also,
+  // callers can cache the result and call less often.
+  s_ctx_mutex.lock();
+  if (HWY_UNLIKELY(!s_ctx)) {
+    s_ctx = std::make_unique<ThreadingContext2>(PrivateToken());
+  }
+  s_ctx_mutex.unlock();
+  return *s_ctx;
+}
+
+/*static*/ void ThreadingContext2::ThreadHostileInvalidate() {
+  // Deliberately avoid taking the lock so that tsan can warn if this is
+  // called concurrently with other calls to `Get`.
+  s_ctx.reset();
+}
+
+// WARNING: called with `s_ctx_mutex` held. Calling `SetArgs` or `Get` would
+// deadlock.
+ThreadingContext2::ThreadingContext2(ThreadingContext2::PrivateToken)
+    : topology(BoundedSlice(s_args.skip_packages, s_args.max_packages),
+               BoundedSlice(s_args.skip_clusters, s_args.max_clusters),
+               BoundedSlice(s_args.skip_lps, s_args.max_lps)),
+      allocator(topology, s_args.bind != Tristate::kFalse),
+      pools(topology, allocator, s_args.max_threads, s_args.pin) {}
+
+}  // namespace gcpp
--- a/util/threading_context.h
+++ b/util/threading_context.h
@ -0,0 +1,128 @@
+// Copyright 2025 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef THIRD_PARTY_GEMMA_CPP_UTIL_THREADING_CONTEXT_H_
+#define THIRD_PARTY_GEMMA_CPP_UTIL_THREADING_CONTEXT_H_
+
+// Separate component to ensure `threading.cc` does not have access to
+// `ThreadingContext`, because that could deadlock.
+
+#include <stddef.h>
+#include <stdint.h>
+
+// IWYU pragma: begin_exports
+#include "util/allocator.h"
+#include "util/args.h"
+#include "util/basics.h"  // Tristate
+#include "util/threading.h"
+#include "util/topology.h"
+// IWYU pragma: end_exports
+
+namespace gcpp {
+
+// Optional arguments for `ThreadingContext` from the command line.
+class ThreadingArgs : public ArgsBase<ThreadingArgs> {
+ public:
+  ThreadingArgs(int argc, char* argv[]) { InitAndParse(argc, argv); }
+  ThreadingArgs() { Init(); };
+
+  // For BoundedTopology:
+  size_t skip_packages;
+  size_t max_packages;
+  size_t skip_clusters;
+  size_t max_clusters;
+  size_t skip_lps;
+  size_t max_lps;
+
+  Tristate bind;
+
+  // For NestedPools:
+  size_t max_threads;  // divided among the detected clusters
+  Tristate pin;        // pin threads?
+  Tristate spin;       // use spin waits?
+
+  template <class Visitor>
+  void ForEach(const Visitor& visitor) {
+    // These can be used to partition CPU sockets/packages and their
+    // clusters/CCXs across several program instances. The default is to use
+    // all available resources.
+    visitor(skip_packages, "skip_packages", size_t{0},
+            "Index of the first socket to use; default 0 = unlimited.", 2);
+    visitor(max_packages, "max_packages", size_t{0},
+            "Maximum number of sockets to use; default 0 = unlimited.", 2);
+    visitor(skip_clusters, "skip_clusters", size_t{0},
+            "Index of the first CCX to use; default 0 = unlimited.", 2);
+    visitor(max_clusters, "max_clusters", size_t{0},
+            "Maximum number of CCXs to use; default 0 = unlimited.", 2);
+    // These are only used when CPU topology is unknown.
+    visitor(skip_lps, "skip_lps", size_t{0},
+            "Index of the first LP to use; default 0 = unlimited.", 2);
+    visitor(max_lps, "max_lps", size_t{0},
+            "Maximum number of LPs to use; default 0 = unlimited.", 2);
+
+    // The exact meaning is more subtle: see the comment at NestedPools ctor.
+    visitor(max_threads, "num_threads", size_t{0},
+            "Maximum number of threads to use; default 0 = unlimited.", 2);
+    visitor(pin, "pin", Tristate::kDefault,
+            "Pin threads? -1 = auto, 0 = no, 1 = yes.", 2);
+    visitor(spin, "spin", Tristate::kDefault,
+            "Use spin waits? -1 = auto, 0 = no, 1 = yes.", 2);
+
+    visitor(bind, "bind", Tristate::kDefault,
+            "Bind memory to sockets? -1 = auto, 0 = no, 1 = yes.", 2);
+  }
+};
+
+// Lazily-initialized singleton with support for passing in arguments from
+// `ThreadingArgs` and re-initializing with different arguments.
+class ThreadingContext2 {
+  struct PrivateToken {};  // avoids constructing directly
+
+ public:
+  // If not called, default arguments are used when `Get` initializes the
+  // singleton. Must not be called after `Get`, unless after a call to
+  // `ThreadHostileInvalidate`, because otherwise initialization already
+  // happened and the arguments would have no effect. Thread-safe, though this
+  // is expected to be called early in the program, before threading starts.
+  static void SetArgs(const ThreadingArgs& args);
+
+  // Returns a reference to the singleton after initializing it if necessary.
+  // When initializing, uses the args passed to `SetArgs`, or defaults.
+  //
+  // It is safe to call this concurrently with other `Get`, but not with
+  // `SetArgs`, because that will warn if called after this, nor with
+  // `ThreadHostileInvalidate`, because that will invalidate the reference which
+  // callers of this may still be using. Such usage only occurs in tests,
+  // hence we prefer not to pull `std::shared_ptr` into the interface.
+  //
+  // To reduce overhead, callers should cache the result and call less often.
+  static ThreadingContext2& Get();
+
+  // Invalidates the singleton before or after a call to `Get`. This allows
+  // changing the arguments between tests. Callers must again call `Get`
+  // afterwards to obtain an instance. WARNING: must not be called concurrently
+  // with other calls to `Get` and usages of its return value.
+  static void ThreadHostileInvalidate();
+
+  explicit ThreadingContext2(PrivateToken);  // only called via `Get`.
+
+  BoundedTopology topology;
+  Allocator2 allocator;
+  NestedPools pools;
+};
+
+}  // namespace gcpp
+
+#endif  // THIRD_PARTY_GEMMA_CPP_UTIL_THREADING_CONTEXT_H_
--- a/util/topology.cc
+++ b/util/topology.cc
@ -138,13 +138,13 @@ BoundedTopology::Cluster::Cluster(const LPS& enabled_lps,
        }
        if (HWY_UNLIKELY(private_kib_ != tcluster.private_kib)) {
          warned = true;
-          HWY_WARN("lp %zu private_kib %zu != cluster %zu.", lp, private_kib_,
-                   tcluster.private_kib);
+          HWY_WARN("lp %zu private_kib %zu != cluster %u.", lp, private_kib_,
+                   static_cast<unsigned>(tcluster.private_kib));
        }
        if (HWY_UNLIKELY(shared_kib_ != tcluster.shared_kib)) {
          warned = true;
-          HWY_WARN("lp %zu shared_kib %zu != cluster %zu.", lp, shared_kib_,
-                   tcluster.shared_kib);
+          HWY_WARN("lp %zu shared_kib %zu != cluster %u.", lp, shared_kib_,
+                   static_cast<unsigned>(tcluster.shared_kib));
        }
      }  // !warned
    }