Add new singleton Allocator2 instead of monostate

Not yet used. Also fix format-string warning in topology.cc. PiperOrigin-RevId: 745166210
2025-04-08 09:00:18 -07:00 · 2025-04-08 09:00:18 -07:00 · 5d4f7e0f7e
parent 4e6aa36e9b
commit 5d4f7e0f7e
5 changed files with 520 additions and 7 deletions
--- a/util/allocator.cc
+++ b/util/allocator.cc
@ -15,12 +15,12 @@
 #include "util/allocator.h"
 #include <stdint.h>
 #include <stdio.h>
 #include "util/basics.h"  // MaybeCheckInitialized
 #include "hwy/aligned_allocator.h"
 #include "hwy/base.h"
 #include "hwy/contrib/thread_pool/futex.h"
 #include "hwy/contrib/thread_pool/topology.h"
 #include "hwy/per_target.h"  // VectorBytes
@ -46,13 +46,32 @@
 #endif  // GEMMA_BIND
 #if GEMMA_BIND && HWY_OS_LINUX
 #include <atomic>
 #include "hwy/contrib/thread_pool/futex.h"
 #endif
 #if HWY_OS_LINUX
 #include <unistd.h>  // sysconf
 #if GEMMA_BIND
 // `move_pages` requires anonymous/private mappings, hence mmap.
 #include <sys/mman.h>
 #include <sys/syscall.h>
 #include <cerrno>
 #include <vector>
-#endif  // GEMMA_BIND && HWY_OS_LINUX
+#endif  // GEMMA_BIND
 #elif HWY_OS_WIN
 #ifndef WIN32_LEAN_AND_MEAN
 #define WIN32_LEAN_AND_MEAN
 #endif
 #ifndef VC_EXTRALEAN
 #define VC_EXTRALEAN
 #endif
 #include <Windows.h>
 #elif HWY_OS_APPLE
 #include <sys/sysctl.h>
 #endif  // HWY_OS_LINUX
 namespace gcpp {
 namespace {
@ -68,14 +87,47 @@ size_t DetectLineBytes() {
 size_t DetectPageSize() {
 #if HWY_OS_LINUX
-  size_t page_bytes = static_cast<size_t>(sysconf(_SC_PAGESIZE));
+  const long ret = sysconf(_SC_PAGESIZE);  // NOLINT(runtime/int)
  HWY_ASSERT(ret != -1);
  const size_t page_bytes = static_cast<size_t>(ret);
  HWY_ASSERT(page_bytes <= (4 << 20));
  return page_bytes;
 #elif HWY_OS_WIN
  SYSTEM_INFO info;
  GetSystemInfo(&info);
  return info.dwPageSize;
 #elif HWY_OS_APPLE
  uint64_t data = 0;
  size_t len = sizeof(data);
  HWY_ASSERT(sysctlbyname("vm.pagesize", &data, &len, nullptr, 0) == 0);
  return data;
 #else
  return 0;
 #endif
 }
 size_t DetectTotalMiB(size_t page_bytes) {
  (void)page_bytes;
 #if HWY_OS_LINUX
  const long ret = sysconf(_SC_PHYS_PAGES);  // NOLINT(runtime/int)
  HWY_ASSERT(ret != -1);
  return static_cast<size_t>(ret) * page_bytes >> 20;
 #elif HWY_OS_WIN
  MEMORYSTATUSEX ms = {sizeof(MEMORYSTATUSEX)};
  HWY_ASSERT(GlobalMemoryStatusEx(&ms) != 0);
  return ms.ullTotalPhys >> 20;
 #elif HWY_OS_APPLE
  int mib[2] = {CTL_HW, HW_MEMSIZE};
  uint64_t data = 0;
  size_t len = sizeof(data);
  HWY_ASSERT(sysctl(mib, sizeof(mib) / sizeof(*mib), &data, &len, nullptr, 0) ==
             0);
  return data >> 20;
 #else
 #error "Port"
 #endif
 }
 }  // namespace
 static size_t line_bytes_;
@ -305,4 +357,123 @@ bool Allocator::BindMemory(void* ptr, size_t bytes, size_t node) {
 bool Allocator::BindMemory(void*, size_t, size_t) { return false; }
 #endif  // GEMMA_BIND && HWY_OS_LINUX
 Allocator2::Allocator2(const BoundedTopology& topology, bool enable_bind) {
  line_bytes_ = DetectLineBytes();
  vector_bytes_ = hwy::VectorBytes();
  step_bytes_ = HWY_MAX(line_bytes_, vector_bytes_);
  base_page_bytes_ = DetectPageSize();
  quantum_bytes_ = step_bytes_;  // may overwrite below
  const BoundedTopology::Cluster& cluster = topology.GetCluster(0, 0);
  if (const hwy::Cache* caches = hwy::DataCaches()) {
    l1_bytes_ = caches[1].size_kib << 10;
    l2_bytes_ = caches[2].size_kib << 10;
    l3_bytes_ = (caches[3].size_kib << 10) * caches[3].cores_sharing;
  } else {  // Unknown, make reasonable assumptions.
    l1_bytes_ = 32 << 10;
    l2_bytes_ = (cluster.PrivateKiB() ? cluster.PrivateKiB() : 256) << 10;
  }
  if (l3_bytes_ == 0) {
    l3_bytes_ = (cluster.SharedKiB() ? cluster.SharedKiB() : 1024) << 10;
  }
  total_mib_ = DetectTotalMiB(base_page_bytes_);
  // Prerequisites for binding:
  // - supported by the OS (currently Linux only),
  // - the page size is known and 'reasonably small', preferably less than
  //   a fraction of MatMul row/col sizes, which for 27B are up to 144 KiB.
  // - we successfully detected topology and there are multiple nodes;
  // - there are multiple packages, because we shard by package_idx.
  if constexpr (GEMMA_BIND) {
    if ((base_page_bytes_ != 0 && base_page_bytes_ <= 16 * 1024) &&
        topology.NumNodes() > 1 && topology.NumPackages() > 1) {
      if (enable_bind) {
        // Ensure pages meet the alignment requirements of `AllocBytes`.
        HWY_ASSERT(base_page_bytes_ >= quantum_bytes_);
        quantum_bytes_ = base_page_bytes_;
        // Ensure MaxQuantum() is an upper bound.
        HWY_ASSERT(MaxQuantum<uint8_t>() >= Quantum<uint8_t>());
        should_bind_ = true;
      } else {
        HWY_WARN(
            "Multiple sockets but binding disabled. This reduces speed; "
            "set or remove enable_bind to avoid this warning.");
      }
    }
  }
  HWY_DASSERT(quantum_bytes_ % step_bytes_ == 0);
  quantum_step_mask_ = quantum_bytes_ / step_bytes_ - 1;
 }
 size_t Allocator2::FreeMiB() const {
 #if HWY_OS_LINUX
  const long ret = sysconf(_SC_AVPHYS_PAGES);  // NOLINT(runtime/int)
  HWY_ASSERT(ret != -1);
  return static_cast<size_t>(ret) * base_page_bytes_ >> 20;
 #elif HWY_OS_WIN
  MEMORYSTATUSEX ms = {sizeof(MEMORYSTATUSEX)};
  HWY_ASSERT(GlobalMemoryStatusEx(&ms) != 0);
  return ms.ullAvailVirtual >> 20;
 #elif HWY_OS_APPLE
  uint64_t free = 0, inactive = 0, speculative = 0;
  size_t len = sizeof(free);
  sysctlbyname("vm.page_free_count", &free, &len, nullptr, 0);
  sysctlbyname("vm.page_inactive_count", &inactive, &len, nullptr, 0);
  sysctlbyname("vm.page_speculative_count", &speculative, &len, nullptr, 0);
  return (free + inactive + speculative) * base_page_bytes_ >> 20;
 #else
 #error "Port"
 #endif
 }
 Allocator2::PtrAndDeleter Allocator2::AllocBytes(size_t bytes) const {
  // If we are not binding, the Highway allocator is cheaper than `mmap`, and
  // defends against 2K aliasing.
  if (!should_bind_) {
    // Perf warning if Highway's alignment is less than we want.
    if (HWY_ALIGNMENT < QuantumBytes()) {
      HWY_WARN(
          "HWY_ALIGNMENT %d < QuantumBytes %zu: either vector or cache lines "
          "are huge, enable GEMMA_BIND to avoid this warning.",
          HWY_ALIGNMENT, QuantumBytes());
    }
    auto p = hwy::AllocateAligned<uint8_t>(bytes);
    // The `hwy::AlignedFreeUniquePtr` deleter is unfortunately specific to the
    // alignment scheme in aligned_allocator.cc and does not work for
    // already-aligned pointers as returned by `mmap`, hence we wrap the Highway
    // pointer in our own deleter.
    return PtrAndDeleter{p.release(), DeleterFunc2([](void* ptr) {
                           hwy::FreeAlignedBytes(ptr, nullptr, nullptr);
                         })};
  }
  // Binding, or large vector/cache line size: use platform-specific allocator.
 #if HWY_OS_LINUX && !defined(__ANDROID_API__)
  // `move_pages` is documented to require an anonymous/private mapping or
  // `MAP_SHARED`. A normal allocation might not suffice, so we use `mmap`.
  // `Init` verified that the page size is a multiple of `QuantumBytes()`.
  const int prot = PROT_READ | PROT_WRITE;
  const int flags = MAP_ANONYMOUS | MAP_PRIVATE;
  const int fd = -1;
  void* p = mmap(0, bytes, prot, flags, fd, off_t{0});
  if (p == MAP_FAILED) p = nullptr;
  return PtrAndDeleter{p, DeleterFunc2([bytes](void* ptr) {
                         HWY_ASSERT(munmap(ptr, bytes) == 0);
                       })};
 #elif HWY_OS_WIN
  const size_t alignment = HWY_MAX(vector_bytes_, line_bytes_);
  return PtrAndDeleter{_aligned_malloc(bytes, alignment),
                       DeleterFunc2([](void* ptr) { _aligned_free(ptr); })};
 #else
  return PtrAndDeleter{nullptr, DeleterFunc2()};
 #endif
 }
 bool Allocator2::BindMemory(void* ptr, size_t bytes, size_t node) const {
  return Allocator::BindMemory(ptr, bytes, node);
 }
 }  // namespace gcpp
--- a/util/allocator.h
+++ b/util/allocator.h
@ -21,6 +21,7 @@
 #include <stddef.h>
 #include <stdint.h>
 #include <functional>
 // IWYU pragma: begin_exports
 #include <memory>  // std::unique_ptr
@ -330,6 +331,156 @@ RowPtr<T> RowPtrFromBatch(RowVectorBatch<T>& row_vectors) {
  return RowPtr<T>(row_vectors.All(), row_vectors.Cols(), row_vectors.Stride());
 }
 // Custom deleter for types without a dtor, but where the deallocation requires
 // state, e.g. a lambda with *by-value* capture.
 class DeleterFunc2 {
 public:
  // `MatOwnerT` requires this to be default-constructible.
  DeleterFunc2() = default;
  template <class Closure>
  DeleterFunc2(const Closure& free_closure) : free_func_(free_closure) {}
  template <typename T>
  void operator()(T* p) const {
    free_func_(const_cast<hwy::RemoveConst<T>*>(p));
  }
 private:
  std::function<void(void*)> free_func_;
 };
 // Wrapper that also calls the destructor for each element being deallocated.
 class DeleterDtor2 {
 public:
  DeleterDtor2() {}
  DeleterDtor2(size_t num, DeleterFunc2 free) : num_(num), free_(free) {}
  template <typename T>
  void operator()(T* p) const {
    for (size_t i = 0; i < num_; ++i) {
      p[i].~T();
    }
    free_(p);
  }
 private:
  size_t num_;
  DeleterFunc2 free_;
 };
 // Unique (move-only) pointer to aligned POD T, which can be an array or class.
 template <typename T>
 using AlignedPtr2 = std::unique_ptr<T, DeleterFunc2>;
 // Unique (move-only) pointer to an aligned array of non-POD T.
 template <typename T>
 using AlignedClassPtr2 = std::unique_ptr<T, DeleterDtor2>;
 // Both allocation, binding, and row accessors depend on the sizes of memory
 // pages and cache lines. To avoid having to pass `Allocator2&` everywhere, we
 // wrap this in a singleton. A monostate requires explicit initialization,
 // which we prefer to avoid because there are many main() functions.
 class Allocator2 {
 public:
  // Must be called at least once before any other function. Not thread-safe,
  // hence only call this from the main thread.
  // TODO: remove enable_bind once Gemma tensors support binding.
  Allocator2(const BoundedTopology& topology, bool enable_bind);
  // Bytes per cache line, or a reasonable guess if unknown. Used to choose
  // ranges such that there will be no false sharing.
  size_t LineBytes() const { return line_bytes_; }
  // Bytes per full vector. Used to compute loop steps.
  size_t VectorBytes() const { return vector_bytes_; }
  // Work granularity that avoids false sharing and partial vectors.
  // = HWY_MAX(LineBytes(), VectorBytes())
  size_t StepBytes() const { return step_bytes_; }
  // File size multiple required for memory mapping.
  size_t BasePageBytes() const { return base_page_bytes_; }
  // Either StepBytes or BasePageBytes if NUMA.
  size_t QuantumBytes() const { return quantum_bytes_; }
  template <typename T>
  size_t Quantum() const {
    return QuantumBytes() / sizeof(T);
  }
  // Upper bound on `Quantum()`, for stack allocations.
  template <typename T>
  static constexpr size_t MaxQuantum() {
    return 4096 / sizeof(T);
  }
  // = QuantumBytes() / StepBytes() - 1
  size_t QuantumStepMask() const { return quantum_step_mask_; }
  // L1 and L2 are typically per core.
  size_t L1Bytes() const { return l1_bytes_; }
  size_t L2Bytes() const { return l2_bytes_; }
  // Clusters often share an L3. We return the total size per package.
  size_t L3Bytes() const { return l3_bytes_; }
  size_t TotalMiB() const { return total_mib_; }
  size_t FreeMiB() const;
  // Returns pointer aligned to `QuantumBytes()`.
  template <typename T>
  AlignedPtr2<T[]> Alloc(size_t num) const {
    const size_t bytes = num * sizeof(T);
    // Fail if the `bytes = num * sizeof(T)` computation overflowed.
    HWY_ASSERT(bytes / sizeof(T) == num);
    PtrAndDeleter pd = AllocBytes(bytes);
    return AlignedPtr2<T[]>(static_cast<T*>(pd.p), pd.deleter);
  }
  // Same as Alloc, but calls constructor(s) with `args` and the deleter will
  // call destructor(s).
  template <typename T, class... Args>
  AlignedClassPtr2<T> AllocClasses(size_t num, Args&&... args) const {
    const size_t bytes = num * sizeof(T);
    // Fail if the `bytes = num * sizeof(T)` computation overflowed.
    HWY_ASSERT(bytes / sizeof(T) == num);
    PtrAndDeleter pd = AllocBytes(bytes);
    T* p = static_cast<T*>(pd.p);
    for (size_t i = 0; i < num; ++i) {
      new (p + i) T(std::forward<Args>(args)...);
    }
    return AlignedClassPtr2<T>(p, DeleterDtor2(num, pd.deleter));
  }
  // Returns whether `BindMemory` can/should be called, i.e. we have page-level
  // control over memory placement and multiple packages and NUMA nodes.
  bool ShouldBind() const { return should_bind_; }
  // Attempts to move(!) `[p, p + bytes)` to the given NUMA node, which is
  // typically `BoundedTopology::GetCluster(package_idx, cluster_idx).node`.
  // Writes zeros to SOME of the memory. Only call if `ShouldBind()`.
  // `p` and `bytes` must be multiples of `QuantumBytes()`.
  bool BindMemory(void* p, size_t bytes, size_t node) const;
 private:
  // Type-erased so this can be implemented in allocator.cc.
  struct PtrAndDeleter {
    void* p;
    DeleterFunc2 deleter;
  };
  PtrAndDeleter AllocBytes(size_t bytes) const;
  size_t line_bytes_;
  size_t vector_bytes_;
  size_t step_bytes_;
  size_t base_page_bytes_;
  size_t quantum_bytes_;
  size_t quantum_step_mask_;
  size_t l1_bytes_ = 0;
  size_t l2_bytes_ = 0;
  size_t l3_bytes_ = 0;
  size_t total_mib_;
  bool should_bind_ = false;
 };
 }  // namespace gcpp
 #endif  // THIRD_PARTY_GEMMA_CPP_UTIL_ALLOCATOR_H_
--- a/util/threading_context.cc
+++ b/util/threading_context.cc
@ -0,0 +1,63 @@
 // Copyright 2025 Google LLC
 // SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     https://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "util/threading_context.h"
 #include <memory>
 #include <mutex>  // NOLINT
 namespace gcpp {
 static ThreadingArgs s_args;
 // Cannot use magic static because that does not support `Invalidate`, hence
 // allocate manually.
 static std::unique_ptr<ThreadingContext2> s_ctx;
 static std::mutex s_ctx_mutex;
 /*static*/ void ThreadingContext2::SetArgs(const ThreadingArgs& args) {
  s_ctx_mutex.lock();
  HWY_ASSERT(!s_ctx);  // Ensure not already initialized, else this is too late.
  s_args = args;
  s_ctx_mutex.unlock();
 }
 /*static*/ ThreadingContext2& ThreadingContext2::Get() {
  // We do not bother with double-checked locking because it requires an
  // atomic pointer, but we prefer to use unique_ptr for simplicity. Also,
  // callers can cache the result and call less often.
  s_ctx_mutex.lock();
  if (HWY_UNLIKELY(!s_ctx)) {
    s_ctx = std::make_unique<ThreadingContext2>(PrivateToken());
  }
  s_ctx_mutex.unlock();
  return *s_ctx;
 }
 /*static*/ void ThreadingContext2::ThreadHostileInvalidate() {
  // Deliberately avoid taking the lock so that tsan can warn if this is
  // called concurrently with other calls to `Get`.
  s_ctx.reset();
 }
 // WARNING: called with `s_ctx_mutex` held. Calling `SetArgs` or `Get` would
 // deadlock.
 ThreadingContext2::ThreadingContext2(ThreadingContext2::PrivateToken)
    : topology(BoundedSlice(s_args.skip_packages, s_args.max_packages),
               BoundedSlice(s_args.skip_clusters, s_args.max_clusters),
               BoundedSlice(s_args.skip_lps, s_args.max_lps)),
      allocator(topology, s_args.bind != Tristate::kFalse),
      pools(topology, allocator, s_args.max_threads, s_args.pin) {}
 }  // namespace gcpp
--- a/util/threading_context.h
+++ b/util/threading_context.h
@ -0,0 +1,128 @@
 // Copyright 2025 Google LLC
 // SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     https://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #ifndef THIRD_PARTY_GEMMA_CPP_UTIL_THREADING_CONTEXT_H_
 #define THIRD_PARTY_GEMMA_CPP_UTIL_THREADING_CONTEXT_H_
 // Separate component to ensure `threading.cc` does not have access to
 // `ThreadingContext`, because that could deadlock.
 #include <stddef.h>
 #include <stdint.h>
 // IWYU pragma: begin_exports
 #include "util/allocator.h"
 #include "util/args.h"
 #include "util/basics.h"  // Tristate
 #include "util/threading.h"
 #include "util/topology.h"
 // IWYU pragma: end_exports
 namespace gcpp {
 // Optional arguments for `ThreadingContext` from the command line.
 class ThreadingArgs : public ArgsBase<ThreadingArgs> {
 public:
  ThreadingArgs(int argc, char* argv[]) { InitAndParse(argc, argv); }
  ThreadingArgs() { Init(); };
  // For BoundedTopology:
  size_t skip_packages;
  size_t max_packages;
  size_t skip_clusters;
  size_t max_clusters;
  size_t skip_lps;
  size_t max_lps;
  Tristate bind;
  // For NestedPools:
  size_t max_threads;  // divided among the detected clusters
  Tristate pin;        // pin threads?
  Tristate spin;       // use spin waits?
  template <class Visitor>
  void ForEach(const Visitor& visitor) {
    // These can be used to partition CPU sockets/packages and their
    // clusters/CCXs across several program instances. The default is to use
    // all available resources.
    visitor(skip_packages, "skip_packages", size_t{0},
            "Index of the first socket to use; default 0 = unlimited.", 2);
    visitor(max_packages, "max_packages", size_t{0},
            "Maximum number of sockets to use; default 0 = unlimited.", 2);
    visitor(skip_clusters, "skip_clusters", size_t{0},
            "Index of the first CCX to use; default 0 = unlimited.", 2);
    visitor(max_clusters, "max_clusters", size_t{0},
            "Maximum number of CCXs to use; default 0 = unlimited.", 2);
    // These are only used when CPU topology is unknown.
    visitor(skip_lps, "skip_lps", size_t{0},
            "Index of the first LP to use; default 0 = unlimited.", 2);
    visitor(max_lps, "max_lps", size_t{0},
            "Maximum number of LPs to use; default 0 = unlimited.", 2);
    // The exact meaning is more subtle: see the comment at NestedPools ctor.
    visitor(max_threads, "num_threads", size_t{0},
            "Maximum number of threads to use; default 0 = unlimited.", 2);
    visitor(pin, "pin", Tristate::kDefault,
            "Pin threads? -1 = auto, 0 = no, 1 = yes.", 2);
    visitor(spin, "spin", Tristate::kDefault,
            "Use spin waits? -1 = auto, 0 = no, 1 = yes.", 2);
    visitor(bind, "bind", Tristate::kDefault,
            "Bind memory to sockets? -1 = auto, 0 = no, 1 = yes.", 2);
  }
 };
 // Lazily-initialized singleton with support for passing in arguments from
 // `ThreadingArgs` and re-initializing with different arguments.
 class ThreadingContext2 {
  struct PrivateToken {};  // avoids constructing directly
 public:
  // If not called, default arguments are used when `Get` initializes the
  // singleton. Must not be called after `Get`, unless after a call to
  // `ThreadHostileInvalidate`, because otherwise initialization already
  // happened and the arguments would have no effect. Thread-safe, though this
  // is expected to be called early in the program, before threading starts.
  static void SetArgs(const ThreadingArgs& args);
  // Returns a reference to the singleton after initializing it if necessary.
  // When initializing, uses the args passed to `SetArgs`, or defaults.
  //
  // It is safe to call this concurrently with other `Get`, but not with
  // `SetArgs`, because that will warn if called after this, nor with
  // `ThreadHostileInvalidate`, because that will invalidate the reference which
  // callers of this may still be using. Such usage only occurs in tests,
  // hence we prefer not to pull `std::shared_ptr` into the interface.
  //
  // To reduce overhead, callers should cache the result and call less often.
  static ThreadingContext2& Get();
  // Invalidates the singleton before or after a call to `Get`. This allows
  // changing the arguments between tests. Callers must again call `Get`
  // afterwards to obtain an instance. WARNING: must not be called concurrently
  // with other calls to `Get` and usages of its return value.
  static void ThreadHostileInvalidate();
  explicit ThreadingContext2(PrivateToken);  // only called via `Get`.
  BoundedTopology topology;
  Allocator2 allocator;
  NestedPools pools;
 };
 }  // namespace gcpp
 #endif  // THIRD_PARTY_GEMMA_CPP_UTIL_THREADING_CONTEXT_H_
--- a/util/topology.cc
+++ b/util/topology.cc
@ -138,13 +138,13 @@ BoundedTopology::Cluster::Cluster(const LPS& enabled_lps,
        }
        if (HWY_UNLIKELY(private_kib_ != tcluster.private_kib)) {
          warned = true;
-          HWY_WARN("lp %zu private_kib %zu != cluster %zu.", lp, private_kib_,
+          HWY_WARN("lp %zu private_kib %zu != cluster %u.", lp, private_kib_,
-                   tcluster.private_kib);
+                   static_cast<unsigned>(tcluster.private_kib));
        }
        if (HWY_UNLIKELY(shared_kib_ != tcluster.shared_kib)) {
          warned = true;
-          HWY_WARN("lp %zu shared_kib %zu != cluster %zu.", lp, shared_kib_,
+          HWY_WARN("lp %zu shared_kib %zu != cluster %u.", lp, shared_kib_,
-                   tcluster.shared_kib);
+                   static_cast<unsigned>(tcluster.shared_kib));
        }
      }  // !warned
    }