Add new singleton Allocator2 instead of monostate

Not yet used.

Also fix format-string warning in topology.cc.

PiperOrigin-RevId: 745166210
This commit is contained in:
Jan Wassenberg 2025-04-08 09:00:18 -07:00 committed by Copybara-Service
parent 4e6aa36e9b
commit 5d4f7e0f7e
5 changed files with 520 additions and 7 deletions

View File

@ -15,12 +15,12 @@
#include "util/allocator.h" #include "util/allocator.h"
#include <stdint.h>
#include <stdio.h> #include <stdio.h>
#include "util/basics.h" // MaybeCheckInitialized #include "util/basics.h" // MaybeCheckInitialized
#include "hwy/aligned_allocator.h" #include "hwy/aligned_allocator.h"
#include "hwy/base.h" #include "hwy/base.h"
#include "hwy/contrib/thread_pool/futex.h"
#include "hwy/contrib/thread_pool/topology.h" #include "hwy/contrib/thread_pool/topology.h"
#include "hwy/per_target.h" // VectorBytes #include "hwy/per_target.h" // VectorBytes
@ -46,13 +46,32 @@
#endif // GEMMA_BIND #endif // GEMMA_BIND
#if GEMMA_BIND && HWY_OS_LINUX #if GEMMA_BIND && HWY_OS_LINUX
#include <atomic>
#include "hwy/contrib/thread_pool/futex.h"
#endif
#if HWY_OS_LINUX
#include <unistd.h> // sysconf
#if GEMMA_BIND
// `move_pages` requires anonymous/private mappings, hence mmap. // `move_pages` requires anonymous/private mappings, hence mmap.
#include <sys/mman.h> #include <sys/mman.h>
#include <sys/syscall.h> #include <sys/syscall.h>
#include <cerrno> #include <cerrno>
#include <vector> #include <vector>
#endif // GEMMA_BIND && HWY_OS_LINUX #endif // GEMMA_BIND
#elif HWY_OS_WIN
#ifndef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN
#endif
#ifndef VC_EXTRALEAN
#define VC_EXTRALEAN
#endif
#include <Windows.h>
#elif HWY_OS_APPLE
#include <sys/sysctl.h>
#endif // HWY_OS_LINUX
namespace gcpp { namespace gcpp {
namespace { namespace {
@ -68,14 +87,47 @@ size_t DetectLineBytes() {
size_t DetectPageSize() { size_t DetectPageSize() {
#if HWY_OS_LINUX #if HWY_OS_LINUX
size_t page_bytes = static_cast<size_t>(sysconf(_SC_PAGESIZE)); const long ret = sysconf(_SC_PAGESIZE); // NOLINT(runtime/int)
HWY_ASSERT(ret != -1);
const size_t page_bytes = static_cast<size_t>(ret);
HWY_ASSERT(page_bytes <= (4 << 20)); HWY_ASSERT(page_bytes <= (4 << 20));
return page_bytes; return page_bytes;
#elif HWY_OS_WIN
SYSTEM_INFO info;
GetSystemInfo(&info);
return info.dwPageSize;
#elif HWY_OS_APPLE
uint64_t data = 0;
size_t len = sizeof(data);
HWY_ASSERT(sysctlbyname("vm.pagesize", &data, &len, nullptr, 0) == 0);
return data;
#else #else
return 0; return 0;
#endif #endif
} }
size_t DetectTotalMiB(size_t page_bytes) {
(void)page_bytes;
#if HWY_OS_LINUX
const long ret = sysconf(_SC_PHYS_PAGES); // NOLINT(runtime/int)
HWY_ASSERT(ret != -1);
return static_cast<size_t>(ret) * page_bytes >> 20;
#elif HWY_OS_WIN
MEMORYSTATUSEX ms = {sizeof(MEMORYSTATUSEX)};
HWY_ASSERT(GlobalMemoryStatusEx(&ms) != 0);
return ms.ullTotalPhys >> 20;
#elif HWY_OS_APPLE
int mib[2] = {CTL_HW, HW_MEMSIZE};
uint64_t data = 0;
size_t len = sizeof(data);
HWY_ASSERT(sysctl(mib, sizeof(mib) / sizeof(*mib), &data, &len, nullptr, 0) ==
0);
return data >> 20;
#else
#error "Port"
#endif
}
} // namespace } // namespace
static size_t line_bytes_; static size_t line_bytes_;
@ -305,4 +357,123 @@ bool Allocator::BindMemory(void* ptr, size_t bytes, size_t node) {
bool Allocator::BindMemory(void*, size_t, size_t) { return false; } bool Allocator::BindMemory(void*, size_t, size_t) { return false; }
#endif // GEMMA_BIND && HWY_OS_LINUX #endif // GEMMA_BIND && HWY_OS_LINUX
Allocator2::Allocator2(const BoundedTopology& topology, bool enable_bind) {
line_bytes_ = DetectLineBytes();
vector_bytes_ = hwy::VectorBytes();
step_bytes_ = HWY_MAX(line_bytes_, vector_bytes_);
base_page_bytes_ = DetectPageSize();
quantum_bytes_ = step_bytes_; // may overwrite below
const BoundedTopology::Cluster& cluster = topology.GetCluster(0, 0);
if (const hwy::Cache* caches = hwy::DataCaches()) {
l1_bytes_ = caches[1].size_kib << 10;
l2_bytes_ = caches[2].size_kib << 10;
l3_bytes_ = (caches[3].size_kib << 10) * caches[3].cores_sharing;
} else { // Unknown, make reasonable assumptions.
l1_bytes_ = 32 << 10;
l2_bytes_ = (cluster.PrivateKiB() ? cluster.PrivateKiB() : 256) << 10;
}
if (l3_bytes_ == 0) {
l3_bytes_ = (cluster.SharedKiB() ? cluster.SharedKiB() : 1024) << 10;
}
total_mib_ = DetectTotalMiB(base_page_bytes_);
// Prerequisites for binding:
// - supported by the OS (currently Linux only),
// - the page size is known and 'reasonably small', preferably less than
// a fraction of MatMul row/col sizes, which for 27B are up to 144 KiB.
// - we successfully detected topology and there are multiple nodes;
// - there are multiple packages, because we shard by package_idx.
if constexpr (GEMMA_BIND) {
if ((base_page_bytes_ != 0 && base_page_bytes_ <= 16 * 1024) &&
topology.NumNodes() > 1 && topology.NumPackages() > 1) {
if (enable_bind) {
// Ensure pages meet the alignment requirements of `AllocBytes`.
HWY_ASSERT(base_page_bytes_ >= quantum_bytes_);
quantum_bytes_ = base_page_bytes_;
// Ensure MaxQuantum() is an upper bound.
HWY_ASSERT(MaxQuantum<uint8_t>() >= Quantum<uint8_t>());
should_bind_ = true;
} else {
HWY_WARN(
"Multiple sockets but binding disabled. This reduces speed; "
"set or remove enable_bind to avoid this warning.");
}
}
}
HWY_DASSERT(quantum_bytes_ % step_bytes_ == 0);
quantum_step_mask_ = quantum_bytes_ / step_bytes_ - 1;
}
size_t Allocator2::FreeMiB() const {
#if HWY_OS_LINUX
const long ret = sysconf(_SC_AVPHYS_PAGES); // NOLINT(runtime/int)
HWY_ASSERT(ret != -1);
return static_cast<size_t>(ret) * base_page_bytes_ >> 20;
#elif HWY_OS_WIN
MEMORYSTATUSEX ms = {sizeof(MEMORYSTATUSEX)};
HWY_ASSERT(GlobalMemoryStatusEx(&ms) != 0);
return ms.ullAvailVirtual >> 20;
#elif HWY_OS_APPLE
uint64_t free = 0, inactive = 0, speculative = 0;
size_t len = sizeof(free);
sysctlbyname("vm.page_free_count", &free, &len, nullptr, 0);
sysctlbyname("vm.page_inactive_count", &inactive, &len, nullptr, 0);
sysctlbyname("vm.page_speculative_count", &speculative, &len, nullptr, 0);
return (free + inactive + speculative) * base_page_bytes_ >> 20;
#else
#error "Port"
#endif
}
Allocator2::PtrAndDeleter Allocator2::AllocBytes(size_t bytes) const {
// If we are not binding, the Highway allocator is cheaper than `mmap`, and
// defends against 2K aliasing.
if (!should_bind_) {
// Perf warning if Highway's alignment is less than we want.
if (HWY_ALIGNMENT < QuantumBytes()) {
HWY_WARN(
"HWY_ALIGNMENT %d < QuantumBytes %zu: either vector or cache lines "
"are huge, enable GEMMA_BIND to avoid this warning.",
HWY_ALIGNMENT, QuantumBytes());
}
auto p = hwy::AllocateAligned<uint8_t>(bytes);
// The `hwy::AlignedFreeUniquePtr` deleter is unfortunately specific to the
// alignment scheme in aligned_allocator.cc and does not work for
// already-aligned pointers as returned by `mmap`, hence we wrap the Highway
// pointer in our own deleter.
return PtrAndDeleter{p.release(), DeleterFunc2([](void* ptr) {
hwy::FreeAlignedBytes(ptr, nullptr, nullptr);
})};
}
// Binding, or large vector/cache line size: use platform-specific allocator.
#if HWY_OS_LINUX && !defined(__ANDROID_API__)
// `move_pages` is documented to require an anonymous/private mapping or
// `MAP_SHARED`. A normal allocation might not suffice, so we use `mmap`.
// `Init` verified that the page size is a multiple of `QuantumBytes()`.
const int prot = PROT_READ | PROT_WRITE;
const int flags = MAP_ANONYMOUS | MAP_PRIVATE;
const int fd = -1;
void* p = mmap(0, bytes, prot, flags, fd, off_t{0});
if (p == MAP_FAILED) p = nullptr;
return PtrAndDeleter{p, DeleterFunc2([bytes](void* ptr) {
HWY_ASSERT(munmap(ptr, bytes) == 0);
})};
#elif HWY_OS_WIN
const size_t alignment = HWY_MAX(vector_bytes_, line_bytes_);
return PtrAndDeleter{_aligned_malloc(bytes, alignment),
DeleterFunc2([](void* ptr) { _aligned_free(ptr); })};
#else
return PtrAndDeleter{nullptr, DeleterFunc2()};
#endif
}
bool Allocator2::BindMemory(void* ptr, size_t bytes, size_t node) const {
return Allocator::BindMemory(ptr, bytes, node);
}
} // namespace gcpp } // namespace gcpp

View File

@ -21,6 +21,7 @@
#include <stddef.h> #include <stddef.h>
#include <stdint.h> #include <stdint.h>
#include <functional>
// IWYU pragma: begin_exports // IWYU pragma: begin_exports
#include <memory> // std::unique_ptr #include <memory> // std::unique_ptr
@ -330,6 +331,156 @@ RowPtr<T> RowPtrFromBatch(RowVectorBatch<T>& row_vectors) {
return RowPtr<T>(row_vectors.All(), row_vectors.Cols(), row_vectors.Stride()); return RowPtr<T>(row_vectors.All(), row_vectors.Cols(), row_vectors.Stride());
} }
// Custom deleter for types without a dtor, but where the deallocation requires
// state, e.g. a lambda with *by-value* capture.
class DeleterFunc2 {
public:
// `MatOwnerT` requires this to be default-constructible.
DeleterFunc2() = default;
template <class Closure>
DeleterFunc2(const Closure& free_closure) : free_func_(free_closure) {}
template <typename T>
void operator()(T* p) const {
free_func_(const_cast<hwy::RemoveConst<T>*>(p));
}
private:
std::function<void(void*)> free_func_;
};
// Wrapper that also calls the destructor for each element being deallocated.
class DeleterDtor2 {
public:
DeleterDtor2() {}
DeleterDtor2(size_t num, DeleterFunc2 free) : num_(num), free_(free) {}
template <typename T>
void operator()(T* p) const {
for (size_t i = 0; i < num_; ++i) {
p[i].~T();
}
free_(p);
}
private:
size_t num_;
DeleterFunc2 free_;
};
// Unique (move-only) pointer to aligned POD T, which can be an array or class.
template <typename T>
using AlignedPtr2 = std::unique_ptr<T, DeleterFunc2>;
// Unique (move-only) pointer to an aligned array of non-POD T.
template <typename T>
using AlignedClassPtr2 = std::unique_ptr<T, DeleterDtor2>;
// Both allocation, binding, and row accessors depend on the sizes of memory
// pages and cache lines. To avoid having to pass `Allocator2&` everywhere, we
// wrap this in a singleton. A monostate requires explicit initialization,
// which we prefer to avoid because there are many main() functions.
class Allocator2 {
public:
// Must be called at least once before any other function. Not thread-safe,
// hence only call this from the main thread.
// TODO: remove enable_bind once Gemma tensors support binding.
Allocator2(const BoundedTopology& topology, bool enable_bind);
// Bytes per cache line, or a reasonable guess if unknown. Used to choose
// ranges such that there will be no false sharing.
size_t LineBytes() const { return line_bytes_; }
// Bytes per full vector. Used to compute loop steps.
size_t VectorBytes() const { return vector_bytes_; }
// Work granularity that avoids false sharing and partial vectors.
// = HWY_MAX(LineBytes(), VectorBytes())
size_t StepBytes() const { return step_bytes_; }
// File size multiple required for memory mapping.
size_t BasePageBytes() const { return base_page_bytes_; }
// Either StepBytes or BasePageBytes if NUMA.
size_t QuantumBytes() const { return quantum_bytes_; }
template <typename T>
size_t Quantum() const {
return QuantumBytes() / sizeof(T);
}
// Upper bound on `Quantum()`, for stack allocations.
template <typename T>
static constexpr size_t MaxQuantum() {
return 4096 / sizeof(T);
}
// = QuantumBytes() / StepBytes() - 1
size_t QuantumStepMask() const { return quantum_step_mask_; }
// L1 and L2 are typically per core.
size_t L1Bytes() const { return l1_bytes_; }
size_t L2Bytes() const { return l2_bytes_; }
// Clusters often share an L3. We return the total size per package.
size_t L3Bytes() const { return l3_bytes_; }
size_t TotalMiB() const { return total_mib_; }
size_t FreeMiB() const;
// Returns pointer aligned to `QuantumBytes()`.
template <typename T>
AlignedPtr2<T[]> Alloc(size_t num) const {
const size_t bytes = num * sizeof(T);
// Fail if the `bytes = num * sizeof(T)` computation overflowed.
HWY_ASSERT(bytes / sizeof(T) == num);
PtrAndDeleter pd = AllocBytes(bytes);
return AlignedPtr2<T[]>(static_cast<T*>(pd.p), pd.deleter);
}
// Same as Alloc, but calls constructor(s) with `args` and the deleter will
// call destructor(s).
template <typename T, class... Args>
AlignedClassPtr2<T> AllocClasses(size_t num, Args&&... args) const {
const size_t bytes = num * sizeof(T);
// Fail if the `bytes = num * sizeof(T)` computation overflowed.
HWY_ASSERT(bytes / sizeof(T) == num);
PtrAndDeleter pd = AllocBytes(bytes);
T* p = static_cast<T*>(pd.p);
for (size_t i = 0; i < num; ++i) {
new (p + i) T(std::forward<Args>(args)...);
}
return AlignedClassPtr2<T>(p, DeleterDtor2(num, pd.deleter));
}
// Returns whether `BindMemory` can/should be called, i.e. we have page-level
// control over memory placement and multiple packages and NUMA nodes.
bool ShouldBind() const { return should_bind_; }
// Attempts to move(!) `[p, p + bytes)` to the given NUMA node, which is
// typically `BoundedTopology::GetCluster(package_idx, cluster_idx).node`.
// Writes zeros to SOME of the memory. Only call if `ShouldBind()`.
// `p` and `bytes` must be multiples of `QuantumBytes()`.
bool BindMemory(void* p, size_t bytes, size_t node) const;
private:
// Type-erased so this can be implemented in allocator.cc.
struct PtrAndDeleter {
void* p;
DeleterFunc2 deleter;
};
PtrAndDeleter AllocBytes(size_t bytes) const;
size_t line_bytes_;
size_t vector_bytes_;
size_t step_bytes_;
size_t base_page_bytes_;
size_t quantum_bytes_;
size_t quantum_step_mask_;
size_t l1_bytes_ = 0;
size_t l2_bytes_ = 0;
size_t l3_bytes_ = 0;
size_t total_mib_;
bool should_bind_ = false;
};
} // namespace gcpp } // namespace gcpp
#endif // THIRD_PARTY_GEMMA_CPP_UTIL_ALLOCATOR_H_ #endif // THIRD_PARTY_GEMMA_CPP_UTIL_ALLOCATOR_H_

63
util/threading_context.cc Normal file
View File

@ -0,0 +1,63 @@
// Copyright 2025 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "util/threading_context.h"
#include <memory>
#include <mutex> // NOLINT
namespace gcpp {
static ThreadingArgs s_args;
// Cannot use magic static because that does not support `Invalidate`, hence
// allocate manually.
static std::unique_ptr<ThreadingContext2> s_ctx;
static std::mutex s_ctx_mutex;
/*static*/ void ThreadingContext2::SetArgs(const ThreadingArgs& args) {
s_ctx_mutex.lock();
HWY_ASSERT(!s_ctx); // Ensure not already initialized, else this is too late.
s_args = args;
s_ctx_mutex.unlock();
}
/*static*/ ThreadingContext2& ThreadingContext2::Get() {
// We do not bother with double-checked locking because it requires an
// atomic pointer, but we prefer to use unique_ptr for simplicity. Also,
// callers can cache the result and call less often.
s_ctx_mutex.lock();
if (HWY_UNLIKELY(!s_ctx)) {
s_ctx = std::make_unique<ThreadingContext2>(PrivateToken());
}
s_ctx_mutex.unlock();
return *s_ctx;
}
/*static*/ void ThreadingContext2::ThreadHostileInvalidate() {
// Deliberately avoid taking the lock so that tsan can warn if this is
// called concurrently with other calls to `Get`.
s_ctx.reset();
}
// WARNING: called with `s_ctx_mutex` held. Calling `SetArgs` or `Get` would
// deadlock.
ThreadingContext2::ThreadingContext2(ThreadingContext2::PrivateToken)
: topology(BoundedSlice(s_args.skip_packages, s_args.max_packages),
BoundedSlice(s_args.skip_clusters, s_args.max_clusters),
BoundedSlice(s_args.skip_lps, s_args.max_lps)),
allocator(topology, s_args.bind != Tristate::kFalse),
pools(topology, allocator, s_args.max_threads, s_args.pin) {}
} // namespace gcpp

128
util/threading_context.h Normal file
View File

@ -0,0 +1,128 @@
// Copyright 2025 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef THIRD_PARTY_GEMMA_CPP_UTIL_THREADING_CONTEXT_H_
#define THIRD_PARTY_GEMMA_CPP_UTIL_THREADING_CONTEXT_H_
// Separate component to ensure `threading.cc` does not have access to
// `ThreadingContext`, because that could deadlock.
#include <stddef.h>
#include <stdint.h>
// IWYU pragma: begin_exports
#include "util/allocator.h"
#include "util/args.h"
#include "util/basics.h" // Tristate
#include "util/threading.h"
#include "util/topology.h"
// IWYU pragma: end_exports
namespace gcpp {
// Optional arguments for `ThreadingContext` from the command line.
class ThreadingArgs : public ArgsBase<ThreadingArgs> {
public:
ThreadingArgs(int argc, char* argv[]) { InitAndParse(argc, argv); }
ThreadingArgs() { Init(); };
// For BoundedTopology:
size_t skip_packages;
size_t max_packages;
size_t skip_clusters;
size_t max_clusters;
size_t skip_lps;
size_t max_lps;
Tristate bind;
// For NestedPools:
size_t max_threads; // divided among the detected clusters
Tristate pin; // pin threads?
Tristate spin; // use spin waits?
template <class Visitor>
void ForEach(const Visitor& visitor) {
// These can be used to partition CPU sockets/packages and their
// clusters/CCXs across several program instances. The default is to use
// all available resources.
visitor(skip_packages, "skip_packages", size_t{0},
"Index of the first socket to use; default 0 = unlimited.", 2);
visitor(max_packages, "max_packages", size_t{0},
"Maximum number of sockets to use; default 0 = unlimited.", 2);
visitor(skip_clusters, "skip_clusters", size_t{0},
"Index of the first CCX to use; default 0 = unlimited.", 2);
visitor(max_clusters, "max_clusters", size_t{0},
"Maximum number of CCXs to use; default 0 = unlimited.", 2);
// These are only used when CPU topology is unknown.
visitor(skip_lps, "skip_lps", size_t{0},
"Index of the first LP to use; default 0 = unlimited.", 2);
visitor(max_lps, "max_lps", size_t{0},
"Maximum number of LPs to use; default 0 = unlimited.", 2);
// The exact meaning is more subtle: see the comment at NestedPools ctor.
visitor(max_threads, "num_threads", size_t{0},
"Maximum number of threads to use; default 0 = unlimited.", 2);
visitor(pin, "pin", Tristate::kDefault,
"Pin threads? -1 = auto, 0 = no, 1 = yes.", 2);
visitor(spin, "spin", Tristate::kDefault,
"Use spin waits? -1 = auto, 0 = no, 1 = yes.", 2);
visitor(bind, "bind", Tristate::kDefault,
"Bind memory to sockets? -1 = auto, 0 = no, 1 = yes.", 2);
}
};
// Lazily-initialized singleton with support for passing in arguments from
// `ThreadingArgs` and re-initializing with different arguments.
class ThreadingContext2 {
struct PrivateToken {}; // avoids constructing directly
public:
// If not called, default arguments are used when `Get` initializes the
// singleton. Must not be called after `Get`, unless after a call to
// `ThreadHostileInvalidate`, because otherwise initialization already
// happened and the arguments would have no effect. Thread-safe, though this
// is expected to be called early in the program, before threading starts.
static void SetArgs(const ThreadingArgs& args);
// Returns a reference to the singleton after initializing it if necessary.
// When initializing, uses the args passed to `SetArgs`, or defaults.
//
// It is safe to call this concurrently with other `Get`, but not with
// `SetArgs`, because that will warn if called after this, nor with
// `ThreadHostileInvalidate`, because that will invalidate the reference which
// callers of this may still be using. Such usage only occurs in tests,
// hence we prefer not to pull `std::shared_ptr` into the interface.
//
// To reduce overhead, callers should cache the result and call less often.
static ThreadingContext2& Get();
// Invalidates the singleton before or after a call to `Get`. This allows
// changing the arguments between tests. Callers must again call `Get`
// afterwards to obtain an instance. WARNING: must not be called concurrently
// with other calls to `Get` and usages of its return value.
static void ThreadHostileInvalidate();
explicit ThreadingContext2(PrivateToken); // only called via `Get`.
BoundedTopology topology;
Allocator2 allocator;
NestedPools pools;
};
} // namespace gcpp
#endif // THIRD_PARTY_GEMMA_CPP_UTIL_THREADING_CONTEXT_H_

View File

@ -138,13 +138,13 @@ BoundedTopology::Cluster::Cluster(const LPS& enabled_lps,
} }
if (HWY_UNLIKELY(private_kib_ != tcluster.private_kib)) { if (HWY_UNLIKELY(private_kib_ != tcluster.private_kib)) {
warned = true; warned = true;
HWY_WARN("lp %zu private_kib %zu != cluster %zu.", lp, private_kib_, HWY_WARN("lp %zu private_kib %zu != cluster %u.", lp, private_kib_,
tcluster.private_kib); static_cast<unsigned>(tcluster.private_kib));
} }
if (HWY_UNLIKELY(shared_kib_ != tcluster.shared_kib)) { if (HWY_UNLIKELY(shared_kib_ != tcluster.shared_kib)) {
warned = true; warned = true;
HWY_WARN("lp %zu shared_kib %zu != cluster %zu.", lp, shared_kib_, HWY_WARN("lp %zu shared_kib %zu != cluster %u.", lp, shared_kib_,
tcluster.shared_kib); static_cast<unsigned>(tcluster.shared_kib));
} }
} // !warned } // !warned
} }