tune pool kSpin mode in threading_context

Previously, this happened concurrently with the matmul autotune, which could lead to incorrect outcomes. threading: de-singleton Pinning (no longer stores affinity); pass PoolWorkerMapping; fix Pool dtor order Also enable SPR target (Zen4 is AMD-only), update Highway version for renamed Thread()->GlobalIdx(). PiperOrigin-RevId: 816223017
2025-10-07 08:35:44 -07:00 · 2025-10-07 08:35:44 -07:00 · 035273c184
parent 9dc802c7aa
commit 035273c184
17 changed files with 143 additions and 114 deletions
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -114,6 +114,7 @@ cc_library(
        "@highway//:hwy",
        "@highway//:hwy_test_util",
        "@highway//:profiler",
        "@highway//:thread_pool",
    ],
 )
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -22,7 +22,7 @@ set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-FetchContent_Declare(highway GIT_REPOSITORY https://github.com/google/highway.git GIT_TAG 1d16731233de45a365b43867f27d0a5f73925300 EXCLUDE_FROM_ALL)
+FetchContent_Declare(highway GIT_REPOSITORY https://github.com/google/highway.git GIT_TAG 9781a1698ee0756ef1eaaf96930113ed7cb6d3ee EXCLUDE_FROM_ALL)
 FetchContent_MakeAvailable(highway)
 ## Note: absl needs to be installed by sentencepiece. This will only happen if
--- a/MODULE.bazel
+++ b/MODULE.bazel
@ -18,7 +18,7 @@ bazel_dep(name = "google_benchmark", version = "1.8.5")
 # Require a more recent version.
 git_override(
    module_name = "highway",
-    commit = "1d16731233de45a365b43867f27d0a5f73925300",
+    commit = "9781a1698ee0756ef1eaaf96930113ed7cb6d3ee",
    remote = "https://github.com/google/highway",
 )
--- a/README.md
+++ b/README.md
@ -452,7 +452,7 @@ FetchContent_MakeAvailable(sentencepiece)
 FetchContent_Declare(gemma GIT_REPOSITORY https://github.com/google/gemma.cpp GIT_TAG origin/main)
 FetchContent_MakeAvailable(gemma)
-FetchContent_Declare(highway GIT_REPOSITORY https://github.com/google/highway.git GIT_TAG 92d327e841d78e11ae888757a3e16d291951cf64)
+FetchContent_Declare(highway GIT_REPOSITORY https://github.com/google/highway.git GIT_TAG 9781a1698ee0756ef1eaaf96930113ed7cb6d3ee)
 FetchContent_MakeAvailable(highway)
 ```
--- a/compression/types.h
+++ b/compression/types.h
@ -45,10 +45,11 @@ namespace gcpp {
 // as NEON_WITHOUT_AES. Also skip SVE because SVE2_128 and SVE_256 cover most.
 #define GEMMA_DISABLED_TARGETS (HWY_SCALAR | HWY_NEON | HWY_SVE)
 #elif HWY_ARCH_X86
-// Skip anything older than Haswell (2013); also use Zen4 for recent CPUs,
+// Skip anything older than Haswell (2013); use Zen4/SPR for recent CPUs.
-// because we do not use anything added by SPR (e.g. FP16) nor AVX 10.2.
+// Although we do not use SPR's F16, Zen4 is only enabled for AMD. We do not
 // yet use any AVX 10.2 features.
 #define GEMMA_DISABLED_TARGETS \
-  (HWY_SCALAR | HWY_SSE2 | HWY_SSSE3 | HWY_SSE4 | HWY_AVX3_SPR | HWY_AVX10_2)
+  (HWY_SCALAR | HWY_SSE2 | HWY_SSSE3 | HWY_SSE4 | HWY_AVX10_2)
 #endif  // HWY_ARCH_*
 #endif  // GEMMA_DISABLED_TARGETS
--- a/evals/cross_entropy.cc
+++ b/evals/cross_entropy.cc
@ -84,7 +84,7 @@ namespace gcpp {
 namespace HWY_NAMESPACE {
 void CallSoftmax(Logits logits, hwy::Profiler& p) {
-  Softmax(logits, p, hwy::Profiler::Thread());
+  Softmax(logits, p, hwy::Profiler::GlobalIdx());
 }
 }  // namespace HWY_NAMESPACE
--- a/examples/hello_world/CMakeLists.txt
+++ b/examples/hello_world/CMakeLists.txt
@ -18,7 +18,7 @@ set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 include(FetchContent)
-FetchContent_Declare(highway GIT_REPOSITORY https://github.com/google/highway.git GIT_TAG 1d16731233de45a365b43867f27d0a5f73925300)
+FetchContent_Declare(highway GIT_REPOSITORY https://github.com/google/highway.git GIT_TAG 9781a1698ee0756ef1eaaf96930113ed7cb6d3ee)
 FetchContent_MakeAvailable(highway)
 FetchContent_Declare(sentencepiece GIT_REPOSITORY https://github.com/google/sentencepiece GIT_TAG 9045b2f60fa2b323dfac0eaef8fc17565036f9f9)
 FetchContent_MakeAvailable(sentencepiece)
--- a/examples/simplified_gemma/CMakeLists.txt
+++ b/examples/simplified_gemma/CMakeLists.txt
@ -18,7 +18,7 @@ set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 include(FetchContent)
-FetchContent_Declare(highway GIT_REPOSITORY https://github.com/google/highway.git GIT_TAG 1d16731233de45a365b43867f27d0a5f73925300)
+FetchContent_Declare(highway GIT_REPOSITORY https://github.com/google/highway.git GIT_TAG 9781a1698ee0756ef1eaaf96930113ed7cb6d3ee)
 FetchContent_MakeAvailable(highway)
 FetchContent_Declare(sentencepiece GIT_REPOSITORY https://github.com/google/sentencepiece GIT_TAG 53de76561cfc149d3c01037f0595669ad32a5e7c)
 FetchContent_MakeAvailable(sentencepiece)
--- a/gemma/api_server.cc
+++ b/gemma/api_server.cc
@ -376,7 +376,6 @@ void HandleGenerateContentStreaming(ServerState& state, const httplib::Request&
          // Ensure all data is sent
          sink.done();
          return false;  // End streaming
        } catch (const std::exception& e) {
--- a/gemma/attention.cc
+++ b/gemma/attention.cc
@ -254,7 +254,7 @@ static HWY_INLINE void ComputeQKV(size_t num_tokens, const size_t layer_idx,
                                  MatMulEnv& env) {
  static const auto zone = env.ctx.profiler.AddZone(
      "Gen.Attention.ComputeQKV", hwy::ProfilerFlags::kInclusive);
-  PROFILER_ZONE3(env.ctx.profiler, hwy::Profiler::Thread(), zone);
+  PROFILER_ZONE3(env.ctx.profiler, hwy::Profiler::GlobalIdx(), zone);
  const hwy::Divisor div_qbatch(qbatch.Size());
  const size_t num_interleaved = num_tokens * div_qbatch.GetDivisor();
@ -330,7 +330,7 @@ static HWY_INLINE void SumHeads(const LayerWeightsPtrs& layer,
                                MatMulEnv& env) {
  static const auto zone = env.ctx.profiler.AddZone(
      "Gen.Attention.SumHeads", hwy::ProfilerFlags::kInclusive);
-  PROFILER_ZONE3(env.ctx.profiler, hwy::Profiler::Thread(), zone);
+  PROFILER_ZONE3(env.ctx.profiler, hwy::Profiler::GlobalIdx(), zone);
  const LayerConfig& layer_config = layer.layer_config;
  (void)layer_config;  // For HWY_DASSERT
  // att_weights and att_out are concatenated heads, each of length
@ -350,7 +350,7 @@ void GemmaAttention(size_t num_tokens, const size_t layer_idx,
                    MatMulEnv& env, int flags) {
  static const auto zone =
      env.ctx.profiler.AddZone("Gen.Attention", hwy::ProfilerFlags::kInclusive);
-  PROFILER_ZONE3(env.ctx.profiler, hwy::Profiler::Thread(), zone);
+  PROFILER_ZONE3(env.ctx.profiler, hwy::Profiler::GlobalIdx(), zone);
  const LayerConfig& layer_config = layer.layer_config;
  HWY_DASSERT(!layer_config.IsMHA());  // No longer supported.
--- a/gemma/gemma-inl.h
+++ b/gemma/gemma-inl.h
@ -155,7 +155,7 @@ static inline void FFWNoVit(const LayerWeightsPtrs& layer,
                            Activations& activations, MatMulEnv& env) {
  static const auto zone =
      env.ctx.profiler.AddZone("Gen.FFW", hwy::ProfilerFlags::kInclusive);
-  PROFILER_ZONE3(env.ctx.profiler, hwy::Profiler::Thread(), zone);
+  PROFILER_ZONE3(env.ctx.profiler, hwy::Profiler::GlobalIdx(), zone);
  const LayerConfig& layer_config = layer.layer_config;
  HWY_DASSERT(!layer_config.ff_biases);  // Only used in Vit.
--- a/gemma/gemma.cc
+++ b/gemma/gemma.cc
@ -139,7 +139,7 @@ EmbedMMToken(int token, size_t x_row, size_t pos, size_t pos_in_prompt,
             size_t image_token_position = 0) {
  static const auto zone =
      ctx.profiler.AddZone("Gen.Embed", hwy::ProfilerFlags::kInclusive);
-  PROFILER_ZONE3(ctx.profiler, hwy::Profiler::Thread(), zone);
+  PROFILER_ZONE3(ctx.profiler, hwy::Profiler::GlobalIdx(), zone);
  // Image tokens just need to be copied.
  if (model_config.wrapping == PromptWrapping::GEMMA_VLM &&
--- a/gemma/vit.cc
+++ b/gemma/vit.cc
@ -335,7 +335,8 @@ void PrefillVit(const ModelConfig& model_config, const WeightsPtrs& weights,
    // Apply soft embedding norm before input projection.
    CallUpcasted(&weights.mm_embed_norm, [&](const auto* weights_t) {
      RMSNormInplace(weights_t->PackedScale1(), activations.x.Row(0),
-                     vit_model_dim, env.ctx.profiler, hwy::Profiler::Thread());
+                     vit_model_dim, env.ctx.profiler,
                     hwy::Profiler::GlobalIdx());
    });
  }
--- a/util/threading.cc
+++ b/util/threading.cc
@ -19,7 +19,6 @@
 #include <stdio.h>
 #include <algorithm>  // std::sort
 #include <atomic>
 #include <memory>
 #include <optional>
 #include <vector>
@ -29,7 +28,6 @@
 #include "hwy/base.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 #include "hwy/contrib/thread_pool/topology.h"
 #include "hwy/profiler.h"
 namespace gcpp {
@ -41,13 +39,11 @@ static void SortByDescendingSize(std::vector<T>& groups) {
            [](const T& a, const T& b) { return a.Size() > b.Size(); });
 }
-// Singleton, holds the original process affinity and the pinning status.
+static bool InContainer() {
-class Pinning {
+  return false;  // placeholder for container detection, do not remove
-  static bool InContainer() {
+}
  return false;  }
- public:
+PinningPolicy::PinningPolicy(Tristate pin) {
  void SetPolicy(Tristate pin) {
  if (pin == Tristate::kDefault) {
    // Pinning is unreliable inside containers because the hypervisor might
    // periodically change our affinity mask, or other processes might also
@ -55,28 +51,27 @@ class Pinning {
    pin = InContainer() ? Tristate::kFalse : Tristate::kTrue;
  }
  want_pin_ = (pin == Tristate::kTrue);
-    any_error_.clear();
+}
  }
-  // If want_pin_, tries to pin each worker in `pool` to an LP in `cluster`,
+// If `pinning.Want()`, tries to pin each worker in `pool` to an LP in
-  // and sets `any_error_` if any fails.
+// `cluster`, and calls `pinning.NotifyFailed()` if any fails.
-  void MaybePin(const BoundedTopology& topology, size_t pkg_idx,
+void MaybePin(const BoundedTopology& topology, size_t pkg_idx,
              size_t cluster_idx, const BoundedTopology::Cluster& cluster,
-                hwy::ThreadPool& pool) {
+              PinningPolicy& pinning, hwy::ThreadPool& pool) {
  const std::vector<size_t> lps = cluster.LPVector();
  HWY_ASSERT(pool.NumWorkers() <= lps.size());
  pool.Run(0, pool.NumWorkers(), [&](uint64_t task, size_t thread) {
    HWY_ASSERT(task == thread);  // each worker has one task
    char buf[16];  // Linux limitation
-      const int bytes_written = snprintf(
+    const int bytes_written = snprintf(buf, sizeof(buf), "P%zu X%02zu C%03d",
          buf, sizeof(buf), "P%zu X%02zu C%03d",
                                       topology.SkippedPackages() + pkg_idx,
-          topology.SkippedClusters() + cluster_idx, static_cast<int>(task));
+                                       topology.SkippedClusters() + cluster_idx,
                                       static_cast<int>(task));
    HWY_ASSERT(bytes_written < static_cast<int>(sizeof(buf)));
    hwy::SetThreadName(buf, 0);  // does not support varargs
-      if (HWY_LIKELY(want_pin_)) {
+    if (HWY_LIKELY(pinning.Want())) {
      if (HWY_UNLIKELY(!hwy::PinThreadToLogicalProcessor(lps[task]))) {
        // Apple does not support pinning, hence do not warn there.
        if (!HWY_OS_APPLE) {
@ -84,42 +79,20 @@ class Pinning {
                   static_cast<int>(task), pool.NumWorkers(), lps[task],
                   lps.size());
        }
-          (void)any_error_.test_and_set();
+        pinning.NotifyFailed();
      }
    }
  });
  }
  // Called ONCE after all MaybePin because it invalidates the error status.
  bool AllPinned(const char** pin_string) {
    // If !want_pin_, MaybePin will return without setting any_error_, but in
    // that case we still want to return false to avoid spinning.
    // .test() was only added in C++20, so we use .test_and_set() instead.
    const bool all_pinned = want_pin_ && !any_error_.test_and_set();
    *pin_string = all_pinned  ? "pinned"
                  : want_pin_ ? "pinning failed"
                              : "pinning skipped";
    return all_pinned;
  }
 private:
  std::atomic_flag any_error_ = ATOMIC_FLAG_INIT;
  bool want_pin_;  // set in SetPolicy
 };  // Pinning
 // Singleton saves global affinity across all BoundedTopology instances because
 // pinning overwrites it.
 static Pinning& GetPinning() {
  static Pinning pinning;
  return pinning;
 }
 static PoolPtr MakePool(const Allocator& allocator, size_t num_workers,
                        hwy::PoolWorkerMapping mapping,
                        std::optional<size_t> node = std::nullopt) {
  // `ThreadPool` expects the number of threads to create, which is one less
  // than the number of workers, but avoid underflow if zero.
  const size_t num_threads = num_workers == 0 ? 0 : num_workers - 1;
-  PoolPtr ptr = allocator.AllocClasses<hwy::ThreadPool>(1, num_threads);
+  PoolPtr ptr =
      allocator.AllocClasses<hwy::ThreadPool>(1, num_threads, mapping);
  const size_t bytes =
      hwy::RoundUpTo(sizeof(hwy::ThreadPool), allocator.QuantumBytes());
  if (node.has_value() && allocator.ShouldBind()) {
@ -142,10 +115,11 @@ static size_t DivideMaxAcross(const size_t max, const size_t instances) {
 NestedPools::NestedPools(const BoundedTopology& topology,
                         const Allocator& allocator, size_t max_threads,
-                         Tristate pin) {
+                         Tristate pin)
-  GetPinning().SetPolicy(pin);
+    : pinning_(pin) {
  packages_.resize(topology.NumPackages());
-  all_packages_ = MakePool(allocator, packages_.size());
+  all_packages_ =
      MakePool(allocator, packages_.size(), hwy::PoolWorkerMapping());
  const size_t max_workers_per_package =
      DivideMaxAcross(max_threads, packages_.size());
  // Each worker in all_packages_, including the main thread, will be the
@ -153,11 +127,11 @@ NestedPools::NestedPools(const BoundedTopology& topology,
  // `cluster.lps` if `pin`.
  all_packages_->Run(0, packages_.size(), [&](uint64_t pkg_idx, size_t thread) {
    HWY_ASSERT(pkg_idx == thread);  // each thread has one task
-    packages_[pkg_idx] =
+    packages_[pkg_idx] = Package(topology, allocator, pinning_, pkg_idx,
-        Package(topology, allocator, pkg_idx, max_workers_per_package);
+                                 max_workers_per_package);
  });
-  all_pinned_ = GetPinning().AllPinned(&pin_string_);
+  all_pinned_ = pinning_.AllPinned(&pin_string_);
  // For mapping package/cluster/thread to noncontiguous TLS indices, in case
  // cluster/thread counts differ.
@ -172,8 +146,6 @@ NestedPools::NestedPools(const BoundedTopology& topology,
  HWY_ASSERT(max_clusters_per_package_ <= 64);
  HWY_ASSERT(max_workers_per_cluster_ >= 1);
  HWY_ASSERT(max_workers_per_cluster_ <= 256);
  hwy::Profiler::Get().SetMaxThreads(MaxWorkers());
 }
 // `max_or_zero` == 0 means no limit.
@ -182,15 +154,22 @@ static inline size_t CapIfNonZero(size_t num, size_t max_or_zero) {
 }
 NestedPools::Package::Package(const BoundedTopology& topology,
-                              const Allocator& allocator, size_t pkg_idx,
+                              const Allocator& allocator,
                              PinningPolicy& pinning, size_t pkg_idx,
                              size_t max_workers_per_package) {
  // Pre-allocate because elements are set concurrently.
  clusters_.resize(topology.NumClusters(pkg_idx));
  const size_t max_workers_per_cluster =
      DivideMaxAcross(max_workers_per_package, clusters_.size());
-  all_clusters_ = MakePool(allocator, clusters_.size(),
+  const BoundedTopology::Cluster& cluster0 = topology.GetCluster(pkg_idx, 0);
-                           topology.GetCluster(pkg_idx, 0).Node());
+  // Core 0 of each cluster. The second argument is the cluster size, not
  // number of clusters. We ensure that it is the same for all clusters so that
  // the `GlobalIdx` computation is consistent within and across clusters.
  const hwy::PoolWorkerMapping all_clusters_mapping(hwy::kAllClusters,
                                                    cluster0.Size());
  all_clusters_ = MakePool(allocator, clusters_.size(), all_clusters_mapping,
                           cluster0.Node());
  // Parallel so we also pin the calling worker in `all_clusters` to
  // `cluster.lps`.
  all_clusters_->Run(
@ -198,11 +177,13 @@ NestedPools::Package::Package(const BoundedTopology& topology,
        HWY_ASSERT(cluster_idx == thread);  // each thread has one task
        const BoundedTopology::Cluster& cluster =
            topology.GetCluster(pkg_idx, cluster_idx);
        HWY_ASSERT(cluster.Size() == cluster0.Size());
        clusters_[cluster_idx] = MakePool(
            allocator, CapIfNonZero(cluster.Size(), max_workers_per_cluster),
            hwy::PoolWorkerMapping(cluster_idx, cluster.Size()),
            cluster.Node());
        // Pin workers AND the calling thread from `all_clusters`.
-        GetPinning().MaybePin(topology, pkg_idx, cluster_idx, cluster,
+        MaybePin(topology, pkg_idx, cluster_idx, cluster, pinning,
                 *clusters_[cluster_idx]);
      });
 }
--- a/util/threading.h
+++ b/util/threading.h
@ -19,6 +19,7 @@
 #include <stddef.h>
 #include <stdint.h>
 #include <atomic>
 #include <vector>
 // IWYU pragma: begin_exports
@ -40,6 +41,30 @@ namespace gcpp {
 // moving because it is a typedef to `std::unique_ptr`.
 using PoolPtr = AlignedClassPtr<hwy::ThreadPool>;
 class PinningPolicy {
 public:
  explicit PinningPolicy(Tristate pin);
  bool Want() const { return want_pin_; }
  void NotifyFailed() { (void)any_error_.test_and_set(); }
  // Called ONCE after all MaybePin because it invalidates the error status.
  bool AllPinned(const char** pin_string) {
    // If !want_pin_, MaybePin will return without setting any_error_, but in
    // that case we still want to return false to avoid spinning.
    // .test() was only added in C++20, so we use .test_and_set() instead.
    const bool all_pinned = want_pin_ && !any_error_.test_and_set();
    *pin_string = all_pinned  ? "pinned"
                  : want_pin_ ? "pinning failed"
                              : "pinning skipped";
    return all_pinned;
  }
 private:
  std::atomic_flag any_error_ = ATOMIC_FLAG_INIT;
  bool want_pin_;  // set in SetPolicy
 };  // PinningPolicy
 // Creates a hierarchy of thread pools according to `BoundedTopology`: one with
 // a thread per enabled package; for each of those, one with a thread per
 // enabled cluster (CCX/shared L3), and for each of those, the remaining
@ -56,7 +81,12 @@ using PoolPtr = AlignedClassPtr<hwy::ThreadPool>;
 // Useful when there are tasks which should be parallelized by workers sharing a
 // cache, or on the same NUMA node. In both cases, individual pools have lower
 // barrier synchronization latency than one large pool. However, to utilize all
-// cores, call sites will have to use nested parallel-for loops.
+// cores, call sites will have to use nested parallel-for loops as in
 // `HierarchicalParallelFor`. To allow switching modes easily, prefer using the
 // `ParallelFor` abstraction in threading_context.h).
 //
 // Note that this was previously intended to use all cores, but we are now
 // moving toward also allowing concurrent construction with subsets of cores.
 class NestedPools {
 public:
  // Neither move nor copy.
@ -151,7 +181,8 @@ class NestedPools {
   public:
    Package() = default;  // for vector
    Package(const BoundedTopology& topology, const Allocator& allocator,
-            size_t pkg_idx, size_t max_workers_per_package);
+            PinningPolicy& pinning, size_t pkg_idx,
            size_t max_workers_per_package);
    size_t NumClusters() const { return clusters_.size(); }
    size_t MaxWorkersPerCluster() const {
@ -184,8 +215,10 @@ class NestedPools {
    }
   private:
-    std::vector<PoolPtr> clusters_;
+    // Must be freed after `clusters_` because it reserves threads which are
    // the main threads of `clusters_`.
    PoolPtr all_clusters_;
    std::vector<PoolPtr> clusters_;
  };  // Package
  void SetWaitMode(hwy::PoolWaitMode wait_mode) {
@ -195,6 +228,7 @@ class NestedPools {
    }
  }
  PinningPolicy pinning_;
  bool all_pinned_;
  const char* pin_string_;
--- a/util/threading_context.cc
+++ b/util/threading_context.cc
@ -21,6 +21,7 @@
 #include <vector>
 #include "hwy/aligned_allocator.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 #include "hwy/profiler.h"
 #include "hwy/tests/test_util.h"  // RandomState
@ -28,7 +29,11 @@ namespace gcpp {
 // Invokes `pool.Run` with varying task counts until auto-tuning completes, or
 // an upper bound just in case.
-static void TunePool(hwy::ThreadPool& pool) {
+static void TunePool(hwy::PoolWaitMode wait_mode, hwy::ThreadPool& pool) {
  pool.SetWaitMode(wait_mode);
 // TODO(janwas): re-enable after investigating potential deadlock.
 #if 0
  const size_t num_workers = pool.NumWorkers();
  // pool.Run would just be a serial loop without auto-tuning, so skip.
  if (num_workers == 1) return;
@ -69,6 +74,22 @@ static void TunePool(hwy::ThreadPool& pool) {
    HWY_ASSERT(total == prev_total + expected);
    prev_total += expected;
  }
 #endif
 }
 static void TunePools(hwy::PoolWaitMode wait_mode, NestedPools& pools) {
  TunePool(wait_mode, pools.AllPackages());
  for (size_t pkg_idx = 0; pkg_idx < pools.NumPackages(); ++pkg_idx) {
    hwy::ThreadPool& clusters = pools.AllClusters(pkg_idx);
    TunePool(wait_mode, clusters);
    // Run in parallel because Turin CPUs have 16, and in real usage, we often
    // run all at the same time.
    clusters.Run(0, clusters.NumWorkers(),
                 [&](uint64_t cluster_idx, size_t /*thread*/) {
                   TunePool(wait_mode, pools.Cluster(pkg_idx, cluster_idx));
                 });
  }
 }
 ThreadingContext::ThreadingContext(const ThreadingArgs& args)
@ -80,18 +101,9 @@ ThreadingContext::ThreadingContext(const ThreadingArgs& args)
      allocator(topology, cache_info, args.bind != Tristate::kFalse),
      pools(topology, allocator, args.max_threads, args.pin) {
  PROFILER_ZONE("Startup.ThreadingContext autotune");
-  TunePool(pools.AllPackages());
+  TunePools(hwy::PoolWaitMode::kSpin, pools);
-  for (size_t pkg_idx = 0; pkg_idx < pools.NumPackages(); ++pkg_idx) {
+  // kBlock is the default, hence set/tune it last.
-    hwy::ThreadPool& clusters = pools.AllClusters(pkg_idx);
+  TunePools(hwy::PoolWaitMode::kBlock, pools);
    TunePool(clusters);
    // Run in parallel because Turin CPUs have 16, and in real usage, we often
    // run all at the same time.
    clusters.Run(0, clusters.NumWorkers(),
                 [&](uint64_t cluster_idx, size_t /*thread*/) {
                   TunePool(pools.Cluster(pkg_idx, cluster_idx));
                 });
  }
 }
 }  // namespace gcpp
--- a/util/threading_context.h
+++ b/util/threading_context.h
@ -41,7 +41,7 @@ class ThreadingArgs : public ArgsBase<ThreadingArgs> {
  // For BoundedTopology:
  size_t skip_packages;
-  size_t max_packages = 1;
+  size_t max_packages = 1;  // some users assign 1 to this, hence non-const.
  size_t skip_clusters;
  size_t max_clusters;
  size_t skip_lps;