diff --git a/util/threading.cc b/util/threading.cc
index 1671187..26a45fd 100644
--- a/util/threading.cc
+++ b/util/threading.cc
@@ -45,35 +45,48 @@ class Pinning {
   return false;  }
 
  public:
-  // Returns set of LPs available for use. Subsequent calls return the same
-  // set as the first, because pinning overwrites the main thread's affinity.
+  // Returns set of LPs available for use. Cached during the first call
+  // because subsequent pinning overwrites the main thread's affinity.
   // Thread-hostile, not called concurrently.
-  LPS EnabledLPs() {
-    if (original_affinity_.Any()) return original_affinity_;
+  LPS EnabledLPs(const BoundedSlice& lp_slice) {
+    if (enabled_lps_.Any()) return enabled_lps_;
 
-    // Regardless of topology, ignore LPs disabled via OS, taskset, or numactl.
-    LPS enabled_lps;
-    if (HWY_UNLIKELY(!GetThreadAffinity(enabled_lps))) {
+    LPS affinity;
+    if (HWY_LIKELY(GetThreadAffinity(affinity))) {
+      // To honor taskset/numactl *and* the users's `lp_slice`, we interpret
+      // the latter as a slice of the 1-bits of `enabled_lps`. Note that this
+      // can be used to exclude hyperthreads because Linux groups LPs by
+      // sibling index. For example, the first `num_cores` are not siblings.
+      const size_t detected = affinity.Count();
+      size_t enabled_idx = 0;
+      affinity.Foreach([&](size_t lp) {
+        if (lp_slice.Contains(detected, enabled_idx)) {
+          enabled_lps_.Set(lp);
+        }
+        ++enabled_idx;
+      });
+    } else {
       const size_t num_lps = hwy::TotalLogicalProcessors();
-      HWY_WARN("unknown OS affinity, considering all %zu LPs enabled.",
-               num_lps);
+      HWY_WARN("unknown OS affinity, max %zu LPs and slice %zu.", num_lps,
+               lp_slice.Num(num_lps));
       for (size_t lp = 0; lp < num_lps; ++lp) {
-        enabled_lps.Set(lp);
+        if (lp_slice.Contains(num_lps, lp)) {
+          enabled_lps_.Set(lp);
+        }
       }
     }
 
     // Without threading support, only keep the first enabled LP; it might still
     // make sense to pin the main thread to avoid migrations.
     if (HWY_UNLIKELY(!hwy::HaveThreadingSupport())) {
-      HWY_ASSERT(enabled_lps.Any());
-      const size_t lp = enabled_lps.First();
-      enabled_lps = LPS();
-      enabled_lps.Set(lp);
+      HWY_ASSERT(enabled_lps_.Any());
+      const size_t lp = enabled_lps_.First();
+      enabled_lps_ = LPS();
+      enabled_lps_.Set(lp);
       HWY_WARN("Warning, threads not supported, using only the main thread.");
     }
 
-    original_affinity_ = enabled_lps;
-    return enabled_lps;
+    return enabled_lps_;
   }
 
   void SetPolicy(Tristate pin) {
@@ -128,7 +141,7 @@ class Pinning {
  private:
   std::atomic_flag any_error_ = ATOMIC_FLAG_INIT;
   bool want_pin_;  // set in SetPolicy
-  LPS original_affinity_;
+  LPS enabled_lps_;
 };  // Pinning
 
 // Singleton saves global affinity across all BoundedTopology instances because
@@ -141,7 +154,7 @@ static Pinning& GetPinning() {
 BoundedTopology::BoundedTopology(BoundedSlice package_slice,
                                  BoundedSlice cluster_slice,
                                  BoundedSlice lp_slice) {
-  const LPS enabled_lps = GetPinning().EnabledLPs();
+  const LPS enabled_lps = GetPinning().EnabledLPs(lp_slice);
 
 #if !GEMMA_DISABLE_TOPOLOGY
   if (HWY_LIKELY(!topology_.packages.empty())) {
@@ -152,7 +165,7 @@ BoundedTopology::BoundedTopology(BoundedSlice package_slice,
   // Topology unknown or no packages with enabled LPs: create a single
   // package with one cluster, and one node.
   if (HWY_UNLIKELY(NumPackages() == 0)) {
-    InitFromSlice(enabled_lps, lp_slice);
+    InitFromLPs(enabled_lps);
   }
 
   HWY_ASSERT(NumPackages() != 0 && NumClusters(0) != 0 && NumNodes() != 0);
@@ -214,9 +227,9 @@ constexpr bool kSplitLargeClusters = false;
 constexpr size_t kMaxClusters = 8;
 constexpr size_t kMaxLPsPerCluster = 6;
 
-// Topology is unknown, rely on OS affinity and user-specified slice.
-BoundedTopology::Package::Package(const LPS& enabled_lps,
-                                  BoundedSlice lp_slice) {
+// Topology is unknown, use only the given LPs which derive from OS affinity
+// and `lp_slice`.
+BoundedTopology::Package::Package(const LPS& enabled_lps) {
   LPS clusters_lps[kMaxClusters];
   const size_t num_clusters =
       kSplitLargeClusters
@@ -224,16 +237,9 @@ BoundedTopology::Package::Package(const LPS& enabled_lps,
                     hwy::DivCeil(enabled_lps.Count(), kMaxLPsPerCluster))
           : 1;
 
-  // Interpret `lp_slice` as a slice of the 1-bits of `enabled_lps`, so
-  // we honor both the OS affinity and the user-specified slice. Note that
-  // this can be used to exclude hyperthreads because Linux groups LPs by
-  // sibling index. For example, the first `num_cores` are not siblings.
-  const size_t detected = enabled_lps.Count();
   size_t enabled_idx = 0;
   enabled_lps.Foreach([&](size_t lp) {
-    if (lp_slice.Contains(detected, enabled_idx)) {
-      clusters_lps[enabled_idx % num_clusters].Set(lp);
-    }
+    clusters_lps[enabled_idx % num_clusters].Set(lp);
     ++enabled_idx;
   });
 
@@ -386,9 +392,8 @@ void BoundedTopology::InitFromTopology(const LPS& enabled_lps,
 
 #endif  // !GEMMA_DISABLE_TOPOLOGY
 
-void BoundedTopology::InitFromSlice(const LPS& enabled_lps,
-                                    BoundedSlice lp_slice) {
-  packages_.push_back(Package(enabled_lps, lp_slice));
+void BoundedTopology::InitFromLPs(const LPS& enabled_lps) {
+  packages_.push_back(Package(enabled_lps));
 
   snprintf(topology_string_, sizeof(topology_string_), "LPs=%zu",
            GetCluster(0, 0).Size());
@@ -433,7 +438,7 @@ NestedPools::NestedPools(size_t max_threads, Tristate pin,
       0, all_packages_->NumWorkers(), [&](uint64_t pkg_idx, size_t thread) {
         HWY_ASSERT(pkg_idx == thread);  // each thread has one task
         packages_[pkg_idx] =
-            Package(topology_, pkg_idx, max_workers_per_package, lp_slice);
+            Package(topology_, pkg_idx, max_workers_per_package);
       });
 
   all_pinned_ = GetPinning().AllPinned(&pin_string_);
@@ -454,8 +459,7 @@ NestedPools::NestedPools(size_t max_threads, Tristate pin,
 }
 
 NestedPools::Package::Package(const BoundedTopology& topology, size_t pkg_idx,
-                              size_t max_workers_per_package,
-                              BoundedSlice lp_slice) {
+                              size_t max_workers_per_package) {
   // Pre-allocate because elements are set concurrently.
   clusters_.resize(topology.NumClusters(pkg_idx));
   const size_t max_workers_per_cluster =
diff --git a/util/threading.h b/util/threading.h
index c2db6ba..1b3ad41 100644
--- a/util/threading.h
+++ b/util/threading.h
@@ -165,7 +165,7 @@ class BoundedTopology {
 
  private:
   struct Package {
-    Package(const LPS& enabled_lps, BoundedSlice lp_slice);
+    explicit Package(const LPS& enabled_lps);
     Package(const LPS& enabled_lps, const hwy::Topology& topology,
             size_t pkg_idx, BoundedSlice cluster_slice);
 
@@ -177,7 +177,7 @@ class BoundedTopology {
 
   void InitFromTopology(const LPS& enabled_lps, BoundedSlice package_slice,
                         BoundedSlice cluster_slice);
-  void InitFromSlice(const LPS& enabled_lps, BoundedSlice lp_slice);
+  void InitFromLPs(const LPS& enabled_lps);
 
 #if !GEMMA_DISABLE_TOPOLOGY
   hwy::Topology topology_;
@@ -304,7 +304,7 @@ class NestedPools {
    public:
     Package() = default;  // for vector
     Package(const BoundedTopology& topology, size_t pkg_idx,
-            size_t max_workers_per_package, BoundedSlice lp_slice);
+            size_t max_workers_per_package);
 
     size_t NumClusters() const { return clusters_.size(); }
     size_t MaxWorkersPerCluster() const {