gemma.cpp/util/topology.h

// Copyright 2024 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef THIRD_PARTY_GEMMA_CPP_UTIL_TOPOLOGY_H_
#define THIRD_PARTY_GEMMA_CPP_UTIL_TOPOLOGY_H_

#include <stddef.h>
#include <stdint.h>

#include <vector>

// IWYU pragma: begin_exports
#include "hwy/base.h"  // HWY_ASSERT
#include "hwy/contrib/thread_pool/topology.h"
// IWYU pragma: end_exports

#ifndef GEMMA_DISABLE_TOPOLOGY
#define GEMMA_DISABLE_TOPOLOGY 0
#endif  // !GEMMA_DISABLE_TOPOLOGY

namespace gcpp {

// A slice of a 1D integer range such as the indices of packages or clusters.
// This allows assigning them to multiple instances of our binary.
class BoundedSlice {
 public:
  // Defaults to "use all detected".
  BoundedSlice(size_t skip = 0, size_t max = 0) : skip_(skip), max_(max) {}

  size_t Begin() const { return skip_; }
  size_t Max() const { return max_; }

  // STL-style one past the end.
  size_t End(size_t detected) const {
    return (max_ == 0) ? detected : HWY_MIN(detected, skip_ + max_);
  }

  // Number of elements in the slice.
  size_t Num(size_t detected) const { return End(detected) - Begin(); }

  bool Contains(size_t detected, size_t idx) const {
    return Begin() <= idx && idx < End(detected);
  }

  template <class Func>
  void Foreach(const char* name, size_t detected, const Func& func) {
    if (Begin() >= detected) {
      HWY_ABORT("Invalid skip=%zu for %s, detected=%zu", skip_, name, detected);
    }
    for (size_t i = Begin(); i < End(detected); ++i) {
      func(i);
    }
  }

 private:
  // How many to skip, or equivalently, index of the first to use. It is an
  // error if this is >= `detected`, because that would leave none for this
  // instance to use.
  size_t skip_;

  // Upper bound on the number to use, or zero if no limit.
  size_t max_;
};

// "LP" is a logical processor, a 0-based index passed to the OS.
using LPS = hwy::LogicalProcessorSet;

// Wraps hwy::Topology and only keeps the subset of packages and clusters
// apportioned by BoundedSlice, further limited by the OS affinity mask.
// NOTE: if topology is unknown or the OS affinity is too restrictive, we fall
// back to a single package and cluster.
class BoundedTopology {
 public:
  // `package_slice` must have `Max() == 1`. Others default to "use all".
  BoundedTopology(BoundedSlice package_slice,
                  BoundedSlice cluster_slice = BoundedSlice(),
                  BoundedSlice lp_slice = BoundedSlice());

  size_t NumNodes() const { return nodes_.Count(); }
  const char* TopologyString() const { return topology_string_; }

  class Cluster {
   public:
    explicit Cluster(const LPS& lps);
    Cluster(const LPS& enabled_lps,
            const std::vector<hwy::Topology::LP>& all_lps,
            const hwy::Topology::Cluster& tcluster);

    size_t NumWorkers() const { return num_workers_; }

    // Returns vector with all enabled LPs, used for pinning.
    std::vector<size_t> LPVector() const {
      std::vector<size_t> lps;
      lps.reserve(lps_.Count());
      lps_.Foreach([&lps](size_t lp) { lps.push_back(lp); });
      return lps;
    }

    const LPS& LPSet() const { return lps_; }
    size_t Node() const { return node_; }
    size_t PrivateKiB() const { return private_kib_; }
    size_t SharedKiB() const { return shared_kib_; }

   private:
    // Enabled LPs; if topology is known, only the ones in this cluster.
    LPS lps_;
    // How many workers in the per-cluster pool. If 0, this Cluster is removed.
    size_t num_workers_ = 0;
    // NUMA node, set from hwy::Topology::LP::node.
    size_t node_ = 0;
    // L2 cache size in KiB, or 0 if unknown.
    size_t private_kib_ = 0;
    // L3 cache size in KiB, or 0 if unknown.
    size_t shared_kib_ = 0;
  };  // Cluster

  size_t NumClusters() const { return clusters_.size(); }
  const Cluster& GetCluster(size_t cluster_idx) const {
    HWY_ASSERT(cluster_idx < clusters_.size());
    return clusters_[cluster_idx];
  }

  // In case we are running with a subset of packages/clusters, these are added
  // to the package/cluster indices for purposes of the thread name, so that
  // they are distinct.
  size_t SkippedPackages() const { return package_slice_.Begin(); }
  size_t SkippedClusters() const { return cluster_slice_.Begin(); }

 private:
  void SplitLargeCluster(const LPS& enabled_lps,
                         hwy::Topology::Cluster tcluster);
  bool InitFromTopology(const LPS& enabled_lps);
  void InitFromLPs(const LPS& enabled_lps);

#if !GEMMA_DISABLE_TOPOLOGY
  hwy::Topology topology_;
#endif
  BoundedSlice package_slice_;  // Within the entire detected topology.
  BoundedSlice cluster_slice_;
  std::vector<Cluster> clusters_;
  char topology_string_[96];
  LPS nodes_;
};

}  // namespace gcpp

#endif  // THIRD_PARTY_GEMMA_CPP_UTIL_TOPOLOGY_H_