gemma.cpp/util/allocator.h

235 lines
9.0 KiB
C++

// Copyright 2024 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef THIRD_PARTY_GEMMA_CPP_UTIL_ALLOCATOR_H_
#define THIRD_PARTY_GEMMA_CPP_UTIL_ALLOCATOR_H_
#include <stddef.h>
#include <stdint.h>
#include <cstdlib> // std::aligned_alloc / _aligned_malloc
// IWYU pragma: begin_exports
#include "util/threading.h"
#include "hwy/aligned_allocator.h"
#include "hwy/base.h"
#include "hwy/contrib/thread_pool/thread_pool.h"
// IWYU pragma: end_exports
#ifndef GEMMA_NUMA
// The check below requires two #if, hence start with 0 and redefine to 1.
#define GEMMA_NUMA 0
// To avoid a dependency on libnuma, use syscalls directly. We require six
// arguments, which has been supported by glibc since around 2010.
#if defined(__GLIBC__) && defined(__GLIBC_PREREQ)
#if HWY_OS_LINUX && __GLIBC_PREREQ(2, 11)
#undef GEMMA_NUMA
#define GEMMA_NUMA 1
#endif
#endif
#endif // GEMMA_NUMA
namespace gcpp {
using ByteStorageT = hwy::AlignedFreeUniquePtr<uint8_t[]>;
template <typename T>
ByteStorageT AllocateSizeof() {
return hwy::AllocateAligned<uint8_t>(sizeof(T));
}
// Owns dynamically-allocated aligned memory for a batch of row vectors.
// This can be seen as a (batch_size x len) matrix.
template <typename T>
class RowVectorBatch {
public:
// Default ctor for Activations ctor.
RowVectorBatch() : batch_size_(0), len_(0) {}
// Main ctor, called from Activations::Allocate.
RowVectorBatch(size_t batch_size, size_t len)
: batch_size_(batch_size), len_(len) {
mem_ = hwy::AllocateAligned<T>(batch_size * len);
}
// Move-only
RowVectorBatch(RowVectorBatch&) noexcept = delete;
RowVectorBatch& operator=(RowVectorBatch&) noexcept = delete;
RowVectorBatch(RowVectorBatch&&) noexcept = default;
RowVectorBatch& operator=(RowVectorBatch&&) noexcept = default;
size_t BatchSize() const { return batch_size_; }
size_t Len() const { return len_; }
// Returns the given row vector of length `Len()`.
T* Batch(size_t batch_idx) {
HWY_DASSERT(batch_idx < batch_size_);
return mem_.get() + batch_idx * len_;
}
const T* Batch(size_t batch_idx) const {
HWY_DASSERT(batch_idx < batch_size_);
return mem_.get() + batch_idx * len_;
}
// For MatMul or other operations that process the entire batch at once.
T* All() { return mem_.get(); }
const T* Const() const { return mem_.get(); }
size_t NumBytes() const { return batch_size_ * len_ * sizeof(T); }
private:
hwy::AlignedFreeUniquePtr<T[]> mem_;
size_t batch_size_; // rows in the matrix
size_t len_; // columns in the matrix = vector length
};
// Stateful in order to know whether to bind to NUMA nodes. `Monostate` for
// convenience - avoids passing around a reference.
class Allocator {
public:
static void Init(const BoundedTopology& topology) {
bytes_per_page_ = DetectPageSize();
HWY_ASSERT(bytes_per_page_ <= (4 << 20));
// NUMA only makes sense if:
// - the page size is known and 'reasonably small', preferably less than
// a fraction of MatMul row/col sizes, which for 27B are up to 144 KiB.
// - we successfully detected topology and there are multiple nodes;
// - there are multiple packages, because we shard by package_idx.
use_numa_ = (bytes_per_page_ != 0 && bytes_per_page_ <= 16 * 1024) &&
topology.NumNodes() > 1 && topology.NumPackages() > 1;
// TODO: remove once tensors are page-aligned.
use_numa_ = false;
fprintf(stderr, "Warning: disabling use_numa_\n");
alignment_ = use_numa_ ? bytes_per_page_ : HWY_ALIGNMENT;
}
static bool UseNUMA() { return use_numa_; }
// BindTensor requires row pointers and lengths be a multiple of this.
static size_t Alignment() { return alignment_; }
template <typename T>
static hwy::AlignedFreeUniquePtr<T[]> Alloc(size_t num) {
// For non-NUMA, use the Highway allocator because it defends against 2k
// aliasing.
if (!use_numa_) return hwy::AllocateAligned<T>(num);
constexpr size_t kSize = sizeof(T);
// Ensure the `bytes = num * kSize` computation did not overflow.
constexpr bool kIsPow2 = (kSize & (kSize - 1)) == 0;
constexpr size_t kBits = hwy::detail::ShiftCount(kSize);
static_assert(!kIsPow2 || (1ull << kBits) == kSize, "ShiftCount has a bug");
const size_t bytes = kIsPow2 ? num << kBits : num * kSize;
const size_t check = kIsPow2 ? bytes >> kBits : bytes / kSize;
if (check != num) {
return hwy::AlignedFreeUniquePtr<T[]>(); // overflowed
}
// AlignedFreeUniquePtr has a deleter that can call an arbitrary `free`, but
// with an extra opaque pointer, which we discard via `call_free`.
#if defined(__ANDROID_API__) && __ANDROID_API__ < 28
const auto call_free = [](void* ptr, void*) { std::free(ptr); };
void* mem = nullptr;
int err = posix_memalign(&mem, Alignment(), bytes);
HWY_ASSERT(err == 0);
T* p = static_cast<T*>(mem);
#elif HWY_OS_WIN
const auto call_free = [](void* ptr, void*) { _aligned_free(ptr); };
T* p = static_cast<T*>(_aligned_malloc(bytes, Alignment()));
#else
const auto call_free = [](void* ptr, void*) { std::free(ptr); };
T* p = static_cast<T*>(std::aligned_alloc(Alignment(), bytes));
#endif
return hwy::AlignedFreeUniquePtr<T[]>(
p, hwy::AlignedFreer(call_free, nullptr));
}
private:
static size_t DetectPageSize();
// Required for BindMemory. Usually 4K, but can differ on Arm.
static size_t bytes_per_page_;
static bool use_numa_;
static size_t alignment_;
};
// Used in MatMul and allocator.h. Defined here because it depends on
// Allocator::Alignment().
template <class Func>
void StaticPartitionRowsAndCols(NestedPools& nested, size_t rows, size_t cols,
size_t bytes_per_element, const Func& func) {
// Both rows and cols must be a multiple of the alignment to avoid
// touching remote pages.
const size_t multiple = Allocator::Alignment() / bytes_per_element;
// Static partitioning of columns across packages. We assume that column
// sharding is more expensive, hence we distribute columns across packages,
// of which there are usually only one or two. For MatMul, the final result is
// the sum of each package's partial dot products.
hwy::ThreadPool& all_packages = nested.AllPackages();
const size_t num_packages = all_packages.NumWorkers();
const size_t cols_per_package =
hwy::RoundUpTo(hwy::DivCeil(cols, num_packages), multiple);
const size_t col_tasks = hwy::DivCeil(cols, cols_per_package);
HWY_ASSERT(col_tasks <= num_packages);
all_packages.Run(
0, col_tasks, [&](uint64_t package_idx, size_t package_thread) {
HWY_ASSERT(package_idx == package_thread); // one task per worker
const size_t col_begin = package_idx * cols_per_package;
const size_t col_end = HWY_MIN(col_begin + cols_per_package, cols);
// Static partitioning of rows across the package's clusters. We assume
// that row sharding is cheaper. In MatMul, results can indeed be
// computed independently for each row of B.
hwy::ThreadPool& all_clusters = nested.AllClusters(package_idx);
const size_t num_clusters = all_clusters.NumWorkers();
const size_t rows_per_cluster =
hwy::RoundUpTo(hwy::DivCeil(rows, num_clusters), multiple);
const size_t row_tasks = hwy::DivCeil(rows, rows_per_cluster);
HWY_ASSERT(row_tasks <= num_clusters);
all_clusters.Run(
0, row_tasks, [&](uint64_t cluster_idx, size_t cluster_thread) {
HWY_ASSERT(cluster_idx == cluster_thread); // one task per worker
// For binding to NUMA node.
const size_t node = nested.Node(package_idx, cluster_idx);
// Older CPUs that predate chiplets typically have only one
// cluster, so callers should also parallelize using this
// per-cluster pool.
hwy::ThreadPool& cluster =
nested.Cluster(package_idx, cluster_idx);
// This plus the worker from `cluster->Run` is the TLS index.
const size_t worker_offset =
nested.WorkerOffset(package_idx, cluster_idx);
const size_t row_begin = cluster_idx * rows_per_cluster;
const size_t row_end =
HWY_MIN(row_begin + rows_per_cluster, rows);
func(node, cluster, worker_offset, row_begin, row_end, col_begin,
col_end);
});
});
}
void BindTensor(NestedPools& nested, size_t rows, size_t cols,
size_t bytes_per_col, void* ptr);
} // namespace gcpp
#endif // THIRD_PARTY_GEMMA_CPP_UTIL_ALLOCATOR_H_