mirror of https://github.com/google/gemma.cpp.git
Cleanup: split CacheInfo from Allocator, MatMul helper functions
Lift DecompressA out of main autotuner to prevent interference Also use kMaxNR / kNR constants instead of extra args Fix: only require vector alignment, not cache alignment PiperOrigin-RevId: 804333769
This commit is contained in:
parent
6e52a835c6
commit
06e5da1e22
|
|
@ -241,8 +241,8 @@ void ShowConfig(const LoaderArgs& loader, const ThreadingArgs& threading,
|
||||||
dt, cpu100, static_cast<int>(threading.bind),
|
dt, cpu100, static_cast<int>(threading.bind),
|
||||||
ctx.topology.TopologyString(), ctx.pools.PinString(),
|
ctx.topology.TopologyString(), ctx.pools.PinString(),
|
||||||
CacheString().c_str(), hwy::TargetName(hwy::DispatchedTarget()),
|
CacheString().c_str(), hwy::TargetName(hwy::DispatchedTarget()),
|
||||||
ctx.allocator.VectorBytes() * 8, CompiledConfig(), PROFILER_ENABLED,
|
ctx.cache_info.VectorBytes() * 8, CompiledConfig(),
|
||||||
ctx.allocator.TotalMiB());
|
PROFILER_ENABLED, ctx.allocator.TotalMiB());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
141
ops/matmul-inl.h
141
ops/matmul-inl.h
|
|
@ -21,7 +21,7 @@
|
||||||
|
|
||||||
#include "compression/types.h"
|
#include "compression/types.h"
|
||||||
#include "ops/matmul.h" // IWYU pragma: export
|
#include "ops/matmul.h" // IWYU pragma: export
|
||||||
#include "util/allocator.h"
|
#include "util/allocator.h" // CacheInfo
|
||||||
#include "util/basics.h"
|
#include "util/basics.h"
|
||||||
#include "util/mat.h"
|
#include "util/mat.h"
|
||||||
#include "util/threading_context.h"
|
#include "util/threading_context.h"
|
||||||
|
|
@ -566,7 +566,7 @@ class MMKernel {
|
||||||
};
|
};
|
||||||
|
|
||||||
// Miscellaneous stateless helper functions.
|
// Miscellaneous stateless helper functions.
|
||||||
struct MMImpl {
|
class MMImpl {
|
||||||
// Returns existing entry for the given key or -1.
|
// Returns existing entry for the given key or -1.
|
||||||
static HWY_INLINE intptr_t IndexOfKey(MMKeys::Key key, const MMKeys& keys) {
|
static HWY_INLINE intptr_t IndexOfKey(MMKeys::Key key, const MMKeys& keys) {
|
||||||
const hwy::Span<const uint64_t> all_keys = keys.Keys();
|
const hwy::Span<const uint64_t> all_keys = keys.Keys();
|
||||||
|
|
@ -596,6 +596,63 @@ struct MMImpl {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public:
|
||||||
|
static MMPerKey& FindOrAddPerKey(size_t M, size_t K, size_t N,
|
||||||
|
size_t vector_bytes,
|
||||||
|
MatMulEnv::PerCluster& per_cluster) {
|
||||||
|
const MMKeys::Key key = MMKeys::KeyFromDims(M, K, N);
|
||||||
|
intptr_t index = MMImpl::IndexOfKey(key, per_cluster.keys);
|
||||||
|
// First time we see this shape/key.
|
||||||
|
if (HWY_UNLIKELY(index < 0)) {
|
||||||
|
per_cluster.keys.Append(key, vector_bytes);
|
||||||
|
|
||||||
|
// Invalidates `MMAutoTune::Best()`.
|
||||||
|
std::vector<MMPerKey>& per_keys = per_cluster.per_key;
|
||||||
|
index = per_keys.size();
|
||||||
|
per_keys.push_back(MMPerKey());
|
||||||
|
}
|
||||||
|
return per_cluster.per_key[index];
|
||||||
|
}
|
||||||
|
|
||||||
|
static void NotifyAutotuneResult(size_t M, size_t K, size_t N, double t0,
|
||||||
|
const MMConfig& cfg, MatMulEnv& env,
|
||||||
|
MMAutoTune<MMConfig>& tuner) {
|
||||||
|
const uint64_t t1 =
|
||||||
|
env.have_timer_stop ? hwy::timer::Stop() : hwy::timer::Start();
|
||||||
|
const double min_elapsed = static_cast<double>(tuner.NotifyTicks(t1 - t0)) /
|
||||||
|
hwy::platform::InvariantTicksPerSecond();
|
||||||
|
const double flops = 2 * M * K * N / min_elapsed; // * 2 for FMA
|
||||||
|
if (HWY_UNLIKELY(env.print_measurement && tuner.ShouldPrint())) {
|
||||||
|
fprintf(stderr, "%7.1f,%.2f,%zu,%4zu,%4zu,%5zu,%s,%zu\n", flops * 1E-9,
|
||||||
|
min_elapsed * 1E3, cfg.MR(), cfg.MC(), cfg.KC(), cfg.NC(),
|
||||||
|
StringFromOrder(cfg.Order()), cfg.InnerTasks());
|
||||||
|
}
|
||||||
|
if (HWY_UNLIKELY(env.print_best && tuner.Best())) {
|
||||||
|
const auto ratio = [&tuner](uint64_t ticks) -> double {
|
||||||
|
return static_cast<double>(ticks) /
|
||||||
|
static_cast<double>(tuner.BestTicks());
|
||||||
|
};
|
||||||
|
const MMConfig& best = *tuner.Best();
|
||||||
|
fprintf(stderr,
|
||||||
|
"\n%zu,%zu,%zu,%7.1f,%.2f,%zu,%4zu,%4zu,%5zu,%s,%zu,%.2f,%.2f\n",
|
||||||
|
M, K, N, flops * 1E-9, min_elapsed * 1E3, best.MR(), best.MC(),
|
||||||
|
best.KC(), best.NC(), StringFromOrder(best.Order()),
|
||||||
|
best.InnerTasks(), ratio(tuner.WorstMinTicks()),
|
||||||
|
ratio(tuner.FirstConfigTicks()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void EnsureAligned(const MatPtr& A, const size_t vector_bytes) {
|
||||||
|
// Ensure A rows are vector-aligned. Neither `Stride` nor `IsPacked` are
|
||||||
|
// reliable: the latter returns true for single rows, and the former may
|
||||||
|
// match `Cols` if the width matches the padding.
|
||||||
|
// Note that B is packed in matmul_test, but otherwise generally padded.
|
||||||
|
HWY_ASSERT(hwy::IsAligned(A.RowBytes(0), vector_bytes));
|
||||||
|
if (A.Rows() > 1) {
|
||||||
|
HWY_ASSERT(hwy::IsAligned(A.RowBytes(1), vector_bytes));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static size_t Worker(const MMArgs& args) {
|
static size_t Worker(const MMArgs& args) {
|
||||||
return args.options.cluster_idx *
|
return args.options.cluster_idx *
|
||||||
args.env->ctx.pools.MaxWorkersPerCluster();
|
args.env->ctx.pools.MaxWorkersPerCluster();
|
||||||
|
|
@ -753,14 +810,14 @@ struct MMImpl {
|
||||||
// loops over the inner KC and MC. Member variables avoid long argument lists.
|
// loops over the inner KC and MC. Member variables avoid long argument lists.
|
||||||
class MMState {
|
class MMState {
|
||||||
public:
|
public:
|
||||||
MMState(const Extents2D A, const size_t B_rows, const MMArgs& args,
|
MMState(size_t M, size_t K, size_t N, const MMArgs& args,
|
||||||
const MMConfig& config)
|
const MMConfig& config)
|
||||||
: args_(args),
|
: args_(args),
|
||||||
range_n_(0, B_rows),
|
range_n_(0, N),
|
||||||
mr_(config.MR()),
|
mr_(config.MR()),
|
||||||
ranges_mc_(config.RangesOfMC(A.rows)),
|
ranges_mc_(config.RangesOfMC(M)),
|
||||||
ranges_kc_(config.RangesOfKC(A.cols)),
|
ranges_kc_(config.RangesOfKC(K)),
|
||||||
ranges_nc_(config.RangesOfNC(B_rows)),
|
ranges_nc_(config.RangesOfNC(N)),
|
||||||
order_(config.Order()),
|
order_(config.Order()),
|
||||||
inner_tasks_(config.InnerTasks()) {}
|
inner_tasks_(config.InnerTasks()) {}
|
||||||
|
|
||||||
|
|
@ -783,7 +840,7 @@ class MMState {
|
||||||
// Compute size of per-worker storage for `kNR` row ranges of B. Stack
|
// Compute size of per-worker storage for `kNR` row ranges of B. Stack
|
||||||
// allocation avoids passing a worker index.
|
// allocation avoids passing a worker index.
|
||||||
static constexpr size_t B_stride_max_ =
|
static constexpr size_t B_stride_max_ =
|
||||||
kMaxKC + 2 * Allocator::MaxLineBytes() / sizeof(BF16);
|
kMaxKC + 2 * CacheInfo::MaxLineBytes() / sizeof(BF16);
|
||||||
static constexpr size_t B_storage_max_ = kNR * B_stride_max_;
|
static constexpr size_t B_storage_max_ = kNR * B_stride_max_;
|
||||||
|
|
||||||
// Granularity of `ForN`. B rows produce C columns, so we
|
// Granularity of `ForN`. B rows produce C columns, so we
|
||||||
|
|
@ -1056,88 +1113,48 @@ HWY_NOINLINE MMPerKey* MatMul(const MatPtrT<TA>& A, const MatPtrT<TB>& B,
|
||||||
options.cluster_idx * env.ctx.pools.MaxWorkersPerCluster(),
|
options.cluster_idx * env.ctx.pools.MaxWorkersPerCluster(),
|
||||||
zone);
|
zone);
|
||||||
|
|
||||||
const Allocator& allocator = env.ctx.allocator;
|
|
||||||
HWY_DASSERT(options.cluster_idx < env.row_ptrs.size());
|
HWY_DASSERT(options.cluster_idx < env.row_ptrs.size());
|
||||||
MatMulEnv::PerCluster& per_cluster = env.per_cluster[options.cluster_idx];
|
|
||||||
RowPtrs<TC> C_rows =
|
RowPtrs<TC> C_rows =
|
||||||
GetOrSetTempRowPtrs(C, env.row_ptrs[options.cluster_idx]);
|
GetOrSetTempRowPtrs(C, env.row_ptrs[options.cluster_idx]);
|
||||||
|
|
||||||
const size_t M = A.Rows();
|
const size_t M = A.Rows();
|
||||||
const size_t K = A.Cols();
|
const size_t K = A.Cols();
|
||||||
const size_t N = B.Rows();
|
const size_t N = B.Rows();
|
||||||
const MMKeys::Key key = MMKeys::KeyFromDims(M, K, N);
|
|
||||||
intptr_t index = MMImpl::IndexOfKey(key, per_cluster.keys);
|
|
||||||
// First time we see this shape/key.
|
|
||||||
if (HWY_UNLIKELY(index < 0)) {
|
|
||||||
per_cluster.keys.Append(key, allocator);
|
|
||||||
|
|
||||||
// invalidates `MMAutoTune::Best()`
|
const CacheInfo& cache = env.ctx.cache_info;
|
||||||
std::vector<MMPerKey>& per_keys = per_cluster.per_key;
|
MMPerKey& per_key = MMImpl::FindOrAddPerKey(
|
||||||
index = per_keys.size();
|
M, K, N, cache.VectorBytes(), env.per_cluster[options.cluster_idx]);
|
||||||
per_keys.push_back(MMPerKey());
|
|
||||||
}
|
|
||||||
MMPerKey& per_key = per_cluster.per_key[index];
|
|
||||||
MMAutoTune<MMConfig>& tuner = per_key.autotune;
|
MMAutoTune<MMConfig>& tuner = per_key.autotune;
|
||||||
|
|
||||||
const MMArgs args(env, per_key, static_cast<double>(A.Scale()) * B.Scale(),
|
const MMArgs args(env, per_key, static_cast<double>(A.Scale()) * B.Scale(),
|
||||||
add, options);
|
add, options);
|
||||||
if (HWY_LIKELY(tuner.Best())) {
|
if (HWY_LIKELY(tuner.Best())) {
|
||||||
const MMState state(A.Extents(), B.Rows(), args, *tuner.Best());
|
const MMState state(M, K, N, args, *tuner.Best());
|
||||||
const StridedViewBF A_view = MMImpl::MaybeDecompressA(A, args);
|
const StridedViewBF A_view = MMImpl::MaybeDecompressA(A, args);
|
||||||
state.DispatchParallelism(A_view, B, C_rows);
|
state.DispatchParallelism(A_view, B, C_rows);
|
||||||
return &per_key;
|
return &per_key;
|
||||||
}
|
}
|
||||||
|
|
||||||
// From here, CPU time is negligible except DoMatMul.
|
// Autotuning, first call: enumerate all feasible configs.
|
||||||
|
|
||||||
// First call: enumerate all feasible configs.
|
|
||||||
if (HWY_UNLIKELY(!tuner.HasCandidates())) {
|
if (HWY_UNLIKELY(!tuner.HasCandidates())) {
|
||||||
// Ensure matrix dimensions match each other.
|
// Ensure matrix dimensions match each other (off the hot path).
|
||||||
HWY_ASSERT(K == B.Cols());
|
HWY_ASSERT(K == B.Cols());
|
||||||
HWY_ASSERT(M <= kMaxBatchSize);
|
HWY_ASSERT(M <= kMaxBatchSize);
|
||||||
HWY_ASSERT(K <= MMStorage::kMaxK);
|
HWY_ASSERT(K <= MMStorage::kMaxK);
|
||||||
HWY_ASSERT(N % kNR == 0);
|
HWY_ASSERT(N % kNR == 0);
|
||||||
// Ensure A rows are vector-aligned. Neither `Stride` nor `IsPacked` are
|
MMImpl::EnsureAligned(A, cache.VectorBytes());
|
||||||
// reliable: the latter returns true for single rows, and the former may
|
tuner.SetCandidates(
|
||||||
// match `Cols` if the width matches the padding.
|
MMCandidates(cache, M, K, N, sizeof(TC), env.print_config));
|
||||||
// Note that B is packed in matmul_test, but otherwise generally padded.
|
|
||||||
HWY_ASSERT(hwy::IsAligned(A.Row(0), env.ctx.allocator.LineBytes()));
|
|
||||||
if (A.Rows() > 1) {
|
|
||||||
HWY_ASSERT(hwy::IsAligned(A.Row(1), env.ctx.allocator.LineBytes()));
|
|
||||||
}
|
|
||||||
|
|
||||||
tuner.SetCandidates(MMCandidates(allocator, M, K, N, sizeof(TC), kMaxMR,
|
|
||||||
kNR, env.print_config));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// (Also auto-tunes, hence outside the timed section to prevent interference.)
|
||||||
|
const StridedViewBF A_view = MMImpl::MaybeDecompressA(A, args);
|
||||||
|
|
||||||
const MMConfig& cfg = tuner.NextConfig();
|
const MMConfig& cfg = tuner.NextConfig();
|
||||||
const uint64_t t0 = hwy::timer::Start();
|
const uint64_t t0 = hwy::timer::Start();
|
||||||
MMState state(A.Extents(), B.Rows(), args, cfg);
|
MMState state(M, K, N, args, cfg);
|
||||||
const StridedViewBF A_view = MMImpl::MaybeDecompressA(A, args);
|
|
||||||
state.DispatchParallelism(A_view, B, C_rows);
|
state.DispatchParallelism(A_view, B, C_rows);
|
||||||
const uint64_t t1 =
|
MMImpl::NotifyAutotuneResult(M, K, N, t0, cfg, env, tuner);
|
||||||
env.have_timer_stop ? hwy::timer::Stop() : hwy::timer::Start();
|
|
||||||
const double min_elapsed = static_cast<double>(tuner.NotifyTicks(t1 - t0)) /
|
|
||||||
hwy::platform::InvariantTicksPerSecond();
|
|
||||||
const double flops = 2 * M * K * N / min_elapsed; // * 2 for FMA
|
|
||||||
if (HWY_UNLIKELY(env.print_measurement && tuner.ShouldPrint())) {
|
|
||||||
fprintf(stderr, "%7.1f,%.2f,%zu,%4zu,%4zu,%5zu,%s,%zu\n", flops * 1E-9,
|
|
||||||
min_elapsed * 1E3, cfg.MR(), cfg.MC(), cfg.KC(), cfg.NC(),
|
|
||||||
StringFromOrder(cfg.Order()), cfg.InnerTasks());
|
|
||||||
}
|
|
||||||
if (HWY_UNLIKELY(env.print_best && tuner.Best())) {
|
|
||||||
const auto ratio = [per_key](uint64_t ticks) -> double {
|
|
||||||
return static_cast<double>(ticks) /
|
|
||||||
static_cast<double>(per_key.autotune.BestTicks());
|
|
||||||
};
|
|
||||||
const MMConfig& best = *tuner.Best();
|
|
||||||
fprintf(stderr,
|
|
||||||
"\n%zu,%zu,%zu,%7.1f,%.2f,%zu,%4zu,%4zu,%5zu,%s,%zu,%.2f,%.2f\n", M,
|
|
||||||
K, N, flops * 1E-9, min_elapsed * 1E3, best.MR(), best.MC(),
|
|
||||||
best.KC(), best.NC(), StringFromOrder(best.Order()),
|
|
||||||
best.InnerTasks(), ratio(tuner.WorstMinTicks()),
|
|
||||||
ratio(tuner.FirstConfigTicks()));
|
|
||||||
}
|
|
||||||
|
|
||||||
return &per_key;
|
return &per_key;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -62,22 +62,19 @@ size_t PrevDivisor(const size_t begin, const size_t end, const size_t dim,
|
||||||
// and holds most of their arguments in member variables.
|
// and holds most of their arguments in member variables.
|
||||||
class GenerateCandidates {
|
class GenerateCandidates {
|
||||||
public:
|
public:
|
||||||
GenerateCandidates(const Allocator& allocator, size_t M, size_t K, size_t N,
|
GenerateCandidates(const CacheInfo& cache, size_t M, size_t K, size_t N,
|
||||||
size_t sizeof_TC, size_t max_mr, size_t nr,
|
size_t sizeof_TC, bool print_config)
|
||||||
bool print_config)
|
: cache_(cache),
|
||||||
: allocator_(allocator),
|
|
||||||
M_(M),
|
M_(M),
|
||||||
K_(K),
|
K_(K),
|
||||||
N_(N),
|
N_(N),
|
||||||
sizeof_TC_(sizeof_TC),
|
sizeof_TC_(sizeof_TC),
|
||||||
max_mr_(max_mr),
|
|
||||||
nr_(nr),
|
|
||||||
// These influence kc/nc, but are also stored in `MMConfig` for
|
// These influence kc/nc, but are also stored in `MMConfig` for
|
||||||
// `RangesOf*`. Must be a vector multiple. The previous/next cache line
|
// `RangesOf*`. Must be a vector multiple. The previous/next cache line
|
||||||
// is likely still in L1, but we expect K > 1000 and might as well round
|
// is likely still in L1, but we expect K > 1000 and might as well round
|
||||||
// up to the line size. Both A and B are BF16.
|
// up to the line size. Both A and B are BF16.
|
||||||
kc_multiple_(HWY_MIN(K, allocator.LineBytes() / sizeof(BF16))),
|
kc_multiple_(HWY_MIN(K, cache.LineBytes() / sizeof(BF16))),
|
||||||
nc_multiple_(allocator.StepBytes() / sizeof_TC),
|
nc_multiple_(cache.StepBytes() / sizeof_TC),
|
||||||
print_config_(print_config) {}
|
print_config_(print_config) {}
|
||||||
|
|
||||||
std::vector<MMConfig> operator()() const {
|
std::vector<MMConfig> operator()() const {
|
||||||
|
|
@ -127,10 +124,10 @@ class GenerateCandidates {
|
||||||
SizeVec all_mr;
|
SizeVec all_mr;
|
||||||
all_mr.reserve(3);
|
all_mr.reserve(3);
|
||||||
// AVX2's 16 registers are not enough for four rows, but SSE4 may benefit.
|
// AVX2's 16 registers are not enough for four rows, but SSE4 may benefit.
|
||||||
if (M_ >= max_mr_ && !is_avx2) all_mr.push_back(max_mr_);
|
if (M_ >= kMaxMR && !is_avx2) all_mr.push_back(kMaxMR);
|
||||||
// Allow for AVX-512 but not SSE4 (for which 4 are usually better). Also
|
// Allow for AVX-512 but not SSE4 (for which 4 are usually better). Also
|
||||||
// enable if not enough rows for 4.
|
// enable if not enough rows for 4.
|
||||||
if (M_ >= 2 && (M_ < max_mr_ || (!is_sse && !is_wasm))) {
|
if (M_ >= 2 && (M_ < kMaxMR || (!is_sse && !is_wasm))) {
|
||||||
all_mr.push_back(size_t{2});
|
all_mr.push_back(size_t{2});
|
||||||
}
|
}
|
||||||
// Even SSE4 usually prefers 2 rows; only enable for single rows.
|
// Even SSE4 usually prefers 2 rows; only enable for single rows.
|
||||||
|
|
@ -172,8 +169,8 @@ class GenerateCandidates {
|
||||||
// size. This results in an overestimate, and the loop below will propose
|
// size. This results in an overestimate, and the loop below will propose
|
||||||
// the next few smaller values for the autotuner to evaluate.
|
// the next few smaller values for the autotuner to evaluate.
|
||||||
const size_t bytes_ab =
|
const size_t bytes_ab =
|
||||||
allocator_.L1Bytes() * (sizeof(BF16) + sizeof(SfpStream));
|
cache_.L1Bytes() * (sizeof(BF16) + sizeof(SfpStream));
|
||||||
const size_t col_bytes = rows_a * sizeof(BF16) + nr_ * sizeof(BF16);
|
const size_t col_bytes = rows_a * sizeof(BF16) + kNR * sizeof(BF16);
|
||||||
size_t kc_max = hwy::DivCeil(bytes_ab, col_bytes);
|
size_t kc_max = hwy::DivCeil(bytes_ab, col_bytes);
|
||||||
kc_max = RoundDownWithFloor(HWY_MIN(kc_max, kMaxKC), kc_multiple_);
|
kc_max = RoundDownWithFloor(HWY_MIN(kc_max, kMaxKC), kc_multiple_);
|
||||||
kc_max = HWY_MIN(kc_max, K_);
|
kc_max = HWY_MIN(kc_max, K_);
|
||||||
|
|
@ -213,14 +210,14 @@ class GenerateCandidates {
|
||||||
SizeVec MC(size_t mr, size_t kc, MMOrder order) const {
|
SizeVec MC(size_t mr, size_t kc, MMOrder order) const {
|
||||||
// Typically 12-24K. The B rows are pinned in L1, but also occupy L2 because
|
// Typically 12-24K. The B rows are pinned in L1, but also occupy L2 because
|
||||||
// it is typically inclusive.
|
// it is typically inclusive.
|
||||||
const size_t bytes_b = nr_ * kc * (sizeof(SfpStream) + sizeof(BF16));
|
const size_t bytes_b = kNR * kc * (sizeof(SfpStream) + sizeof(BF16));
|
||||||
|
|
||||||
// Choose the largest feasible `mc_max` (A/C rows) to maximize reuse of the
|
// Choose the largest feasible `mc_max` (A/C rows) to maximize reuse of the
|
||||||
// packed B. We want `mc * kc` elements of A to fit in L2, alongside
|
// packed B. We want `mc * kc` elements of A to fit in L2, alongside
|
||||||
// `bytes_b` plus `mc` cache lines because resident-A updates `mc` rows of
|
// `bytes_b` plus `mc` cache lines because resident-A updates `mc` rows of
|
||||||
// partial.
|
// partial.
|
||||||
const size_t bytes_per_mc = kc * sizeof(BF16) + allocator_.LineBytes();
|
const size_t bytes_per_mc = kc * sizeof(BF16) + cache_.LineBytes();
|
||||||
size_t mc_max = hwy::DivCeil(allocator_.L2Bytes() - bytes_b, bytes_per_mc);
|
size_t mc_max = hwy::DivCeil(cache_.L2Bytes() - bytes_b, bytes_per_mc);
|
||||||
mc_max = HWY_MIN(mc_max, kMaxBatchSize);
|
mc_max = HWY_MIN(mc_max, kMaxBatchSize);
|
||||||
HWY_DASSERT(mc_max != 0);
|
HWY_DASSERT(mc_max != 0);
|
||||||
mc_max = HWY_MIN(mc_max, M_);
|
mc_max = HWY_MIN(mc_max, M_);
|
||||||
|
|
@ -261,7 +258,7 @@ class GenerateCandidates {
|
||||||
// Otherwise, leave it unbounded.
|
// Otherwise, leave it unbounded.
|
||||||
if (M_ > mr) {
|
if (M_ > mr) {
|
||||||
const size_t bytes_per_nc = (kc * sizeof(BF16) + mc * sizeof_TC_);
|
const size_t bytes_per_nc = (kc * sizeof(BF16) + mc * sizeof_TC_);
|
||||||
nc_max = HWY_MIN(hwy::DivCeil(allocator_.L3Bytes(), bytes_per_nc), N_);
|
nc_max = HWY_MIN(hwy::DivCeil(cache_.L3Bytes(), bytes_per_nc), N_);
|
||||||
}
|
}
|
||||||
HWY_DASSERT(nc_max != 0);
|
HWY_DASSERT(nc_max != 0);
|
||||||
nc_max = RoundDownWithFloor(nc_max, nc_multiple_);
|
nc_max = RoundDownWithFloor(nc_max, nc_multiple_);
|
||||||
|
|
@ -328,15 +325,12 @@ class GenerateCandidates {
|
||||||
return inner_tasks;
|
return inner_tasks;
|
||||||
}
|
}
|
||||||
|
|
||||||
const Allocator& allocator_;
|
const CacheInfo& cache_;
|
||||||
const size_t M_;
|
const size_t M_;
|
||||||
const size_t K_;
|
const size_t K_;
|
||||||
const size_t N_;
|
const size_t N_;
|
||||||
const size_t sizeof_TC_;
|
const size_t sizeof_TC_;
|
||||||
|
|
||||||
const size_t max_mr_;
|
|
||||||
const size_t nr_;
|
|
||||||
|
|
||||||
const size_t kc_multiple_;
|
const size_t kc_multiple_;
|
||||||
const size_t nc_multiple_;
|
const size_t nc_multiple_;
|
||||||
|
|
||||||
|
|
@ -346,12 +340,10 @@ class GenerateCandidates {
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
// Facade to avoid exposing `GenerateCandidates` in the header.
|
// Facade to avoid exposing `GenerateCandidates` in the header.
|
||||||
std::vector<MMConfig> MMCandidates(const Allocator& allocator, size_t M,
|
std::vector<MMConfig> MMCandidates(const CacheInfo& cache, size_t M, size_t K,
|
||||||
size_t K, size_t N, size_t sizeof_TC,
|
size_t N, size_t sizeof_TC,
|
||||||
size_t max_mr, size_t nr,
|
|
||||||
bool print_config) {
|
bool print_config) {
|
||||||
return GenerateCandidates(allocator, M, K, N, sizeof_TC, max_mr, nr,
|
return GenerateCandidates(cache, M, K, N, sizeof_TC, print_config)();
|
||||||
print_config)();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
MatMulEnv::MatMulEnv(ThreadingContext& ctx) : ctx(ctx) {
|
MatMulEnv::MatMulEnv(ThreadingContext& ctx) : ctx(ctx) {
|
||||||
|
|
|
||||||
12
ops/matmul.h
12
ops/matmul.h
|
|
@ -477,9 +477,9 @@ class MMConfig {
|
||||||
static_assert(sizeof(MMConfig) == 32); // for faster indexing
|
static_assert(sizeof(MMConfig) == 32); // for faster indexing
|
||||||
#pragma pack(pop)
|
#pragma pack(pop)
|
||||||
|
|
||||||
std::vector<MMConfig> MMCandidates(const Allocator& allocator, size_t M,
|
std::vector<MMConfig> MMCandidates(const CacheInfo& cache, size_t M, size_t K,
|
||||||
size_t K, size_t N, size_t sizeof_TC,
|
size_t N, size_t sizeof_TC,
|
||||||
size_t max_mr, size_t nr, bool print_config);
|
bool print_config);
|
||||||
|
|
||||||
// State machine for choosing the best `TConfig`, which is `MMConfig` for the
|
// State machine for choosing the best `TConfig`, which is `MMConfig` for the
|
||||||
// main MatMul autotuner.
|
// main MatMul autotuner.
|
||||||
|
|
@ -619,11 +619,11 @@ class MMKeys {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Must only be called if not already present in `Keys()`.
|
// Must only be called if not already present in `Keys()`.
|
||||||
void Append(Key key, const Allocator& allocator) {
|
void Append(Key key, size_t vector_bytes) {
|
||||||
// Dynamic allocation because the test checks many more dimensions than
|
// Dynamic allocation because the test checks many more dimensions than
|
||||||
// would be reasonable to pre-allocate. DIY for alignment and padding.
|
// would be reasonable to pre-allocate. DIY for alignment and padding.
|
||||||
if (HWY_UNLIKELY(num_unique_ >= capacity_)) {
|
if (HWY_UNLIKELY(num_unique_ >= capacity_)) {
|
||||||
const size_t NU64 = allocator.VectorBytes() / sizeof(Key);
|
const size_t NU64 = vector_bytes / sizeof(Key);
|
||||||
// Start at one vector so the size is always a multiple of N.
|
// Start at one vector so the size is always a multiple of N.
|
||||||
if (HWY_UNLIKELY(capacity_ == 0)) {
|
if (HWY_UNLIKELY(capacity_ == 0)) {
|
||||||
capacity_ = hwy::DivCeil(NU64, 2); // will be doubled below
|
capacity_ = hwy::DivCeil(NU64, 2); // will be doubled below
|
||||||
|
|
@ -704,7 +704,7 @@ struct MMArgs {
|
||||||
scale(scale),
|
scale(scale),
|
||||||
add(add),
|
add(add),
|
||||||
options(options),
|
options(options),
|
||||||
line_bytes(env.ctx.allocator.LineBytes()) {}
|
line_bytes(env.ctx.cache_info.LineBytes()) {}
|
||||||
|
|
||||||
MatMulEnv* env;
|
MatMulEnv* env;
|
||||||
MMPerKey* per_key;
|
MMPerKey* per_key;
|
||||||
|
|
|
||||||
|
|
@ -130,7 +130,7 @@ size_t DetectTotalMiB(size_t page_bytes) {
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
Allocator::Allocator(const BoundedTopology& topology, bool enable_bind) {
|
CacheInfo::CacheInfo(const BoundedTopology& topology) {
|
||||||
line_bytes_ = DetectLineBytes();
|
line_bytes_ = DetectLineBytes();
|
||||||
// Ensure MaxLineBytes() is an upper bound.
|
// Ensure MaxLineBytes() is an upper bound.
|
||||||
HWY_ASSERT(MaxLineBytes() >= LineBytes());
|
HWY_ASSERT(MaxLineBytes() >= LineBytes());
|
||||||
|
|
@ -138,8 +138,6 @@ Allocator::Allocator(const BoundedTopology& topology, bool enable_bind) {
|
||||||
vector_bytes_ = hwy::VectorBytes();
|
vector_bytes_ = hwy::VectorBytes();
|
||||||
|
|
||||||
step_bytes_ = HWY_MAX(line_bytes_, vector_bytes_);
|
step_bytes_ = HWY_MAX(line_bytes_, vector_bytes_);
|
||||||
base_page_bytes_ = DetectPageSize();
|
|
||||||
quantum_bytes_ = step_bytes_; // may overwrite below
|
|
||||||
|
|
||||||
const BoundedTopology::Cluster& cluster = topology.GetCluster(0, 0);
|
const BoundedTopology::Cluster& cluster = topology.GetCluster(0, 0);
|
||||||
if (const hwy::Cache* caches = hwy::DataCaches()) {
|
if (const hwy::Cache* caches = hwy::DataCaches()) {
|
||||||
|
|
@ -153,8 +151,14 @@ Allocator::Allocator(const BoundedTopology& topology, bool enable_bind) {
|
||||||
if (l3_bytes_ == 0) {
|
if (l3_bytes_ == 0) {
|
||||||
l3_bytes_ = (cluster.SharedKiB() ? cluster.SharedKiB() : 1024) << 10;
|
l3_bytes_ = (cluster.SharedKiB() ? cluster.SharedKiB() : 1024) << 10;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
total_mib_ = DetectTotalMiB(base_page_bytes_);
|
Allocator::Allocator(const BoundedTopology& topology,
|
||||||
|
const CacheInfo& cache_info, bool enable_bind)
|
||||||
|
: line_bytes_(cache_info.LineBytes()),
|
||||||
|
base_page_bytes_(DetectPageSize()),
|
||||||
|
total_mib_(DetectTotalMiB(base_page_bytes_)) {
|
||||||
|
quantum_bytes_ = cache_info.StepBytes(); // may overwrite below
|
||||||
|
|
||||||
// Prerequisites for binding:
|
// Prerequisites for binding:
|
||||||
// - supported by the OS (currently Linux only),
|
// - supported by the OS (currently Linux only),
|
||||||
|
|
|
||||||
|
|
@ -77,27 +77,49 @@ using AlignedPtr = std::unique_ptr<T, DeleterFunc>;
|
||||||
template <typename T>
|
template <typename T>
|
||||||
using AlignedClassPtr = std::unique_ptr<T, DeleterDtor>;
|
using AlignedClassPtr = std::unique_ptr<T, DeleterDtor>;
|
||||||
|
|
||||||
// Both allocation, binding, and row accessors depend on the sizes of memory
|
// Holds cache line size/capacity and vector size. Stored in `ThreadingContext`.
|
||||||
// pages and cache lines. To avoid having to pass `Allocator&` everywhere, we
|
class CacheInfo {
|
||||||
// wrap this in a singleton. A monostate requires explicit initialization,
|
|
||||||
// which we prefer to avoid because there are many main() functions.
|
|
||||||
class Allocator {
|
|
||||||
public:
|
public:
|
||||||
// Must be called at least once before any other function. Not thread-safe,
|
CacheInfo(const BoundedTopology& topology);
|
||||||
// hence only call this from the main thread.
|
|
||||||
Allocator(const BoundedTopology& topology, bool enable_bind);
|
|
||||||
|
|
||||||
// Bytes per cache line, or a reasonable guess if unknown. Used to choose
|
// Bytes per cache line, or a reasonable guess if unknown. Used to choose
|
||||||
// ranges such that there will be no false sharing.
|
// ranges such that there will be no false sharing.
|
||||||
size_t LineBytes() const { return line_bytes_; }
|
size_t LineBytes() const { return line_bytes_; }
|
||||||
// Upper bound on `LineBytes()`, for stack allocations.
|
// Upper bound on `LineBytes()`, for stack allocations.
|
||||||
static constexpr size_t MaxLineBytes() { return 256; }
|
static constexpr size_t MaxLineBytes() { return 256; }
|
||||||
|
|
||||||
// Bytes per full vector. Used to compute loop steps.
|
// Bytes per full vector. Used to compute loop steps.
|
||||||
size_t VectorBytes() const { return vector_bytes_; }
|
size_t VectorBytes() const { return vector_bytes_; }
|
||||||
// Work granularity that avoids false sharing and partial vectors.
|
// Work granularity that avoids false sharing and partial vectors.
|
||||||
// = HWY_MAX(LineBytes(), VectorBytes())
|
// = HWY_MAX(LineBytes(), VectorBytes())
|
||||||
size_t StepBytes() const { return step_bytes_; }
|
size_t StepBytes() const { return step_bytes_; }
|
||||||
|
|
||||||
|
// L1 and L2 are typically per core.
|
||||||
|
size_t L1Bytes() const { return l1_bytes_; }
|
||||||
|
size_t L2Bytes() const { return l2_bytes_; }
|
||||||
|
// Clusters often share an L3. We return the total size per package.
|
||||||
|
size_t L3Bytes() const { return l3_bytes_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
size_t line_bytes_;
|
||||||
|
size_t vector_bytes_;
|
||||||
|
size_t step_bytes_;
|
||||||
|
|
||||||
|
size_t l1_bytes_ = 0;
|
||||||
|
size_t l2_bytes_ = 0;
|
||||||
|
size_t l3_bytes_ = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
// NUMA-aware allocation and memory binding. Stored in `ThreadingContext`.
|
||||||
|
class Allocator {
|
||||||
|
public:
|
||||||
|
Allocator(const BoundedTopology& topology, const CacheInfo& cache_info,
|
||||||
|
bool enable_bind);
|
||||||
|
|
||||||
|
// Used by `AllocateFor`, which only takes an `Allocator` argument,
|
||||||
|
// hence copy from `CacheInfo`.
|
||||||
|
size_t LineBytes() const { return line_bytes_; }
|
||||||
|
|
||||||
// File size multiple required for memory mapping. Also used when binding
|
// File size multiple required for memory mapping. Also used when binding
|
||||||
// memory to NUMA nodes (see `BindB/BindC`).
|
// memory to NUMA nodes (see `BindB/BindC`).
|
||||||
size_t BasePageBytes() const { return base_page_bytes_; }
|
size_t BasePageBytes() const { return base_page_bytes_; }
|
||||||
|
|
@ -105,12 +127,6 @@ class Allocator {
|
||||||
// Desired allocator alignment: Either StepBytes, or BasePageBytes if NUMA.
|
// Desired allocator alignment: Either StepBytes, or BasePageBytes if NUMA.
|
||||||
size_t QuantumBytes() const { return quantum_bytes_; }
|
size_t QuantumBytes() const { return quantum_bytes_; }
|
||||||
|
|
||||||
// L1 and L2 are typically per core.
|
|
||||||
size_t L1Bytes() const { return l1_bytes_; }
|
|
||||||
size_t L2Bytes() const { return l2_bytes_; }
|
|
||||||
// Clusters often share an L3. We return the total size per package.
|
|
||||||
size_t L3Bytes() const { return l3_bytes_; }
|
|
||||||
|
|
||||||
size_t TotalMiB() const { return total_mib_; }
|
size_t TotalMiB() const { return total_mib_; }
|
||||||
size_t FreeMiB() const;
|
size_t FreeMiB() const;
|
||||||
|
|
||||||
|
|
@ -159,18 +175,11 @@ class Allocator {
|
||||||
bool BindMemory(void* p, size_t bytes, size_t node) const;
|
bool BindMemory(void* p, size_t bytes, size_t node) const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
size_t line_bytes_;
|
const size_t line_bytes_;
|
||||||
size_t vector_bytes_;
|
const size_t base_page_bytes_;
|
||||||
size_t step_bytes_;
|
const size_t total_mib_;
|
||||||
size_t base_page_bytes_;
|
|
||||||
size_t quantum_bytes_;
|
size_t quantum_bytes_;
|
||||||
|
|
||||||
size_t l1_bytes_ = 0;
|
|
||||||
size_t l2_bytes_ = 0;
|
|
||||||
size_t l3_bytes_ = 0;
|
|
||||||
|
|
||||||
size_t total_mib_;
|
|
||||||
|
|
||||||
bool should_bind_ = false;
|
bool should_bind_ = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -76,7 +76,8 @@ ThreadingContext::ThreadingContext(const ThreadingArgs& args)
|
||||||
topology(BoundedSlice(args.skip_packages, args.max_packages),
|
topology(BoundedSlice(args.skip_packages, args.max_packages),
|
||||||
BoundedSlice(args.skip_clusters, args.max_clusters),
|
BoundedSlice(args.skip_clusters, args.max_clusters),
|
||||||
BoundedSlice(args.skip_lps, args.max_lps)),
|
BoundedSlice(args.skip_lps, args.max_lps)),
|
||||||
allocator(topology, args.bind != Tristate::kFalse),
|
cache_info(topology),
|
||||||
|
allocator(topology, cache_info, args.bind != Tristate::kFalse),
|
||||||
pools(topology, allocator, args.max_threads, args.pin) {
|
pools(topology, allocator, args.max_threads, args.pin) {
|
||||||
PROFILER_ZONE("Startup.ThreadingContext autotune");
|
PROFILER_ZONE("Startup.ThreadingContext autotune");
|
||||||
TunePool(pools.AllPackages());
|
TunePool(pools.AllPackages());
|
||||||
|
|
|
||||||
|
|
@ -105,7 +105,10 @@ struct ThreadingContext {
|
||||||
// will be 1 regardless of the actual system topology.
|
// will be 1 regardless of the actual system topology.
|
||||||
BoundedTopology topology;
|
BoundedTopology topology;
|
||||||
|
|
||||||
// Ctor depends on `topology` for deciding whether to enable NUMA.
|
// Ctor depends on `topology` for per-cluster cache sizes.
|
||||||
|
CacheInfo cache_info;
|
||||||
|
|
||||||
|
// Ctor depends on `topology` (for NUMA) and `cache_info` (for step size).
|
||||||
Allocator allocator;
|
Allocator allocator;
|
||||||
|
|
||||||
// Per-package/cluster/within cluster pools of threads, matching `topology`.
|
// Per-package/cluster/within cluster pools of threads, matching `topology`.
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue