mirror of https://github.com/google/gemma.cpp.git
1.01x speedup: improved autotune
Group M=4..7 into same config. Add configs for power of two sizes. Allow odd mc to enable a single range for odd M. io.cc: warning fix(cast). IsBlock -> !IsOneMC benchmark_helper: best for verbosity 3, all configs for 4 ops_test: remove unused includes PiperOrigin-RevId: 824475104
This commit is contained in:
parent
8198e7104a
commit
86200ce224
|
|
@ -47,9 +47,8 @@ GemmaEnv::GemmaEnv(const LoaderArgs& loader, const ThreadingArgs& threading,
|
|||
ShowConfig(loader, threading, inference, config, gemma_.WeightReadMode(),
|
||||
ctx_);
|
||||
}
|
||||
if (inference.verbosity >= 3) {
|
||||
env_.print_config = env_.print_best = true;
|
||||
}
|
||||
if (inference.verbosity >= 3) env_.print_best = true;
|
||||
if (inference.verbosity >= 4) env_.print_config = true;
|
||||
|
||||
runtime_config_ = {
|
||||
.max_generated_tokens = inference.max_generated_tokens,
|
||||
|
|
|
|||
6
io/io.cc
6
io/io.cc
|
|
@ -110,7 +110,8 @@ class FilePosix : public File {
|
|||
HWY_WARN(
|
||||
"Read failure at pos %zu within size %zu with offset %zu and "
|
||||
"errno %d\n",
|
||||
pos, size, offset, errno);
|
||||
static_cast<size_t>(pos), static_cast<size_t>(size),
|
||||
static_cast<size_t>(offset), errno);
|
||||
break;
|
||||
}
|
||||
pos += bytes_read;
|
||||
|
|
@ -130,7 +131,8 @@ class FilePosix : public File {
|
|||
HWY_WARN(
|
||||
"Write failure at pos %zu within size %zu with offset %zu and "
|
||||
"errno %d\n",
|
||||
pos, size, offset, errno);
|
||||
static_cast<size_t>(pos), static_cast<size_t>(size),
|
||||
static_cast<size_t>(offset), errno);
|
||||
break;
|
||||
}
|
||||
pos += bytes_written;
|
||||
|
|
|
|||
|
|
@ -837,10 +837,11 @@ class MMImpl {
|
|||
hwy::platform::InvariantTicksPerSecond();
|
||||
const double flops = 2 * M * K * N * num_B / min_elapsed; // * 2 for FMA
|
||||
if (HWY_UNLIKELY(env.print_measurement && tuner.ShouldPrint())) {
|
||||
fprintf(stderr, "%zu,%zu,%zu,%zu,%7.1f,%.2f,%zu,%4zu,%4zu,%5zu,%s,%zu\n",
|
||||
M, K, N, num_B, flops * 1E-9, min_elapsed * 1E3, cfg.MR(),
|
||||
cfg.MC(), cfg.KC(), cfg.NC(), StringFromOrder(cfg.Order()),
|
||||
cfg.InnerTasks());
|
||||
fprintf(
|
||||
stderr,
|
||||
"%4zu,%4zu,%4zu,B%zu,%7.1f,%.2f ms, MR%zu,%4zu,%4zu,%5zu,%-7s,%zu\n",
|
||||
M, K, N, num_B, flops * 1E-9, min_elapsed * 1E3, cfg.MR(), cfg.MC(),
|
||||
cfg.KC(), cfg.NC(), StringFromOrder(cfg.Order()), cfg.InnerTasks());
|
||||
}
|
||||
if (HWY_UNLIKELY(env.print_best && tuner.Best())) {
|
||||
const auto ratio = [&tuner](uint64_t ticks) -> double {
|
||||
|
|
@ -850,7 +851,8 @@ class MMImpl {
|
|||
const MMConfig& best = *tuner.Best();
|
||||
fprintf(
|
||||
stderr,
|
||||
"\n%zu,%zu,%zu,%zu,%7.1f,%.2f,%zu,%4zu,%4zu,%5zu,%s,%zu,%.2f,%.2f\n",
|
||||
"\n%4zu,%4zu,%4zu,B%zu,%7.1f,%.2f ms, MR%zu,%4zu,%4zu,%5zu,%-7s,%zu, "
|
||||
"%.2fx,%.2fx\n",
|
||||
M, K, N, num_B, flops * 1E-9, min_elapsed * 1E3, best.MR(), best.MC(),
|
||||
best.KC(), best.NC(), StringFromOrder(best.Order()),
|
||||
best.InnerTasks(), ratio(tuner.WorstMinTicks()),
|
||||
|
|
@ -906,8 +908,8 @@ class MMLoops {
|
|||
const auto zone = args.env.ctx.profiler_zones.Get(Zones::kMMNT);
|
||||
HWY_DASSERT(args.ranges_mc.NumTasks() == 1);
|
||||
HWY_DASSERT(args.ranges_kc.NumTasks() == 1);
|
||||
const IndexRange& range_mc = args.ranges_mc.Range(0);
|
||||
const IndexRange& range_kc = args.ranges_kc.Range(0);
|
||||
const IndexRange& range_mc = args.ranges_mc.Range(0); // whole M
|
||||
const IndexRange& range_kc = args.ranges_kc.Range(0); // whole K
|
||||
|
||||
parallel.ForN(
|
||||
args.env.ctx, args.range_n, MultipleN(sizeof(TC), args.line_bytes),
|
||||
|
|
@ -941,7 +943,7 @@ class MMLoops {
|
|||
const MMArgs& args) {
|
||||
const auto zone = args.env.ctx.profiler_zones.Get(Zones::kMMNT_K);
|
||||
HWY_DASSERT(args.ranges_mc.NumTasks() == 1);
|
||||
const IndexRange& range_mc = args.ranges_mc.Range(0);
|
||||
const IndexRange& range_mc = args.ranges_mc.Range(0); // whole M
|
||||
|
||||
parallel.ForN(args.env.ctx, args.range_n,
|
||||
MultipleN(sizeof(TC), args.line_bytes), args.inner_tasks,
|
||||
|
|
@ -977,7 +979,7 @@ class MMLoops {
|
|||
const MMArgs& args) {
|
||||
const auto zone = args.env.ctx.profiler_zones.Get(Zones::kMMNT_MT);
|
||||
HWY_DASSERT(args.ranges_kc.NumTasks() == 1);
|
||||
const IndexRange& range_kc = args.ranges_kc.Range(0);
|
||||
const IndexRange& range_kc = args.ranges_kc.Range(0); // whole K
|
||||
|
||||
parallel.ForRangesMC_NC(
|
||||
args.env.ctx, args.ranges_mc, args.ranges_nc, args.options.cluster_idx,
|
||||
|
|
@ -1158,8 +1160,9 @@ HWY_NOINLINE MMPerKey* TwoMatMul(const MatPtrT<BF16>& A, const MatPtrT<TB>& B1,
|
|||
HWY_ASSERT(K <= MMEntireA::kMaxK);
|
||||
HWY_ASSERT(N % kNR == 0);
|
||||
MMImpl::EnsureAligned(A, cache.VectorBytes());
|
||||
tuner.SetCandidates(
|
||||
MMCandidates(cache, M, K, N, num_B, sizeof(BF16), env.print_config));
|
||||
const size_t max_M = MMKeys::BucketM(M);
|
||||
tuner.SetCandidates(MMCandidates(cache, max_M, K, N, num_B, sizeof(BF16),
|
||||
env.print_config));
|
||||
}
|
||||
|
||||
const MMConfig& cfg = tuner.NextConfig();
|
||||
|
|
|
|||
152
ops/matmul.cc
152
ops/matmul.cc
|
|
@ -21,6 +21,7 @@
|
|||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "util/allocator.h"
|
||||
|
|
@ -46,7 +47,9 @@ size_t RoundDownWithFloor(size_t value, size_t multiple) {
|
|||
// multiple of `multiple`, or 0 if none exists.
|
||||
size_t PrevDivisor(const size_t begin, const size_t end, const size_t dim,
|
||||
const size_t multiple) {
|
||||
HWY_DASSERT(end != 0 && dim != 0 && multiple != 0);
|
||||
HWY_DASSERT(end != 0);
|
||||
HWY_DASSERT(dim != 0);
|
||||
HWY_DASSERT(multiple != 0);
|
||||
size_t prev = RoundDownWithFloor(end, multiple);
|
||||
// Avoid returning `end` if rounding down had no effect.
|
||||
if (prev == end) prev -= multiple;
|
||||
|
|
@ -62,10 +65,10 @@ size_t PrevDivisor(const size_t begin, const size_t end, const size_t dim,
|
|||
// and holds most of their arguments in member variables.
|
||||
class GenerateCandidates {
|
||||
public:
|
||||
GenerateCandidates(const CacheInfo& cache, size_t M, size_t K, size_t N,
|
||||
GenerateCandidates(const CacheInfo& cache, size_t max_M, size_t K, size_t N,
|
||||
size_t num_B, size_t sizeof_TC, bool print_config)
|
||||
: cache_(cache),
|
||||
M_(M),
|
||||
max_M_(max_M),
|
||||
K_(K),
|
||||
N_(N),
|
||||
num_B_(num_B),
|
||||
|
|
@ -89,14 +92,14 @@ class GenerateCandidates {
|
|||
for (size_t mc : MC(mr, kc, order)) {
|
||||
for (size_t nc : NC(mr, mc, kc, order)) {
|
||||
for (int inner_tasks : all_inner_tasks) {
|
||||
const MMConfig config(K_, N_, mr, mc, kc, nc, kc_multiple_,
|
||||
nc_multiple_, order, inner_tasks);
|
||||
const size_t M_tasks = config.RangesOfMC(M_).NumTasks();
|
||||
const MMConfig config(max_M_, K_, N_, mr, mc, kc, nc,
|
||||
kc_multiple_, nc_multiple_, order,
|
||||
inner_tasks);
|
||||
const size_t M_tasks = config.RangesOfMC(max_M_).NumTasks();
|
||||
const size_t K_tasks = config.RangesOfKC(K_).NumTasks();
|
||||
|
||||
// Blocks only make sense when there are multiple M tasks.
|
||||
if (IsBlock(order) != (M_tasks > 1)) continue;
|
||||
// Single KC only makes sense when there is a single K task.
|
||||
// Do not use single-MC/KC order if there are multiple.
|
||||
if (IsOneMC(order) != (M_tasks == 1)) continue;
|
||||
if (IsOneKC(order) != (K_tasks == 1)) continue;
|
||||
|
||||
candidates.push_back(config);
|
||||
|
|
@ -114,6 +117,25 @@ class GenerateCandidates {
|
|||
private:
|
||||
using SizeVec = std::vector<size_t>;
|
||||
|
||||
// Concatenate and print once because this can be called concurrently.
|
||||
void MaybePrintSizes(size_t dim, size_t max, const char* caption,
|
||||
const SizeVec& sizes) const {
|
||||
if (!print_config_ || sizes.empty()) return;
|
||||
std::string out("num_B ");
|
||||
out += std::to_string(num_B_);
|
||||
out += " (";
|
||||
out += std::to_string(dim);
|
||||
out += ", max ";
|
||||
out += std::to_string(max);
|
||||
out += ") ";
|
||||
out += caption;
|
||||
out += ": ";
|
||||
for (size_t size : sizes) {
|
||||
out += std::to_string(size) + " ";
|
||||
}
|
||||
fprintf(stderr, "%s\n", out.c_str());
|
||||
}
|
||||
|
||||
// How many rows of A per call to `MMKernel::LoopKC`. Lower values may
|
||||
// be better for SIMD targets with fewer registers.
|
||||
SizeVec MR() const {
|
||||
|
|
@ -125,14 +147,14 @@ class GenerateCandidates {
|
|||
SizeVec all_mr;
|
||||
all_mr.reserve(3);
|
||||
// AVX2's 16 registers are not enough for four rows, but SSE4 may benefit.
|
||||
if (M_ >= kMaxMR && !is_avx2) all_mr.push_back(kMaxMR);
|
||||
if (max_M_ >= kMaxMR && !is_avx2) all_mr.push_back(kMaxMR);
|
||||
// Allow for AVX-512 but not SSE4 (for which 4 are usually better). Also
|
||||
// enable if not enough rows for 4.
|
||||
if (M_ >= 2 && (M_ < kMaxMR || (!is_sse && !is_wasm))) {
|
||||
if (max_M_ >= 2 && (max_M_ < kMaxMR || (!is_sse && !is_wasm))) {
|
||||
all_mr.push_back(size_t{2});
|
||||
}
|
||||
// Even SSE4 usually prefers 2 rows; only enable for single rows.
|
||||
if (M_ == 1) all_mr.push_back(size_t{1});
|
||||
if (max_M_ == 1) all_mr.push_back(size_t{1});
|
||||
HWY_ASSERT(!all_mr.empty());
|
||||
return all_mr;
|
||||
}
|
||||
|
|
@ -143,18 +165,21 @@ class GenerateCandidates {
|
|||
for (size_t order_idx = 0;; ++order_idx) {
|
||||
const MMOrder order = static_cast<MMOrder>(order_idx);
|
||||
if (StringFromOrder(order) == nullptr) return orders; // done
|
||||
// 2D blocking is useless for a single row of M.
|
||||
if (IsBlock(order) && M_ <= mr) continue;
|
||||
// Multiple-MC is useless for a single row of M.
|
||||
if (!IsOneMC(order) && max_M_ <= mr) continue;
|
||||
// Conversely, N-only parallelism is uncompetitive for large M.
|
||||
if (!IsBlock(order) && M_ >= kMaxTilesM * mr) continue;
|
||||
if (IsOneMC(order) && max_M_ >= 8 * mr) continue;
|
||||
orders.push_back(order);
|
||||
}
|
||||
}
|
||||
|
||||
// The number of A and B columns to read between updating `C`.
|
||||
SizeVec KC(size_t mr, MMOrder order) const {
|
||||
// Must return the actual value: although ignored by `RangesOfKC`, this will
|
||||
// be used in MC() and NC().
|
||||
if (IsOneKC(order)) return SizeVec(1, K_);
|
||||
// `LoopKC` handles up to `mr` rows of A.
|
||||
const size_t rows_a = HWY_MIN(M_, mr);
|
||||
const size_t rows_a = HWY_MIN(max_M_, mr);
|
||||
|
||||
// After looping over `kc` columns, we write `mr x 4` outputs and 16 vector
|
||||
// `buf`. To amortize the write cost, we want to maximize `kc`. However, it
|
||||
|
|
@ -186,7 +211,7 @@ class GenerateCandidates {
|
|||
|
||||
// If we can afford a single K task, that's usually best; only try one
|
||||
// more. Otherwise, blocks may require smaller kc (more options).
|
||||
const size_t reps = (kc_max == K_) ? 1 : IsBlock(order) ? 3 : 2;
|
||||
const size_t reps = (kc_max == K_) ? 1 : IsOneMC(order) ? 2 : 3;
|
||||
|
||||
size_t prev = kc_max;
|
||||
for (size_t rep = 0; rep < reps; ++rep) {
|
||||
|
|
@ -196,19 +221,16 @@ class GenerateCandidates {
|
|||
}
|
||||
}
|
||||
|
||||
if (print_config_ && all_kc.size() > 1) {
|
||||
fprintf(stderr, "num_B %zu: KC: ", num_B_);
|
||||
for (size_t kc : all_kc) {
|
||||
fprintf(stderr, "%zu ", kc);
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
MaybePrintSizes(K_, kc_max, "KC", all_kc);
|
||||
return all_kc;
|
||||
}
|
||||
|
||||
// The number of (L2 resident) A rows for `A2C0` to loop over.
|
||||
SizeVec MC(size_t mr, size_t kc, MMOrder order) const {
|
||||
// Must return the actual value: although ignored by `RangesOfMC`, this will
|
||||
// be used in NC().
|
||||
if (IsOneMC(order) || max_M_ <= mr) return SizeVec(1, max_M_);
|
||||
|
||||
// Typically 12-24K. The B rows are pinned in L1, but also occupy L2 because
|
||||
// it is typically inclusive.
|
||||
const size_t bytes_b = kNR * kc * (sizeof(SfpStream) + sizeof(BF16));
|
||||
|
|
@ -219,35 +241,45 @@ class GenerateCandidates {
|
|||
const size_t bytes_per_mc = kc * sizeof(BF16) + cache_.LineBytes();
|
||||
size_t mc_max = hwy::DivCeil(cache_.L2Bytes() - bytes_b, bytes_per_mc);
|
||||
mc_max = HWY_MIN(mc_max, HWY_MIN(kMaxBatchSize, kMaxMC));
|
||||
mc_max = HWY_MIN(mc_max, max_M_);
|
||||
HWY_DASSERT(mc_max != 0);
|
||||
mc_max = HWY_MIN(mc_max, M_);
|
||||
mc_max = hwy::RoundDownTo(mc_max, mr);
|
||||
|
||||
SizeVec all_mc(1, mc_max);
|
||||
// Larger MC is better for non-blocks, otherwise we want more small options,
|
||||
// especially for two B.
|
||||
const size_t reps = !IsBlock(order) ? 2 : (2 + num_B_);
|
||||
SizeVec all_mc;
|
||||
all_mc.reserve(6);
|
||||
|
||||
size_t prev = mc_max;
|
||||
for (size_t rep = 0; rep < reps; ++rep) {
|
||||
prev = PrevDivisor(1, prev, M_, mr);
|
||||
if (prev >= mc_max || prev == 0) break;
|
||||
const size_t rounded_M = HWY_MAX(mr, hwy::RoundDownTo(max_M_, mr));
|
||||
size_t prev = hwy::RoundDownTo(mc_max, mr);
|
||||
|
||||
// If mc_max is large enough, allow using the whole range without rounding
|
||||
// down (which may require two ranges).
|
||||
if (mc_max == max_M_ && (max_M_ % mr) != 0) {
|
||||
all_mc.push_back(max_M_);
|
||||
// The next option should be considerably smaller than `max_M_`.
|
||||
prev = HWY_MAX(mr, hwy::RoundDownTo(3 * prev / 4, mr));
|
||||
} else {
|
||||
all_mc.push_back(prev);
|
||||
}
|
||||
|
||||
// Blocks: largest is not useful.
|
||||
if (IsBlock(order) && all_mc.size() > 1) {
|
||||
all_mc.erase(all_mc.begin(), all_mc.begin() + 1);
|
||||
// We know `order` is multiple MC, where more/smaller values of `mc` are
|
||||
// helpful, especially for two B, hence add iterations.
|
||||
const size_t reps = 2 + num_B_;
|
||||
for (size_t rep = 0; rep < reps; ++rep) {
|
||||
prev = PrevDivisor(mr, prev, rounded_M, mr);
|
||||
if (prev == 0) break; // none found
|
||||
if (prev == mr) {
|
||||
if (all_mc.back() != prev) all_mc.push_back(prev);
|
||||
break;
|
||||
}
|
||||
if (prev <= mc_max / 8) break;
|
||||
all_mc.push_back(prev);
|
||||
}
|
||||
|
||||
if (print_config_ && all_mc.size() > 1) {
|
||||
fprintf(stderr, "num_B %zu: MC: ", num_B_);
|
||||
for (size_t mc : all_mc) {
|
||||
fprintf(stderr, "%zu ", mc);
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
if (all_mc.size() <= 2) {
|
||||
if (max_M_ > mr) all_mc.push_back(max_M_ / 2);
|
||||
if (mc_max > mr) all_mc.push_back(mc_max / 2);
|
||||
}
|
||||
|
||||
MaybePrintSizes(max_M_, mc_max, "MC", all_mc);
|
||||
return all_mc;
|
||||
}
|
||||
|
||||
|
|
@ -257,7 +289,7 @@ class GenerateCandidates {
|
|||
// Only if there will be reuse of B: choose the largest `nc_max` (C cols)
|
||||
// such that `nc x kc` of B and `mc x nc` of `C` fit in L3. Otherwise,
|
||||
// leave it unbounded.
|
||||
if (M_ > mr) {
|
||||
if (max_M_ > mr) {
|
||||
const size_t bytes_per_nc = (kc * sizeof(BF16) + mc * sizeof_TC_);
|
||||
nc_max = HWY_MIN(hwy::DivCeil(cache_.L3Bytes(), bytes_per_nc), kMaxNC);
|
||||
}
|
||||
|
|
@ -271,8 +303,8 @@ class GenerateCandidates {
|
|||
nc_max = RoundDownWithFloor(N_ / 2, nc_multiple_);
|
||||
}
|
||||
|
||||
// Non-block calls ForNP, which ignores `range_nc` and uses `range_np`.
|
||||
if (!IsBlock(order)) return SizeVec(1, N_);
|
||||
// Single-MC calls `ForNP`, which ignores `range_nc`.
|
||||
if (IsOneMC(order)) return SizeVec(1, N_);
|
||||
|
||||
SizeVec all_nc(1, nc_max);
|
||||
|
||||
|
|
@ -282,7 +314,7 @@ class GenerateCandidates {
|
|||
// hence autotune a wider range of nc than the other dimensions.
|
||||
size_t reps = 9 + num_B_;
|
||||
// For small M, we can afford larger NC, hence allow fewer small options.
|
||||
if (M_ <= 2 * mr) reps -= 1;
|
||||
if (max_M_ <= 2 * mr) reps -= 1;
|
||||
|
||||
size_t prev = nc_max;
|
||||
for (size_t rep = 0; rep < reps; ++rep) {
|
||||
|
|
@ -302,14 +334,7 @@ class GenerateCandidates {
|
|||
all_nc.begin() + HWY_MIN(want_delete, max_delete));
|
||||
}
|
||||
|
||||
if (print_config_ && all_nc.size() > 1) {
|
||||
fprintf(stderr, "num_B %zu: NC: ", num_B_);
|
||||
for (size_t nc : all_nc) {
|
||||
fprintf(stderr, "%zu ", nc);
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
MaybePrintSizes(N_, nc_max, "NC", all_nc);
|
||||
return all_nc;
|
||||
}
|
||||
|
||||
|
|
@ -319,8 +344,8 @@ class GenerateCandidates {
|
|||
std::vector<int> inner_tasks;
|
||||
inner_tasks.reserve(3);
|
||||
inner_tasks.push_back(1);
|
||||
// Blocks have one task per mc/nc range and ignore this parameter.
|
||||
if (!IsBlock(order)) {
|
||||
// Multiple-MC have one task per mc/nc range and ignore this parameter.
|
||||
if (IsOneMC(order)) {
|
||||
inner_tasks.push_back(2);
|
||||
inner_tasks.push_back(4);
|
||||
}
|
||||
|
|
@ -328,7 +353,7 @@ class GenerateCandidates {
|
|||
}
|
||||
|
||||
const CacheInfo& cache_;
|
||||
const size_t M_;
|
||||
const size_t max_M_;
|
||||
const size_t K_;
|
||||
const size_t N_;
|
||||
const size_t num_B_;
|
||||
|
|
@ -343,10 +368,11 @@ class GenerateCandidates {
|
|||
} // namespace
|
||||
|
||||
// Facade to avoid exposing `GenerateCandidates` in the header.
|
||||
std::vector<MMConfig> MMCandidates(const CacheInfo& cache, size_t M, size_t K,
|
||||
size_t N, size_t num_B, size_t sizeof_TC,
|
||||
bool print_config) {
|
||||
return GenerateCandidates(cache, M, K, N, num_B, sizeof_TC, print_config)();
|
||||
std::vector<MMConfig> MMCandidates(const CacheInfo& cache, size_t max_M,
|
||||
size_t K, size_t N, size_t num_B,
|
||||
size_t sizeof_TC, bool print_config) {
|
||||
return GenerateCandidates(cache, max_M, K, N, num_B, sizeof_TC,
|
||||
print_config)();
|
||||
}
|
||||
|
||||
MatMulEnv::MatMulEnv(ThreadingContext& ctx)
|
||||
|
|
|
|||
61
ops/matmul.h
61
ops/matmul.h
|
|
@ -331,8 +331,8 @@ void DispatchOrder(MMOrder order, const Func& func, Args&&... args) {
|
|||
}
|
||||
}
|
||||
|
||||
static inline bool IsBlock(MMOrder order) {
|
||||
return order == MMOrder::kNT_MT_K || order == MMOrder::kNT_MT;
|
||||
static inline bool IsOneMC(MMOrder order) {
|
||||
return order == MMOrder::kNT || order == MMOrder::kNT_K;
|
||||
}
|
||||
|
||||
static inline bool IsOneKC(MMOrder order) {
|
||||
|
|
@ -381,6 +381,8 @@ static inline const char* StringFromParA(MMParA par_a) {
|
|||
// `mc` := A rows such that `kc` columns fit in L2,
|
||||
// `nc` := B rows such that `kc` columns fit in L3 alongside `mc x nc` C.
|
||||
// Also includes loop order and task granularity.
|
||||
//
|
||||
// This is shared by multiple M which return the same `BucketM`.
|
||||
#pragma pack(push, 1)
|
||||
class MMConfig {
|
||||
public:
|
||||
|
|
@ -388,8 +390,8 @@ class MMConfig {
|
|||
// `mr` is the number of A rows per call to `MMKernel::LoopKC`.
|
||||
// `MMOrder` is how to parallelize the outer loops.
|
||||
// `inner_tasks` chooses the within-cluster task granularity in `ForN`.
|
||||
MMConfig(size_t K, size_t N, size_t mr, size_t mc, size_t kc, size_t nc,
|
||||
size_t kc_multiple, size_t nc_multiple, MMOrder order,
|
||||
MMConfig(size_t M, size_t K, size_t N, size_t mr, size_t mc, size_t kc,
|
||||
size_t nc, size_t kc_multiple, size_t nc_multiple, MMOrder order,
|
||||
int inner_tasks)
|
||||
: mr_(static_cast<uint32_t>(mr)),
|
||||
mc_(static_cast<uint32_t>(mc)),
|
||||
|
|
@ -401,12 +403,8 @@ class MMConfig {
|
|||
inner_tasks_(static_cast<uint8_t>(inner_tasks)),
|
||||
reserved_{} {
|
||||
HWY_DASSERT(mr == 1 || mr == 2 || mr == 4);
|
||||
if (mc % mr != 0) {
|
||||
HWY_WARN("mc %zu not a multiple of mr %zu", mc, mr);
|
||||
}
|
||||
// Do not warn for single-kc tasks; some models unfortunately have K which
|
||||
// are not multiples of `kc_multiple`.
|
||||
if (kc != K && (kc % kc_multiple) != 0) {
|
||||
// Some models have K which are not multiples of `kc_multiple`.
|
||||
if (!IsOneKC(order) && (kc % kc_multiple) != 0) {
|
||||
HWY_WARN("kc %zu not a multiple of kc_multiple %zu", kc, kc_multiple);
|
||||
}
|
||||
if (nc != N && (nc % nc_multiple) != 0) {
|
||||
|
|
@ -417,11 +415,21 @@ class MMConfig {
|
|||
}
|
||||
|
||||
// Splits M/N into blocks which are visited sequentially or in parallel.
|
||||
// K is always sequential, see `MMOrder`.
|
||||
IndexRangePartition RangesOfMC(size_t M) const {
|
||||
return MaxSizePartition(IndexRange(0, M), mc_, mr_);
|
||||
if (IsOneMC(order_)) {
|
||||
// Must have exactly one M range/tile, regardless of `mr_` and `mc_`.
|
||||
return IndexRangePartition(M);
|
||||
}
|
||||
const size_t mc = HWY_MIN(M, MC());
|
||||
const size_t mr = HWY_MIN(M, MR());
|
||||
return MaxSizePartition(IndexRange(0, M), mc, mr);
|
||||
}
|
||||
// K is either a single range, or a sequential loop.
|
||||
IndexRangePartition RangesOfKC(size_t K) const {
|
||||
if (IsOneKC(order_)) {
|
||||
// Must have exactly one K range/tile, regardless of `kc_`.
|
||||
return IndexRangePartition(K);
|
||||
}
|
||||
return MaxSizePartition(IndexRange(0, K), kc_, kc_multiple_);
|
||||
}
|
||||
IndexRangePartition RangesOfNC(size_t N) const {
|
||||
|
|
@ -448,7 +456,7 @@ class MMConfig {
|
|||
uint32_t kc_multiple_;
|
||||
MMOrder order_;
|
||||
uint8_t inner_tasks_;
|
||||
HWY_MAYBE_UNUSED uint8_t reserved_[6];
|
||||
HWY_MEMBER_VAR_MAYBE_UNUSED uint8_t reserved_[6];
|
||||
};
|
||||
static_assert(sizeof(MMConfig) == 32); // for faster indexing
|
||||
#pragma pack(pop)
|
||||
|
|
@ -557,26 +565,27 @@ class MMAutoTune {
|
|||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
// Minimum M, in units of tile rows of height mr={1, 2, 4}, from which
|
||||
// `MMOrder::kNT[_K]` are no longer allowed. They require a single MC range,
|
||||
// but choosing the same config for a larger M can result in multiple MC ranges.
|
||||
// Thus M less than this must have unique keys/configs.
|
||||
HWY_INLINE_VAR constexpr size_t kMaxTilesM = 8;
|
||||
|
||||
// Map of previously seen dimensions to index via linear search.
|
||||
class MMKeys {
|
||||
// Group batch size into buckets to reduce #auto-tunes.
|
||||
static size_t BucketM(size_t M) {
|
||||
if (M < kMaxTilesM * kMaxMR) return M; // See kMaxTilesM above.
|
||||
if (M <= 128) return 128;
|
||||
return 512;
|
||||
}
|
||||
|
||||
public:
|
||||
using Key = uint64_t;
|
||||
// KeyFromDims will only return this if all dims are zero, which is invalid.
|
||||
static constexpr Key kPadding = 0;
|
||||
|
||||
// Returns the maximum permissible M in the bucket, for grouping batch sizes
|
||||
// into buckets to reduce #auto-tunes.
|
||||
static size_t BucketM(size_t M) {
|
||||
HWY_DASSERT(M != 0);
|
||||
// Small M: 1..3, 4..7, 8..15, etc. share the same config.
|
||||
if (M < 64) return M | (kMaxMR - 1);
|
||||
// Larger M use power of two buckets: 64..127, 128..255, etc.
|
||||
const size_t floor_log2_M =
|
||||
31 - hwy::Num0BitsAboveMS1Bit_Nonzero32(static_cast<uint32_t>(M));
|
||||
const size_t min_M = size_t{1} << floor_log2_M;
|
||||
HWY_DASSERT(min_M <= M && M < 2 * min_M);
|
||||
return 2 * min_M - 1;
|
||||
}
|
||||
|
||||
// Compresses the dimensions into a single Key for faster comparison.
|
||||
static Key KeyFromDims(size_t M, size_t K, size_t N, size_t num_B) {
|
||||
HWY_DASSERT(M < (Key{1} << 16)); // batch sizes are smaller
|
||||
|
|
|
|||
|
|
@ -14,7 +14,6 @@
|
|||
// limitations under the License.
|
||||
|
||||
#include "compression/types.h"
|
||||
#include "util/zones.h"
|
||||
#ifndef HWY_DISABLED_TARGETS
|
||||
#define HWY_DISABLED_TARGETS GEMMA_DISABLED_TARGETS
|
||||
#endif // HWY_DISABLED_TARGETS
|
||||
|
|
@ -38,7 +37,6 @@
|
|||
#include "util/mat.h" // MatStorageT
|
||||
#include "util/test_util.h"
|
||||
#include "util/threading_context.h"
|
||||
#include "hwy/profiler.h"
|
||||
#include "hwy/tests/hwy_gtest.h"
|
||||
|
||||
// clang-format off
|
||||
|
|
|
|||
|
|
@ -187,7 +187,9 @@ class NestedPools {
|
|||
// functions below.
|
||||
class IndexRangePartition {
|
||||
public:
|
||||
IndexRangePartition() = default; // for MMPartitions
|
||||
explicit IndexRangePartition(size_t single_task)
|
||||
: range_(0, single_task), task_size_(single_task), num_tasks_(1) {}
|
||||
|
||||
IndexRangePartition(const IndexRange& range, const size_t task_size)
|
||||
: range_(range), task_size_(static_cast<uint32_t>(task_size)) {
|
||||
const uint32_t num = static_cast<uint32_t>(range.Num());
|
||||
|
|
|
|||
Loading…
Reference in New Issue