1.01x speedup: improved autotune

Group M=4..7 into same config. Add configs for power of two sizes.
Allow odd mc to enable a single range for odd M.

io.cc: warning fix(cast).
IsBlock -> !IsOneMC
benchmark_helper: best for verbosity 3, all configs for 4
ops_test: remove unused includes
PiperOrigin-RevId: 824475104
This commit is contained in:
Jan Wassenberg 2025-10-27 05:34:58 -07:00 committed by Copybara-Service
parent 8198e7104a
commit 86200ce224
7 changed files with 148 additions and 109 deletions

View File

@ -47,9 +47,8 @@ GemmaEnv::GemmaEnv(const LoaderArgs& loader, const ThreadingArgs& threading,
ShowConfig(loader, threading, inference, config, gemma_.WeightReadMode(),
ctx_);
}
if (inference.verbosity >= 3) {
env_.print_config = env_.print_best = true;
}
if (inference.verbosity >= 3) env_.print_best = true;
if (inference.verbosity >= 4) env_.print_config = true;
runtime_config_ = {
.max_generated_tokens = inference.max_generated_tokens,

View File

@ -110,7 +110,8 @@ class FilePosix : public File {
HWY_WARN(
"Read failure at pos %zu within size %zu with offset %zu and "
"errno %d\n",
pos, size, offset, errno);
static_cast<size_t>(pos), static_cast<size_t>(size),
static_cast<size_t>(offset), errno);
break;
}
pos += bytes_read;
@ -130,7 +131,8 @@ class FilePosix : public File {
HWY_WARN(
"Write failure at pos %zu within size %zu with offset %zu and "
"errno %d\n",
pos, size, offset, errno);
static_cast<size_t>(pos), static_cast<size_t>(size),
static_cast<size_t>(offset), errno);
break;
}
pos += bytes_written;

View File

@ -837,10 +837,11 @@ class MMImpl {
hwy::platform::InvariantTicksPerSecond();
const double flops = 2 * M * K * N * num_B / min_elapsed; // * 2 for FMA
if (HWY_UNLIKELY(env.print_measurement && tuner.ShouldPrint())) {
fprintf(stderr, "%zu,%zu,%zu,%zu,%7.1f,%.2f,%zu,%4zu,%4zu,%5zu,%s,%zu\n",
M, K, N, num_B, flops * 1E-9, min_elapsed * 1E3, cfg.MR(),
cfg.MC(), cfg.KC(), cfg.NC(), StringFromOrder(cfg.Order()),
cfg.InnerTasks());
fprintf(
stderr,
"%4zu,%4zu,%4zu,B%zu,%7.1f,%.2f ms, MR%zu,%4zu,%4zu,%5zu,%-7s,%zu\n",
M, K, N, num_B, flops * 1E-9, min_elapsed * 1E3, cfg.MR(), cfg.MC(),
cfg.KC(), cfg.NC(), StringFromOrder(cfg.Order()), cfg.InnerTasks());
}
if (HWY_UNLIKELY(env.print_best && tuner.Best())) {
const auto ratio = [&tuner](uint64_t ticks) -> double {
@ -850,7 +851,8 @@ class MMImpl {
const MMConfig& best = *tuner.Best();
fprintf(
stderr,
"\n%zu,%zu,%zu,%zu,%7.1f,%.2f,%zu,%4zu,%4zu,%5zu,%s,%zu,%.2f,%.2f\n",
"\n%4zu,%4zu,%4zu,B%zu,%7.1f,%.2f ms, MR%zu,%4zu,%4zu,%5zu,%-7s,%zu, "
"%.2fx,%.2fx\n",
M, K, N, num_B, flops * 1E-9, min_elapsed * 1E3, best.MR(), best.MC(),
best.KC(), best.NC(), StringFromOrder(best.Order()),
best.InnerTasks(), ratio(tuner.WorstMinTicks()),
@ -906,8 +908,8 @@ class MMLoops {
const auto zone = args.env.ctx.profiler_zones.Get(Zones::kMMNT);
HWY_DASSERT(args.ranges_mc.NumTasks() == 1);
HWY_DASSERT(args.ranges_kc.NumTasks() == 1);
const IndexRange& range_mc = args.ranges_mc.Range(0);
const IndexRange& range_kc = args.ranges_kc.Range(0);
const IndexRange& range_mc = args.ranges_mc.Range(0); // whole M
const IndexRange& range_kc = args.ranges_kc.Range(0); // whole K
parallel.ForN(
args.env.ctx, args.range_n, MultipleN(sizeof(TC), args.line_bytes),
@ -941,7 +943,7 @@ class MMLoops {
const MMArgs& args) {
const auto zone = args.env.ctx.profiler_zones.Get(Zones::kMMNT_K);
HWY_DASSERT(args.ranges_mc.NumTasks() == 1);
const IndexRange& range_mc = args.ranges_mc.Range(0);
const IndexRange& range_mc = args.ranges_mc.Range(0); // whole M
parallel.ForN(args.env.ctx, args.range_n,
MultipleN(sizeof(TC), args.line_bytes), args.inner_tasks,
@ -977,7 +979,7 @@ class MMLoops {
const MMArgs& args) {
const auto zone = args.env.ctx.profiler_zones.Get(Zones::kMMNT_MT);
HWY_DASSERT(args.ranges_kc.NumTasks() == 1);
const IndexRange& range_kc = args.ranges_kc.Range(0);
const IndexRange& range_kc = args.ranges_kc.Range(0); // whole K
parallel.ForRangesMC_NC(
args.env.ctx, args.ranges_mc, args.ranges_nc, args.options.cluster_idx,
@ -1158,8 +1160,9 @@ HWY_NOINLINE MMPerKey* TwoMatMul(const MatPtrT<BF16>& A, const MatPtrT<TB>& B1,
HWY_ASSERT(K <= MMEntireA::kMaxK);
HWY_ASSERT(N % kNR == 0);
MMImpl::EnsureAligned(A, cache.VectorBytes());
tuner.SetCandidates(
MMCandidates(cache, M, K, N, num_B, sizeof(BF16), env.print_config));
const size_t max_M = MMKeys::BucketM(M);
tuner.SetCandidates(MMCandidates(cache, max_M, K, N, num_B, sizeof(BF16),
env.print_config));
}
const MMConfig& cfg = tuner.NextConfig();

View File

@ -21,6 +21,7 @@
#include <stdint.h>
#include <stdio.h>
#include <string>
#include <vector>
#include "util/allocator.h"
@ -46,7 +47,9 @@ size_t RoundDownWithFloor(size_t value, size_t multiple) {
// multiple of `multiple`, or 0 if none exists.
size_t PrevDivisor(const size_t begin, const size_t end, const size_t dim,
const size_t multiple) {
HWY_DASSERT(end != 0 && dim != 0 && multiple != 0);
HWY_DASSERT(end != 0);
HWY_DASSERT(dim != 0);
HWY_DASSERT(multiple != 0);
size_t prev = RoundDownWithFloor(end, multiple);
// Avoid returning `end` if rounding down had no effect.
if (prev == end) prev -= multiple;
@ -62,10 +65,10 @@ size_t PrevDivisor(const size_t begin, const size_t end, const size_t dim,
// and holds most of their arguments in member variables.
class GenerateCandidates {
public:
GenerateCandidates(const CacheInfo& cache, size_t M, size_t K, size_t N,
GenerateCandidates(const CacheInfo& cache, size_t max_M, size_t K, size_t N,
size_t num_B, size_t sizeof_TC, bool print_config)
: cache_(cache),
M_(M),
max_M_(max_M),
K_(K),
N_(N),
num_B_(num_B),
@ -89,14 +92,14 @@ class GenerateCandidates {
for (size_t mc : MC(mr, kc, order)) {
for (size_t nc : NC(mr, mc, kc, order)) {
for (int inner_tasks : all_inner_tasks) {
const MMConfig config(K_, N_, mr, mc, kc, nc, kc_multiple_,
nc_multiple_, order, inner_tasks);
const size_t M_tasks = config.RangesOfMC(M_).NumTasks();
const MMConfig config(max_M_, K_, N_, mr, mc, kc, nc,
kc_multiple_, nc_multiple_, order,
inner_tasks);
const size_t M_tasks = config.RangesOfMC(max_M_).NumTasks();
const size_t K_tasks = config.RangesOfKC(K_).NumTasks();
// Blocks only make sense when there are multiple M tasks.
if (IsBlock(order) != (M_tasks > 1)) continue;
// Single KC only makes sense when there is a single K task.
// Do not use single-MC/KC order if there are multiple.
if (IsOneMC(order) != (M_tasks == 1)) continue;
if (IsOneKC(order) != (K_tasks == 1)) continue;
candidates.push_back(config);
@ -114,6 +117,25 @@ class GenerateCandidates {
private:
using SizeVec = std::vector<size_t>;
// Concatenate and print once because this can be called concurrently.
void MaybePrintSizes(size_t dim, size_t max, const char* caption,
const SizeVec& sizes) const {
if (!print_config_ || sizes.empty()) return;
std::string out("num_B ");
out += std::to_string(num_B_);
out += " (";
out += std::to_string(dim);
out += ", max ";
out += std::to_string(max);
out += ") ";
out += caption;
out += ": ";
for (size_t size : sizes) {
out += std::to_string(size) + " ";
}
fprintf(stderr, "%s\n", out.c_str());
}
// How many rows of A per call to `MMKernel::LoopKC`. Lower values may
// be better for SIMD targets with fewer registers.
SizeVec MR() const {
@ -125,14 +147,14 @@ class GenerateCandidates {
SizeVec all_mr;
all_mr.reserve(3);
// AVX2's 16 registers are not enough for four rows, but SSE4 may benefit.
if (M_ >= kMaxMR && !is_avx2) all_mr.push_back(kMaxMR);
if (max_M_ >= kMaxMR && !is_avx2) all_mr.push_back(kMaxMR);
// Allow for AVX-512 but not SSE4 (for which 4 are usually better). Also
// enable if not enough rows for 4.
if (M_ >= 2 && (M_ < kMaxMR || (!is_sse && !is_wasm))) {
if (max_M_ >= 2 && (max_M_ < kMaxMR || (!is_sse && !is_wasm))) {
all_mr.push_back(size_t{2});
}
// Even SSE4 usually prefers 2 rows; only enable for single rows.
if (M_ == 1) all_mr.push_back(size_t{1});
if (max_M_ == 1) all_mr.push_back(size_t{1});
HWY_ASSERT(!all_mr.empty());
return all_mr;
}
@ -143,18 +165,21 @@ class GenerateCandidates {
for (size_t order_idx = 0;; ++order_idx) {
const MMOrder order = static_cast<MMOrder>(order_idx);
if (StringFromOrder(order) == nullptr) return orders; // done
// 2D blocking is useless for a single row of M.
if (IsBlock(order) && M_ <= mr) continue;
// Multiple-MC is useless for a single row of M.
if (!IsOneMC(order) && max_M_ <= mr) continue;
// Conversely, N-only parallelism is uncompetitive for large M.
if (!IsBlock(order) && M_ >= kMaxTilesM * mr) continue;
if (IsOneMC(order) && max_M_ >= 8 * mr) continue;
orders.push_back(order);
}
}
// The number of A and B columns to read between updating `C`.
SizeVec KC(size_t mr, MMOrder order) const {
// Must return the actual value: although ignored by `RangesOfKC`, this will
// be used in MC() and NC().
if (IsOneKC(order)) return SizeVec(1, K_);
// `LoopKC` handles up to `mr` rows of A.
const size_t rows_a = HWY_MIN(M_, mr);
const size_t rows_a = HWY_MIN(max_M_, mr);
// After looping over `kc` columns, we write `mr x 4` outputs and 16 vector
// `buf`. To amortize the write cost, we want to maximize `kc`. However, it
@ -186,7 +211,7 @@ class GenerateCandidates {
// If we can afford a single K task, that's usually best; only try one
// more. Otherwise, blocks may require smaller kc (more options).
const size_t reps = (kc_max == K_) ? 1 : IsBlock(order) ? 3 : 2;
const size_t reps = (kc_max == K_) ? 1 : IsOneMC(order) ? 2 : 3;
size_t prev = kc_max;
for (size_t rep = 0; rep < reps; ++rep) {
@ -196,19 +221,16 @@ class GenerateCandidates {
}
}
if (print_config_ && all_kc.size() > 1) {
fprintf(stderr, "num_B %zu: KC: ", num_B_);
for (size_t kc : all_kc) {
fprintf(stderr, "%zu ", kc);
}
fprintf(stderr, "\n");
}
MaybePrintSizes(K_, kc_max, "KC", all_kc);
return all_kc;
}
// The number of (L2 resident) A rows for `A2C0` to loop over.
SizeVec MC(size_t mr, size_t kc, MMOrder order) const {
// Must return the actual value: although ignored by `RangesOfMC`, this will
// be used in NC().
if (IsOneMC(order) || max_M_ <= mr) return SizeVec(1, max_M_);
// Typically 12-24K. The B rows are pinned in L1, but also occupy L2 because
// it is typically inclusive.
const size_t bytes_b = kNR * kc * (sizeof(SfpStream) + sizeof(BF16));
@ -219,35 +241,45 @@ class GenerateCandidates {
const size_t bytes_per_mc = kc * sizeof(BF16) + cache_.LineBytes();
size_t mc_max = hwy::DivCeil(cache_.L2Bytes() - bytes_b, bytes_per_mc);
mc_max = HWY_MIN(mc_max, HWY_MIN(kMaxBatchSize, kMaxMC));
mc_max = HWY_MIN(mc_max, max_M_);
HWY_DASSERT(mc_max != 0);
mc_max = HWY_MIN(mc_max, M_);
mc_max = hwy::RoundDownTo(mc_max, mr);
SizeVec all_mc(1, mc_max);
// Larger MC is better for non-blocks, otherwise we want more small options,
// especially for two B.
const size_t reps = !IsBlock(order) ? 2 : (2 + num_B_);
SizeVec all_mc;
all_mc.reserve(6);
size_t prev = mc_max;
for (size_t rep = 0; rep < reps; ++rep) {
prev = PrevDivisor(1, prev, M_, mr);
if (prev >= mc_max || prev == 0) break;
const size_t rounded_M = HWY_MAX(mr, hwy::RoundDownTo(max_M_, mr));
size_t prev = hwy::RoundDownTo(mc_max, mr);
// If mc_max is large enough, allow using the whole range without rounding
// down (which may require two ranges).
if (mc_max == max_M_ && (max_M_ % mr) != 0) {
all_mc.push_back(max_M_);
// The next option should be considerably smaller than `max_M_`.
prev = HWY_MAX(mr, hwy::RoundDownTo(3 * prev / 4, mr));
} else {
all_mc.push_back(prev);
}
// Blocks: largest is not useful.
if (IsBlock(order) && all_mc.size() > 1) {
all_mc.erase(all_mc.begin(), all_mc.begin() + 1);
}
if (print_config_ && all_mc.size() > 1) {
fprintf(stderr, "num_B %zu: MC: ", num_B_);
for (size_t mc : all_mc) {
fprintf(stderr, "%zu ", mc);
// We know `order` is multiple MC, where more/smaller values of `mc` are
// helpful, especially for two B, hence add iterations.
const size_t reps = 2 + num_B_;
for (size_t rep = 0; rep < reps; ++rep) {
prev = PrevDivisor(mr, prev, rounded_M, mr);
if (prev == 0) break; // none found
if (prev == mr) {
if (all_mc.back() != prev) all_mc.push_back(prev);
break;
}
fprintf(stderr, "\n");
if (prev <= mc_max / 8) break;
all_mc.push_back(prev);
}
if (all_mc.size() <= 2) {
if (max_M_ > mr) all_mc.push_back(max_M_ / 2);
if (mc_max > mr) all_mc.push_back(mc_max / 2);
}
MaybePrintSizes(max_M_, mc_max, "MC", all_mc);
return all_mc;
}
@ -257,7 +289,7 @@ class GenerateCandidates {
// Only if there will be reuse of B: choose the largest `nc_max` (C cols)
// such that `nc x kc` of B and `mc x nc` of `C` fit in L3. Otherwise,
// leave it unbounded.
if (M_ > mr) {
if (max_M_ > mr) {
const size_t bytes_per_nc = (kc * sizeof(BF16) + mc * sizeof_TC_);
nc_max = HWY_MIN(hwy::DivCeil(cache_.L3Bytes(), bytes_per_nc), kMaxNC);
}
@ -271,8 +303,8 @@ class GenerateCandidates {
nc_max = RoundDownWithFloor(N_ / 2, nc_multiple_);
}
// Non-block calls ForNP, which ignores `range_nc` and uses `range_np`.
if (!IsBlock(order)) return SizeVec(1, N_);
// Single-MC calls `ForNP`, which ignores `range_nc`.
if (IsOneMC(order)) return SizeVec(1, N_);
SizeVec all_nc(1, nc_max);
@ -282,7 +314,7 @@ class GenerateCandidates {
// hence autotune a wider range of nc than the other dimensions.
size_t reps = 9 + num_B_;
// For small M, we can afford larger NC, hence allow fewer small options.
if (M_ <= 2 * mr) reps -= 1;
if (max_M_ <= 2 * mr) reps -= 1;
size_t prev = nc_max;
for (size_t rep = 0; rep < reps; ++rep) {
@ -302,14 +334,7 @@ class GenerateCandidates {
all_nc.begin() + HWY_MIN(want_delete, max_delete));
}
if (print_config_ && all_nc.size() > 1) {
fprintf(stderr, "num_B %zu: NC: ", num_B_);
for (size_t nc : all_nc) {
fprintf(stderr, "%zu ", nc);
}
fprintf(stderr, "\n");
}
MaybePrintSizes(N_, nc_max, "NC", all_nc);
return all_nc;
}
@ -319,8 +344,8 @@ class GenerateCandidates {
std::vector<int> inner_tasks;
inner_tasks.reserve(3);
inner_tasks.push_back(1);
// Blocks have one task per mc/nc range and ignore this parameter.
if (!IsBlock(order)) {
// Multiple-MC have one task per mc/nc range and ignore this parameter.
if (IsOneMC(order)) {
inner_tasks.push_back(2);
inner_tasks.push_back(4);
}
@ -328,7 +353,7 @@ class GenerateCandidates {
}
const CacheInfo& cache_;
const size_t M_;
const size_t max_M_;
const size_t K_;
const size_t N_;
const size_t num_B_;
@ -343,10 +368,11 @@ class GenerateCandidates {
} // namespace
// Facade to avoid exposing `GenerateCandidates` in the header.
std::vector<MMConfig> MMCandidates(const CacheInfo& cache, size_t M, size_t K,
size_t N, size_t num_B, size_t sizeof_TC,
bool print_config) {
return GenerateCandidates(cache, M, K, N, num_B, sizeof_TC, print_config)();
std::vector<MMConfig> MMCandidates(const CacheInfo& cache, size_t max_M,
size_t K, size_t N, size_t num_B,
size_t sizeof_TC, bool print_config) {
return GenerateCandidates(cache, max_M, K, N, num_B, sizeof_TC,
print_config)();
}
MatMulEnv::MatMulEnv(ThreadingContext& ctx)

View File

@ -331,8 +331,8 @@ void DispatchOrder(MMOrder order, const Func& func, Args&&... args) {
}
}
static inline bool IsBlock(MMOrder order) {
return order == MMOrder::kNT_MT_K || order == MMOrder::kNT_MT;
static inline bool IsOneMC(MMOrder order) {
return order == MMOrder::kNT || order == MMOrder::kNT_K;
}
static inline bool IsOneKC(MMOrder order) {
@ -381,6 +381,8 @@ static inline const char* StringFromParA(MMParA par_a) {
// `mc` := A rows such that `kc` columns fit in L2,
// `nc` := B rows such that `kc` columns fit in L3 alongside `mc x nc` C.
// Also includes loop order and task granularity.
//
// This is shared by multiple M which return the same `BucketM`.
#pragma pack(push, 1)
class MMConfig {
public:
@ -388,8 +390,8 @@ class MMConfig {
// `mr` is the number of A rows per call to `MMKernel::LoopKC`.
// `MMOrder` is how to parallelize the outer loops.
// `inner_tasks` chooses the within-cluster task granularity in `ForN`.
MMConfig(size_t K, size_t N, size_t mr, size_t mc, size_t kc, size_t nc,
size_t kc_multiple, size_t nc_multiple, MMOrder order,
MMConfig(size_t M, size_t K, size_t N, size_t mr, size_t mc, size_t kc,
size_t nc, size_t kc_multiple, size_t nc_multiple, MMOrder order,
int inner_tasks)
: mr_(static_cast<uint32_t>(mr)),
mc_(static_cast<uint32_t>(mc)),
@ -401,12 +403,8 @@ class MMConfig {
inner_tasks_(static_cast<uint8_t>(inner_tasks)),
reserved_{} {
HWY_DASSERT(mr == 1 || mr == 2 || mr == 4);
if (mc % mr != 0) {
HWY_WARN("mc %zu not a multiple of mr %zu", mc, mr);
}
// Do not warn for single-kc tasks; some models unfortunately have K which
// are not multiples of `kc_multiple`.
if (kc != K && (kc % kc_multiple) != 0) {
// Some models have K which are not multiples of `kc_multiple`.
if (!IsOneKC(order) && (kc % kc_multiple) != 0) {
HWY_WARN("kc %zu not a multiple of kc_multiple %zu", kc, kc_multiple);
}
if (nc != N && (nc % nc_multiple) != 0) {
@ -417,11 +415,21 @@ class MMConfig {
}
// Splits M/N into blocks which are visited sequentially or in parallel.
// K is always sequential, see `MMOrder`.
IndexRangePartition RangesOfMC(size_t M) const {
return MaxSizePartition(IndexRange(0, M), mc_, mr_);
if (IsOneMC(order_)) {
// Must have exactly one M range/tile, regardless of `mr_` and `mc_`.
return IndexRangePartition(M);
}
const size_t mc = HWY_MIN(M, MC());
const size_t mr = HWY_MIN(M, MR());
return MaxSizePartition(IndexRange(0, M), mc, mr);
}
// K is either a single range, or a sequential loop.
IndexRangePartition RangesOfKC(size_t K) const {
if (IsOneKC(order_)) {
// Must have exactly one K range/tile, regardless of `kc_`.
return IndexRangePartition(K);
}
return MaxSizePartition(IndexRange(0, K), kc_, kc_multiple_);
}
IndexRangePartition RangesOfNC(size_t N) const {
@ -448,7 +456,7 @@ class MMConfig {
uint32_t kc_multiple_;
MMOrder order_;
uint8_t inner_tasks_;
HWY_MAYBE_UNUSED uint8_t reserved_[6];
HWY_MEMBER_VAR_MAYBE_UNUSED uint8_t reserved_[6];
};
static_assert(sizeof(MMConfig) == 32); // for faster indexing
#pragma pack(pop)
@ -557,26 +565,27 @@ class MMAutoTune {
//------------------------------------------------------------------------------
// Minimum M, in units of tile rows of height mr={1, 2, 4}, from which
// `MMOrder::kNT[_K]` are no longer allowed. They require a single MC range,
// but choosing the same config for a larger M can result in multiple MC ranges.
// Thus M less than this must have unique keys/configs.
HWY_INLINE_VAR constexpr size_t kMaxTilesM = 8;
// Map of previously seen dimensions to index via linear search.
class MMKeys {
// Group batch size into buckets to reduce #auto-tunes.
static size_t BucketM(size_t M) {
if (M < kMaxTilesM * kMaxMR) return M; // See kMaxTilesM above.
if (M <= 128) return 128;
return 512;
}
public:
using Key = uint64_t;
// KeyFromDims will only return this if all dims are zero, which is invalid.
static constexpr Key kPadding = 0;
// Returns the maximum permissible M in the bucket, for grouping batch sizes
// into buckets to reduce #auto-tunes.
static size_t BucketM(size_t M) {
HWY_DASSERT(M != 0);
// Small M: 1..3, 4..7, 8..15, etc. share the same config.
if (M < 64) return M | (kMaxMR - 1);
// Larger M use power of two buckets: 64..127, 128..255, etc.
const size_t floor_log2_M =
31 - hwy::Num0BitsAboveMS1Bit_Nonzero32(static_cast<uint32_t>(M));
const size_t min_M = size_t{1} << floor_log2_M;
HWY_DASSERT(min_M <= M && M < 2 * min_M);
return 2 * min_M - 1;
}
// Compresses the dimensions into a single Key for faster comparison.
static Key KeyFromDims(size_t M, size_t K, size_t N, size_t num_B) {
HWY_DASSERT(M < (Key{1} << 16)); // batch sizes are smaller

View File

@ -14,7 +14,6 @@
// limitations under the License.
#include "compression/types.h"
#include "util/zones.h"
#ifndef HWY_DISABLED_TARGETS
#define HWY_DISABLED_TARGETS GEMMA_DISABLED_TARGETS
#endif // HWY_DISABLED_TARGETS
@ -38,7 +37,6 @@
#include "util/mat.h" // MatStorageT
#include "util/test_util.h"
#include "util/threading_context.h"
#include "hwy/profiler.h"
#include "hwy/tests/hwy_gtest.h"
// clang-format off

View File

@ -187,7 +187,9 @@ class NestedPools {
// functions below.
class IndexRangePartition {
public:
IndexRangePartition() = default; // for MMPartitions
explicit IndexRangePartition(size_t single_task)
: range_(0, single_task), task_size_(single_task), num_tasks_(1) {}
IndexRangePartition(const IndexRange& range, const size_t task_size)
: range_(range), task_size_(static_cast<uint32_t>(task_size)) {
const uint32_t num = static_cast<uint32_t>(range.Num());