diff --git a/evals/benchmark_helper.cc b/evals/benchmark_helper.cc index 9e4c1b6..860e934 100644 --- a/evals/benchmark_helper.cc +++ b/evals/benchmark_helper.cc @@ -47,9 +47,8 @@ GemmaEnv::GemmaEnv(const LoaderArgs& loader, const ThreadingArgs& threading, ShowConfig(loader, threading, inference, config, gemma_.WeightReadMode(), ctx_); } - if (inference.verbosity >= 3) { - env_.print_config = env_.print_best = true; - } + if (inference.verbosity >= 3) env_.print_best = true; + if (inference.verbosity >= 4) env_.print_config = true; runtime_config_ = { .max_generated_tokens = inference.max_generated_tokens, diff --git a/io/io.cc b/io/io.cc index 9363b07..bd0d72b 100644 --- a/io/io.cc +++ b/io/io.cc @@ -110,7 +110,8 @@ class FilePosix : public File { HWY_WARN( "Read failure at pos %zu within size %zu with offset %zu and " "errno %d\n", - pos, size, offset, errno); + static_cast(pos), static_cast(size), + static_cast(offset), errno); break; } pos += bytes_read; @@ -130,7 +131,8 @@ class FilePosix : public File { HWY_WARN( "Write failure at pos %zu within size %zu with offset %zu and " "errno %d\n", - pos, size, offset, errno); + static_cast(pos), static_cast(size), + static_cast(offset), errno); break; } pos += bytes_written; diff --git a/ops/matmul-inl.h b/ops/matmul-inl.h index 96cd4f1..4b217a1 100644 --- a/ops/matmul-inl.h +++ b/ops/matmul-inl.h @@ -837,10 +837,11 @@ class MMImpl { hwy::platform::InvariantTicksPerSecond(); const double flops = 2 * M * K * N * num_B / min_elapsed; // * 2 for FMA if (HWY_UNLIKELY(env.print_measurement && tuner.ShouldPrint())) { - fprintf(stderr, "%zu,%zu,%zu,%zu,%7.1f,%.2f,%zu,%4zu,%4zu,%5zu,%s,%zu\n", - M, K, N, num_B, flops * 1E-9, min_elapsed * 1E3, cfg.MR(), - cfg.MC(), cfg.KC(), cfg.NC(), StringFromOrder(cfg.Order()), - cfg.InnerTasks()); + fprintf( + stderr, + "%4zu,%4zu,%4zu,B%zu,%7.1f,%.2f ms, MR%zu,%4zu,%4zu,%5zu,%-7s,%zu\n", + M, K, N, num_B, flops * 1E-9, min_elapsed * 1E3, cfg.MR(), cfg.MC(), + cfg.KC(), cfg.NC(), StringFromOrder(cfg.Order()), cfg.InnerTasks()); } if (HWY_UNLIKELY(env.print_best && tuner.Best())) { const auto ratio = [&tuner](uint64_t ticks) -> double { @@ -850,7 +851,8 @@ class MMImpl { const MMConfig& best = *tuner.Best(); fprintf( stderr, - "\n%zu,%zu,%zu,%zu,%7.1f,%.2f,%zu,%4zu,%4zu,%5zu,%s,%zu,%.2f,%.2f\n", + "\n%4zu,%4zu,%4zu,B%zu,%7.1f,%.2f ms, MR%zu,%4zu,%4zu,%5zu,%-7s,%zu, " + "%.2fx,%.2fx\n", M, K, N, num_B, flops * 1E-9, min_elapsed * 1E3, best.MR(), best.MC(), best.KC(), best.NC(), StringFromOrder(best.Order()), best.InnerTasks(), ratio(tuner.WorstMinTicks()), @@ -906,8 +908,8 @@ class MMLoops { const auto zone = args.env.ctx.profiler_zones.Get(Zones::kMMNT); HWY_DASSERT(args.ranges_mc.NumTasks() == 1); HWY_DASSERT(args.ranges_kc.NumTasks() == 1); - const IndexRange& range_mc = args.ranges_mc.Range(0); - const IndexRange& range_kc = args.ranges_kc.Range(0); + const IndexRange& range_mc = args.ranges_mc.Range(0); // whole M + const IndexRange& range_kc = args.ranges_kc.Range(0); // whole K parallel.ForN( args.env.ctx, args.range_n, MultipleN(sizeof(TC), args.line_bytes), @@ -941,7 +943,7 @@ class MMLoops { const MMArgs& args) { const auto zone = args.env.ctx.profiler_zones.Get(Zones::kMMNT_K); HWY_DASSERT(args.ranges_mc.NumTasks() == 1); - const IndexRange& range_mc = args.ranges_mc.Range(0); + const IndexRange& range_mc = args.ranges_mc.Range(0); // whole M parallel.ForN(args.env.ctx, args.range_n, MultipleN(sizeof(TC), args.line_bytes), args.inner_tasks, @@ -977,7 +979,7 @@ class MMLoops { const MMArgs& args) { const auto zone = args.env.ctx.profiler_zones.Get(Zones::kMMNT_MT); HWY_DASSERT(args.ranges_kc.NumTasks() == 1); - const IndexRange& range_kc = args.ranges_kc.Range(0); + const IndexRange& range_kc = args.ranges_kc.Range(0); // whole K parallel.ForRangesMC_NC( args.env.ctx, args.ranges_mc, args.ranges_nc, args.options.cluster_idx, @@ -1158,8 +1160,9 @@ HWY_NOINLINE MMPerKey* TwoMatMul(const MatPtrT& A, const MatPtrT& B1, HWY_ASSERT(K <= MMEntireA::kMaxK); HWY_ASSERT(N % kNR == 0); MMImpl::EnsureAligned(A, cache.VectorBytes()); - tuner.SetCandidates( - MMCandidates(cache, M, K, N, num_B, sizeof(BF16), env.print_config)); + const size_t max_M = MMKeys::BucketM(M); + tuner.SetCandidates(MMCandidates(cache, max_M, K, N, num_B, sizeof(BF16), + env.print_config)); } const MMConfig& cfg = tuner.NextConfig(); diff --git a/ops/matmul.cc b/ops/matmul.cc index ebeff9b..5a9fb0d 100644 --- a/ops/matmul.cc +++ b/ops/matmul.cc @@ -21,6 +21,7 @@ #include #include +#include #include #include "util/allocator.h" @@ -46,7 +47,9 @@ size_t RoundDownWithFloor(size_t value, size_t multiple) { // multiple of `multiple`, or 0 if none exists. size_t PrevDivisor(const size_t begin, const size_t end, const size_t dim, const size_t multiple) { - HWY_DASSERT(end != 0 && dim != 0 && multiple != 0); + HWY_DASSERT(end != 0); + HWY_DASSERT(dim != 0); + HWY_DASSERT(multiple != 0); size_t prev = RoundDownWithFloor(end, multiple); // Avoid returning `end` if rounding down had no effect. if (prev == end) prev -= multiple; @@ -62,10 +65,10 @@ size_t PrevDivisor(const size_t begin, const size_t end, const size_t dim, // and holds most of their arguments in member variables. class GenerateCandidates { public: - GenerateCandidates(const CacheInfo& cache, size_t M, size_t K, size_t N, + GenerateCandidates(const CacheInfo& cache, size_t max_M, size_t K, size_t N, size_t num_B, size_t sizeof_TC, bool print_config) : cache_(cache), - M_(M), + max_M_(max_M), K_(K), N_(N), num_B_(num_B), @@ -89,14 +92,14 @@ class GenerateCandidates { for (size_t mc : MC(mr, kc, order)) { for (size_t nc : NC(mr, mc, kc, order)) { for (int inner_tasks : all_inner_tasks) { - const MMConfig config(K_, N_, mr, mc, kc, nc, kc_multiple_, - nc_multiple_, order, inner_tasks); - const size_t M_tasks = config.RangesOfMC(M_).NumTasks(); + const MMConfig config(max_M_, K_, N_, mr, mc, kc, nc, + kc_multiple_, nc_multiple_, order, + inner_tasks); + const size_t M_tasks = config.RangesOfMC(max_M_).NumTasks(); const size_t K_tasks = config.RangesOfKC(K_).NumTasks(); - // Blocks only make sense when there are multiple M tasks. - if (IsBlock(order) != (M_tasks > 1)) continue; - // Single KC only makes sense when there is a single K task. + // Do not use single-MC/KC order if there are multiple. + if (IsOneMC(order) != (M_tasks == 1)) continue; if (IsOneKC(order) != (K_tasks == 1)) continue; candidates.push_back(config); @@ -114,6 +117,25 @@ class GenerateCandidates { private: using SizeVec = std::vector; + // Concatenate and print once because this can be called concurrently. + void MaybePrintSizes(size_t dim, size_t max, const char* caption, + const SizeVec& sizes) const { + if (!print_config_ || sizes.empty()) return; + std::string out("num_B "); + out += std::to_string(num_B_); + out += " ("; + out += std::to_string(dim); + out += ", max "; + out += std::to_string(max); + out += ") "; + out += caption; + out += ": "; + for (size_t size : sizes) { + out += std::to_string(size) + " "; + } + fprintf(stderr, "%s\n", out.c_str()); + } + // How many rows of A per call to `MMKernel::LoopKC`. Lower values may // be better for SIMD targets with fewer registers. SizeVec MR() const { @@ -125,14 +147,14 @@ class GenerateCandidates { SizeVec all_mr; all_mr.reserve(3); // AVX2's 16 registers are not enough for four rows, but SSE4 may benefit. - if (M_ >= kMaxMR && !is_avx2) all_mr.push_back(kMaxMR); + if (max_M_ >= kMaxMR && !is_avx2) all_mr.push_back(kMaxMR); // Allow for AVX-512 but not SSE4 (for which 4 are usually better). Also // enable if not enough rows for 4. - if (M_ >= 2 && (M_ < kMaxMR || (!is_sse && !is_wasm))) { + if (max_M_ >= 2 && (max_M_ < kMaxMR || (!is_sse && !is_wasm))) { all_mr.push_back(size_t{2}); } // Even SSE4 usually prefers 2 rows; only enable for single rows. - if (M_ == 1) all_mr.push_back(size_t{1}); + if (max_M_ == 1) all_mr.push_back(size_t{1}); HWY_ASSERT(!all_mr.empty()); return all_mr; } @@ -143,18 +165,21 @@ class GenerateCandidates { for (size_t order_idx = 0;; ++order_idx) { const MMOrder order = static_cast(order_idx); if (StringFromOrder(order) == nullptr) return orders; // done - // 2D blocking is useless for a single row of M. - if (IsBlock(order) && M_ <= mr) continue; + // Multiple-MC is useless for a single row of M. + if (!IsOneMC(order) && max_M_ <= mr) continue; // Conversely, N-only parallelism is uncompetitive for large M. - if (!IsBlock(order) && M_ >= kMaxTilesM * mr) continue; + if (IsOneMC(order) && max_M_ >= 8 * mr) continue; orders.push_back(order); } } // The number of A and B columns to read between updating `C`. SizeVec KC(size_t mr, MMOrder order) const { + // Must return the actual value: although ignored by `RangesOfKC`, this will + // be used in MC() and NC(). + if (IsOneKC(order)) return SizeVec(1, K_); // `LoopKC` handles up to `mr` rows of A. - const size_t rows_a = HWY_MIN(M_, mr); + const size_t rows_a = HWY_MIN(max_M_, mr); // After looping over `kc` columns, we write `mr x 4` outputs and 16 vector // `buf`. To amortize the write cost, we want to maximize `kc`. However, it @@ -186,7 +211,7 @@ class GenerateCandidates { // If we can afford a single K task, that's usually best; only try one // more. Otherwise, blocks may require smaller kc (more options). - const size_t reps = (kc_max == K_) ? 1 : IsBlock(order) ? 3 : 2; + const size_t reps = (kc_max == K_) ? 1 : IsOneMC(order) ? 2 : 3; size_t prev = kc_max; for (size_t rep = 0; rep < reps; ++rep) { @@ -196,19 +221,16 @@ class GenerateCandidates { } } - if (print_config_ && all_kc.size() > 1) { - fprintf(stderr, "num_B %zu: KC: ", num_B_); - for (size_t kc : all_kc) { - fprintf(stderr, "%zu ", kc); - } - fprintf(stderr, "\n"); - } - + MaybePrintSizes(K_, kc_max, "KC", all_kc); return all_kc; } // The number of (L2 resident) A rows for `A2C0` to loop over. SizeVec MC(size_t mr, size_t kc, MMOrder order) const { + // Must return the actual value: although ignored by `RangesOfMC`, this will + // be used in NC(). + if (IsOneMC(order) || max_M_ <= mr) return SizeVec(1, max_M_); + // Typically 12-24K. The B rows are pinned in L1, but also occupy L2 because // it is typically inclusive. const size_t bytes_b = kNR * kc * (sizeof(SfpStream) + sizeof(BF16)); @@ -219,35 +241,45 @@ class GenerateCandidates { const size_t bytes_per_mc = kc * sizeof(BF16) + cache_.LineBytes(); size_t mc_max = hwy::DivCeil(cache_.L2Bytes() - bytes_b, bytes_per_mc); mc_max = HWY_MIN(mc_max, HWY_MIN(kMaxBatchSize, kMaxMC)); + mc_max = HWY_MIN(mc_max, max_M_); HWY_DASSERT(mc_max != 0); - mc_max = HWY_MIN(mc_max, M_); - mc_max = hwy::RoundDownTo(mc_max, mr); - SizeVec all_mc(1, mc_max); - // Larger MC is better for non-blocks, otherwise we want more small options, - // especially for two B. - const size_t reps = !IsBlock(order) ? 2 : (2 + num_B_); + SizeVec all_mc; + all_mc.reserve(6); - size_t prev = mc_max; - for (size_t rep = 0; rep < reps; ++rep) { - prev = PrevDivisor(1, prev, M_, mr); - if (prev >= mc_max || prev == 0) break; + const size_t rounded_M = HWY_MAX(mr, hwy::RoundDownTo(max_M_, mr)); + size_t prev = hwy::RoundDownTo(mc_max, mr); + + // If mc_max is large enough, allow using the whole range without rounding + // down (which may require two ranges). + if (mc_max == max_M_ && (max_M_ % mr) != 0) { + all_mc.push_back(max_M_); + // The next option should be considerably smaller than `max_M_`. + prev = HWY_MAX(mr, hwy::RoundDownTo(3 * prev / 4, mr)); + } else { all_mc.push_back(prev); } - // Blocks: largest is not useful. - if (IsBlock(order) && all_mc.size() > 1) { - all_mc.erase(all_mc.begin(), all_mc.begin() + 1); - } - - if (print_config_ && all_mc.size() > 1) { - fprintf(stderr, "num_B %zu: MC: ", num_B_); - for (size_t mc : all_mc) { - fprintf(stderr, "%zu ", mc); + // We know `order` is multiple MC, where more/smaller values of `mc` are + // helpful, especially for two B, hence add iterations. + const size_t reps = 2 + num_B_; + for (size_t rep = 0; rep < reps; ++rep) { + prev = PrevDivisor(mr, prev, rounded_M, mr); + if (prev == 0) break; // none found + if (prev == mr) { + if (all_mc.back() != prev) all_mc.push_back(prev); + break; } - fprintf(stderr, "\n"); + if (prev <= mc_max / 8) break; + all_mc.push_back(prev); } + if (all_mc.size() <= 2) { + if (max_M_ > mr) all_mc.push_back(max_M_ / 2); + if (mc_max > mr) all_mc.push_back(mc_max / 2); + } + + MaybePrintSizes(max_M_, mc_max, "MC", all_mc); return all_mc; } @@ -257,7 +289,7 @@ class GenerateCandidates { // Only if there will be reuse of B: choose the largest `nc_max` (C cols) // such that `nc x kc` of B and `mc x nc` of `C` fit in L3. Otherwise, // leave it unbounded. - if (M_ > mr) { + if (max_M_ > mr) { const size_t bytes_per_nc = (kc * sizeof(BF16) + mc * sizeof_TC_); nc_max = HWY_MIN(hwy::DivCeil(cache_.L3Bytes(), bytes_per_nc), kMaxNC); } @@ -271,8 +303,8 @@ class GenerateCandidates { nc_max = RoundDownWithFloor(N_ / 2, nc_multiple_); } - // Non-block calls ForNP, which ignores `range_nc` and uses `range_np`. - if (!IsBlock(order)) return SizeVec(1, N_); + // Single-MC calls `ForNP`, which ignores `range_nc`. + if (IsOneMC(order)) return SizeVec(1, N_); SizeVec all_nc(1, nc_max); @@ -282,7 +314,7 @@ class GenerateCandidates { // hence autotune a wider range of nc than the other dimensions. size_t reps = 9 + num_B_; // For small M, we can afford larger NC, hence allow fewer small options. - if (M_ <= 2 * mr) reps -= 1; + if (max_M_ <= 2 * mr) reps -= 1; size_t prev = nc_max; for (size_t rep = 0; rep < reps; ++rep) { @@ -302,14 +334,7 @@ class GenerateCandidates { all_nc.begin() + HWY_MIN(want_delete, max_delete)); } - if (print_config_ && all_nc.size() > 1) { - fprintf(stderr, "num_B %zu: NC: ", num_B_); - for (size_t nc : all_nc) { - fprintf(stderr, "%zu ", nc); - } - fprintf(stderr, "\n"); - } - + MaybePrintSizes(N_, nc_max, "NC", all_nc); return all_nc; } @@ -319,8 +344,8 @@ class GenerateCandidates { std::vector inner_tasks; inner_tasks.reserve(3); inner_tasks.push_back(1); - // Blocks have one task per mc/nc range and ignore this parameter. - if (!IsBlock(order)) { + // Multiple-MC have one task per mc/nc range and ignore this parameter. + if (IsOneMC(order)) { inner_tasks.push_back(2); inner_tasks.push_back(4); } @@ -328,7 +353,7 @@ class GenerateCandidates { } const CacheInfo& cache_; - const size_t M_; + const size_t max_M_; const size_t K_; const size_t N_; const size_t num_B_; @@ -343,10 +368,11 @@ class GenerateCandidates { } // namespace // Facade to avoid exposing `GenerateCandidates` in the header. -std::vector MMCandidates(const CacheInfo& cache, size_t M, size_t K, - size_t N, size_t num_B, size_t sizeof_TC, - bool print_config) { - return GenerateCandidates(cache, M, K, N, num_B, sizeof_TC, print_config)(); +std::vector MMCandidates(const CacheInfo& cache, size_t max_M, + size_t K, size_t N, size_t num_B, + size_t sizeof_TC, bool print_config) { + return GenerateCandidates(cache, max_M, K, N, num_B, sizeof_TC, + print_config)(); } MatMulEnv::MatMulEnv(ThreadingContext& ctx) diff --git a/ops/matmul.h b/ops/matmul.h index fcb3063..cc52578 100644 --- a/ops/matmul.h +++ b/ops/matmul.h @@ -331,8 +331,8 @@ void DispatchOrder(MMOrder order, const Func& func, Args&&... args) { } } -static inline bool IsBlock(MMOrder order) { - return order == MMOrder::kNT_MT_K || order == MMOrder::kNT_MT; +static inline bool IsOneMC(MMOrder order) { + return order == MMOrder::kNT || order == MMOrder::kNT_K; } static inline bool IsOneKC(MMOrder order) { @@ -381,6 +381,8 @@ static inline const char* StringFromParA(MMParA par_a) { // `mc` := A rows such that `kc` columns fit in L2, // `nc` := B rows such that `kc` columns fit in L3 alongside `mc x nc` C. // Also includes loop order and task granularity. +// +// This is shared by multiple M which return the same `BucketM`. #pragma pack(push, 1) class MMConfig { public: @@ -388,8 +390,8 @@ class MMConfig { // `mr` is the number of A rows per call to `MMKernel::LoopKC`. // `MMOrder` is how to parallelize the outer loops. // `inner_tasks` chooses the within-cluster task granularity in `ForN`. - MMConfig(size_t K, size_t N, size_t mr, size_t mc, size_t kc, size_t nc, - size_t kc_multiple, size_t nc_multiple, MMOrder order, + MMConfig(size_t M, size_t K, size_t N, size_t mr, size_t mc, size_t kc, + size_t nc, size_t kc_multiple, size_t nc_multiple, MMOrder order, int inner_tasks) : mr_(static_cast(mr)), mc_(static_cast(mc)), @@ -401,12 +403,8 @@ class MMConfig { inner_tasks_(static_cast(inner_tasks)), reserved_{} { HWY_DASSERT(mr == 1 || mr == 2 || mr == 4); - if (mc % mr != 0) { - HWY_WARN("mc %zu not a multiple of mr %zu", mc, mr); - } - // Do not warn for single-kc tasks; some models unfortunately have K which - // are not multiples of `kc_multiple`. - if (kc != K && (kc % kc_multiple) != 0) { + // Some models have K which are not multiples of `kc_multiple`. + if (!IsOneKC(order) && (kc % kc_multiple) != 0) { HWY_WARN("kc %zu not a multiple of kc_multiple %zu", kc, kc_multiple); } if (nc != N && (nc % nc_multiple) != 0) { @@ -417,11 +415,21 @@ class MMConfig { } // Splits M/N into blocks which are visited sequentially or in parallel. - // K is always sequential, see `MMOrder`. IndexRangePartition RangesOfMC(size_t M) const { - return MaxSizePartition(IndexRange(0, M), mc_, mr_); + if (IsOneMC(order_)) { + // Must have exactly one M range/tile, regardless of `mr_` and `mc_`. + return IndexRangePartition(M); + } + const size_t mc = HWY_MIN(M, MC()); + const size_t mr = HWY_MIN(M, MR()); + return MaxSizePartition(IndexRange(0, M), mc, mr); } + // K is either a single range, or a sequential loop. IndexRangePartition RangesOfKC(size_t K) const { + if (IsOneKC(order_)) { + // Must have exactly one K range/tile, regardless of `kc_`. + return IndexRangePartition(K); + } return MaxSizePartition(IndexRange(0, K), kc_, kc_multiple_); } IndexRangePartition RangesOfNC(size_t N) const { @@ -448,7 +456,7 @@ class MMConfig { uint32_t kc_multiple_; MMOrder order_; uint8_t inner_tasks_; - HWY_MAYBE_UNUSED uint8_t reserved_[6]; + HWY_MEMBER_VAR_MAYBE_UNUSED uint8_t reserved_[6]; }; static_assert(sizeof(MMConfig) == 32); // for faster indexing #pragma pack(pop) @@ -557,26 +565,27 @@ class MMAutoTune { //------------------------------------------------------------------------------ -// Minimum M, in units of tile rows of height mr={1, 2, 4}, from which -// `MMOrder::kNT[_K]` are no longer allowed. They require a single MC range, -// but choosing the same config for a larger M can result in multiple MC ranges. -// Thus M less than this must have unique keys/configs. -HWY_INLINE_VAR constexpr size_t kMaxTilesM = 8; - // Map of previously seen dimensions to index via linear search. class MMKeys { - // Group batch size into buckets to reduce #auto-tunes. - static size_t BucketM(size_t M) { - if (M < kMaxTilesM * kMaxMR) return M; // See kMaxTilesM above. - if (M <= 128) return 128; - return 512; - } - public: using Key = uint64_t; // KeyFromDims will only return this if all dims are zero, which is invalid. static constexpr Key kPadding = 0; + // Returns the maximum permissible M in the bucket, for grouping batch sizes + // into buckets to reduce #auto-tunes. + static size_t BucketM(size_t M) { + HWY_DASSERT(M != 0); + // Small M: 1..3, 4..7, 8..15, etc. share the same config. + if (M < 64) return M | (kMaxMR - 1); + // Larger M use power of two buckets: 64..127, 128..255, etc. + const size_t floor_log2_M = + 31 - hwy::Num0BitsAboveMS1Bit_Nonzero32(static_cast(M)); + const size_t min_M = size_t{1} << floor_log2_M; + HWY_DASSERT(min_M <= M && M < 2 * min_M); + return 2 * min_M - 1; + } + // Compresses the dimensions into a single Key for faster comparison. static Key KeyFromDims(size_t M, size_t K, size_t N, size_t num_B) { HWY_DASSERT(M < (Key{1} << 16)); // batch sizes are smaller diff --git a/ops/ops_test.cc b/ops/ops_test.cc index 4d94b61..e89dca0 100644 --- a/ops/ops_test.cc +++ b/ops/ops_test.cc @@ -14,7 +14,6 @@ // limitations under the License. #include "compression/types.h" -#include "util/zones.h" #ifndef HWY_DISABLED_TARGETS #define HWY_DISABLED_TARGETS GEMMA_DISABLED_TARGETS #endif // HWY_DISABLED_TARGETS @@ -38,7 +37,6 @@ #include "util/mat.h" // MatStorageT #include "util/test_util.h" #include "util/threading_context.h" -#include "hwy/profiler.h" #include "hwy/tests/hwy_gtest.h" // clang-format off diff --git a/util/threading.h b/util/threading.h index b18ad24..3fb0227 100644 --- a/util/threading.h +++ b/util/threading.h @@ -187,7 +187,9 @@ class NestedPools { // functions below. class IndexRangePartition { public: - IndexRangePartition() = default; // for MMPartitions + explicit IndexRangePartition(size_t single_task) + : range_(0, single_task), task_size_(single_task), num_tasks_(1) {} + IndexRangePartition(const IndexRange& range, const size_t task_size) : range_(range), task_size_(static_cast(task_size)) { const uint32_t num = static_cast(range.Num());