From a24eda8d02080211fa66c4c0410d12522ec7e925 Mon Sep 17 00:00:00 2001 From: Jan Wassenberg Date: Tue, 30 Jul 2024 08:26:51 -0700 Subject: [PATCH] Split matmul into matvec; add large matrix benchmark Rename var names to row/col for more clarity. Better estimate error tolerance via max abs col sum. PiperOrigin-RevId: 657601791 --- BUILD.bazel | 20 +++ CMakeLists.txt | 5 +- backprop/forward-inl.h | 2 +- gemma/gemma-inl.h | 2 +- ops/matmul-inl.h | 307 -------------------------------- ops/matmul_test.cc | 392 ++++++++++++++--------------------------- ops/matvec-inl.h | 357 +++++++++++++++++++++++++++++++++++++ ops/matvec_test.cc | 199 +++++++++++++++++++++ 8 files changed, 716 insertions(+), 568 deletions(-) create mode 100644 ops/matvec-inl.h create mode 100644 ops/matvec_test.cc diff --git a/BUILD.bazel b/BUILD.bazel index 15ce06b..25df65c 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -25,6 +25,7 @@ cc_library( textual_hdrs = [ "ops/ops-inl.h", "ops/matmul-inl.h", + "ops/matvec-inl.h", ], deps = [ "//compression:compress", @@ -56,6 +57,25 @@ cc_test( ], ) +cc_test( + name = "matvec_test", + size = "small", + timeout = "long", + srcs = ["ops/matvec_test.cc"], + local_defines = ["HWY_IS_TEST"], + # for test_suite. + tags = ["hwy_ops_test"], + deps = [ + ":ops", + "@googletest//:gtest_main", # buildcleaner: keep + "//compression:compress", + "@hwy//:hwy", + "@hwy//:hwy_test_util", + "@hwy//:nanobenchmark", + "@hwy//:thread_pool", + ], +) + cc_test( name = "matmul_test", size = "small", diff --git a/CMakeLists.txt b/CMakeLists.txt index b1347e0..38dd73a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -98,6 +98,7 @@ set(SOURCES gemma/weights.cc gemma/weights.h ops/matmul-inl.h + ops/matvec-inl.h ops/ops-inl.h util/app.h util/args.h @@ -148,7 +149,9 @@ set(GEMMA_TEST_FILES backprop/backward_test.cc backprop/backward_scalar_test.cc backprop/optimize_test.cc - gemma/ops_test.cc + ops/ops_test.cc + ops/matmul_test.cc + ops/matvec_test.cc evals/gemma_test.cc ) diff --git a/backprop/forward-inl.h b/backprop/forward-inl.h index 11ac050..b3ffd92 100644 --- a/backprop/forward-inl.h +++ b/backprop/forward-inl.h @@ -39,7 +39,7 @@ #define THIRD_PARTY_GEMMA_CPP_FORWARD_TOGGLE #endif -#include "ops/matmul-inl.h" +#include "ops/matvec-inl.h" #include "ops/ops-inl.h" #include "hwy/highway.h" diff --git a/gemma/gemma-inl.h b/gemma/gemma-inl.h index 9992981..633d898 100644 --- a/gemma/gemma-inl.h +++ b/gemma/gemma-inl.h @@ -40,11 +40,11 @@ #include "gemma/weights.h" // Placeholder for internal test4, do not remove #include "ops/matmul-inl.h" +#include "ops/matvec-inl.h" #include "ops/ops-inl.h" #include "hwy/aligned_allocator.h" #include "hwy/base.h" #include "hwy/bit_set.h" -#include "hwy/contrib/matvec/matvec-inl.h" #include "hwy/contrib/thread_pool/thread_pool.h" #include "hwy/contrib/thread_pool/topology.h" #include "hwy/highway.h" diff --git a/ops/matmul-inl.h b/ops/matmul-inl.h index 0f45881..de7a37b 100644 --- a/ops/matmul-inl.h +++ b/ops/matmul-inl.h @@ -21,8 +21,6 @@ #include #include -#include "compression/compress.h" -#include "compression/sfp.h" #include "hwy/base.h" #include "hwy/contrib/thread_pool/thread_pool.h" #include "hwy/profiler.h" @@ -38,9 +36,7 @@ #endif #include "compression/compress-inl.h" -#include "hwy/contrib/dot/dot-inl.h" #include "hwy/contrib/math/math-inl.h" -#include "hwy/contrib/matvec/matvec-inl.h" HWY_BEFORE_NAMESPACE(); namespace gcpp { @@ -443,309 +439,6 @@ HWY_NOINLINE void MatMul_4x4(const size_t batch_size, const Mat& A, }); } -//------------------------------------------------------------------------------ -HWY_INLINE void ToEvenOddF32(const hwy::bfloat16_t* HWY_RESTRICT vec_aligned, - const size_t size, float* HWY_RESTRICT out) { - const hn::ScalableTag df; - const hn::Repartition dbf16; - - HWY_DASSERT(size % hn::Lanes(dbf16) == 0); - HWY_DASSERT(hn::IsAligned(df, vec_aligned)); - - for (size_t i = 0; i < size; i += hn::Lanes(dbf16)) { - const auto interleaved = hn::LoadU(dbf16, vec_aligned + i); - hn::Store(hn::PromoteEvenTo(df, interleaved), df, out + i); - hn::Store(hn::PromoteOddTo(df, interleaved), df, out + i + hn::Lanes(df)); - } -} - -HWY_INLINE void ToEvenOddF32(const float* HWY_RESTRICT vec_aligned, - const size_t size, float* HWY_RESTRICT out) { - const hn::ScalableTag df; - using VF = hn::Vec; - - HWY_DASSERT(size % (hn::Lanes(df) * 2) == 0); - HWY_DASSERT(hn::IsAligned(df, vec_aligned)); - - VF vec0, vec1; - for (size_t i = 0; i < size; i += hn::Lanes(df) * 2) { - hn::LoadInterleaved2(df, vec_aligned + i, vec0, vec1); - hn::Store(vec0, df, out + i); - hn::Store(vec1, df, out + i + hn::Lanes(df)); - } -} - -// Simple version without tiling nor threading, but two offsets/outputs and -// always with addition. -template -HWY_INLINE void TwoOfsMatVecAddLoop(const ArrayT& mat, const size_t mat_ofs0, - const size_t mat_ofs1, - const VecT* HWY_RESTRICT vec_aligned, - const AddT* HWY_RESTRICT add0, - const AddT* HWY_RESTRICT add1, - float* HWY_RESTRICT out0, - float* HWY_RESTRICT out1) { - PROFILER_ZONE("TwoOfsMatVecAddLoop"); - constexpr bool kVecEO = false; - const hn::ScalableTag df; - - for (size_t idx_row = 0; idx_row < kOuter; ++idx_row) { - const size_t row_ofs0 = mat_ofs0 + (idx_row)*kInner; - const size_t row_ofs1 = mat_ofs1 + (idx_row)*kInner; - out0[idx_row] = hwy::ConvertScalarTo(add0[idx_row]) + - Dot(df, mat, row_ofs0, vec_aligned, kInner); - out1[idx_row] = hwy::ConvertScalarTo(add1[idx_row]) + - Dot(df, mat, row_ofs1, vec_aligned, kInner); - } -} - -HWY_INLINE constexpr size_t MaxCols() { - // Vec + mat rows should fit into 32 KiB L1. - return 2048; -} - -template -HWY_INLINE constexpr size_t RowsPerStrip() { - // Aim for 128 work items to reduce pool overhead. Must be at least one - // vector; prefer a power of two for faster division. - constexpr size_t kLanes = hn::ScalableTag().MaxLanes(); - constexpr size_t kRowsPerStrip = - kOuter < 128 ? kLanes - : HWY_MAX(kLanes, 1ULL << hwy::FloorLog2(kOuter / 128)); - return kRowsPerStrip; -} - -namespace detail { - -// For each i = [0, num_rows), compute partial (length `num_cols`) dot product -// of row i with `vec_aligned` and add into `out[i]`. The upper-left -// coordinate of the tile is r0, c0. -template -HWY_INLINE void AccumulatePartialDotProducts( - DF df, const ArrayT& mat, size_t mat_ofs, size_t mat_stride, size_t r0, - size_t c0, size_t num_rows, size_t num_cols, - const VecT* HWY_RESTRICT vec_aligned, float* HWY_RESTRICT out) { - for (size_t idx_row = 0; idx_row < num_rows; ++idx_row) { - const size_t row_ofs = mat_ofs + (r0 + idx_row) * mat_stride; - out[idx_row] += - Dot(df, mat, row_ofs + c0, vec_aligned + c0, num_cols); - } -} - -// Same as AccumulatePartialDotProducts, but sets out[i] to the first partial -// dot product + init (if kInit), which avoids having to zero-initialize and -// accumulate. -template -HWY_INLINE void SetFirstPartialDotProducts(DF df, const ArrayT& mat, - size_t mat_ofs, size_t mat_stride, - size_t r0, size_t c0, - size_t num_rows, size_t num_cols, - const VecT* HWY_RESTRICT vec_aligned, - const InitT* HWY_RESTRICT init, - float* HWY_RESTRICT out) { - for (size_t idx_row = 0; idx_row < num_rows; ++idx_row) { - const size_t row_ofs = mat_ofs + (r0 + idx_row) * mat_stride; - if constexpr (kInit) { - out[idx_row] = - hwy::ConvertScalarTo(init[idx_row + r0]) + - Dot(df, mat, row_ofs + c0, vec_aligned + c0, num_cols); - } else { - out[idx_row] = - Dot(df, mat, row_ofs + c0, vec_aligned + c0, num_cols); - } - } -} - -// Adds together partial dot products for all tiles with the same r0 (a -// horizontal strip of the entire matrix); the result is the full dot product -// for rows r in [r0, r0 + num_rows) + optionally the add vector, which we -// store into in out[r - r0]. -template -HWY_INLINE void FullDotProductsForStrip(DF df, const ArrayT& mat, - size_t mat_ofs, size_t mat_stride, - size_t r0, size_t num_rows, - const VecT* HWY_RESTRICT vec_aligned, - const AddT* HWY_RESTRICT add, - float* HWY_RESTRICT out) { - // Tall and skinny: set `out` to the single dot product. - if (mat_stride < MaxCols()) { - SetFirstPartialDotProducts(df, mat, mat_ofs, mat_stride, r0, - 0, num_rows, mat_stride, - vec_aligned, add, out); - return; - } - - // We have at least MaxCols, so start by setting `out` to that: - SetFirstPartialDotProducts(df, mat, mat_ofs, mat_stride, r0, 0, - num_rows, MaxCols(), vec_aligned, - add, out); - // For further multiples of MaxCols, accumulate. Remainders handled below. - size_t c0 = MaxCols(); - for (; c0 <= mat_stride - MaxCols(); c0 += MaxCols()) { - AccumulatePartialDotProducts(df, mat, mat_ofs, mat_stride, r0, c0, - num_rows, MaxCols(), vec_aligned, out); - } - - if (c0 < mat_stride) { // Final cols - AccumulatePartialDotProducts(df, mat, mat_ofs, mat_stride, r0, c0, - num_rows, mat_stride - c0, vec_aligned, - out); - } -} - -template -HWY_INLINE void MatVecAddInner(const ArrayT& mat, const size_t mat_ofs, - const VecT* HWY_RESTRICT const vec_aligned, - const AddT* HWY_RESTRICT const add, - float* HWY_RESTRICT out, hwy::ThreadPool& pool) { - const hn::ScalableTag df; - constexpr size_t kRowsPerStrip = RowsPerStrip(); - constexpr size_t kNumStrips = kOuter / kRowsPerStrip; - - // For each entire strip. - pool.Run(0, kNumStrips, [&](const uint64_t strip, size_t thread) HWY_ATTR { - PROFILER_ZONE("MatVec.lambda"); - const size_t r0 = strip * kRowsPerStrip; - detail::FullDotProductsForStrip( - df, mat, mat_ofs, kInner, r0, kRowsPerStrip, vec_aligned, add, - out + r0); - }); - - // Remaining rows - const size_t r0 = kNumStrips * kRowsPerStrip; - if (r0 < kOuter) { - PROFILER_ZONE("MatVec remainder"); - const size_t num_rows = kOuter - r0; - detail::FullDotProductsForStrip( - df, mat, mat_ofs, kInner, r0, num_rows, vec_aligned, add, out + r0); - } -} - -} // namespace detail - -// Stores dot products of rows with `vec_aligned` + add the values from `add` -// (if kAdd), then stores them to `out`. -template -HWY_INLINE void MatVecT(const ArrayT& mat, const size_t mat_ofs, - const VecT* HWY_RESTRICT const vec_aligned, - const AddT* HWY_RESTRICT const add, - float* HWY_RESTRICT even_odd, float* HWY_RESTRICT out, - hwy::ThreadPool& pool) { - PROFILER_ZONE("MatVecAdd"); - -#if !defined(HWY_NATIVE_DOT_BF16) || !HWY_NATIVE_DOT_BF16 - using MatT = typename ArrayT::value_type; - // Sfp -> float does not benefit enough to recoup the cost of ToEvenOddF32. - if constexpr (CompressTraits::kSupportsEvenOdd && - hwy::IsSameEither() && - !(hwy::IsSame() && - hwy::IsSame())) { - ToEvenOddF32(vec_aligned, kInner, even_odd); - detail::MatVecAddInner( - mat, mat_ofs, even_odd, add, out, pool); - return; - } -#else - (void)even_odd; -#endif - - detail::MatVecAddInner( - mat, mat_ofs, vec_aligned, add, out, pool); -} - -// With addition -template -HWY_INLINE void MatVecAdd(const ArrayT& mat, const size_t mat_ofs, - const VecT* HWY_RESTRICT const vec_aligned, - const AddT* HWY_RESTRICT const add, - float* HWY_RESTRICT even_odd, float* HWY_RESTRICT out, - hwy::ThreadPool& pool) { - return MatVecT(mat, mat_ofs, vec_aligned, add, - even_odd, out, pool); -} - -// Without addition -template -HWY_INLINE void MatVec(const ArrayT& mat, const size_t mat_ofs, - const VecT* HWY_RESTRICT const vec_aligned, - float* HWY_RESTRICT even_odd, float* HWY_RESTRICT out, - hwy::ThreadPool& pool) { - MatVecT(mat, mat_ofs, vec_aligned, - /*add=*/static_cast(nullptr), - even_odd, out, pool); -} - -// Two matrices, same vector -template -HWY_NOINLINE void TwoMatVecT(const ArrayT& mat0, const ArrayT& mat1, - const size_t mat_ofs, - const VecT* HWY_RESTRICT vec_aligned, - const AddT* HWY_RESTRICT add0, - const AddT* HWY_RESTRICT add1, - float* HWY_RESTRICT out0, float* HWY_RESTRICT out1, - hwy::ThreadPool& pool) { - PROFILER_ZONE("TwoMatVecAdd"); - - const hn::ScalableTag df; - constexpr size_t kRowsPerStrip = RowsPerStrip(); - constexpr size_t kNumStrips = kOuter / kRowsPerStrip; - constexpr bool kVecIsEvenOdd = false; - - // For each entire strip. - pool.Run(0, kNumStrips, [&](const uint64_t strip, size_t thread) HWY_ATTR { - PROFILER_ZONE("TwoMatVec.lambda"); - const size_t r0 = strip * kRowsPerStrip; - detail::FullDotProductsForStrip( - df, mat0, mat_ofs, kInner, r0, kRowsPerStrip, vec_aligned, add0, - out0 + r0); - detail::FullDotProductsForStrip( - df, mat1, mat_ofs, kInner, r0, kRowsPerStrip, vec_aligned, add1, - out1 + r0); - }); - - // Remaining rows - const size_t r0 = kNumStrips * kRowsPerStrip; - if (r0 < kOuter) { - PROFILER_ZONE("TwoMatVec remainder"); - const size_t num_rows = kOuter - r0; - detail::FullDotProductsForStrip( - df, mat0, mat_ofs, kInner, r0, num_rows, vec_aligned, add0, out0 + r0); - detail::FullDotProductsForStrip( - df, mat1, mat_ofs, kInner, r0, num_rows, vec_aligned, add1, out1 + r0); - } -} - -// With addition -template -HWY_NOINLINE void TwoMatVecAdd( - const ArrayT& mat0, const ArrayT& mat1, const size_t mat_ofs, - const VecT* HWY_RESTRICT vec_aligned, const AddT* HWY_RESTRICT add0, - const AddT* HWY_RESTRICT add1, float* HWY_RESTRICT out0, - float* HWY_RESTRICT out1, hwy::ThreadPool& pool) { - return TwoMatVecT( - mat0, mat1, mat_ofs, vec_aligned, add0, add1, out0, out1, pool); -} - -// Without addition -template -HWY_NOINLINE void TwoMatVec(const ArrayT& mat0, const ArrayT& mat1, - const size_t mat_ofs, - const VecT* HWY_RESTRICT vec_aligned, - float* HWY_RESTRICT out0, float* HWY_RESTRICT out1, - hwy::ThreadPool& pool) { - TwoMatVecT( - mat0, mat1, mat_ofs, vec_aligned, /*add0=*/nullptr, /*add1=*/nullptr, - out0, out1, pool); -} - // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE } // namespace gcpp diff --git a/ops/matmul_test.cc b/ops/matmul_test.cc index 61204af..ca025bc 100644 --- a/ops/matmul_test.cc +++ b/ops/matmul_test.cc @@ -21,9 +21,6 @@ #include #include -#include -#include -#include #include #include "compression/compress.h" @@ -41,7 +38,6 @@ #include "hwy/tests/test_util-inl.h" // After highway.h #include "ops/matmul-inl.h" -#include "ops/ops-inl.h" // MulByConst HWY_BEFORE_NAMESPACE(); namespace gcpp { @@ -49,102 +45,115 @@ namespace HWY_NAMESPACE { namespace hn = hwy::HWY_NAMESPACE; -template -std::unique_ptr> GenerateMatHeap( +// Generates inputs: deterministic, within max SfpStream range. +template +std::unique_ptr> GenerateMatHeap( size_t offset, hwy::ThreadPool& pool) { gcpp::CompressWorkingSet ws; hwy::AlignedFreeUniquePtr content = - hwy::AllocateAligned(kOuter * kInner); - const float scale = 1.875f / (kInner * kOuter + offset); - pool.Run(0, kOuter, [&](const size_t i, size_t /*thread*/) { - for (size_t j = 0; j < kInner; j++) { - content[i * kInner + j] = - static_cast((i * kInner + j + offset) * scale); + hwy::AllocateAligned(kRows * kCols); + const float scale = 1.875f / (kCols * kRows + offset); + pool.Run(0, kRows, [&](const size_t i, size_t /*thread*/) { + for (size_t j = 0; j < kCols; j++) { + content[i * kCols + j] = + static_cast((i * kCols + j + offset) * scale); } }); - std::unique_ptr> mat = - std::make_unique>(); - Compress(content.get(), kOuter * kInner, ws, kOuter * kInner, mat->data(), 0, + std::unique_ptr> mat = + std::make_unique>(); + Compress(content.get(), kRows * kCols, ws, kRows * kCols, mat->data(), 0, pool); mat->set_scale(0.6f); // Arbitrary value, different from 1. return mat; } -template -std::unique_ptr> -GenerateTransposeMatHeap(size_t offset, hwy::ThreadPool& pool) { +template +std::unique_ptr> GenerateTransposeMatHeap( + size_t offset, hwy::ThreadPool& pool) { gcpp::CompressWorkingSet ws; hwy::AlignedFreeUniquePtr content = - hwy::AllocateAligned(kOuter * kInner); - const float scale = 1.875f / (kInner * kOuter + offset); - pool.Run(0, kOuter, [&](const size_t i, size_t /*thread*/) { - for (size_t j = 0; j < kInner; j++) { - content[j * kOuter + i] = - static_cast((i * kInner + j + offset) * scale); + hwy::AllocateAligned(kRows * kCols); + const float scale = 1.875f / (kCols * kRows + offset); + pool.Run(0, kRows, [&](const size_t i, size_t /*thread*/) { + for (size_t j = 0; j < kCols; j++) { + content[j * kRows + i] = + static_cast((i * kCols + j + offset) * scale); } }); - std::unique_ptr> mat = - std::make_unique>(); - Compress(content.get(), kOuter * kInner, ws, kOuter * kInner, mat->data(), 0, + std::unique_ptr> mat = + std::make_unique>(); + Compress(content.get(), kRows * kCols, ws, kRows * kCols, mat->data(), 0, pool); // Arbitrary value, different from 1, must match GenerateMatHeap. mat->set_scale(0.6f); return mat; } -template -std::unique_ptr> GenerateZeroMatHeap( +template +std::unique_ptr> GenerateZeroMatHeap( hwy::ThreadPool& pool) { gcpp::CompressWorkingSet ws; hwy::AlignedFreeUniquePtr content = - hwy::AllocateAligned(kOuter * kInner); + hwy::AllocateAligned(kRows * kCols); - pool.Run(0, kOuter, [&](const size_t i, size_t thread) { - hwy::ZeroBytes(&content[i * kInner], kInner * sizeof(content[0])); + pool.Run(0, kRows, [&](const size_t i, size_t thread) { + hwy::ZeroBytes(&content[i * kCols], kCols * sizeof(content[0])); }); - std::unique_ptr> mat = - std::make_unique>(); - Compress(content.get(), kOuter * kInner, ws, kOuter * kInner, mat->data(), 0, + std::unique_ptr> mat = + std::make_unique>(); + Compress(content.get(), kRows * kCols, ws, kRows * kCols, mat->data(), 0, pool); mat->set_scale(1.2f); // Arbitrary value, different from 1. return mat; } -// A simple matrix multiplication. No optimization / tiling. -template -hwy::AlignedFreeUniquePtr SimpleMatMul( - const hwy::AlignedFreeUniquePtr& a, - const hwy::AlignedFreeUniquePtr& b) { - hwy::AlignedFreeUniquePtr out = hwy::AllocateAligned(kM * kK); - hwy::ZeroBytes(out.get(), kM * kK * sizeof(float)); - - int i, j, k; - for (i = 0; i < kM; ++i) { - for (j = 0; j < kK; ++j) { - for (k = 0; k < kN; ++k) { - out[i * kK + j] += a[i * kN + k] * b[k * kK + j]; - } - } - } - - return out; +template +void Decompress(const MatT* compressed, size_t num, float* out) { + const hn::ScalableTag d; + hwy::AlignedFreeUniquePtr b = hwy::AllocateAligned(num); + CompressTraits::Decompress(d, /*in_capacity=*/0, compressed, 0, out, + num); } -template -void AssertClose(const MatT* HWY_RESTRICT expected, - const MatT* HWY_RESTRICT actual, size_t num) { - for (size_t idx = 0; idx < num; idx++) { - const double expected_value = hwy::ConvertScalarTo(expected[idx]); - const double actual_value = hwy::ConvertScalarTo(actual[idx]); +// Returns 1-norm, used for estimating tolerable numerical differences. +double MaxColAbsSum(const float* HWY_RESTRICT a, size_t rows, size_t cols) { + double max_col_abs_sum = 0.0; + for (size_t c = 0; c < cols; c++) { + double col_abs_sum = 0.0; + for (size_t r = 0; r < rows; r++) { + col_abs_sum += hwy::ScalarAbs(a[r * cols + c]); + } + max_col_abs_sum = HWY_MAX(max_col_abs_sum, col_abs_sum); + } + return max_col_abs_sum; +} - const double magnitude = std::abs(expected_value); +template +void AssertClose(size_t rows_ac, size_t cols_ab, size_t cols_c_rows_b, + const MatTA* HWY_RESTRICT a_compr, + const MatTB* HWY_RESTRICT b_trans_compr, + const float* HWY_RESTRICT expected_c, + const float* HWY_RESTRICT actual_c) { + const size_t num_a = rows_ac * cols_ab; + const size_t num_b = cols_c_rows_b * cols_ab; + const size_t num_c = rows_ac * cols_c_rows_b; + hwy::AlignedFreeUniquePtr a = hwy::AllocateAligned(num_a); + hwy::AlignedFreeUniquePtr b_trans = + hwy::AllocateAligned(num_b); + Decompress(a_compr, num_a, a.get()); + Decompress(b_trans_compr, num_b, b_trans.get()); - const double tolerance = - 256.0 * hwy::ConvertScalarTo(hwy::Epsilon()) * - HWY_MAX(magnitude, 1.0); + const double norm = MaxColAbsSum(a.get(), rows_ac, cols_ab) * + MaxColAbsSum(b_trans.get(), cols_c_rows_b, cols_ab); + const double epsilon = hwy::ConvertScalarTo(hwy::Epsilon()); + const double tolerance = 50.0 * norm * epsilon; + + for (size_t idx = 0; idx < num_c; idx++) { + const double expected_value = expected_c[idx]; + const double actual_value = actual_c[idx]; if (!(expected_value - tolerance <= actual_value && actual_value <= expected_value + tolerance)) { @@ -154,35 +163,23 @@ void AssertClose(const MatT* HWY_RESTRICT expected, } } } - -template -void AssertClose(const hwy::AlignedFreeUniquePtr& a, - const hwy::AlignedFreeUniquePtr& b) { - for (size_t idx = 0; idx < length; idx++) { - const float rel_abs_delta = std::abs(a[idx] - b[idx]) / - std::max(std::abs(a[idx]), std::abs(b[idx])); - EXPECT_LT(rel_abs_delta, 2e-6) - << "a[" << idx << "]=" << a[idx] << ", b[" << idx << "]=" << b[idx]; - } -} - // Largely unoptimized; reordered innermost loops nets ~5-10X speedup. -template -HWY_INLINE void MatMulSlowBatch(size_t batch_size, const MatTA* HWY_RESTRICT a, - const MatTB* HWY_RESTRICT b, const float scale, - const float* add, float* HWY_RESTRICT out) { - for (size_t i = 0; i < batch_size; ++i) { - for (size_t k = 0; k < kN; ++k) { - for (size_t j = 0; j < kK; ++j) { - const float a1 = hwy::ConvertScalarTo(a[i * kN + k]); - const float b1 = hwy::ConvertScalarTo(b[k * kK + j]); - out[i * kK + j] += scale * a1 * b1; +template +HWY_INLINE void MatMulSlow(size_t rows_ac, size_t cols_a_rows_b, size_t cols_bc, + const MatTA* HWY_RESTRICT a, + const MatTB* HWY_RESTRICT b, const float scale, + const float* add, float* HWY_RESTRICT out) { + for (size_t i = 0; i < rows_ac; ++i) { + for (size_t k = 0; k < cols_a_rows_b; ++k) { + for (size_t j = 0; j < cols_bc; ++j) { + const float a1 = hwy::ConvertScalarTo(a[i * cols_a_rows_b + k]); + const float b1 = hwy::ConvertScalarTo(b[k * cols_bc + j]); + out[i * cols_bc + j] += scale * a1 * b1; } } if (add != nullptr) { - for (size_t j = 0; j < kK; ++j) { - out[i * kK + j] += add[j]; + for (size_t j = 0; j < cols_bc; ++j) { + out[i * cols_bc + j] += add[j]; } } } @@ -190,74 +187,78 @@ HWY_INLINE void MatMulSlowBatch(size_t batch_size, const MatTA* HWY_RESTRICT a, // The above overload can handle combinations of f32 and bf16, but this one // is required for MatTB = {SFP, NUQ}. -template -HWY_INLINE void MatMulSlowBatch(size_t batch_size, const MatTA* HWY_RESTRICT a, - const MatTB* HWY_RESTRICT b_compr, - const float scale, const float* add, - float* HWY_RESTRICT out) { +template +HWY_INLINE void MatMulSlow(size_t rows_ac, size_t cols_a_rows_b, size_t cols_bc, + const MatTA* HWY_RESTRICT a, + const MatTB* HWY_RESTRICT b_compr, const float scale, + const float* add, float* HWY_RESTRICT out) { const hn::ScalableTag d; - hwy::AlignedFreeUniquePtr b = hwy::AllocateAligned(kK * kN); + hwy::AlignedFreeUniquePtr b = + hwy::AllocateAligned(cols_a_rows_b * cols_bc); CompressTraits::Decompress(d, /*in_capacity=*/0, b_compr, 0, b.get(), - kK * kN); - MatMulSlowBatch(batch_size, a, b.get(), scale, add, out); + cols_a_rows_b * cols_bc); + MatMulSlow(rows_ac, cols_a_rows_b, cols_bc, a, b.get(), scale, add, out); } -void PrintSpeed(const char* algo, size_t M, size_t N, size_t K, - double elapsed) { - // * 2 because of FMA. +void PrintSpeed(const char* algo, size_t rows_ac, size_t cols_a_rows_b, + size_t cols_bc, double elapsed) { + // 2 because of FMA. fprintf(stderr, "%s: %f seconds, %f GFLOPS.\n", algo, elapsed, - 2E-9 * M * N * K / elapsed); + 2E-9 * rows_ac * cols_a_rows_b * cols_bc / elapsed); } -template +template void TestMatMul(hwy::ThreadPool& pool) { using TraitsA = CompressTraits; using TraitsB = CompressTraits; - fprintf(stderr, "TestMatMul %lu, %lu, %lu, add=%d, MatTA=%s, MatTB=%s\n", kM, - kN, kK, kAdd, TraitsA::Name(), TraitsB::Name()); + const bool want_bench = kColsBC > 2000; // avoid spam for small matrices + fprintf(stderr, "TestMatMul %lu, %lu, %lu, add=%d, MatTA=%s, MatTB=%s\n", + kRowsAC, kColsARowsB, kColsBC, kAdd, TraitsA::Name(), + TraitsB::Name()); - std::unique_ptr> a = - GenerateMatHeap(0, pool); - std::unique_ptr> b_trans = - GenerateTransposeMatHeap(0, pool); - hwy::AlignedFreeUniquePtr c = hwy::AllocateAligned(kM * kK); + std::unique_ptr> a = + GenerateMatHeap(0, pool); + std::unique_ptr> b_trans = + GenerateTransposeMatHeap(0, pool); + hwy::AlignedFreeUniquePtr c = + hwy::AllocateAligned(kRowsAC * kColsBC); const float scale = a->scale() * b_trans->scale(); - std::unique_ptr> add; + std::unique_ptr> add; if (kAdd) { - add = GenerateMatHeap(0, pool); + add = GenerateMatHeap(0, pool); add->set_scale(1.0f); } - std::unique_ptr> c_slow; - const bool compare_slow = kN < 2048; - if (compare_slow) { - std::unique_ptr> b = - GenerateMatHeap(0, pool); - HWY_ASSERT_EQ(scale, a->scale() * b->scale()); - c_slow = GenerateZeroMatHeap(pool); - const double start_slow = hwy::platform::Now(); - MatMulSlowBatch(kM, a->data(), b->data(), scale, - kAdd ? add->data() : nullptr, c_slow->data()); - PrintSpeed("MatMulSlowBatch", kM, kN, kK, + std::unique_ptr> b = + GenerateMatHeap(0, pool); + HWY_ASSERT_EQ(scale, a->scale() * b->scale()); + std::unique_ptr> c_slow = + GenerateZeroMatHeap(pool); + const double start_slow = hwy::platform::Now(); + MatMulSlow(kRowsAC, kColsARowsB, kColsBC, a->data(), b->data(), scale, + kAdd ? add->data() : nullptr, c_slow->data()); + if (want_bench) { + PrintSpeed("MatMulSlow", kRowsAC, kColsARowsB, kColsBC, hwy::platform::Now() - start_slow); } double min_elapsed = hwy::HighestValue(); - for (int rep = 0; rep < (compare_slow ? 1 : 3); ++rep) { + for (int rep = 0; rep < (want_bench ? 3 : 1); ++rep) { const double start_tiled = hwy::platform::Now(); - MatMul_4x4(kM, MakeMat(a->data(), kN), MakeMat(b_trans->data(), kN), - scale, kAdd ? add->data_scale1() : nullptr, - MakeMat(c.get(), kK), pool); + MatMul_4x4(kRowsAC, MakeMat(a->data(), kColsARowsB), + MakeMat(b_trans->data(), kColsARowsB), scale, + kAdd ? add->data_scale1() : nullptr, + MakeMat(c.get(), kColsBC), pool); min_elapsed = HWY_MIN(min_elapsed, hwy::platform::Now() - start_tiled); } - PrintSpeed("MatMul_4x4", kM, kN, kK, min_elapsed); - - if (compare_slow) { - AssertClose(c_slow->data(), c.get(), kM * kK); + if (want_bench) { + PrintSpeed("MatMul_4x4", kRowsAC, kColsARowsB, kColsBC, min_elapsed); } + + AssertClose(kRowsAC, kColsARowsB, kColsBC, a->data(), b_trans->data(), + c_slow->data(), c.get()); } void TestAllMatMul() { @@ -274,6 +275,7 @@ void TestAllMatMul() { // large-scale test TestMatMul<64, 24576, 3072, /*kAdd=*/false, F32, SFP>(pool); + TestMatMul<64, 3072, 24576, /*kAdd=*/false, F32, SFP>(pool); // medium-sized square test TestMatMul<512, 512, 512, /*kAdd=*/false, F32>(pool); @@ -283,7 +285,7 @@ void TestAllMatMul() { TestMatMul<512, 512, 512, /*kAdd=*/false, F32, SFP>(pool); TestMatMul<512, 512, 512, /*kAdd=*/true, BF16, SFP>(pool); - // minimal non-square test. kK must be at least 2 vectors. + // minimal non-square test. kColsARowsB must be at least 2 vectors. TestMatMul<35, 128, 32, /*kAdd=*/false, F32>(pool); TestMatMul<34, 128, 32, /*kAdd=*/true, BF16>(pool); TestMatMul<33, 128, 32, /*kAdd=*/false, F32, BF16>(pool); @@ -316,129 +318,6 @@ void TestAllMatMul() { TestMatMul<1, 128, 32, /*kAdd=*/true, BF16, SFP>(pool); } -template -hwy::AlignedFreeUniquePtr SimpleMatVecAdd( - const CompressedArray& mat, - const hwy::AlignedFreeUniquePtr& vec, - const hwy::AlignedFreeUniquePtr& add) { - hwy::AlignedFreeUniquePtr uncompressed_mat = - hwy::AllocateAligned(kOuter * kInner); - hwy::AlignedFreeUniquePtr out = hwy::AllocateAligned(kOuter); - HWY_ASSERT(uncompressed_mat && out); - Decompress(mat, 0, uncompressed_mat.get(), kOuter * kInner); - MulByConst(mat.scale(), uncompressed_mat.get(), kOuter * kInner); - for (size_t idx_row = 0; idx_row < kOuter; idx_row++) { - out[idx_row] = add[idx_row]; - for (size_t idx_col = 0; idx_col < kInner; idx_col++) { - out[idx_row] += - uncompressed_mat[kInner * idx_row + idx_col] * vec[idx_col]; - } - } - return out; -} - -template -CompressedArray GenerateMat(size_t offset, - hwy::ThreadPool& pool) { - gcpp::CompressWorkingSet ws; - CompressedArray mat; - std::array content; - const float scale = 1.0f / kInner; - pool.Run(0, kOuter, [&](const size_t i, size_t /*thread*/) { - for (size_t j = 0; j < kInner; j++) { - content[i * kInner + j] = - static_cast((i * kInner + j + offset) * scale); - } - }); - - Compress(content, ws, mat, pool); - mat.set_scale(1.9f); // Arbitrary value, different from 1. - return mat; -} - -template -hwy::AlignedFreeUniquePtr GenerateVec(size_t offset) { - hwy::AlignedFreeUniquePtr vec = hwy::AllocateAligned(length); - HWY_ASSERT(vec); - for (size_t idx = 0; idx < length; idx++) { - vec[idx] = static_cast(idx + offset); - } - return vec; -} - -void TestMatVecAdd() { - hwy::ThreadPool pool(hwy::ThreadPool::MaxThreads()); - constexpr size_t kOuter = 128 * 3; - constexpr size_t kInner = 128 * 5; - CompressedArray mat = - GenerateMat(0, pool); - hwy::AlignedFreeUniquePtr vec = GenerateVec(0); - hwy::AlignedFreeUniquePtr add = GenerateVec(0); - hwy::AlignedFreeUniquePtr even_odd = - hwy::AllocateAligned(kInner * pool.NumWorkers()); - hwy::AlignedFreeUniquePtr expected_out = - SimpleMatVecAdd(mat, vec, add); - hwy::AlignedFreeUniquePtr actual_out = - hwy::AllocateAligned(kOuter); - HWY_ASSERT(vec && add && even_odd && expected_out && actual_out); - MatVecAdd(mat, 0, vec.get(), add.get(), even_odd.get(), - actual_out.get(), pool); - AssertClose(actual_out, expected_out); -} - -void TestTwoMatVecAdd() { - hwy::ThreadPool pool(hwy::ThreadPool::MaxThreads()); - constexpr size_t kOuter = 128 * 3; - constexpr size_t kInner = 128 * 5; - CompressedArray mat0 = - GenerateMat(0, pool); - CompressedArray mat1 = - GenerateMat(1, pool); - hwy::AlignedFreeUniquePtr vec = GenerateVec(0); - hwy::AlignedFreeUniquePtr add0 = GenerateVec(0); - hwy::AlignedFreeUniquePtr add1 = GenerateVec(1); - hwy::AlignedFreeUniquePtr expected_out0 = - SimpleMatVecAdd(mat0, vec, add0); - hwy::AlignedFreeUniquePtr expected_out1 = - SimpleMatVecAdd(mat1, vec, add1); - hwy::AlignedFreeUniquePtr actual_out0 = - hwy::AllocateAligned(kOuter); - hwy::AlignedFreeUniquePtr actual_out1 = - hwy::AllocateAligned(kOuter); - HWY_ASSERT(vec && add0 && add1 && expected_out0 && actual_out0 && - expected_out1 && actual_out1); - TwoMatVecAdd(mat0, mat1, 0, vec.get(), add0.get(), add1.get(), - actual_out0.get(), actual_out1.get(), pool); - AssertClose(actual_out0, expected_out0); - AssertClose(actual_out1, expected_out1); -} - -void TestTwoOfsMatVecAddLoop() { - hwy::ThreadPool pool(hwy::ThreadPool::MaxThreads()); - constexpr size_t kOuter = 128 * 3; - constexpr size_t kInner = 128 * 5; - CompressedArray mat = - GenerateMat(0, pool); - hwy::AlignedFreeUniquePtr vec = GenerateVec(0); - hwy::AlignedFreeUniquePtr add0 = GenerateVec(0); - hwy::AlignedFreeUniquePtr add1 = GenerateVec(1); - hwy::AlignedFreeUniquePtr expected_out0 = - SimpleMatVecAdd(mat, vec, add0); - hwy::AlignedFreeUniquePtr expected_out1 = - SimpleMatVecAdd(mat, vec, add1); - hwy::AlignedFreeUniquePtr actual_out0 = - hwy::AllocateAligned(kOuter); - hwy::AlignedFreeUniquePtr actual_out1 = - hwy::AllocateAligned(kOuter); - HWY_ASSERT(vec && add0 && add1 && expected_out0 && actual_out0 && - expected_out1 && actual_out1); - TwoOfsMatVecAddLoop(mat, 0, 0, vec.get(), add0.get(), - add1.get(), actual_out0.get(), - actual_out1.get()); - AssertClose(actual_out0, expected_out0); - AssertClose(actual_out1, expected_out1); -} - // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE } // namespace gcpp @@ -449,9 +328,6 @@ HWY_AFTER_NAMESPACE(); namespace gcpp { HWY_BEFORE_TEST(MatmulTest); HWY_EXPORT_AND_TEST_P(MatmulTest, TestAllMatMul); -HWY_EXPORT_AND_TEST_P(MatmulTest, TestMatVecAdd); -HWY_EXPORT_AND_TEST_P(MatmulTest, TestTwoMatVecAdd); -HWY_EXPORT_AND_TEST_P(MatmulTest, TestTwoOfsMatVecAddLoop); HWY_AFTER_TEST(); } // namespace gcpp diff --git a/ops/matvec-inl.h b/ops/matvec-inl.h new file mode 100644 index 0000000..70d6b87 --- /dev/null +++ b/ops/matvec-inl.h @@ -0,0 +1,357 @@ +// Copyright 2024 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Include guard for non-SIMD code. +#ifndef THIRD_PARTY_GEMMA_CPP_OPS_MATVEC_INL_H_ +#define THIRD_PARTY_GEMMA_CPP_OPS_MATVEC_INL_H_ + +#include +#include +#include + +#include "compression/compress.h" +#include "compression/sfp.h" +#include "hwy/base.h" +#include "hwy/contrib/thread_pool/thread_pool.h" +#include "hwy/profiler.h" + +#endif // THIRD_PARTY_GEMMA_CPP_OPS_MATVEC_INL_H_ + +// Include guard for (potentially) SIMD code. +#if defined(THIRD_PARTY_GEMMA_CPP_MATVEC_TOGGLE) == defined(HWY_TARGET_TOGGLE) +#ifdef THIRD_PARTY_GEMMA_CPP_MATVEC_TOGGLE +#undef THIRD_PARTY_GEMMA_CPP_MATVEC_TOGGLE +#else +#define THIRD_PARTY_GEMMA_CPP_MATVEC_TOGGLE +#endif + +#include "compression/compress-inl.h" +#include "hwy/contrib/dot/dot-inl.h" +#include "hwy/contrib/math/math-inl.h" +#include "hwy/contrib/matvec/matvec-inl.h" + +HWY_BEFORE_NAMESPACE(); +namespace gcpp { +namespace HWY_NAMESPACE { +namespace hn = hwy::HWY_NAMESPACE; + +HWY_INLINE void ToEvenOddF32(const hwy::bfloat16_t* HWY_RESTRICT vec_aligned, + const size_t size, float* HWY_RESTRICT out) { + const hn::ScalableTag df; + const hn::Repartition dbf16; + + HWY_DASSERT(size % hn::Lanes(dbf16) == 0); + HWY_DASSERT(hn::IsAligned(df, vec_aligned)); + + for (size_t i = 0; i < size; i += hn::Lanes(dbf16)) { + const auto interleaved = hn::LoadU(dbf16, vec_aligned + i); + hn::Store(hn::PromoteEvenTo(df, interleaved), df, out + i); + hn::Store(hn::PromoteOddTo(df, interleaved), df, out + i + hn::Lanes(df)); + } +} + +HWY_INLINE void ToEvenOddF32(const float* HWY_RESTRICT vec_aligned, + const size_t size, float* HWY_RESTRICT out) { + const hn::ScalableTag df; + using VF = hn::Vec; + + HWY_DASSERT(size % (hn::Lanes(df) * 2) == 0); + HWY_DASSERT(hn::IsAligned(df, vec_aligned)); + + VF vec0, vec1; + for (size_t i = 0; i < size; i += hn::Lanes(df) * 2) { + hn::LoadInterleaved2(df, vec_aligned + i, vec0, vec1); + hn::Store(vec0, df, out + i); + hn::Store(vec1, df, out + i + hn::Lanes(df)); + } +} + +// Simple version without tiling nor threading, but two offsets/outputs and +// always with addition. +template +HWY_INLINE void TwoOfsMatVecAddLoop(const ArrayT& mat, const size_t mat_ofs0, + const size_t mat_ofs1, + const VecT* HWY_RESTRICT vec_aligned, + const AddT* HWY_RESTRICT add0, + const AddT* HWY_RESTRICT add1, + float* HWY_RESTRICT out0, + float* HWY_RESTRICT out1) { + PROFILER_ZONE("TwoOfsMatVecAddLoop"); + constexpr bool kVecEO = false; + const hn::ScalableTag df; + + for (size_t idx_row = 0; idx_row < kOuter; ++idx_row) { + const size_t row_ofs0 = mat_ofs0 + (idx_row)*kInner; + const size_t row_ofs1 = mat_ofs1 + (idx_row)*kInner; + out0[idx_row] = hwy::ConvertScalarTo(add0[idx_row]) + + Dot(df, mat, row_ofs0, vec_aligned, kInner); + out1[idx_row] = hwy::ConvertScalarTo(add1[idx_row]) + + Dot(df, mat, row_ofs1, vec_aligned, kInner); + } +} + +HWY_INLINE constexpr size_t MaxCols() { + // Vec + mat rows should fit into 32 KiB L1. + return 2048; +} + +template +HWY_INLINE constexpr size_t RowsPerStrip() { + // Aim for 128 work items to reduce pool overhead. Must be at least one + // vector; prefer a power of two for faster division. + constexpr size_t kLanes = hn::ScalableTag().MaxLanes(); + constexpr size_t kRowsPerStrip = + kOuter < 128 ? kLanes + : HWY_MAX(kLanes, 1ULL << hwy::FloorLog2(kOuter / 128)); + return kRowsPerStrip; +} + +namespace detail { + +// For each i = [0, num_rows), compute partial (length `num_cols`) dot product +// of row i with `vec_aligned` and add into `out[i]`. The upper-left +// coordinate of the tile is r0, c0. +template +HWY_INLINE void AccumulatePartialDotProducts( + DF df, const ArrayT& mat, size_t mat_ofs, size_t mat_stride, size_t r0, + size_t c0, size_t num_rows, size_t num_cols, + const VecT* HWY_RESTRICT vec_aligned, float* HWY_RESTRICT out) { + for (size_t idx_row = 0; idx_row < num_rows; ++idx_row) { + const size_t row_ofs = mat_ofs + (r0 + idx_row) * mat_stride; + out[idx_row] += + Dot(df, mat, row_ofs + c0, vec_aligned + c0, num_cols); + } +} + +// Same as AccumulatePartialDotProducts, but sets out[i] to the first partial +// dot product + init (if kInit), which avoids having to zero-initialize and +// accumulate. +template +HWY_INLINE void SetFirstPartialDotProducts(DF df, const ArrayT& mat, + size_t mat_ofs, size_t mat_stride, + size_t r0, size_t c0, + size_t num_rows, size_t num_cols, + const VecT* HWY_RESTRICT vec_aligned, + const InitT* HWY_RESTRICT init, + float* HWY_RESTRICT out) { + for (size_t idx_row = 0; idx_row < num_rows; ++idx_row) { + const size_t row_ofs = mat_ofs + (r0 + idx_row) * mat_stride; + if constexpr (kInit) { + out[idx_row] = + hwy::ConvertScalarTo(init[idx_row + r0]) + + Dot(df, mat, row_ofs + c0, vec_aligned + c0, num_cols); + } else { + out[idx_row] = + Dot(df, mat, row_ofs + c0, vec_aligned + c0, num_cols); + } + } +} + +// Adds together partial dot products for all tiles with the same r0 (a +// horizontal strip of the entire matrix); the result is the full dot product +// for rows r in [r0, r0 + num_rows) + optionally the add vector, which we +// store into in out[r - r0]. +template +HWY_INLINE void FullDotProductsForStrip(DF df, const ArrayT& mat, + size_t mat_ofs, size_t mat_stride, + size_t r0, size_t num_rows, + const VecT* HWY_RESTRICT vec_aligned, + const AddT* HWY_RESTRICT add, + float* HWY_RESTRICT out) { + // Tall and skinny: set `out` to the single dot product. + if (mat_stride < MaxCols()) { + SetFirstPartialDotProducts(df, mat, mat_ofs, mat_stride, r0, + 0, num_rows, mat_stride, + vec_aligned, add, out); + return; + } + + // We have at least MaxCols, so start by setting `out` to that: + SetFirstPartialDotProducts(df, mat, mat_ofs, mat_stride, r0, 0, + num_rows, MaxCols(), vec_aligned, + add, out); + // For further multiples of MaxCols, accumulate. Remainders handled below. + size_t c0 = MaxCols(); + for (; c0 <= mat_stride - MaxCols(); c0 += MaxCols()) { + AccumulatePartialDotProducts(df, mat, mat_ofs, mat_stride, r0, c0, + num_rows, MaxCols(), vec_aligned, out); + } + + if (c0 < mat_stride) { // Final cols + AccumulatePartialDotProducts(df, mat, mat_ofs, mat_stride, r0, c0, + num_rows, mat_stride - c0, vec_aligned, + out); + } +} + +template +HWY_INLINE void MatVecAddInner(const ArrayT& mat, const size_t mat_ofs, + const VecT* HWY_RESTRICT const vec_aligned, + const AddT* HWY_RESTRICT const add, + float* HWY_RESTRICT out, hwy::ThreadPool& pool) { + const hn::ScalableTag df; + constexpr size_t kRowsPerStrip = RowsPerStrip(); + constexpr size_t kNumStrips = kOuter / kRowsPerStrip; + + // For each entire strip. + pool.Run(0, kNumStrips, [&](const uint64_t strip, size_t thread) HWY_ATTR { + PROFILER_ZONE("MatVec.lambda"); + const size_t r0 = strip * kRowsPerStrip; + detail::FullDotProductsForStrip( + df, mat, mat_ofs, kInner, r0, kRowsPerStrip, vec_aligned, add, + out + r0); + }); + + // Remaining rows + const size_t r0 = kNumStrips * kRowsPerStrip; + if (r0 < kOuter) { + PROFILER_ZONE("MatVec remainder"); + const size_t num_rows = kOuter - r0; + detail::FullDotProductsForStrip( + df, mat, mat_ofs, kInner, r0, num_rows, vec_aligned, add, out + r0); + } +} + +} // namespace detail + +// Stores dot products of rows with `vec_aligned` + add the values from `add` +// (if kAdd), then stores them to `out`. +template +HWY_INLINE void MatVecT(const ArrayT& mat, const size_t mat_ofs, + const VecT* HWY_RESTRICT const vec_aligned, + const AddT* HWY_RESTRICT const add, + float* HWY_RESTRICT even_odd, float* HWY_RESTRICT out, + hwy::ThreadPool& pool) { + PROFILER_ZONE("MatVecAdd"); + +#if !defined(HWY_NATIVE_DOT_BF16) || !HWY_NATIVE_DOT_BF16 + using MatT = typename ArrayT::value_type; + // Sfp -> float does not benefit enough to recoup the cost of ToEvenOddF32. + if constexpr (CompressTraits::kSupportsEvenOdd && + hwy::IsSameEither() && + !(hwy::IsSame() && + hwy::IsSame())) { + ToEvenOddF32(vec_aligned, kInner, even_odd); + detail::MatVecAddInner( + mat, mat_ofs, even_odd, add, out, pool); + return; + } +#else + (void)even_odd; +#endif + + detail::MatVecAddInner( + mat, mat_ofs, vec_aligned, add, out, pool); +} + +// With addition +template +HWY_INLINE void MatVecAdd(const ArrayT& mat, const size_t mat_ofs, + const VecT* HWY_RESTRICT const vec_aligned, + const AddT* HWY_RESTRICT const add, + float* HWY_RESTRICT even_odd, float* HWY_RESTRICT out, + hwy::ThreadPool& pool) { + return MatVecT(mat, mat_ofs, vec_aligned, add, + even_odd, out, pool); +} + +// Without addition +template +HWY_INLINE void MatVec(const ArrayT& mat, const size_t mat_ofs, + const VecT* HWY_RESTRICT const vec_aligned, + float* HWY_RESTRICT even_odd, float* HWY_RESTRICT out, + hwy::ThreadPool& pool) { + MatVecT(mat, mat_ofs, vec_aligned, + /*add=*/static_cast(nullptr), + even_odd, out, pool); +} + +// Two matrices, same vector +template +HWY_NOINLINE void TwoMatVecT(const ArrayT& mat0, const ArrayT& mat1, + const size_t mat_ofs, + const VecT* HWY_RESTRICT vec_aligned, + const AddT* HWY_RESTRICT add0, + const AddT* HWY_RESTRICT add1, + float* HWY_RESTRICT out0, float* HWY_RESTRICT out1, + hwy::ThreadPool& pool) { + PROFILER_ZONE("TwoMatVecAdd"); + + const hn::ScalableTag df; + constexpr size_t kRowsPerStrip = RowsPerStrip(); + constexpr size_t kNumStrips = kOuter / kRowsPerStrip; + constexpr bool kVecIsEvenOdd = false; + + // For each entire strip. + pool.Run(0, kNumStrips, [&](const uint64_t strip, size_t thread) HWY_ATTR { + PROFILER_ZONE("TwoMatVec.lambda"); + const size_t r0 = strip * kRowsPerStrip; + detail::FullDotProductsForStrip( + df, mat0, mat_ofs, kInner, r0, kRowsPerStrip, vec_aligned, add0, + out0 + r0); + detail::FullDotProductsForStrip( + df, mat1, mat_ofs, kInner, r0, kRowsPerStrip, vec_aligned, add1, + out1 + r0); + }); + + // Remaining rows + const size_t r0 = kNumStrips * kRowsPerStrip; + if (r0 < kOuter) { + PROFILER_ZONE("TwoMatVec remainder"); + const size_t num_rows = kOuter - r0; + detail::FullDotProductsForStrip( + df, mat0, mat_ofs, kInner, r0, num_rows, vec_aligned, add0, out0 + r0); + detail::FullDotProductsForStrip( + df, mat1, mat_ofs, kInner, r0, num_rows, vec_aligned, add1, out1 + r0); + } +} + +// With addition +template +HWY_NOINLINE void TwoMatVecAdd( + const ArrayT& mat0, const ArrayT& mat1, const size_t mat_ofs, + const VecT* HWY_RESTRICT vec_aligned, const AddT* HWY_RESTRICT add0, + const AddT* HWY_RESTRICT add1, float* HWY_RESTRICT out0, + float* HWY_RESTRICT out1, hwy::ThreadPool& pool) { + return TwoMatVecT( + mat0, mat1, mat_ofs, vec_aligned, add0, add1, out0, out1, pool); +} + +// Without addition +template +HWY_NOINLINE void TwoMatVec(const ArrayT& mat0, const ArrayT& mat1, + const size_t mat_ofs, + const VecT* HWY_RESTRICT vec_aligned, + float* HWY_RESTRICT out0, float* HWY_RESTRICT out1, + hwy::ThreadPool& pool) { + TwoMatVecT( + mat0, mat1, mat_ofs, vec_aligned, /*add0=*/nullptr, /*add1=*/nullptr, + out0, out1, pool); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace gcpp +HWY_AFTER_NAMESPACE(); + +#endif // NOLINT diff --git a/ops/matvec_test.cc b/ops/matvec_test.cc new file mode 100644 index 0000000..3e915af --- /dev/null +++ b/ops/matvec_test.cc @@ -0,0 +1,199 @@ +// Copyright 2023 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HWY_DISABLED_TARGETS +// Exclude HWY_SCALAR due to 2x bf16 -> f32. +#define HWY_DISABLED_TARGETS HWY_SCALAR +#endif + +#include +#include + +#include +#include +#include +#include + +#include "compression/compress.h" +#include "hwy/aligned_allocator.h" +#include "hwy/base.h" +#include "hwy/contrib/thread_pool/thread_pool.h" + +// clang-format off +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "ops/matvec_test.cc" // NOLINT +// clang-format on +#include "hwy/foreach_target.h" // IWYU pragma: keep +#include "hwy/highway.h" +#include "hwy/tests/test_util-inl.h" +// After highway.h +#include "ops/matvec-inl.h" +#include "ops/ops-inl.h" // MulByConst + +HWY_BEFORE_NAMESPACE(); +namespace gcpp { +namespace HWY_NAMESPACE { + +template +hwy::AlignedFreeUniquePtr SimpleMatVecAdd( + const CompressedArray& mat, + const hwy::AlignedFreeUniquePtr& vec, + const hwy::AlignedFreeUniquePtr& add) { + hwy::AlignedFreeUniquePtr uncompressed_mat = + hwy::AllocateAligned(kOuter * kInner); + hwy::AlignedFreeUniquePtr out = hwy::AllocateAligned(kOuter); + HWY_ASSERT(uncompressed_mat && out); + Decompress(mat, 0, uncompressed_mat.get(), kOuter * kInner); + MulByConst(mat.scale(), uncompressed_mat.get(), kOuter * kInner); + for (size_t idx_row = 0; idx_row < kOuter; idx_row++) { + out[idx_row] = add[idx_row]; + for (size_t idx_col = 0; idx_col < kInner; idx_col++) { + out[idx_row] += + uncompressed_mat[kInner * idx_row + idx_col] * vec[idx_col]; + } + } + return out; +} + +template +CompressedArray GenerateMat(size_t offset, + hwy::ThreadPool& pool) { + gcpp::CompressWorkingSet ws; + CompressedArray mat; + std::array content; + const float scale = 1.0f / kInner; + pool.Run(0, kOuter, [&](const size_t i, size_t /*thread*/) { + for (size_t j = 0; j < kInner; j++) { + content[i * kInner + j] = + static_cast((i * kInner + j + offset) * scale); + } + }); + + Compress(content, ws, mat, pool); + mat.set_scale(1.9f); // Arbitrary value, different from 1. + return mat; +} + +template +hwy::AlignedFreeUniquePtr GenerateVec(size_t offset) { + hwy::AlignedFreeUniquePtr vec = hwy::AllocateAligned(length); + HWY_ASSERT(vec); + for (size_t idx = 0; idx < length; idx++) { + vec[idx] = static_cast(idx + offset); + } + return vec; +} + +template +void AssertClose(const hwy::AlignedFreeUniquePtr& a, + const hwy::AlignedFreeUniquePtr& b) { + for (size_t idx = 0; idx < length; idx++) { + const float rel_abs_delta = std::abs(a[idx] - b[idx]) / + std::max(std::abs(a[idx]), std::abs(b[idx])); + EXPECT_LT(rel_abs_delta, 2e-6) + << "a[" << idx << "]=" << a[idx] << ", b[" << idx << "]=" << b[idx]; + } +} + +void TestMatVecAdd() { + hwy::ThreadPool pool(hwy::ThreadPool::MaxThreads()); + constexpr size_t kOuter = 128 * 3; + constexpr size_t kInner = 128 * 5; + CompressedArray mat = + GenerateMat(0, pool); + hwy::AlignedFreeUniquePtr vec = GenerateVec(0); + hwy::AlignedFreeUniquePtr add = GenerateVec(0); + hwy::AlignedFreeUniquePtr even_odd = + hwy::AllocateAligned(kInner * pool.NumWorkers()); + hwy::AlignedFreeUniquePtr expected_out = + SimpleMatVecAdd(mat, vec, add); + hwy::AlignedFreeUniquePtr actual_out = + hwy::AllocateAligned(kOuter); + HWY_ASSERT(vec && add && even_odd && expected_out && actual_out); + MatVecAdd(mat, 0, vec.get(), add.get(), even_odd.get(), + actual_out.get(), pool); + AssertClose(actual_out, expected_out); +} + +void TestTwoMatVecAdd() { + hwy::ThreadPool pool(hwy::ThreadPool::MaxThreads()); + constexpr size_t kOuter = 128 * 3; + constexpr size_t kInner = 128 * 5; + CompressedArray mat0 = + GenerateMat(0, pool); + CompressedArray mat1 = + GenerateMat(1, pool); + hwy::AlignedFreeUniquePtr vec = GenerateVec(0); + hwy::AlignedFreeUniquePtr add0 = GenerateVec(0); + hwy::AlignedFreeUniquePtr add1 = GenerateVec(1); + hwy::AlignedFreeUniquePtr expected_out0 = + SimpleMatVecAdd(mat0, vec, add0); + hwy::AlignedFreeUniquePtr expected_out1 = + SimpleMatVecAdd(mat1, vec, add1); + hwy::AlignedFreeUniquePtr actual_out0 = + hwy::AllocateAligned(kOuter); + hwy::AlignedFreeUniquePtr actual_out1 = + hwy::AllocateAligned(kOuter); + HWY_ASSERT(vec && add0 && add1 && expected_out0 && actual_out0 && + expected_out1 && actual_out1); + TwoMatVecAdd(mat0, mat1, 0, vec.get(), add0.get(), add1.get(), + actual_out0.get(), actual_out1.get(), pool); + AssertClose(actual_out0, expected_out0); + AssertClose(actual_out1, expected_out1); +} + +void TestTwoOfsMatVecAddLoop() { + hwy::ThreadPool pool(hwy::ThreadPool::MaxThreads()); + constexpr size_t kOuter = 128 * 3; + constexpr size_t kInner = 128 * 5; + CompressedArray mat = + GenerateMat(0, pool); + hwy::AlignedFreeUniquePtr vec = GenerateVec(0); + hwy::AlignedFreeUniquePtr add0 = GenerateVec(0); + hwy::AlignedFreeUniquePtr add1 = GenerateVec(1); + hwy::AlignedFreeUniquePtr expected_out0 = + SimpleMatVecAdd(mat, vec, add0); + hwy::AlignedFreeUniquePtr expected_out1 = + SimpleMatVecAdd(mat, vec, add1); + hwy::AlignedFreeUniquePtr actual_out0 = + hwy::AllocateAligned(kOuter); + hwy::AlignedFreeUniquePtr actual_out1 = + hwy::AllocateAligned(kOuter); + HWY_ASSERT(vec && add0 && add1 && expected_out0 && actual_out0 && + expected_out1 && actual_out1); + TwoOfsMatVecAddLoop(mat, 0, 0, vec.get(), add0.get(), + add1.get(), actual_out0.get(), + actual_out1.get()); + AssertClose(actual_out0, expected_out0); + AssertClose(actual_out1, expected_out1); +} + +// NOLINTNEXTLINE(google-readability-namespace-comments) +} // namespace HWY_NAMESPACE +} // namespace gcpp +HWY_AFTER_NAMESPACE(); + +#if HWY_ONCE + +namespace gcpp { +HWY_BEFORE_TEST(MatVecTest); +HWY_EXPORT_AND_TEST_P(MatVecTest, TestMatVecAdd); +HWY_EXPORT_AND_TEST_P(MatVecTest, TestTwoMatVecAdd); +HWY_EXPORT_AND_TEST_P(MatVecTest, TestTwoOfsMatVecAddLoop); +HWY_AFTER_TEST(); + +} // namespace gcpp + +#endif