diff --git a/CMakeLists.txt b/CMakeLists.txt index 7cc586b..c0c749b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,12 +22,12 @@ set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) -FetchContent_Declare(highway GIT_REPOSITORY https://github.com/google/highway.git GIT_TAG da250571a45826b21eebbddc1e50d0c1137dee5f) +FetchContent_Declare(highway GIT_REPOSITORY https://github.com/google/highway.git GIT_TAG da250571a45826b21eebbddc1e50d0c1137dee5f EXCLUDE_FROM_ALL) FetchContent_MakeAvailable(highway) ## Note: absl needs to be installed by sentencepiece. This will only happen if ## cmake is invoked with -DSPM_ENABLE_SHARED=OFF and -DSPM_ABSL_PROVIDER=module -FetchContent_Declare(sentencepiece GIT_REPOSITORY https://github.com/google/sentencepiece GIT_TAG 53de76561cfc149d3c01037f0595669ad32a5e7c) +FetchContent_Declare(sentencepiece GIT_REPOSITORY https://github.com/google/sentencepiece GIT_TAG 53de76561cfc149d3c01037f0595669ad32a5e7c EXCLUDE_FROM_ALL) FetchContent_MakeAvailable(sentencepiece) set(SOURCES @@ -91,15 +91,6 @@ set(GEMMA_TEST_FILES ops_test.cc ) -include(FetchContent) -FetchContent_Declare( - googletest - URL https://github.com/google/googletest/archive/refs/tags/v1.14.0.zip -) -set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) -FetchContent_MakeAvailable(googletest) -include(GoogleTest) - foreach (TESTFILE IN LISTS GEMMA_TEST_FILES) # The TESTNAME is the name without the extension or directory. get_filename_component(TESTNAME ${TESTFILE} NAME_WE) diff --git a/ops.h b/ops.h index 0959c02..263e091 100644 --- a/ops.h +++ b/ops.h @@ -88,56 +88,91 @@ template HWY_INLINE constexpr size_t RowsPerStrip() { // Aim for 128 work items to reduce pool overhead. Must be at least one // vector; prefer a power of two for faster division. + constexpr size_t kLanes = hn::ScalableTag().MaxLanes(); constexpr size_t kRowsPerStrip = - HWY_MAX(hn::ScalableTag().MaxLanes(), - 1ULL << hwy::FloorLog2(kOuter / 128)); + kOuter < 128 ? kLanes + : HWY_MAX(kLanes, 1ULL << hwy::FloorLog2(kOuter / 128)); return kRowsPerStrip; } // Simple version without tiling nor threading. -template -HWY_INLINE void MatVecLoop(const CompressedArray& mat, - const size_t mat_ofs, - const VecT* HWY_RESTRICT vec_aligned, - float* HWY_RESTRICT out) { - PROFILER_ZONE("MatVecLoop"); +template +HWY_INLINE void MatVecAddLoop(const ArrayT& mat, const size_t mat_ofs, + const VecT* HWY_RESTRICT vec_aligned, + const AddT* HWY_RESTRICT add, + float* HWY_RESTRICT out) { + PROFILER_ZONE("MatVecAddLoop"); const hn::ScalableTag df; for (size_t idx_row = 0; idx_row < kOuter; ++idx_row) { const size_t row_ofs = mat_ofs + idx_row * kInner; - out[idx_row] = Dot(df, mat, row_ofs, vec_aligned, kInner); + if constexpr (kAdd) { + out[idx_row] = hwy::ConvertScalarTo(add[idx_row]) + + Dot(df, mat, row_ofs, vec_aligned, kInner); + } else { + out[idx_row] = Dot(df, mat, row_ofs, vec_aligned, kInner); + } } } +template +HWY_INLINE void MatVecLoop(const ArrayT& mat, const size_t mat_ofs, + const VecT* HWY_RESTRICT vec_aligned, + float* HWY_RESTRICT out) { + MatVecAddLoop( + mat, mat_ofs, vec_aligned, /*add=*/nullptr, out); +} + // Simple version without tiling nor threading, but two offsets/outputs. -template -HWY_INLINE void TwoOfsMatVecLoop(const CompressedArray& mat, - const size_t mat_ofs0, const size_t mat_ofs1, - const VecT* HWY_RESTRICT vec_aligned, - float* HWY_RESTRICT out0, - float* HWY_RESTRICT out1) { +template +HWY_INLINE void TwoOfsMatVecAddLoop(const ArrayT& mat, const size_t mat_ofs0, + const size_t mat_ofs1, + const VecT* HWY_RESTRICT vec_aligned, + const AddT* HWY_RESTRICT add0, + const AddT* HWY_RESTRICT add1, + float* HWY_RESTRICT out0, + float* HWY_RESTRICT out1) { PROFILER_ZONE("MatVecLoop"); const hn::ScalableTag df; for (size_t idx_row = 0; idx_row < kOuter; ++idx_row) { const size_t row_ofs0 = mat_ofs0 + (idx_row)*kInner; const size_t row_ofs1 = mat_ofs1 + (idx_row)*kInner; - out0[idx_row] = Dot(df, mat, row_ofs0, vec_aligned, kInner); - out1[idx_row] = Dot(df, mat, row_ofs1, vec_aligned, kInner); + if constexpr (kAdd) { + out0[idx_row] = hwy::ConvertScalarTo(add0[idx_row]) + + Dot(df, mat, row_ofs0, vec_aligned, kInner); + out1[idx_row] = hwy::ConvertScalarTo(add1[idx_row]) + + Dot(df, mat, row_ofs1, vec_aligned, kInner); + } else { + out0[idx_row] = Dot(df, mat, row_ofs0, vec_aligned, kInner); + out1[idx_row] = Dot(df, mat, row_ofs1, vec_aligned, kInner); + } } } +// Simple version without tiling nor threading, but two offsets/outputs. +template +HWY_INLINE void TwoOfsMatVecLoop(const ArrayT& mat, const size_t mat_ofs0, + const size_t mat_ofs1, + const VecT* HWY_RESTRICT vec_aligned, + float* HWY_RESTRICT out0, + float* HWY_RESTRICT out1) { + TwoOfsMatVecAddLoop( + mat, mat_ofs0, mat_ofs1, vec_aligned, /*add0=*/nullptr, /*add1=*/nullptr, + out0, out1); +} + namespace detail { // For each i = [0, num_rows), compute partial (length `num_cols`) dot product // of row i with `vec_aligned` and add into `out[i]`. The upper-left coordinate // of the tile is r0, c0. -template +template HWY_INLINE void AccumulatePartialDotProducts( - DF df, const CompressedArray& mat, size_t mat_ofs, - size_t mat_stride, size_t r0, size_t c0, size_t num_rows, size_t num_cols, + DF df, const ArrayT& mat, size_t mat_ofs, size_t mat_stride, size_t r0, + size_t c0, size_t num_rows, size_t num_cols, const VecT* HWY_RESTRICT vec_aligned, float* HWY_RESTRICT out) { for (size_t idx_row = 0; idx_row < num_rows; ++idx_row) { const size_t row_ofs = mat_ofs + (r0 + idx_row) * mat_stride; @@ -145,37 +180,49 @@ HWY_INLINE void AccumulatePartialDotProducts( } } -// Same as above, but sets out[i] to the first partial dot product, which -// avoids having to zero-initialize and accumulate. -template -HWY_INLINE void SetFirstPartialDotProducts( - DF df, const CompressedArray& mat, size_t mat_ofs, - size_t mat_stride, size_t r0, size_t c0, size_t num_rows, size_t num_cols, - const VecT* HWY_RESTRICT vec_aligned, float* HWY_RESTRICT out) { +// Same as above, but sets out[i] to the first partial dot product + +// init (if kInit), which avoids having to zero-initialize and accumulate. +template +HWY_INLINE void SetFirstPartialDotProducts(DF df, const ArrayT& mat, + size_t mat_ofs, size_t mat_stride, + size_t r0, size_t c0, + size_t num_rows, size_t num_cols, + const VecT* HWY_RESTRICT vec_aligned, + const InitT* HWY_RESTRICT init, + float* HWY_RESTRICT out) { for (size_t idx_row = 0; idx_row < num_rows; ++idx_row) { const size_t row_ofs = mat_ofs + (r0 + idx_row) * mat_stride; - out[idx_row] = Dot(df, mat, row_ofs + c0, vec_aligned + c0, num_cols); + if constexpr (kInit) { + out[idx_row] = hwy::ConvertScalarTo(init[idx_row + r0]) + + Dot(df, mat, row_ofs + c0, vec_aligned + c0, num_cols); + } else { + out[idx_row] = Dot(df, mat, row_ofs + c0, vec_aligned + c0, num_cols); + } } } // Adds together partial dot products for all tiles with the same r0 (a // horizontal strip of the entire matrix); the result is the full dot product -// for rows r in [r0, r0 + num_rows), which we store into in out[r - r0]. -template -HWY_INLINE void FullDotProductsForStrip( - DF df, const CompressedArray& mat, size_t mat_ofs, - size_t mat_stride, size_t r0, size_t num_rows, - const VecT* HWY_RESTRICT vec_aligned, float* HWY_RESTRICT out) { +// for rows r in [r0, r0 + num_rows) + optionally the add vector, which we store +// into in out[r - r0]. +template +HWY_INLINE void FullDotProductsForStrip(DF df, const ArrayT& mat, + size_t mat_ofs, size_t mat_stride, + size_t r0, size_t num_rows, + const VecT* HWY_RESTRICT vec_aligned, + const AddT* HWY_RESTRICT add, + float* HWY_RESTRICT out) { // Tall and skinny: set `out` to the single dot product. if (mat_stride < MaxCols()) { - SetFirstPartialDotProducts(df, mat, mat_ofs, mat_stride, r0, 0, num_rows, - mat_stride, vec_aligned, out); + SetFirstPartialDotProducts(df, mat, mat_ofs, mat_stride, r0, 0, + num_rows, mat_stride, vec_aligned, add, + out); return; } // We have at least MaxCols, so start by setting `out` to that: - SetFirstPartialDotProducts(df, mat, mat_ofs, mat_stride, r0, 0, num_rows, - MaxCols(), vec_aligned, out); + SetFirstPartialDotProducts(df, mat, mat_ofs, mat_stride, r0, 0, + num_rows, MaxCols(), vec_aligned, add, out); // For further multiples of MaxCols, accumulate. Remainders handled below. size_t c0 = MaxCols(); for (; c0 <= mat_stride - MaxCols(); c0 += MaxCols()) { @@ -191,15 +238,16 @@ HWY_INLINE void FullDotProductsForStrip( } // namespace detail -// Stores dot products of rows with `vec_aligned` to a buffer, then stores them -// to `out`. -template -HWY_INLINE void MatVec(const CompressedArray& mat, - const size_t mat_ofs, - const VecT* HWY_RESTRICT const vec_aligned, - float* HWY_RESTRICT out, hwy::ThreadPool& pool) { - PROFILER_ZONE("MatVec"); +// Stores dot products of rows with `vec_aligned` + add the values from `add` +// (if kAdd), then stores them to `out`. +// +template +HWY_INLINE void MatVecAdd(const ArrayT& mat, const size_t mat_ofs, + const VecT* HWY_RESTRICT const vec_aligned, + const AddT* HWY_RESTRICT const add, + float* HWY_RESTRICT out, hwy::ThreadPool& pool) { + PROFILER_ZONE("MatVecAdd"); const hn::ScalableTag df; constexpr size_t kRowsPerStrip = RowsPerStrip(); @@ -209,8 +257,9 @@ HWY_INLINE void MatVec(const CompressedArray& mat, pool.Run(0, kNumStrips, [&](const uint64_t strip, size_t thread) HWY_ATTR { PROFILER_ZONE("MatVec.lambda"); const size_t r0 = strip * kRowsPerStrip; - detail::FullDotProductsForStrip(df, mat, mat_ofs, kInner, r0, kRowsPerStrip, - vec_aligned, out + r0); + detail::FullDotProductsForStrip(df, mat, mat_ofs, kInner, r0, + kRowsPerStrip, vec_aligned, add, + out + r0); }); // Remaining rows @@ -218,11 +267,19 @@ HWY_INLINE void MatVec(const CompressedArray& mat, if (r0 < kOuter) { PROFILER_ZONE("MatVec remainder"); const size_t num_rows = kOuter - r0; - detail::FullDotProductsForStrip(df, mat, mat_ofs, kInner, r0, num_rows, - vec_aligned, out + r0); + detail::FullDotProductsForStrip(df, mat, mat_ofs, kInner, r0, + num_rows, vec_aligned, add, out + r0); } } +template +HWY_INLINE void MatVec(const ArrayT& mat, const size_t mat_ofs, + const VecT* HWY_RESTRICT const vec_aligned, + float* HWY_RESTRICT out, hwy::ThreadPool& pool) { + MatVecAdd( + mat, mat_ofs, vec_aligned, /*add=*/nullptr, out, pool); +} + template static HWY_INLINE hn::Vec Gelu(D d, hn::Vec v) { const hn::Vec kMul = hn::Set(d, 0.044715f); @@ -277,69 +334,105 @@ static HWY_NOINLINE HWY_MAYBE_UNUSED void GeluMulToBF16( } } +template +static HWY_INLINE hn::Vec Sigmoid(D d, hn::Vec v) { + using VF = hn::Vec; + // Chebyshev polynomial coefficients for rational approximation + const VF c0 = hn::Set(d, 0.00949107017368078f); + const VF c1 = hn::Set(d, 0.0654858946800232f); + const VF c2 = hn::Set(d, 0.231547489762306f - 0.00949107017368078f); + const VF c3 = hn::Set(d, 0.530778527259827f); + const VF c4 = hn::Set(d, 0.855334937572479f); + const VF c5 = hn::Set(d, 0.500000894069672f); + + const VF d0 = hn::Set(d, 0.130970627069473f); + const VF d1 = hn::Set(d, 3.99615288415589e-07f); + const VF d2 = hn::Set(d, 1.06155431270599f - 0.130970627069473f); + const VF d3 = hn::Set(d, 1.35144250634767e-06f); + const VF d4 = hn::Set(d, 1); + + // The approximation works in range -12..12, but the input value is clamped + // in -11.5..11.5 since the approximation slightly overshoots after that. + // The function is nearly 0 for input values below -11.5 and nearly 1 for + // input values above 11.5. + const VF invtwelve = hn::Set(d, 1.0f / 12.0f); + const VF lo = hn::Set(d, -11.5f); + const VF hi = hn::Set(d, 11.5f); + + VF f = hn::Clamp(v, lo, hi); + f = hn::Mul(f, invtwelve); + VF f2 = hn::Add(f, f); + + VF a1 = hn::MulAdd(f2, c0, c1); + VF a2 = hn::MulAdd(f2, a1, c2); + VF a3 = hn::Sub(hn::MulAdd(f2, a2, c3), a1); + VF a4 = hn::Sub(hn::MulAdd(f2, a3, c4), a2); + VF f0 = hn::Sub(hn::MulAdd(f, a4, c5), a3); + + VF b1 = hn::MulAdd(f2, d0, d1); + VF b2 = hn::MulAdd(f2, b1, d2); + VF b3 = hn::Sub(hn::MulAdd(f2, b2, d3), b1); + VF f1 = hn::Sub(hn::MulAdd(f, b3, d4), b2); + + return Div(f0, f1); +} + +// Sigmoid using the logistic function 1 / (1 + exp(-x[i])) +static HWY_NOINLINE HWY_MAYBE_UNUSED void Sigmoid(float* HWY_RESTRICT x, + size_t size) { + namespace hn = hwy::HWY_NAMESPACE; + using D = hn::ScalableTag; + hn::Transform(D(), x, size, + [](D d, hn::Vec v) HWY_ATTR { return Sigmoid(d, v); }); +} + // Two matrices, same vector -// TODO(janwas): apply optimizations from MatVec/replace with above overload -template -HWY_NOINLINE void TwoMatVec(const CompressedArray& mat0, - const CompressedArray& mat1, +template +HWY_NOINLINE void TwoMatVecAdd( + const ArrayT& mat0, const ArrayT& mat1, const size_t mat_ofs, + const VecT* HWY_RESTRICT vec_aligned, const AddT* HWY_RESTRICT add0, + const AddT* HWY_RESTRICT add1, float* HWY_RESTRICT out0, + float* HWY_RESTRICT out1, hwy::ThreadPool& pool) { + PROFILER_ZONE("TwoMatVecAdd"); + + const hn::ScalableTag df; + constexpr size_t kRowsPerStrip = RowsPerStrip(); + constexpr size_t kNumStrips = kOuter / kRowsPerStrip; + + // For each entire strip. + pool.Run(0, kNumStrips, [&](const uint64_t strip, size_t thread) HWY_ATTR { + PROFILER_ZONE("TwoMatVec.lambda"); + const size_t r0 = strip * kRowsPerStrip; + detail::FullDotProductsForStrip(df, mat0, mat_ofs, kInner, r0, + kRowsPerStrip, vec_aligned, add0, + out0 + r0); + detail::FullDotProductsForStrip(df, mat1, mat_ofs, kInner, r0, + kRowsPerStrip, vec_aligned, add1, + out1 + r0); + }); + + // Remaining rows + const size_t r0 = kNumStrips * kRowsPerStrip; + if (r0 < kOuter) { + PROFILER_ZONE("TwoMatVec remainder"); + const size_t num_rows = kOuter - r0; + detail::FullDotProductsForStrip( + df, mat0, mat_ofs, kInner, r0, num_rows, vec_aligned, add0, out0 + r0); + detail::FullDotProductsForStrip( + df, mat1, mat_ofs, kInner, r0, num_rows, vec_aligned, add1, out1 + r0); + } +} + +template +HWY_NOINLINE void TwoMatVec(const ArrayT& mat0, const ArrayT& mat1, const size_t mat_ofs, const VecT* HWY_RESTRICT vec_aligned, float* HWY_RESTRICT out0, float* HWY_RESTRICT out1, hwy::ThreadPool& pool) { - const hn::ScalableTag df; - const size_t NF = hn::Lanes(df); - - // Process multiple rows at a time so that we write multiples of a cache line - // to avoid false sharing (>= 64). - constexpr size_t kRowsPerStrip = 128 / sizeof(float); - const uint32_t num_strips = kOuter / kRowsPerStrip; - - // No remainder handling after ThreadPool. - static_assert(kOuter % kRowsPerStrip == 0, "Add remainder handling"); - - // Required for Stream loop, otherwise we might have partial vectors. - HWY_DASSERT(kRowsPerStrip >= NF); - pool.Run(0, num_strips, - [&](const uint32_t strip, size_t /*thread*/) HWY_ATTR { - // MSVC workaround: duplicate to ensure constexpr. - constexpr size_t kRowsPerStrip = 128 / sizeof(float); - // Software write-combining to avoid cache pollution from out. - // Although `out` may be used later, keeping it out of the cache - // now and avoiding RFOs is a consistent 5% overall win. - HWY_ALIGN float buf0[kRowsPerStrip]; - HWY_ALIGN float buf1[kRowsPerStrip]; - - // Only handle entire strips here because the Stream is not masked. - const size_t begin = strip * kRowsPerStrip; - for (size_t idx_row = 0; idx_row < kRowsPerStrip; ++idx_row) { - const size_t row_ofs = mat_ofs + (begin + idx_row) * kInner; - buf0[idx_row] = Dot(df, mat0, row_ofs, vec_aligned, kInner); - buf1[idx_row] = Dot(df, mat1, row_ofs, vec_aligned, kInner); - } - - HWY_UNROLL(4) - for (size_t i = 0; i != kRowsPerStrip; i += NF) { - hn::Stream(hn::Load(df, buf0 + i), df, out0 + begin + i); - } - HWY_UNROLL(4) - for (size_t i = 0; i != kRowsPerStrip; i += NF) { - hn::Stream(hn::Load(df, buf1 + i), df, out1 + begin + i); - } - }); - hwy::FlushStream(); -} - -// Baseline Naive MatMul -template -HWY_NOINLINE void MatMul(const CompressedArray& mat, - const size_t mat_ofs, const VecT* HWY_RESTRICT vec, - float* HWY_RESTRICT out, hwy::ThreadPool& pool) { - for (size_t i = 0; i < kBatchSize; ++i) { - MatVec( - mat, mat_ofs, vec + i * kInner, out + i * kOuter, pool); - } + TwoMatVecAdd( + mat0, mat1, mat_ofs, vec_aligned, /*add0=*/nullptr, /*add1=*/nullptr, + out0, out1, pool); } static HWY_NOINLINE HWY_MAYBE_UNUSED float Dot(const float* HWY_RESTRICT a, diff --git a/ops_test.cc b/ops_test.cc index f908c39..c711946 100644 --- a/ops_test.cc +++ b/ops_test.cc @@ -356,6 +356,155 @@ void TestAllCreateDistribution() { TestCreateDistribution<5000>(); } +template +CompressedArray GenerateMat(size_t offset) { + hwy::ThreadPool pool(0); + gcpp::CompressWorkingSet ws; + CompressedArray mat; + std::array content; + const float scale = 1.0f / kInner; + for (size_t i = 0; i < kOuter; i++) { + for (size_t j = 0; j < kInner; j++) { + content[i * kInner + j] = static_cast((i + j + offset) * scale); + } + } + Compress(content, ws, mat, pool); + return mat; +} + +template +hwy::AlignedFreeUniquePtr GenerateVec(size_t offset) { + hwy::AlignedFreeUniquePtr vec = hwy::AllocateAligned(length); + for (size_t idx = 0; idx < length; idx++) { + vec[idx] = static_cast(idx + offset); + } + return vec; +} + +template +hwy::AlignedFreeUniquePtr SimpleMatVecAdd( + const CompressedArray& mat, + const hwy::AlignedFreeUniquePtr& vec, + const hwy::AlignedFreeUniquePtr& add) { + hwy::AlignedFreeUniquePtr uncompressed_mat = + hwy::AllocateAligned(kOuter * kInner); + Decompress(mat, 0, uncompressed_mat.get(), kOuter * kInner); + hwy::AlignedFreeUniquePtr out = hwy::AllocateAligned(kOuter); + for (size_t idx_row = 0; idx_row < kOuter; idx_row++) { + out[idx_row] = add[idx_row]; + for (size_t idx_col = 0; idx_col < kInner; idx_col++) { + out[idx_row] += + uncompressed_mat[kInner * idx_row + idx_col] * vec[idx_col]; + } + } + return out; +} + +template +void AssertClose(const hwy::AlignedFreeUniquePtr& a, + const hwy::AlignedFreeUniquePtr& b) { + for (size_t idx = 0; idx < length; idx++) { + const float rel_abs_delta = std::abs(a[idx] - b[idx]) / + std::max(std::abs(a[idx]), std::abs(b[idx])); + EXPECT_LT(rel_abs_delta, 2e-6) + << "a[" << idx << "]=" << a[idx] << ", b[" << idx << "]=" << b[idx]; + } +} + +void TestMatVecAdd() { + hwy::ThreadPool pool(0); + constexpr size_t kOuter = 128 * 3; + constexpr size_t kInner = 128 * 5; + CompressedArray mat = GenerateMat(0); + hwy::AlignedFreeUniquePtr vec = GenerateVec(0); + hwy::AlignedFreeUniquePtr add = GenerateVec(0); + hwy::AlignedFreeUniquePtr expected_out = + SimpleMatVecAdd(mat, vec, add); + hwy::AlignedFreeUniquePtr actual_out = + hwy::AllocateAligned(kOuter); + MatVecAdd(mat, 0, vec.get(), add.get(), + actual_out.get(), pool); + AssertClose(actual_out, expected_out); +} + +void TestMatVecAddLoop() { + constexpr size_t kOuter = 128 * 3; + constexpr size_t kInner = 128 * 5; + CompressedArray mat = GenerateMat(0); + hwy::AlignedFreeUniquePtr vec = GenerateVec(0); + hwy::AlignedFreeUniquePtr add = GenerateVec(0); + hwy::AlignedFreeUniquePtr expected_out = + SimpleMatVecAdd(mat, vec, add); + hwy::AlignedFreeUniquePtr actual_out = + hwy::AllocateAligned(kOuter); + MatVecAddLoop(mat, 0, vec.get(), add.get(), + actual_out.get()); + AssertClose(actual_out, expected_out); +} + +void TestTwoMatVecAdd() { + hwy::ThreadPool pool(0); + constexpr size_t kOuter = 128 * 3; + constexpr size_t kInner = 128 * 5; + CompressedArray mat0 = GenerateMat(0); + CompressedArray mat1 = GenerateMat(1); + hwy::AlignedFreeUniquePtr vec = GenerateVec(0); + hwy::AlignedFreeUniquePtr add0 = GenerateVec(0); + hwy::AlignedFreeUniquePtr add1 = GenerateVec(1); + hwy::AlignedFreeUniquePtr expected_out0 = + SimpleMatVecAdd(mat0, vec, add0); + hwy::AlignedFreeUniquePtr expected_out1 = + SimpleMatVecAdd(mat1, vec, add1); + hwy::AlignedFreeUniquePtr actual_out0 = + hwy::AllocateAligned(kOuter); + hwy::AlignedFreeUniquePtr actual_out1 = + hwy::AllocateAligned(kOuter); + TwoMatVecAdd(mat0, mat1, 0, vec.get(), add0.get(), + add1.get(), actual_out0.get(), + actual_out1.get(), pool); + AssertClose(actual_out0, expected_out0); + AssertClose(actual_out1, expected_out1); +} + +void TestTwoOfsMatVecAddLoop() { + constexpr size_t kOuter = 128 * 3; + constexpr size_t kInner = 128 * 5; + CompressedArray mat = GenerateMat(0); + hwy::AlignedFreeUniquePtr vec = GenerateVec(0); + hwy::AlignedFreeUniquePtr add0 = GenerateVec(0); + hwy::AlignedFreeUniquePtr add1 = GenerateVec(1); + hwy::AlignedFreeUniquePtr expected_out0 = + SimpleMatVecAdd(mat, vec, add0); + hwy::AlignedFreeUniquePtr expected_out1 = + SimpleMatVecAdd(mat, vec, add1); + hwy::AlignedFreeUniquePtr actual_out0 = + hwy::AllocateAligned(kOuter); + hwy::AlignedFreeUniquePtr actual_out1 = + hwy::AllocateAligned(kOuter); + TwoOfsMatVecAddLoop(mat, 0, 0, vec.get(), add0.get(), + add1.get(), actual_out0.get(), + actual_out1.get()); + AssertClose(actual_out0, expected_out0); + AssertClose(actual_out1, expected_out1); +} + +void TestSigmoid() { + std::vector values; + for (int i = -150; i <= 150; ++i) { + values.push_back(.1f * i); + } + std::vector result = values; + Sigmoid(result.data(), result.size()); + + for (size_t i = 0; i < values.size(); i++) { + const float max_error = 0.00007; + float value = values[i]; + float approx = result[i]; + float expected = (1 / (1 + std::exp(-values[i]))); + EXPECT_NEAR(approx, expected, max_error) << "Input: " << value; + } +} + // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE } // namespace gcpp @@ -371,9 +520,15 @@ HWY_EXPORT_AND_TEST_P(OpsTest, TestAllMulByConst); HWY_EXPORT_AND_TEST_P(OpsTest, TestAllMulByConstAndAdd); HWY_EXPORT_AND_TEST_P(OpsTest, TestAllSoftmax); HWY_EXPORT_AND_TEST_P(OpsTest, TestAllCreateDistribution); +HWY_EXPORT_AND_TEST_P(OpsTest, TestMatVecAdd); +HWY_EXPORT_AND_TEST_P(OpsTest, TestMatVecAddLoop); +HWY_EXPORT_AND_TEST_P(OpsTest, TestTwoMatVecAdd); +HWY_EXPORT_AND_TEST_P(OpsTest, TestTwoOfsMatVecAddLoop); +HWY_EXPORT_AND_TEST_P(OpsTest, TestSigmoid); #ifdef HWY_AFTER_TEST HWY_AFTER_TEST(); #endif + } // namespace gcpp #endif