// Copyright 2023 Google LLC // SPDX-License-Identifier: Apache-2.0 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #ifndef HWY_DISABLED_TARGETS #define HWY_DISABLED_TARGETS HWY_SCALAR #endif #include #include #include #include #include #include #include #include "compression/compress.h" #include "hwy/aligned_allocator.h" #include "hwy/base.h" #include "hwy/contrib/thread_pool/thread_pool.h" // clang-format off #undef HWY_TARGET_INCLUDE #define HWY_TARGET_INCLUDE "gemma/ops_test.cc" //NOLINT // clang-format on #include "hwy/foreach_target.h" // IWYU pragma: keep #include "hwy/highway.h" #include "hwy/tests/test_util-inl.h" // After highway.h #include "gemma/ops.h" HWY_BEFORE_NAMESPACE(); namespace gcpp { namespace HWY_NAMESPACE { namespace hn = hwy::HWY_NAMESPACE; template struct ForeachCountAndMisalign { template HWY_NOINLINE void operator()(T /*unused*/, D d) const { hwy::RandomState rng; const size_t N = Lanes(d); const size_t misalignments[3] = {0, N / 4, 3 * N / 5}; for (size_t count = 0; count < 2 * N; ++count) { for (size_t ma : misalignments) { for (size_t mb : misalignments) { Test()(d, count, ma, mb, rng); } } } } }; template T Random(hwy::RandomState& rng) { const int32_t bits = static_cast(Random32(&rng)) & 1023; const double val = (bits - 512) / 64.0; // Clamp negative to zero for unsigned types. return hwy::ConvertScalarTo( HWY_MAX(hwy::ConvertScalarTo(hwy::LowestValue()), val)); } HWY_NOINLINE void SourceAddFrom(const float* HWY_RESTRICT other, float* HWY_RESTRICT x, size_t size) { for (size_t i = 0; i < size; ++i) { x[i] += other[i]; } } HWY_NOINLINE void SourceMulBy(const float* HWY_RESTRICT other, float* HWY_RESTRICT x, size_t size, size_t max_pos) { HWY_DASSERT(max_pos <= size); for (size_t i = 0; i < max_pos; ++i) { x[i] *= other[i]; } } HWY_NOINLINE void SourceMulByConst(float c, float* HWY_RESTRICT x, size_t size, size_t max_pos) { for (size_t i = 0; i < max_pos; ++i) { x[i] *= c; } } HWY_NOINLINE void SourceMulByConstAndAdd(float c, const float* HWY_RESTRICT x, float* HWY_RESTRICT out, size_t size, size_t max_pos) { for (size_t i = 0; i < max_pos; ++i) { out[i] += x[i] * c; } } HWY_NOINLINE void SourceSoftmax(float* HWY_RESTRICT x, size_t size, size_t mask_pos) { HWY_DASSERT(size != 0); HWY_DASSERT(mask_pos <= size); float sum = 0.0; const float maxval = *std::max_element(x, x + mask_pos); for (size_t i = 0; i < mask_pos; ++i) { x[i] = std::exp(x[i] - maxval); sum += x[i]; } const float scale = 1.0f / sum; for (size_t i = 0; i < mask_pos; ++i) { x[i] *= scale; } } template HWY_NOINLINE std::discrete_distribution SourceCreateDistribution( std::array& top_k, float temperature) { // re-normalize distribution for (size_t i = 0; i < k; ++i) { top_k[i] = exp(log(top_k[i]) / temperature); } float denominator = 0.0f; for (size_t i = 0; i < k; ++i) { denominator += top_k[i]; } denominator = 1.0f / denominator; MulByConst(denominator, top_k.data(), k); return std::discrete_distribution(std::begin(top_k), std::end(top_k)); } struct TestAddFrom { template void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b, hwy::RandomState& rng) { using T = hn::TFromD; hwy::AlignedFreeUniquePtr px = hwy::AllocateAligned(HWY_MAX(1, misalign_a + count)); hwy::AlignedFreeUniquePtr pe = hwy::AllocateAligned(HWY_MAX(1, misalign_a + count)); hwy::AlignedFreeUniquePtr po = hwy::AllocateAligned(HWY_MAX(1, misalign_b + count)); HWY_ASSERT(px && pe && po); T* x = px.get() + misalign_a; T* e = pe.get() + misalign_a; T* o = po.get() + misalign_b; for (size_t i = 0; i < count; ++i) { x[i] = Random(rng); e[i] = x[i]; o[i] = Random(rng); } SourceAddFrom(o, e, count); AddFrom(o, x, count); hwy::AssertArraySimilar(e, x, count, hwy::TargetName(HWY_TARGET), __FILE__, __LINE__); } }; struct TestMulBy { template void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b, hwy::RandomState& rng) { using T = hn::TFromD; hwy::AlignedFreeUniquePtr px = hwy::AllocateAligned(HWY_MAX(1, misalign_a + count)); hwy::AlignedFreeUniquePtr pe = hwy::AllocateAligned(HWY_MAX(1, misalign_a + count)); hwy::AlignedFreeUniquePtr po = hwy::AllocateAligned(HWY_MAX(1, misalign_b + count)); HWY_ASSERT(px && pe && po); T* x = px.get() + misalign_a; T* e = pe.get() + misalign_a; T* o = po.get() + misalign_b; for (size_t i = 0; i < count; ++i) { x[i] = Random(rng); e[i] = x[i]; o[i] = Random(rng); } SourceMulBy(o, e, count, count); MulBy(o, x, count, count); hwy::AssertArraySimilar(e, x, count, hwy::TargetName(HWY_TARGET), __FILE__, __LINE__); } }; struct TestMulByConstAndAdd { template void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b, hwy::RandomState& rng) { using T = hn::TFromD; hwy::AlignedFreeUniquePtr px = hwy::AllocateAligned(HWY_MAX(1, misalign_a + count)); hwy::AlignedFreeUniquePtr pe = hwy::AllocateAligned(HWY_MAX(1, misalign_a + count)); hwy::AlignedFreeUniquePtr po = hwy::AllocateAligned(HWY_MAX(1, misalign_b + count)); HWY_ASSERT(px && pe && po); T* x = px.get() + misalign_a; T* e = pe.get() + misalign_a; T* o = po.get() + misalign_b; for (size_t i = 0; i < count; ++i) { x[i] = Random(rng); e[i] = x[i]; o[i] = Random(rng); } T constant = Random(rng); SourceMulByConstAndAdd(constant, o, e, count, count); MulByConstAndAdd(constant, o, x, count, count); hwy::AssertArraySimilar(e, x, count, hwy::TargetName(HWY_TARGET), __FILE__, __LINE__); } }; struct TestMulByConst { template void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b, hwy::RandomState& rng) { if (misalign_b == 0) return; using T = hn::TFromD; hwy::AlignedFreeUniquePtr px = hwy::AllocateAligned(HWY_MAX(1, misalign_a + count)); hwy::AlignedFreeUniquePtr pe = hwy::AllocateAligned(HWY_MAX(1, misalign_a + count)); HWY_ASSERT(px && pe); T* x = px.get() + misalign_a; T* e = pe.get() + misalign_a; for (size_t i = 0; i < count; ++i) { x[i] = Random(rng); e[i] = x[i]; } T constant = Random(rng); SourceMulByConst(constant, e, count, count); MulByConst(constant, x, count, count); hwy::AssertArraySimilar(e, x, count, hwy::TargetName(HWY_TARGET), __FILE__, __LINE__); } }; struct TestSoftmax { template void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b, hwy::RandomState& rng) { if (count == 0) return; // *Softmax would assert if (misalign_b == 0) return; using T = hn::TFromD; hwy::AlignedFreeUniquePtr px = hwy::AllocateAligned(HWY_MAX(1, misalign_a + count)); hwy::AlignedFreeUniquePtr pe = hwy::AllocateAligned(HWY_MAX(1, misalign_a + count)); HWY_ASSERT(px && pe); T* x = px.get() + misalign_a; T* e = pe.get() + misalign_a; for (size_t i = 0; i < count; ++i) { x[i] = Random(rng); e[i] = x[i]; } SourceSoftmax(e, count, count); Softmax(x, count, count); T sum = 0.0f; for (size_t i = 0; i < count; ++i) { sum += x[i]; double rel = std::abs(x[i] - e[i]) / e[i]; ASSERT_LT(rel, 1e-6) << "Mismatch on coordinate " << i << " out of " << count; } ASSERT_NEAR(sum, 1.0, 1e-6); } }; template struct TestCreateDistribution { void operator()(hwy::RandomState& rng) { std::array x; std::array e; for (size_t i = 0; i < k; ++i) { x[i] = Random(rng); e[i] = x[i]; } const float constant = Random(rng); auto expected = SourceCreateDistribution(e, constant); auto output = create_distribution(x, constant); AssertEqual(expected, output, hwy::TargetName(HWY_TARGET), __FILE__, __LINE__); } }; void TestAllAddFrom() { hn::ForPartialVectors>()(float()); } void TestAllMulBy() { hn::ForPartialVectors>()(float()); } void TestAllMulByConst() { hn::ForPartialVectors>()(float()); } void TestAllMulByConstAndAdd() { hn::ForPartialVectors>()( float()); } void TestAllSoftmax() { hn::ForPartialVectors>()(float()); } void TestAllCreateDistribution() { TestCreateDistribution<2048>(); TestCreateDistribution<5000>(); } template CompressedArray GenerateMat(size_t offset, hwy::ThreadPool& pool) { gcpp::CompressWorkingSet ws; CompressedArray mat; std::array content; const float scale = 1.0f / kInner; pool.Run(0, kOuter, [&](const size_t i, size_t /*thread*/) { for (size_t j = 0; j < kInner; j++) { content[i * kInner + j] = static_cast((i * kInner + j + offset) * scale); } }); Compress(content, ws, mat, pool); mat.set_scale(1.0f); return mat; } template CompressedArray GenerateZeroMat(hwy::ThreadPool& pool) { gcpp::CompressWorkingSet ws; CompressedArray mat; std::array content; pool.Run(0, kOuter, [&](const size_t i, size_t thread) { hwy::ZeroBytes(&content[i * kInner], kInner * sizeof(content[0])); }); Compress(content, ws, mat, pool); mat.set_scale(1.0f); return mat; } template std::unique_ptr> GenerateMatHeap( size_t offset, hwy::ThreadPool& pool) { gcpp::CompressWorkingSet ws; std::unique_ptr> mat = std::unique_ptr>( new CompressedArray); hwy::AlignedFreeUniquePtr content = hwy::AllocateAligned(kOuter * kInner); const float scale = 1.875f / (kInner * kOuter + offset); pool.Run(0, kOuter, [&](const size_t i, size_t /*thread*/) { for (size_t j = 0; j < kInner; j++) { content[i * kInner + j] = static_cast((i * kInner + j + offset) * scale); } }); Compress(content.get(), kOuter * kInner, ws, kOuter * kInner, mat->data(), 0, pool); mat->set_scale(1.0f); return mat; } template std::unique_ptr> GenerateTransposeMatHeap(size_t offset, hwy::ThreadPool& pool) { gcpp::CompressWorkingSet ws; std::unique_ptr> mat = std::unique_ptr>( new CompressedArray); hwy::AlignedFreeUniquePtr content = hwy::AllocateAligned(kOuter * kInner); const float scale = 1.875f / (kInner * kOuter + offset); pool.Run(0, kOuter, [&](const size_t i, size_t /*thread*/) { for (size_t j = 0; j < kInner; j++) { content[j * kOuter + i] = static_cast((i * kInner + j + offset) * scale); } }); Compress(content.get(), kOuter * kInner, ws, kOuter * kInner, mat->data(), 0, pool); mat->set_scale(1.0f); return mat; } template std::unique_ptr> GenerateZeroMatHeap( hwy::ThreadPool& pool) { gcpp::CompressWorkingSet ws; std::unique_ptr> mat = std::unique_ptr>( new CompressedArray); hwy::AlignedFreeUniquePtr content = hwy::AllocateAligned(kOuter * kInner); pool.Run(0, kOuter, [&](const size_t i, size_t thread) { hwy::ZeroBytes(&content[i * kInner], kInner * sizeof(content[0])); }); Compress(content.get(), kOuter * kInner, ws, kOuter * kInner, mat->data(), 0, pool); mat->set_scale(1.0f); return mat; } template hwy::AlignedFreeUniquePtr GenerateVec(size_t offset) { hwy::AlignedFreeUniquePtr vec = hwy::AllocateAligned(length); HWY_ASSERT(vec); for (size_t idx = 0; idx < length; idx++) { vec[idx] = static_cast(idx + offset); } return vec; } // A simple matrix multiplication. No optimization / tiling. template hwy::AlignedFreeUniquePtr SimpleMatMul( const hwy::AlignedFreeUniquePtr& a, const hwy::AlignedFreeUniquePtr& b) { hwy::AlignedFreeUniquePtr out = hwy::AllocateAligned(kM * kK); hwy::ZeroBytes(out.get(), kM * kK * sizeof(float)); int i, j, k; for (i = 0; i < kM; ++i) { for (j = 0; j < kK; ++j) { for (k = 0; k < kN; ++k) { out[i * kK + j] += a[i * kN + k] * b[k * kK + j]; } } } return out; } template hwy::AlignedFreeUniquePtr SimpleMatVecAdd( const CompressedArray& mat, const hwy::AlignedFreeUniquePtr& vec, const hwy::AlignedFreeUniquePtr& add) { hwy::AlignedFreeUniquePtr uncompressed_mat = hwy::AllocateAligned(kOuter * kInner); hwy::AlignedFreeUniquePtr out = hwy::AllocateAligned(kOuter); HWY_ASSERT(uncompressed_mat && out); Decompress(mat, 0, uncompressed_mat.get(), kOuter * kInner); for (size_t idx_row = 0; idx_row < kOuter; idx_row++) { out[idx_row] = add[idx_row]; for (size_t idx_col = 0; idx_col < kInner; idx_col++) { out[idx_row] += uncompressed_mat[kInner * idx_row + idx_col] * vec[idx_col]; } } return out; } template void AssertClose(const hwy::AlignedFreeUniquePtr& a, const hwy::AlignedFreeUniquePtr& b) { for (size_t idx = 0; idx < length; idx++) { const float rel_abs_delta = std::abs(a[idx] - b[idx]) / std::max(std::abs(a[idx]), std::abs(b[idx])); EXPECT_LT(rel_abs_delta, 2e-6) << "a[" << idx << "]=" << a[idx] << ", b[" << idx << "]=" << b[idx]; } } template void TestTiledBatchMatMul() { fprintf(stderr, "TestTiledBatchMatMul %lu, %lu, %lu, add=%d ", kM, kN, kK, kAdd); hwy::ThreadPool pool(3); std::unique_ptr> a = GenerateMatHeap(0, pool); std::unique_ptr> b = GenerateMatHeap(0, pool); std::unique_ptr> add = GenerateMatHeap(0, pool); std::unique_ptr> c_slow = GenerateZeroMatHeap(pool); MatMulSlowBatch(kM, a->data(), b->data(), kAdd ? add->data() : nullptr, c_slow->data()); hwy::AlignedFreeUniquePtr c = hwy::AllocateAligned(kM * kK); std::unique_ptr> b_trans = GenerateTransposeMatHeap(0, pool); if (kAdd) { MatMul_4x4_Batch_Add(kM, a->data(), b_trans->data(), c.get(), add->data(), pool); } else { MatMul_4x4_Batch(kM, a->data(), b_trans->data(), c.get(), pool); } AssertClose(c_slow->data(), c.get(), kM * kK); } void TestAllTiledBatchMatMul() { using BF16 = hwy::bfloat16_t; using F32 = float; using SFP = SfpStream; // medium-sized square test TestTiledBatchMatMul<512, 512, 512, /*kAdd=*/false, F32>(); TestTiledBatchMatMul<512, 512, 512, /*kAdd=*/true, BF16>(); TestTiledBatchMatMul<512, 512, 512, /*kAdd=*/false, F32, BF16>(); TestTiledBatchMatMul<512, 512, 512, /*kAdd=*/true, BF16, F32>(); TestTiledBatchMatMul<512, 512, 512, /*kAdd=*/false, F32, SFP>(); TestTiledBatchMatMul<512, 512, 512, /*kAdd=*/true, BF16, SFP>(); // minimal non-square test. kK must be at least 2 vectors. TestTiledBatchMatMul<35, 128, 32, /*kAdd=*/false, F32>(); TestTiledBatchMatMul<34, 128, 32, /*kAdd=*/true, BF16>(); TestTiledBatchMatMul<33, 128, 32, /*kAdd=*/false, F32, BF16>(); TestTiledBatchMatMul<33, 128, 32, /*kAdd=*/true, BF16, F32>(); TestTiledBatchMatMul<31, 128, 32, /*kAdd=*/false, F32, SFP>(); TestTiledBatchMatMul<29, 128, 32, /*kAdd=*/true, BF16, SFP>(); TestTiledBatchMatMul<4, 128, 32, /*kAdd=*/true, F32>(); TestTiledBatchMatMul<4, 128, 32, /*kAdd=*/false, BF16>(); TestTiledBatchMatMul<4, 128, 32, /*kAdd=*/true, F32, BF16>(); TestTiledBatchMatMul<4, 128, 32, /*kAdd=*/false, BF16, F32>(); TestTiledBatchMatMul<4, 128, 32, /*kAdd=*/true, F32, SFP>(); TestTiledBatchMatMul<4, 128, 32, /*kAdd=*/false, BF16, SFP>(); TestTiledBatchMatMul<3, 128, 32, /*kAdd=*/false, F32>(); TestTiledBatchMatMul<3, 128, 32, /*kAdd=*/true, BF16>(); TestTiledBatchMatMul<3, 128, 32, /*kAdd=*/false, F32, BF16>(); TestTiledBatchMatMul<3, 128, 32, /*kAdd=*/true, BF16, F32>(); TestTiledBatchMatMul<3, 128, 32, /*kAdd=*/false, F32, SFP>(); TestTiledBatchMatMul<3, 128, 32, /*kAdd=*/true, BF16, SFP>(); TestTiledBatchMatMul<2, 128, 64, /*kAdd=*/true, F32>(); TestTiledBatchMatMul<2, 128, 64, /*kAdd=*/false, BF16>(); TestTiledBatchMatMul<2, 128, 64, /*kAdd=*/true, F32, BF16>(); TestTiledBatchMatMul<2, 128, 64, /*kAdd=*/false, BF16, F32>(); TestTiledBatchMatMul<2, 128, 64, /*kAdd=*/true, F32, SFP>(); TestTiledBatchMatMul<2, 128, 64, /*kAdd=*/false, BF16, SFP>(); TestTiledBatchMatMul<1, 128, 32, /*kAdd=*/false, F32>(); TestTiledBatchMatMul<1, 128, 32, /*kAdd=*/true, BF16>(); TestTiledBatchMatMul<1, 128, 32, /*kAdd=*/false, F32, BF16>(); TestTiledBatchMatMul<1, 128, 32, /*kAdd=*/true, BF16, F32>(); TestTiledBatchMatMul<1, 128, 32, /*kAdd=*/false, F32, SFP>(); TestTiledBatchMatMul<1, 128, 32, /*kAdd=*/true, BF16, SFP>(); // large-scale test // TODO(philculliton): investigate rounding issues with large matrices. // Causes test timeout. // TestTiledBatchMatMul<512, 24576, 3072, float>(); } void TestMatVecAdd() { hwy::ThreadPool pool(hwy::ThreadPool::MaxThreads()); constexpr size_t kOuter = 128 * 3; constexpr size_t kInner = 128 * 5; CompressedArray mat = GenerateMat(0, pool); hwy::AlignedFreeUniquePtr vec = GenerateVec(0); hwy::AlignedFreeUniquePtr add = GenerateVec(0); hwy::AlignedFreeUniquePtr even_odd = hwy::AllocateAligned(kInner * pool.NumWorkers()); hwy::AlignedFreeUniquePtr expected_out = SimpleMatVecAdd(mat, vec, add); hwy::AlignedFreeUniquePtr actual_out = hwy::AllocateAligned(kOuter); HWY_ASSERT(vec && add && even_odd && expected_out && actual_out); MatVecAdd(mat, 0, vec.get(), add.get(), even_odd.get(), actual_out.get(), pool); AssertClose(actual_out, expected_out); } void TestTwoMatVecAdd() { hwy::ThreadPool pool(hwy::ThreadPool::MaxThreads()); constexpr size_t kOuter = 128 * 3; constexpr size_t kInner = 128 * 5; CompressedArray mat0 = GenerateMat(0, pool); CompressedArray mat1 = GenerateMat(1, pool); hwy::AlignedFreeUniquePtr vec = GenerateVec(0); hwy::AlignedFreeUniquePtr add0 = GenerateVec(0); hwy::AlignedFreeUniquePtr add1 = GenerateVec(1); hwy::AlignedFreeUniquePtr expected_out0 = SimpleMatVecAdd(mat0, vec, add0); hwy::AlignedFreeUniquePtr expected_out1 = SimpleMatVecAdd(mat1, vec, add1); hwy::AlignedFreeUniquePtr actual_out0 = hwy::AllocateAligned(kOuter); hwy::AlignedFreeUniquePtr actual_out1 = hwy::AllocateAligned(kOuter); HWY_ASSERT(vec && add0 && add1 && expected_out0 && actual_out0 && expected_out1 && actual_out1); TwoMatVecAdd(mat0, mat1, 0, vec.get(), add0.get(), add1.get(), actual_out0.get(), actual_out1.get(), pool); AssertClose(actual_out0, expected_out0); AssertClose(actual_out1, expected_out1); } void TestTwoOfsMatVecAddLoop() { hwy::ThreadPool pool(hwy::ThreadPool::MaxThreads()); constexpr size_t kOuter = 128 * 3; constexpr size_t kInner = 128 * 5; CompressedArray mat = GenerateMat(0, pool); hwy::AlignedFreeUniquePtr vec = GenerateVec(0); hwy::AlignedFreeUniquePtr add0 = GenerateVec(0); hwy::AlignedFreeUniquePtr add1 = GenerateVec(1); hwy::AlignedFreeUniquePtr expected_out0 = SimpleMatVecAdd(mat, vec, add0); hwy::AlignedFreeUniquePtr expected_out1 = SimpleMatVecAdd(mat, vec, add1); hwy::AlignedFreeUniquePtr actual_out0 = hwy::AllocateAligned(kOuter); hwy::AlignedFreeUniquePtr actual_out1 = hwy::AllocateAligned(kOuter); HWY_ASSERT(vec && add0 && add1 && expected_out0 && actual_out0 && expected_out1 && actual_out1); TwoOfsMatVecAddLoop(mat, 0, 0, vec.get(), add0.get(), add1.get(), actual_out0.get(), actual_out1.get()); AssertClose(actual_out0, expected_out0); AssertClose(actual_out1, expected_out1); } void TestSigmoid() { std::vector values; for (int i = -150; i <= 150; ++i) { values.push_back(.1f * i); } std::vector result = values; Sigmoid(result.data(), result.size()); for (size_t i = 0; i < values.size(); i++) { const float max_error = 0.00007; float value = values[i]; float approx = result[i]; float expected = (1 / (1 + std::exp(-values[i]))); EXPECT_NEAR(approx, expected, max_error) << "Input: " << value; } } // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE } // namespace gcpp HWY_AFTER_NAMESPACE(); #if HWY_ONCE namespace gcpp { HWY_BEFORE_TEST(OpsTest); HWY_EXPORT_AND_TEST_P(OpsTest, TestAllAddFrom); HWY_EXPORT_AND_TEST_P(OpsTest, TestAllMulBy); HWY_EXPORT_AND_TEST_P(OpsTest, TestAllMulByConst); HWY_EXPORT_AND_TEST_P(OpsTest, TestAllMulByConstAndAdd); HWY_EXPORT_AND_TEST_P(OpsTest, TestAllSoftmax); HWY_EXPORT_AND_TEST_P(OpsTest, TestAllCreateDistribution); HWY_EXPORT_AND_TEST_P(OpsTest, TestAllTiledBatchMatMul); HWY_EXPORT_AND_TEST_P(OpsTest, TestMatVecAdd); HWY_EXPORT_AND_TEST_P(OpsTest, TestTwoMatVecAdd); HWY_EXPORT_AND_TEST_P(OpsTest, TestTwoOfsMatVecAddLoop); HWY_EXPORT_AND_TEST_P(OpsTest, TestSigmoid); #ifdef HWY_AFTER_TEST HWY_AFTER_TEST(); #endif } // namespace gcpp #endif