diff --git a/BUILD.bazel b/BUILD.bazel index 1f9e210..e141e95 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -376,6 +376,7 @@ cc_test( ":test_util", ":threading_context", "@googletest//:gtest_main", # buildcleaner: keep + "//compression:test_util", "//compression:types", "@highway//:hwy", "@highway//:hwy_test_util", diff --git a/compression/compress-inl.h b/compression/compress-inl.h index 512f8fa..e2b3bcc 100644 --- a/compression/compress-inl.h +++ b/compression/compress-inl.h @@ -709,6 +709,242 @@ HWY_INLINE float DecompressAndCall(D, const PackedSpan v, comp3); } +// Similar to `hn::Transform*`, but for compressed `T`. Used by ops-inl.h. +// `DF` is the decompressed type, typically `float`. +template +HWY_INLINE void DecompressAndCompressInplace(DF df, T* HWY_RESTRICT inout, + size_t num, Func&& func) { + const auto packed_inout = MakeSpan(inout, num); + + using VF = hn::Vec; + HWY_LANES_CONSTEXPR const size_t NF = hn::Lanes(df); + size_t i = 0; + if (num >= 2 * NF) { + for (; i <= num - 2 * NF; i += 2 * NF) { + VF v0, v1; + Decompress2(df, packed_inout, i, v0, v1); + const VF out0 = func(df, v0); + const VF out1 = func(df, v1); + Compress2(df, out0, out1, packed_inout, i); + } + } + + const size_t remaining = num - i; + HWY_DASSERT(remaining < 2 * NF); + if (HWY_UNLIKELY(remaining != 0)) { + HWY_ALIGN float buf_inout[2 * hn::MaxLanes(df)]; + // Ensure the second vector is zeroed even if remaining <= NF. + hn::Store(hn::Zero(df), df, buf_inout + NF); + DecompressAndZeroPad(df, packed_inout, i, buf_inout, remaining); + const VF v0 = hn::Load(df, buf_inout); + const VF v1 = hn::Load(df, buf_inout + NF); + const VF out0 = func(df, v0); + const VF out1 = func(df, v1); + Compress2(df, out0, out1, MakeSpan(buf_inout, 2 * NF), 0); + // Clang generates incorrect code for CopyBytes if num = 2. + for (size_t j = 0; j < remaining; ++j) { + inout[i + j] = hwy::ConvertScalarTo(buf_inout[j]); + } + } +} + +// One extra argument. `DF` is the decompressed type, typically `float`. +template +HWY_INLINE void Decompress1AndCompressInplace(DF df, T* HWY_RESTRICT inout, + size_t num, + const T1* HWY_RESTRICT p1, + Func&& func) { + const auto packed_inout = MakeSpan(inout, num); + const auto packed1 = MakeSpan(p1, num); + + using VF = hn::Vec; + HWY_LANES_CONSTEXPR const size_t NF = hn::Lanes(df); + size_t i = 0; + if (num >= 2 * NF) { + for (; i <= num - 2 * NF; i += 2 * NF) { + VF v0, v1; + Decompress2(df, packed_inout, i, v0, v1); + VF v10, v11; + Decompress2(df, packed1, i, v10, v11); + const VF out0 = func(df, v0, v10); + const VF out1 = func(df, v1, v11); + Compress2(df, out0, out1, packed_inout, i); + } + } + + const size_t remaining = num - i; + HWY_DASSERT(remaining < 2 * NF); + if (HWY_UNLIKELY(remaining != 0)) { + HWY_ALIGN float buf_inout[2 * hn::MaxLanes(df)]; + HWY_ALIGN float buf1[2 * hn::MaxLanes(df)]; + // Ensure the second vector is zeroed even if remaining <= NF. + hn::Store(hn::Zero(df), df, buf_inout + NF); + hn::Store(hn::Zero(df), df, buf1 + NF); + DecompressAndZeroPad(df, packed_inout, i, buf_inout, remaining); + DecompressAndZeroPad(df, packed1, i, buf1, remaining); + const VF v0 = hn::Load(df, buf_inout); + const VF v1 = hn::Load(df, buf_inout + NF); + const VF v10 = hn::Load(df, buf1); + const VF v11 = hn::Load(df, buf1 + NF); + const VF out0 = func(df, v0, v10); + const VF out1 = func(df, v1, v11); + Compress2(df, out0, out1, MakeSpan(buf_inout, 2 * NF), 0); + // Clang generates incorrect code for CopyBytes if num = 2. + for (size_t j = 0; j < remaining; ++j) { + inout[i + j] = hwy::ConvertScalarTo(buf_inout[j]); + } + } +} + +// Single input, separate output. `DF` is the decompressed type, typically +// `float`. +template +HWY_INLINE void Decompress1AndCompressTo(DF df, T* HWY_RESTRICT out, size_t num, + const T1* HWY_RESTRICT p1, + Func&& func) { + const auto packed_out = MakeSpan(out, num); + const auto packed1 = MakeSpan(p1, num); + + using VF = hn::Vec; + HWY_LANES_CONSTEXPR const size_t NF = hn::Lanes(df); + size_t i = 0; + if (num >= 2 * NF) { + for (; i <= num - 2 * NF; i += 2 * NF) { + VF v10, v11; + Decompress2(df, packed1, i, v10, v11); + const VF out0 = func(df, v10); + const VF out1 = func(df, v11); + Compress2(df, out0, out1, packed_out, i); + } + } + + const size_t remaining = num - i; + HWY_DASSERT(remaining < 2 * NF); + if (HWY_UNLIKELY(remaining != 0)) { + HWY_ALIGN float buf1[2 * hn::MaxLanes(df)]; + HWY_ALIGN float buf_out[2 * hn::MaxLanes(df)]; + // Ensure the second vector is zeroed even if remaining <= NF. + hn::Store(hn::Zero(df), df, buf1 + NF); + DecompressAndZeroPad(df, packed1, i, buf1, remaining); + const VF v10 = hn::Load(df, buf1); + const VF v11 = hn::Load(df, buf1 + NF); + const VF out0 = func(df, v10); + const VF out1 = func(df, v11); + Compress2(df, out0, out1, MakeSpan(buf_out, 2 * NF), 0); + // Clang generates incorrect code for CopyBytes if num = 2. + for (size_t j = 0; j < remaining; ++j) { + out[i + j] = hwy::ConvertScalarTo(buf_out[j]); + } + } +} + +// Two inputs. `DF` is the decompressed type, typically `float`. +template +HWY_INLINE void Decompress2AndCompressTo(DF df, T* HWY_RESTRICT out, size_t num, + const T1* HWY_RESTRICT p1, + const T2* HWY_RESTRICT p2, + Func&& func) { + const auto packed_out = MakeSpan(out, num); + const auto packed1 = MakeSpan(p1, num); + const auto packed2 = MakeSpan(p2, num); + + using VF = hn::Vec; + HWY_LANES_CONSTEXPR const size_t NF = hn::Lanes(df); + size_t i = 0; + if (num >= 2 * NF) { + for (; i <= num - 2 * NF; i += 2 * NF) { + VF v10, v11, v20, v21; + Decompress2(df, packed1, i, v10, v11); + Decompress2(df, packed2, i, v20, v21); + const VF out0 = func(df, v10, v20); + const VF out1 = func(df, v11, v21); + Compress2(df, out0, out1, packed_out, i); + } + } + + const size_t remaining = num - i; + HWY_DASSERT(remaining < 2 * NF); + if (HWY_UNLIKELY(remaining != 0)) { + HWY_ALIGN float buf1[2 * hn::MaxLanes(df)]; + HWY_ALIGN float buf2[2 * hn::MaxLanes(df)]; + HWY_ALIGN float buf_out[2 * hn::MaxLanes(df)]; + // Ensure the second vector is zeroed even if remaining <= NF. + hn::Store(hn::Zero(df), df, buf1 + NF); + hn::Store(hn::Zero(df), df, buf2 + NF); + DecompressAndZeroPad(df, packed1, i, buf1, remaining); + DecompressAndZeroPad(df, packed2, i, buf2, remaining); + const VF v10 = hn::Load(df, buf1); + const VF v11 = hn::Load(df, buf1 + NF); + const VF v20 = hn::Load(df, buf2); + const VF v21 = hn::Load(df, buf2 + NF); + const VF out0 = func(df, v10, v20); + const VF out1 = func(df, v11, v21); + Compress2(df, out0, out1, MakeSpan(buf_out, 2 * NF), 0); + // Clang generates incorrect code for CopyBytes if num = 2. + for (size_t j = 0; j < remaining; ++j) { + out[i + j] = hwy::ConvertScalarTo(buf_out[j]); + } + } +} + +// Three inputs. `DF` is the decompressed type, typically `float`. +template +HWY_INLINE void Decompress3AndCompressTo(DF df, T* HWY_RESTRICT out, size_t num, + const T1* HWY_RESTRICT p1, + const T2* HWY_RESTRICT p2, + const T3* HWY_RESTRICT p3, + Func&& func) { + const auto packed_out = MakeSpan(out, num); + const auto packed1 = MakeSpan(p1, num); + const auto packed2 = MakeSpan(p2, num); + const auto packed3 = MakeSpan(p3, num); + + using VF = hn::Vec; + HWY_LANES_CONSTEXPR const size_t NF = hn::Lanes(df); + size_t i = 0; + if (num >= 2 * NF) { + for (; i <= num - 2 * NF; i += 2 * NF) { + VF v10, v11, v20, v21, v30, v31; + Decompress2(df, packed1, i, v10, v11); + Decompress2(df, packed2, i, v20, v21); + Decompress2(df, packed3, i, v30, v31); + const VF out0 = func(df, v10, v20, v30); + const VF out1 = func(df, v11, v21, v31); + Compress2(df, out0, out1, packed_out, i); + } + } + + const size_t remaining = num - i; + HWY_DASSERT(remaining < 2 * NF); + if (HWY_UNLIKELY(remaining != 0)) { + HWY_ALIGN float buf1[2 * hn::MaxLanes(df)]; + HWY_ALIGN float buf2[2 * hn::MaxLanes(df)]; + HWY_ALIGN float buf3[2 * hn::MaxLanes(df)]; + HWY_ALIGN float buf_out[2 * hn::MaxLanes(df)]; + // Ensure the second vector is zeroed even if remaining <= NF. + hn::Store(hn::Zero(df), df, buf1 + NF); + hn::Store(hn::Zero(df), df, buf2 + NF); + hn::Store(hn::Zero(df), df, buf3 + NF); + DecompressAndZeroPad(df, packed1, i, buf1, remaining); + DecompressAndZeroPad(df, packed2, i, buf2, remaining); + DecompressAndZeroPad(df, packed3, i, buf3, remaining); + const VF v10 = hn::Load(df, buf1); + const VF v11 = hn::Load(df, buf1 + NF); + const VF v20 = hn::Load(df, buf2); + const VF v21 = hn::Load(df, buf2 + NF); + const VF v30 = hn::Load(df, buf3); + const VF v31 = hn::Load(df, buf3 + NF); + const VF out0 = func(df, v10, v20, v30); + const VF out1 = func(df, v11, v21, v31); + Compress2(df, out0, out1, MakeSpan(buf_out, 2 * NF), 0); + // Clang generates incorrect code for CopyBytes if num = 2. + for (size_t j = 0; j < remaining; ++j) { + out[i + j] = hwy::ConvertScalarTo(buf_out[j]); + } + } +} + // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE } // namespace gcpp diff --git a/compression/compress_test.cc b/compression/compress_test.cc index 2270689..5455b1d 100644 --- a/compression/compress_test.cc +++ b/compression/compress_test.cc @@ -18,11 +18,10 @@ #define HWY_DISABLED_TARGETS GEMMA_DISABLED_TARGETS #endif // HWY_DISABLED_TARGETS -#include "compression/compress.h" - #include #include +#include "compression/compress.h" #include "compression/distortion.h" #include "util/test_util.h" #include "hwy/aligned_allocator.h" @@ -45,7 +44,7 @@ namespace hn = hwy::HWY_NAMESPACE; // Calls Compress and Decompress2 and verifies the distortion/error. template -struct TestDecompress2T { +struct TestDecompress2 { template HWY_INLINE void operator()(T /*unused*/, D d) { const size_t N = hn::Lanes(d); @@ -120,12 +119,12 @@ struct TestDecompress2T { } }; -void TestAllDecompress2() { ForeachPackedAndRawType(); } +void TestAllDecompress2() { ForeachPackedAndRawType(); } // Calls Compress and DecompressAndZeroPad for all short lengths and verifies // the distortion/error. template -struct TestShortLengthsT { +struct TestShortLengths { template HWY_INLINE void operator()(T /*unused*/, D d) { const size_t N = hn::Lanes(d); @@ -196,7 +195,82 @@ struct TestShortLengthsT { } }; -void TestAllShortLengths() { ForeachPackedAndRawType(); } +void TestAllShortLengths() { ForeachPackedAndRawType(); } + +// Verifies the arguments and remainder handling of `DecompressAndCompress*`. +class TestDecompressAndCompress { + public: + template + HWY_INLINE void operator()(T /*unused*/, D d) { + ForeachActivationType3(d); + } + + private: + struct Test { + template + void operator()(T1, T2, T3, D d) { + hwy::RandomState rng; + using DF = hn::Repartition; + using VF = hn::Vec; + const DF df; + + for (size_t num = 1; num < 7 * hn::Lanes(d); ++num) { + auto p = hwy::AllocateAligned(num); + auto p1 = hwy::AllocateAligned(num); + auto p2 = hwy::AllocateAligned(num); + auto out = hwy::AllocateAligned(num); + auto expected1 = hwy::AllocateAligned(num); + auto expected2 = hwy::AllocateAligned(num); + auto expected3 = hwy::AllocateAligned(num); + HWY_ASSERT(p && p1 && p2 && out && expected1 && expected2 && expected3); + // Two bits each, totalling 6 bits which fit in the BF16 mantissa. + for (size_t i = 0; i < num; ++i) { + const size_t mod = i & 3; + p[i] = hwy::ConvertScalarTo(mod); + p1[i] = hwy::ConvertScalarTo(mod << 2); + p2[i] = hwy::ConvertScalarTo(mod << 4); + // For `Decompress1AndCompressInplace` to not overwrite `p`. + out[i] = p[i]; + expected1[i] = hwy::ConvertScalarTo(mod); + expected2[i] = hwy::ConvertScalarTo((mod << 2) | mod); + expected3[i] = + hwy::ConvertScalarTo((mod << 4) | (mod << 2) | mod); + } + + DecompressAndCompressInplace(df, p.get(), num, + [](DF, VF v) HWY_ATTR -> VF { return v; }); + HWY_ASSERT_ARRAY_EQ(expected1.get(), p.get(), num); + + // Uses `out` so as not to overwrite `p`. + Decompress1AndCompressInplace( + df, out.get(), num, p1.get(), + [](DF, VF v, VF v1) HWY_ATTR -> VF { return hn::Add(v, v1); }); + HWY_ASSERT_ARRAY_EQ(expected2.get(), out.get(), num); + + Decompress1AndCompressTo(df, out.get(), num, p.get(), + [](DF, VF v) HWY_ATTR -> VF { return v; }); + HWY_ASSERT_ARRAY_EQ(expected1.get(), out.get(), num); + + Decompress2AndCompressTo(df, out.get(), num, p.get(), p1.get(), + [](DF, VF v, VF v1) + HWY_ATTR -> VF { return hn::Add(v, v1); }); + HWY_ASSERT_ARRAY_EQ(expected2.get(), out.get(), num); + + Decompress3AndCompressTo( + df, out.get(), num, p.get(), p1.get(), p2.get(), + [](DF, VF v, VF v1, VF v2) + HWY_ATTR -> VF { return hn::Add(hn::Add(v, v1), v2); }); + HWY_ASSERT_ARRAY_EQ(expected3.get(), out.get(), num); + } + } + }; +}; + +void TestAllDecompressAndCompress() { + // The Highway Test interface (`ForGE128Vectors`) only supports a single type. + // We hard-code one here, and use `ForeachActivationType` internally. + hn::ForGE128Vectors()(float()); +} // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE @@ -208,6 +282,7 @@ namespace gcpp { HWY_BEFORE_TEST(CompressTest); HWY_EXPORT_AND_TEST_P(CompressTest, TestAllDecompress2); HWY_EXPORT_AND_TEST_P(CompressTest, TestAllShortLengths); +HWY_EXPORT_AND_TEST_P(CompressTest, TestAllDecompressAndCompress); HWY_AFTER_TEST(); } // namespace gcpp #endif // HWY_ONCE diff --git a/compression/test_util-inl.h b/compression/test_util-inl.h index e5b1fe0..1c72b32 100644 --- a/compression/test_util-inl.h +++ b/compression/test_util-inl.h @@ -67,6 +67,35 @@ void ForeachPackedAndRawType() { } } +template +void ForeachActivationType1(D d) { + Test test; + test(float(), d); + test(BF16(), d); +} + +template +void ForeachActivationType2(D d) { + Test test; + test(float(), float(), d); + test(float(), BF16(), d); + test(BF16(), float(), d); + test(BF16(), BF16(), d); +} + +template +void ForeachActivationType3(D d) { + Test test; + test(float(), float(), float(), d); + test(float(), float(), BF16(), d); + test(float(), BF16(), float(), d); + test(float(), BF16(), BF16(), d); + test(BF16(), float(), float(), d); + test(BF16(), float(), BF16(), d); + test(BF16(), BF16(), float(), d); + test(BF16(), BF16(), BF16(), d); +} + // Generates inputs: deterministic, within max SfpStream range. template MatStorageT GenerateMat(const Extents2D& extents, diff --git a/compression/types.h b/compression/types.h index dc10676..667265a 100644 --- a/compression/types.h +++ b/compression/types.h @@ -186,6 +186,11 @@ constexpr bool IsNuqStream() { return hwy::IsSame, NuqStream>(); } +template +constexpr bool SupportsPointerArithmetic() { + return !IsNuqStream(); +} + // Tensor types for loading weights. enum class Type { kUnknown, kF32, kBF16, kSFP, kNUQ, kF64 }; // These are used in `ModelConfig.Specifier`, hence the strings will not diff --git a/gemma/activations.h b/gemma/activations.h index 175ddc9..14994d3 100644 --- a/gemma/activations.h +++ b/gemma/activations.h @@ -206,9 +206,8 @@ struct Activations { // Gated FFW MatStorageT pre_ffw_rms_out; - // Norm may be large, so prefer to keep as f32. - MatStorageT C1; - MatStorageT C2; + MatStorageT C1; + MatStorageT C2; MatStorageT ffw_out; AttentionActivations attention; diff --git a/gemma/attention.cc b/gemma/attention.cc index c73abcb..13681d0 100644 --- a/gemma/attention.cc +++ b/gemma/attention.cc @@ -144,8 +144,8 @@ void SingleDotSoftmaxWeightedSum( // Apply rope and scaling to Q. if (layer.query_norm_scale.HasPtr()) { CallUpcasted(&layer.query_norm_scale, [&](const auto* weights_t) { - RMSNormInplace(weights_t->PackedScale1(), 0, q, - layer.layer_config.qkv_dim, p, worker); + RMSNormInplace(weights_t->PackedScale1(), q, layer.layer_config.qkv_dim, + p, worker); }); } @@ -307,7 +307,7 @@ static HWY_INLINE void ComputeQKV(size_t num_tokens, const size_t layer_idx, // Apply further processing to K. if (layer.key_norm_scale.HasPtr()) { CallUpcasted(&layer.key_norm_scale, [&](const auto* weights_t) { - RMSNormInplace(weights_t->PackedScale1(), 0, kv_f32, qkv_dim, + RMSNormInplace(weights_t->PackedScale1(), kv_f32, qkv_dim, env.ctx.profiler, thread); }); } diff --git a/gemma/gemma-inl.h b/gemma/gemma-inl.h index c1ff722..ed0750a 100644 --- a/gemma/gemma-inl.h +++ b/gemma/gemma-inl.h @@ -43,14 +43,14 @@ HWY_BEFORE_NAMESPACE(); namespace gcpp { namespace HWY_NAMESPACE { -template -void Activation(ActivationType activation, T* HWY_RESTRICT c1, - const T* HWY_RESTRICT c2, const size_t count, hwy::Profiler& p, +template +void Activation(ActivationType activation, T1* HWY_RESTRICT c1, + const T2* HWY_RESTRICT c2, const size_t count, hwy::Profiler& p, const size_t worker) { static const auto zone = p.AddZone("Gen.Activation"); PROFILER_ZONE3(p, worker, zone); namespace hn = hwy::HWY_NAMESPACE; - using DF = hn::ScalableTag; + using DF = hn::ScalableTag; using VF = hn::Vec; // ActivationType::Gelu if (c2 == nullptr) { // No multiplier, just Gelu. @@ -58,9 +58,10 @@ void Activation(ActivationType activation, T* HWY_RESTRICT c1, return; }; // Has multiplier, Gelu(c1) * c2. - hn::Transform1(DF(), c1, count, c2, [](DF df, VF v, VF mul) HWY_ATTR { - return hn::Mul(mul, Gelu(df, v)); - }); + Decompress1AndCompressInplace(DF(), c1, count, c2, + [](DF df, VF v1, VF v2) HWY_ATTR -> VF { + return hn::Mul(v2, Gelu(df, v1)); + }); } // No C2 multiplier. @@ -75,10 +76,9 @@ void ActivationBatched(ActivationType activation, Mat& c1, }); } -template -HWY_NOINLINE void ActivationBatched(ActivationType activation, Mat& c1, - const Mat* c2, ThreadingContext& ctx) { - using T = typename Mat::T; +template +HWY_NOINLINE void ActivationBatched(ActivationType activation, Mat1& c1, + const Mat2* c2, ThreadingContext& ctx) { HWY_DASSERT(c1.SameShape(*c2)); if (c2 && c2->HasPtr()) { SmallParallelFor(c1.Rows(), ctx.pools, [&](uint64_t task, size_t worker) { @@ -87,8 +87,9 @@ HWY_NOINLINE void ActivationBatched(ActivationType activation, Mat& c1, }); } else { // No multiplier SmallParallelFor(c1.Rows(), ctx.pools, [&](uint64_t task, size_t worker) { - Activation(activation, c1.Row(task), static_cast(nullptr), - c1.Cols(), ctx.profiler, worker); + Activation(activation, c1.Row(task), + static_cast(nullptr), c1.Cols(), + ctx.profiler, worker); }); } } diff --git a/gemma/vit.cc b/gemma/vit.cc index a694187..96d6d7f 100644 --- a/gemma/vit.cc +++ b/gemma/vit.cc @@ -335,7 +335,7 @@ void PrefillVit(const ModelConfig& model_config, const WeightsPtrs& weights, // Apply soft embedding norm before input projection. CallUpcasted(&weights.mm_embed_norm, [&](const auto* weights_t) { - RMSNormInplace(weights_t->PackedScale1(), 0, activations.x.Row(0), + RMSNormInplace(weights_t->PackedScale1(), activations.x.Row(0), vit_model_dim, env.ctx.profiler, hwy::Profiler::Thread()); }); } diff --git a/ops/dot_test.cc b/ops/dot_test.cc index a461614..2c0ae3a 100644 --- a/ops/dot_test.cc +++ b/ops/dot_test.cc @@ -750,7 +750,7 @@ class DotStats { void CheckMuls() const { // Comp2 is between Compensated and Kahan. ASSERT_INSIDE(kComp2, 1.001, s_muls[kComp2].Mean(), 1.4); - ASSERT_INSIDE(kComp2, 1.001f, s_muls[kComp2].Max(), 2.4f); + ASSERT_INSIDE(kComp2, 1.001f, s_muls[kComp2].Max(), 6.8f); ASSERT_INSIDE(kComp2, 1.0, s_muls[kComp2].GeometricMean(), 1.2); // Compensated and Double are very accurate. diff --git a/ops/matmul_test.cc b/ops/matmul_test.cc index 14913a1..3a8528a 100644 --- a/ops/matmul_test.cc +++ b/ops/matmul_test.cc @@ -119,7 +119,7 @@ void AssertClose(const MatPtrT& A, const MatPtrT& B, const double eps_bf16 = hwy::ConvertScalarTo(hwy::Epsilon()); const double eps_f32 = hwy::ConvertScalarTo(hwy::Epsilon()); // Dot() uses double-precision summation. - double tolerance = 12 * norm * eps_f32; + double tolerance = 20 * norm * eps_f32; // If B is F32, Dot() promotes F32 or even F64, but MatMul demotes the F32 to // BF16, so add extra tolerance. if (IsF32()) { diff --git a/ops/ops-inl.h b/ops/ops-inl.h index 8a7224b..caf8041 100644 --- a/ops/ops-inl.h +++ b/ops/ops-inl.h @@ -127,12 +127,13 @@ HWY_INLINE hn::Vec Gelu(D d, hn::Vec v) { } // Activation already has a profiler zone. -static HWY_NOINLINE HWY_MAYBE_UNUSED void Gelu(float* HWY_RESTRICT x, - size_t size) { +template +static HWY_NOINLINE HWY_MAYBE_UNUSED void Gelu(T* HWY_RESTRICT x, size_t size) { namespace hn = hwy::HWY_NAMESPACE; - using D = hn::ScalableTag; - hn::Transform(D(), x, size, - [](D d, hn::Vec v) HWY_ATTR { return Gelu(d, v); }); + using DF = hn::ScalableTag; + using VF = hn::Vec; + DecompressAndCompressInplace( + DF(), x, size, [](DF d, VF v) HWY_ATTR -> VF { return Gelu(d, v); }); } template @@ -179,13 +180,15 @@ HWY_INLINE hn::Vec Sigmoid(D d, hn::Vec v) { } // Sigmoid using the logistic function 1 / (1 + exp(-x[i])) -static HWY_NOINLINE HWY_MAYBE_UNUSED void Sigmoid(float* HWY_RESTRICT x, +template +static HWY_NOINLINE HWY_MAYBE_UNUSED void Sigmoid(T* HWY_RESTRICT x, size_t size) { PROFILER_ZONE("ops.Sigmoid"); namespace hn = hwy::HWY_NAMESPACE; - using D = hn::ScalableTag; - hn::Transform(D(), x, size, - [](D d, hn::Vec v) HWY_ATTR { return Sigmoid(d, v); }); + using DF = hn::ScalableTag; + using VF = hn::Vec; + DecompressAndCompressInplace( + DF(), x, size, [](DF d, VF v) HWY_ATTR -> VF { return Sigmoid(d, v); }); } namespace detail { @@ -205,71 +208,53 @@ float RMSNormMul(const VT* HWY_RESTRICT x, const size_t size, hwy::Profiler& p, } // namespace detail -// `x_ofs` is the offset within `x`, required for NuqStream. template HWY_NOINLINE HWY_MAYBE_UNUSED void RMSNorm(const XT* HWY_RESTRICT x, const WT* HWY_RESTRICT weight, - size_t w_ofs, OT* HWY_RESTRICT out, + OT* HWY_RESTRICT out, const size_t size, hwy::Profiler& p, const size_t worker) { static const auto zone = p.AddZone("Ops.RMSNorm"); PROFILER_ZONE3(p, worker, zone); namespace hn = hwy::HWY_NAMESPACE; - const hn::ScalableTag df; - using VF = hn::Vec; - const size_t NF = hn::Lanes(df); + using DF = hn::ScalableTag; + using VF = hn::Vec; - const VF mul = hn::Set(df, detail::RMSNormMul(x, size, p, worker)); + const VF mul = hn::Set(DF(), detail::RMSNormMul(x, size, p, worker)); + const VF* HWY_RESTRICT pmul = &mul; - const auto packed_x = MakeSpan(x, size); - const auto packed_w = MakeSpan(weight, w_ofs + size); - const auto packed_out = MakeSpan(out, size); - - HWY_DASSERT(size % (2 * NF) == 0); - for (size_t i = 0; i < size; i += 2 * NF) { - VF x0, x1, w0, w1; - Decompress2(df, packed_x, i, x0, x1); - Decompress2(df, packed_w, w_ofs + i, w0, w1); - const VF m0 = hn::Mul(mul, x0); - const VF m1 = hn::Mul(mul, x1); - // (1+weight) * m = m + weight*m = one FMA. - const VF out0 = hn::MulAdd(m0, w0, m0); - const VF out1 = hn::MulAdd(m1, w1, m1); - Compress2(df, out0, out1, packed_out, i); - } + Decompress2AndCompressTo(DF(), out, size, x, weight, + [pmul](DF /*df*/, VF vx, VF vw) HWY_ATTR -> VF { + const VF m = hn::Mul(*pmul, vx); + // (1+weight) * m = m + weight*m = one FMA. + return hn::MulAdd(m, vw, m); + }); } // Same as RMSNorm, but its HWY_RESTRICT forbids passing the same pointer. template -HWY_NOINLINE HWY_MAYBE_UNUSED void RMSNormInplace( - const WT* HWY_RESTRICT weight, size_t w_ofs, XT* HWY_RESTRICT inout, - const size_t size, hwy::Profiler& p, const size_t worker) { +HWY_NOINLINE HWY_MAYBE_UNUSED void RMSNormInplace(const WT* HWY_RESTRICT weight, + XT* HWY_RESTRICT inout, + const size_t size, + hwy::Profiler& p, + const size_t worker) { static const auto zone = p.AddZone("Ops.RMSNormInplace"); PROFILER_ZONE3(p, worker, zone); namespace hn = hwy::HWY_NAMESPACE; - const hn::ScalableTag df; - using VF = hn::Vec; - const size_t NF = hn::Lanes(df); + using DF = hn::ScalableTag; + using VF = hn::Vec; - const VF mul = hn::Set(df, detail::RMSNormMul(inout, size, p, worker)); + const VF mul = hn::Set(DF(), detail::RMSNormMul(inout, size, p, worker)); + const VF* HWY_RESTRICT pmul = &mul; - const auto packed_w = MakeSpan(weight, w_ofs + size); - const auto packed_x = MakeSpan(inout, size); - - HWY_DASSERT(size % (2 * NF) == 0); - for (size_t i = 0; i < size; i += 2 * NF) { - VF x0, x1, w0, w1; - Decompress2(df, packed_x, i, x0, x1); - Decompress2(df, packed_w, w_ofs + i, w0, w1); - const VF m0 = hn::Mul(mul, x0); - const VF m1 = hn::Mul(mul, x1); - // (1+weight) * m = m + weight*m = one FMA. - const VF out0 = hn::MulAdd(m0, w0, m0); - const VF out1 = hn::MulAdd(m1, w1, m1); - Compress2(df, out0, out1, packed_x, i); - } + Decompress1AndCompressInplace(DF(), inout, size, weight, + [pmul](DF /*df*/, VF vx, VF vw) HWY_ATTR -> VF { + const VF m = hn::Mul(*pmul, vx); + // (1+weight) * m = m + weight*m = one FMA. + return hn::MulAdd(m, vw, m); + }); } // Computes mean mu and mean of squares mu2 of a vector. Used in LayerNorm. @@ -301,9 +286,9 @@ HWY_NOINLINE void LayerNorm(const XT* x, const WT* HWY_RESTRICT scale, PROFILER_ZONE("ops.LayerNorm"); namespace hn = hwy::HWY_NAMESPACE; - const hn::ScalableTag df; - using VF = hn::Vec; - const size_t NF = hn::Lanes(df); + using DF = hn::ScalableTag; + const DF df; + using VF = hn::Vec; double mu, mu2; ComputeMoments(x, size, mu, mu2); @@ -315,56 +300,13 @@ HWY_NOINLINE void LayerNorm(const XT* x, const WT* HWY_RESTRICT scale, const VF* HWY_RESTRICT pmu = &vmu; const VF* HWY_RESTRICT pvar = &vvar; - const auto packed_x = MakeSpan(x, size); - const auto packed_scale = MakeSpan(scale, size); - const auto packed_bias = MakeSpan(bias, size); - const auto packed_out = MakeSpan(out, size); - - // Loop body for one vector, called from main loop and remainder loop. - const auto norm = [pmu, pvar](VF x, VF s, VF add) HWY_ATTR -> VF { - const VF centered = hn::Sub(x, *pmu); - const VF mul = hn::Mul(s, *pvar); - return hn::MulAdd(centered, mul, add); - }; - - size_t i = 0; - if (size >= 2 * NF) { - for (; i <= size - 2 * NF; i += 2 * NF) { - VF x0, x1, s0, s1, add0, add1; - Decompress2(df, packed_x, i, x0, x1); - Decompress2(df, packed_scale, i, s0, s1); - Decompress2(df, packed_bias, i, add0, add1); - const VF n0 = norm(x0, s0, add0); - const VF n1 = norm(x1, s1, add1); - Compress2(df, n0, n1, packed_out, i); - } - } - - const size_t remaining = size - i; - HWY_DASSERT(remaining < 2 * NF); - if (HWY_UNLIKELY(remaining != 0)) { - HWY_ALIGN float buf_x[2 * hn::MaxLanes(df)]; - HWY_ALIGN float buf_scale[2 * hn::MaxLanes(df)]; - HWY_ALIGN float buf_bias[2 * hn::MaxLanes(df)]; - // Ensure the second vectors are zeroed even if remaining <= NF. - hn::Store(hn::Zero(df), df, buf_x + NF); - hn::Store(hn::Zero(df), df, buf_scale + NF); - hn::Store(hn::Zero(df), df, buf_bias + NF); - HWY_ALIGN OT buf_out[2 * hn::MaxLanes(df)]; - DecompressAndZeroPad(df, packed_x, i, buf_x, remaining); - DecompressAndZeroPad(df, packed_scale, i, buf_scale, remaining); - DecompressAndZeroPad(df, packed_bias, i, buf_bias, remaining); - const VF x0 = hn::Load(df, buf_x); - const VF x1 = hn::Load(df, buf_x + NF); - const VF s0 = hn::Load(df, buf_scale); - const VF s1 = hn::Load(df, buf_scale + NF); - const VF add0 = hn::Load(df, buf_bias); - const VF add1 = hn::Load(df, buf_bias + NF); - const VF n0 = norm(x0, s0, add0); - const VF n1 = norm(x1, s1, add1); - Compress2(df, n0, n1, MakeSpan(buf_out, 2 * NF), 0); - hwy::CopyBytes(buf_out, out + i, remaining * sizeof(OT)); - } + Decompress3AndCompressTo(DF(), out, size, x, scale, bias, + [pmu, pvar](DF /*df*/, VF x, VF s, VF add) + HWY_ATTR -> VF { + const VF centered = hn::Sub(x, *pmu); + const VF mul = hn::Mul(s, *pvar); + return hn::MulAdd(centered, mul, add); + }); } static HWY_NOINLINE HWY_MAYBE_UNUSED void AddAbsolutePositionalEmbeddings( @@ -541,40 +483,11 @@ static HWY_NOINLINE HWY_MAYBE_UNUSED void AddFrom(const XT* HWY_RESTRICT x, PROFILER_ZONE3(p, worker, zone); namespace hn = hwy::HWY_NAMESPACE; - const hn::ScalableTag df; - const size_t NF = hn::Lanes(df); - using VF = hn::Vec; - - const auto packed_x = MakeSpan(x, size); - - size_t i = 0; - if (size >= 2 * NF) { - for (; i <= size - 2 * NF; i += 2 * NF) { - VF x0, x1; - Decompress2(df, packed_x, i, x0, x1); - VF out0 = hn::Load(df, out + i); - VF out1 = hn::Load(df, out + i + NF); - hn::Store(hn::Add(x0, out0), df, out + i); - hn::Store(hn::Add(x1, out1), df, out + i + NF); - } - } - - const size_t remaining = size - i; - const size_t remaining1 = remaining - HWY_MIN(remaining, NF); - HWY_DASSERT(remaining < 2 * NF); - HWY_DASSERT(remaining1 < NF); - if (HWY_UNLIKELY(remaining != 0)) { - HWY_ALIGN float buf_x[2 * hn::MaxLanes(df)]; - // Ensure the second vector is zeroed even if remaining <= NF. - hn::Store(hn::Zero(df), df, buf_x + NF); - DecompressAndZeroPad(df, packed_x, i, buf_x, remaining); - const VF x0 = hn::Load(df, buf_x); - const VF x1 = hn::Load(df, buf_x + NF); - const VF out0 = hn::LoadN(df, out + i, remaining); - const VF out1 = hn::LoadN(df, out + i + NF, remaining1); - hn::StoreN(hn::Add(x0, out0), df, out + i, remaining); - hn::StoreN(hn::Add(x1, out1), df, out + i + NF, remaining1); - } + using DF = hn::ScalableTag; + using VF = hn::Vec; + Decompress1AndCompressInplace(DF(), out, size, x, + [&](DF /*df*/, VF out, VF x) + HWY_ATTR -> VF { return hn::Add(x, out); }); } // Simple loops unless/until batch sizes are large enough to parallelize. @@ -588,7 +501,7 @@ void RMSNormBatched(const MatPtrT& activations, const MatPtr& weights, CallUpcasted(&weights, [&](const auto* weights_t) { SmallParallelFor( activations.Rows(), ctx.pools, [&](uint64_t token_idx, size_t worker) { - RMSNorm(activations.Row(token_idx), weights_t->PackedScale1(), 0, + RMSNorm(activations.Row(token_idx), weights_t->PackedScale1(), out.Row(token_idx), activations.Cols(), ctx.profiler, worker); }); }); @@ -603,7 +516,7 @@ void RMSNormInplaceBatched(const MatPtr& weights, MatPtrT& inout, CallUpcasted(&weights, [&](const auto* weights_t) { SmallParallelFor( inout.Rows(), ctx.pools, [&](uint64_t token_idx, size_t worker) { - RMSNormInplace(weights_t->PackedScale1(), 0, inout.Row(token_idx), + RMSNormInplace(weights_t->PackedScale1(), inout.Row(token_idx), inout.Cols(), ctx.profiler, worker); }); }); @@ -645,38 +558,15 @@ HWY_NOINLINE HWY_MAYBE_UNUSED void MulByConst(const float c, XT* HWY_RESTRICT x, static const auto zone = p.AddZone("Ops.MulByConst"); PROFILER_ZONE3(p, worker, zone); namespace hn = hwy::HWY_NAMESPACE; - const hn::ScalableTag df; - const size_t NF = hn::Lanes(df); - using VF = hn::Vec; + using DF = hn::ScalableTag; + using VF = hn::Vec; - const VF v_c = hn::Set(df, c); - const auto packed_x = MakeSpan(x, size); + const VF vc = hn::Set(DF(), c); + const VF* HWY_RESTRICT pc = &vc; - size_t i = 0; - if (size >= 2 * NF) { - for (; i <= size - 2 * NF; i += 2 * NF) { - VF x0, x1; - Decompress2(df, packed_x, i, x0, x1); - x0 = hn::Mul(x0, v_c); - x1 = hn::Mul(x1, v_c); - Compress2(df, x0, x1, packed_x, i); - } - } - - const size_t remaining = size - i; - HWY_DASSERT(remaining < 2 * NF); - if (HWY_UNLIKELY(remaining != 0)) { - HWY_ALIGN float buf_x[2 * hn::MaxLanes(df)]; - // Ensure the second vector is zeroed even if remaining <= NF. - hn::Store(hn::Zero(df), df, buf_x + NF); - DecompressAndZeroPad(df, packed_x, i, buf_x, remaining); - VF x0 = hn::Load(df, buf_x); - VF x1 = hn::Load(df, buf_x + NF); - x0 = hn::Mul(x0, v_c); - x1 = hn::Mul(x1, v_c); - Compress2(df, x0, x1, MakeSpan(buf_x, 2 * NF), 0); - hwy::CopyBytes(buf_x, x + i, remaining * sizeof(XT)); - } + DecompressAndCompressInplace(DF(), x, size, + [pc](DF /*df*/, VF x) + HWY_ATTR -> VF { return hn::Mul(x, *pc); }); } // Same as above, but with a separate output. Same as below without the add. @@ -687,42 +577,18 @@ HWY_NOINLINE HWY_MAYBE_UNUSED void MulByConstTo( static const auto zone = p.AddZone("Ops.MulByConstTo"); PROFILER_ZONE3(p, worker, zone); namespace hn = hwy::HWY_NAMESPACE; - const hn::ScalableTag df; - const size_t NF = hn::Lanes(df); - using VF = hn::Vec; + using DF = hn::ScalableTag; + using VF = hn::Vec; - const VF v_c = hn::Set(df, c); - const auto packed_x = MakeSpan(x, size); - const auto packed_out = MakeSpan(out, size); + const VF vc = hn::Set(DF(), c); + const VF* HWY_RESTRICT pc = &vc; - size_t i = 0; - if (size >= 2 * NF) { - for (; i <= size - 2 * NF; i += 2 * NF) { - VF x0, x1; - Decompress2(df, packed_x, i, x0, x1); - const VF out0 = hn::Mul(x0, v_c); - const VF out1 = hn::Mul(x1, v_c); - Compress2(df, out0, out1, packed_out, i); - } - } - - const size_t remaining = size - i; - HWY_DASSERT(remaining < 2 * NF); - if (HWY_UNLIKELY(remaining != 0)) { - HWY_ALIGN float buf_x[2 * hn::MaxLanes(df)]; - HWY_ALIGN float buf_out[2 * hn::MaxLanes(df)]; - // Ensure the second vector is zeroed even if remaining <= NF. - hn::Store(hn::Zero(df), df, buf_x + NF); - DecompressAndZeroPad(df, packed_x, i, buf_x, remaining); - const VF x0 = hn::Load(df, buf_x); - const VF x1 = hn::Load(df, buf_x + NF); - const VF out0 = hn::Mul(x0, v_c); - const VF out1 = hn::Mul(x1, v_c); - Compress2(df, out0, out1, MakeSpan(buf_out, 2 * NF), 0); - hwy::CopyBytes(buf_out, out + i, remaining * sizeof(OT)); - } + Decompress1AndCompressTo(DF(), out, size, x, + [pc](DF /*df*/, VF x) + HWY_ATTR -> VF { return hn::Mul(x, *pc); }); } +// out[i] += x[i] * c. template HWY_NOINLINE HWY_MAYBE_UNUSED void MulByConstAndAdd( const float c, const XT* HWY_RESTRICT x, OT* HWY_RESTRICT out, @@ -730,45 +596,16 @@ HWY_NOINLINE HWY_MAYBE_UNUSED void MulByConstAndAdd( static const auto zone = p.AddZone("Ops.MulByConstAndAdd"); PROFILER_ZONE3(p, worker, zone); namespace hn = hwy::HWY_NAMESPACE; - const hn::ScalableTag df; - const size_t NF = hn::Lanes(df); - using VF = hn::Vec; + using DF = hn::ScalableTag; + using VF = hn::Vec; - const VF v_c = hn::Set(df, c); - const auto packed_x = MakeSpan(x, size); - const auto packed_out = MakeSpan(out, size); + const VF vc = hn::Set(DF(), c); + const VF* HWY_RESTRICT pc = &vc; - size_t i = 0; - if (size >= 2 * NF) { - for (; i <= size - 2 * NF; i += 2 * NF) { - VF x0, x1, out0, out1; - Decompress2(df, packed_x, i, x0, x1); - Decompress2(df, packed_out, i, out0, out1); - out0 = hn::MulAdd(x0, v_c, out0); - out1 = hn::MulAdd(x1, v_c, out1); - Compress2(df, out0, out1, packed_out, i); - } - } - - const size_t remaining = size - i; - HWY_DASSERT(remaining < 2 * NF); - if (HWY_UNLIKELY(remaining != 0)) { - HWY_ALIGN float buf_x[2 * hn::MaxLanes(df)]; - HWY_ALIGN float buf_out[2 * hn::MaxLanes(df)]; - // Ensure the second vectors are zeroed even if remaining <= NF. - hn::Store(hn::Zero(df), df, buf_x + NF); - hn::Store(hn::Zero(df), df, buf_out + NF); - DecompressAndZeroPad(df, packed_x, i, buf_x, remaining); - DecompressAndZeroPad(df, packed_out, i, buf_out, remaining); - const VF x0 = hn::Load(df, buf_x); - const VF x1 = hn::Load(df, buf_x + NF); - VF out0 = hn::Load(df, buf_out); - VF out1 = hn::Load(df, buf_out + NF); - out0 = hn::MulAdd(x0, v_c, out0); - out1 = hn::MulAdd(x1, v_c, out1); - Compress2(df, out0, out1, MakeSpan(buf_out, 2 * NF), 0); - hwy::CopyBytes(buf_out, out + i, remaining * sizeof(OT)); - } + Decompress1AndCompressInplace(DF(), out, size, x, + [&](DF /*df*/, VF out, VF x) HWY_ATTR -> VF { + return hn::MulAdd(x, *pc, out); + }); } // See below for a specialized version for top-1 sampling. @@ -913,15 +750,18 @@ static HWY_NOINLINE void LogitsSoftCap(const float cap, float* HWY_RESTRICT x, PROFILER_ZONE3(p, worker, zone); namespace hn = hwy::HWY_NAMESPACE; - using D = hn::ScalableTag; - using V = hn::Vec; + using DF = hn::ScalableTag; + using VF = hn::Vec; - const float inv_cap = 1.0f / cap; + const VF vcap = hn::Set(DF(), cap); + const VF vinv_cap = hn::Set(DF(), 1.0f / cap); + const VF* HWY_RESTRICT pcap = &vcap; + const VF* HWY_RESTRICT pinv_cap = &vinv_cap; - hn::Transform(D(), x, size, [cap, inv_cap](D d, V v) HWY_ATTR { - return hn::Mul(hn::Set(d, cap), - hn::Tanh(d, hn::Mul(v, hn::Set(d, inv_cap)))); - }); + DecompressAndCompressInplace( + DF(), x, size, [pcap, pinv_cap](DF d, VF v) HWY_ATTR -> VF { + return hn::Mul(*pcap, hn::Tanh(d, hn::Mul(v, *pinv_cap))); + }); } // Calls LogitsSoftCap if cap != 0.0f. diff --git a/ops/ops_test.cc b/ops/ops_test.cc index e935ccf..7e63482 100644 --- a/ops/ops_test.cc +++ b/ops/ops_test.cc @@ -47,6 +47,7 @@ #include "hwy/foreach_target.h" // IWYU pragma: keep #include "hwy/highway.h" // After highway.h +#include "compression/test_util-inl.h" #include "ops/ops-inl.h" #include "hwy/tests/test_util-inl.h" @@ -83,48 +84,6 @@ T Random(hwy::RandomState& rng) { HWY_MAX(hwy::ConvertScalarTo(hwy::LowestValue()), val)); } -HWY_NOINLINE void SimpleAddFrom(const float* HWY_RESTRICT other, - float* HWY_RESTRICT x, size_t size) { - for (size_t i = 0; i < size; ++i) { - x[i] += other[i]; - } -} - -HWY_NOINLINE void SimpleMulBy(const float* HWY_RESTRICT other, - float* HWY_RESTRICT x, size_t size) { - for (size_t i = 0; i < size; ++i) { - x[i] *= other[i]; - } -} - -HWY_NOINLINE void SimpleMulByConst(float c, float* HWY_RESTRICT x, - size_t size) { - for (size_t i = 0; i < size; ++i) { - x[i] *= c; - } -} - -HWY_NOINLINE void SimpleMulByConstAndAdd(float c, const float* HWY_RESTRICT x, - float* HWY_RESTRICT out, size_t size) { - for (size_t i = 0; i < size; ++i) { - out[i] += x[i] * c; - } -} - -HWY_NOINLINE void SimpleSoftmax(float* HWY_RESTRICT x, size_t size) { - HWY_DASSERT(size != 0); - float sum = 0.0; - const float maxval = *std::max_element(x, x + size); - for (size_t i = 0; i < size; ++i) { - x[i] = std::exp(x[i] - maxval); - sum += x[i]; - } - const float scale = 1.0f / sum; - for (size_t i = 0; i < size; ++i) { - x[i] *= scale; - } -} - template HWY_NOINLINE std::discrete_distribution SourceCreateDistribution( std::array& top_k, float temperature) { @@ -141,7 +100,8 @@ HWY_NOINLINE std::discrete_distribution SourceCreateDistribution( return std::discrete_distribution(std::begin(top_k), std::end(top_k)); } -struct TestAddFrom { +class TestAddFrom { + public: template void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b, hwy::RandomState& rng) { @@ -171,9 +131,24 @@ struct TestAddFrom { hwy::AssertArraySimilar(e, x, count, hwy::TargetName(HWY_TARGET), __FILE__, __LINE__); } + + private: + template + static HWY_NOINLINE void SimpleAddFrom(const T1* HWY_RESTRICT other, + T2* HWY_RESTRICT x, size_t size) { + for (size_t i = 0; i < size; ++i) { + x[i] = hwy::ConvertScalarTo(hwy::ConvertScalarTo(x[i]) + + hwy::ConvertScalarTo(other[i])); + } + } }; -struct TestMulByConstAndAdd { +void TestAllAddFrom() { + hn::ForPartialVectors>()(float()); +} + +class TestMulByConstAndAdd { + public: template void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b, hwy::RandomState& rng) { @@ -204,9 +179,27 @@ struct TestMulByConstAndAdd { hwy::AssertArraySimilar(e, x, count, hwy::TargetName(HWY_TARGET), __FILE__, __LINE__); } + + private: + template + static HWY_NOINLINE void SimpleMulByConstAndAdd(float c, + const T1* HWY_RESTRICT x, + T2* HWY_RESTRICT out, + size_t size) { + for (size_t i = 0; i < size; ++i) { + out[i] = hwy::ConvertScalarTo(hwy::ConvertScalarTo(out[i]) + + hwy::ConvertScalarTo(x[i]) * c); + } + } }; -struct TestMulByConst { +void TestAllMulByConstAndAdd() { + hn::ForPartialVectors>()( + float()); +} + +class TestMulByConst { + public: template void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b, hwy::RandomState& rng) { @@ -234,9 +227,61 @@ struct TestMulByConst { hwy::AssertArraySimilar(e, x, count, hwy::TargetName(HWY_TARGET), __FILE__, __LINE__); } + + private: + template + static HWY_NOINLINE void SimpleMulByConst(float c, T1* HWY_RESTRICT x, + size_t size) { + for (size_t i = 0; i < size; ++i) { + x[i] = hwy::ConvertScalarTo(hwy::ConvertScalarTo(x[i]) * c); + } + } }; -struct TestSoftmax { +void TestAllMulByConst() { + hn::ForPartialVectors>()(float()); +} + +struct TestMulByConstTo { + template + void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b, + hwy::RandomState& rng) { + if (misalign_b == 0) return; + using T = hn::TFromD; + + hwy::AlignedFreeUniquePtr px = + hwy::AllocateAligned(HWY_MAX(1, misalign_a + count)); + hwy::AlignedFreeUniquePtr pe = + hwy::AllocateAligned(HWY_MAX(1, misalign_a + count)); + hwy::AlignedFreeUniquePtr pactual = + hwy::AllocateAligned(HWY_MAX(1, misalign_a + count)); + HWY_ASSERT(px && pe && pactual); + + T* x = px.get() + misalign_a; + T* e = pe.get() + misalign_a; + T* actual = pe.get() + misalign_a; + + T constant = Random(rng); + for (size_t i = 0; i < count; ++i) { + x[i] = Random(rng); + e[i] = hwy::ConvertScalarTo(hwy::ConvertScalarTo(x[i]) * + hwy::ConvertScalarTo(constant)); + } + + MulByConstTo(constant, x, actual, count, hwy::Profiler::Get(), + /*worker=*/0); + + hwy::AssertArraySimilar(e, actual, count, hwy::TargetName(HWY_TARGET), + __FILE__, __LINE__); + } +}; + +void TestAllMulByConstTo() { + hn::ForPartialVectors>()(float()); +} + +class TestSoftmax { + public: template void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b, hwy::RandomState& rng) { @@ -270,8 +315,27 @@ struct TestSoftmax { } ASSERT_NEAR(sum, 1.0, 1e-6); } + + private: + static HWY_NOINLINE void SimpleSoftmax(float* HWY_RESTRICT x, size_t size) { + HWY_DASSERT(size != 0); + float sum = 0.0; + const float maxval = *std::max_element(x, x + size); + for (size_t i = 0; i < size; ++i) { + x[i] = std::exp(x[i] - maxval); + sum += x[i]; + } + const float scale = 1.0f / sum; + for (size_t i = 0; i < size; ++i) { + x[i] *= scale; + } + } }; +void TestAllSoftmax() { + hn::ForPartialVectors>()(float()); +} + template struct TestCreateDistribution { void operator()(hwy::RandomState& rng) { @@ -291,43 +355,60 @@ struct TestCreateDistribution { } }; -void TestAllAddFrom() { - hn::ForPartialVectors>()(float()); -} - -void TestAllMulByConst() { - hn::ForPartialVectors>()(float()); -} - -void TestAllMulByConstAndAdd() { - hn::ForPartialVectors>()( - float()); -} - -void TestAllSoftmax() { - hn::ForPartialVectors>()(float()); -} - void TestAllCreateDistribution() { TestCreateDistribution<2048>(); TestCreateDistribution<5000>(); } -void TestSigmoid() { - std::vector values; - for (int i = -150; i <= 150; ++i) { - values.push_back(.1f * i); - } - std::vector result = values; - Sigmoid(result.data(), result.size()); +struct TestSigmoid { + template + void operator()(T, D) const { + std::vector values; + for (int i = -150; i <= 150; ++i) { + values.push_back(hwy::ConvertScalarTo(.1f * i)); + } + std::vector result = values; + Sigmoid(result.data(), result.size()); - for (size_t i = 0; i < values.size(); i++) { - const float max_error = 0.00007; - float value = values[i]; - float approx = result[i]; - float expected = (1 / (1 + std::exp(-values[i]))); - EXPECT_NEAR(approx, expected, max_error) << "Input: " << value; + for (size_t i = 0; i < values.size(); i++) { + const float max_error = IsBF16() ? 0.2f : 0.00007f; + const float value = hwy::ConvertScalarTo(values[i]); + const float actual = hwy::ConvertScalarTo(result[i]); + const float expected = (1 / (1 + std::exp(-value))); + EXPECT_NEAR(expected, actual, max_error) + << (IsBF16() ? "bf16" : "float"); + } } +}; + +static HWY_NOINLINE void TestAllSigmoid() { + ForeachActivationType1(hn::ScalableTag()); +} + +struct TestGelu { + template + void operator()(T, D) const { + std::vector values; + for (int i = -150; i <= 150; ++i) { + values.push_back(hwy::ConvertScalarTo(.1f * i)); + } + std::vector result = values; + Gelu(result.data(), result.size()); + + for (size_t i = 0; i < values.size(); i++) { + const float max_error = IsBF16() ? 0.2f : 0.00007f; + const float x = hwy::ConvertScalarTo(values[i]); + const float actual = hwy::ConvertScalarTo(result[i]); + const float expected = + x * (0.5f + 0.5f * tanh(x * (0.79788f + 0.035677f * x * x))); + EXPECT_NEAR(expected, actual, max_error) + << (IsBF16() ? "bf16" : "float"); + } + } +}; + +static HWY_NOINLINE void TestAllGelu() { + ForeachActivationType1(hn::ScalableTag()); } static HWY_NOINLINE HWY_MAYBE_UNUSED void ScalarRopeAndMulBy( @@ -421,7 +502,8 @@ void TestRopeAndMulBy() { } template -HWY_NOINLINE float ScalarSquaredL2(const T* HWY_RESTRICT a, size_t size) { +static HWY_NOINLINE float ScalarSquaredL2(const T* HWY_RESTRICT a, + size_t size) { double sum = 0.0; for (size_t i = 0; i < size; ++i) { const float f = hwy::ConvertScalarTo(a[i]); @@ -431,9 +513,11 @@ HWY_NOINLINE float ScalarSquaredL2(const T* HWY_RESTRICT a, size_t size) { } // Supports bf16 and f32 inputs/outputs, which can be in-place. +// Shared between TestRMSNorm and TestRMSNormInplace. template -HWY_NOINLINE void ScalarRMSNorm(const XT* x, const WT* HWY_RESTRICT weight, - OT* out, size_t size) { +static HWY_NOINLINE void ScalarRMSNorm(const XT* x, + const WT* HWY_RESTRICT weight, OT* out, + size_t size) { constexpr float kEps = 1e-6f; float ss = ScalarSquaredL2(x, size); ss = 1.0f / sqrtf(ss / StaticCast(size) + kEps); @@ -445,42 +529,73 @@ HWY_NOINLINE void ScalarRMSNorm(const XT* x, const WT* HWY_RESTRICT weight, } } -template -void TestRMSNorm(hwy::RandomState& rng) { - constexpr size_t kSize = 128; - HWY_ALIGN XT vec[kSize]; - HWY_ALIGN WT weight[kSize]; - HWY_ALIGN OT expected[kSize]; - HWY_ALIGN OT actual[kSize]; +struct TestRMSNorm { + template + void operator()(XT, WT, OT, D) const { + hwy::RandomState rng; - for (size_t i = 0; i < kSize; ++i) { - vec[i] = hwy::ConvertScalarTo(RandomGaussian(rng)); - weight[i] = hwy::ConvertScalarTo(RandomGaussian(rng)); - } + constexpr size_t kSize = 128; + HWY_ALIGN XT vec[kSize]; + HWY_ALIGN WT weight[kSize]; + HWY_ALIGN OT expected[kSize]; + HWY_ALIGN OT actual[kSize]; - ScalarRMSNorm(vec, weight, expected, kSize); - RMSNorm(vec, weight, 0, actual, kSize, hwy::Profiler::Get(), /*worker=*/0); + for (size_t i = 0; i < kSize; ++i) { + vec[i] = hwy::ConvertScalarTo(RandomGaussian(rng)); + weight[i] = hwy::ConvertScalarTo(RandomGaussian(rng)); + } - for (size_t i = 0; i < kSize; i++) { - const float e = hwy::ConvertScalarTo(expected[i]); - const float a = hwy::ConvertScalarTo(actual[i]); - if (!IsNear(e, a, 1e-5f)) { - HWY_ABORT("RMSNorm %s %s %s mismatch at %zu: %E %E\n", TypeName(), - TypeName(), TypeName(), i, e, a); + ScalarRMSNorm(vec, weight, expected, kSize); + RMSNorm(vec, weight, actual, kSize, hwy::Profiler::Get(), /*worker=*/0); + + for (size_t i = 0; i < kSize; i++) { + const float e = hwy::ConvertScalarTo(expected[i]); + const float a = hwy::ConvertScalarTo(actual[i]); + if (!IsNear(e, a, 1e-5f)) { + HWY_ABORT("RMSNorm %s %s %s mismatch at %zu: %E %E\n", TypeName(), + TypeName(), TypeName(), i, e, a); + } } } -} +}; void TestAllRMSNorm() { - hwy::RandomState rng; - TestRMSNorm(rng); - TestRMSNorm(rng); - TestRMSNorm(rng); - TestRMSNorm(rng); - TestRMSNorm(rng); - TestRMSNorm(rng); - TestRMSNorm(rng); - TestRMSNorm(rng); + ForeachActivationType3(hn::ScalableTag()); +} + +struct TestRMSNormInplace { + template + void operator()(XT, WT, D) const { + hwy::RandomState rng; + + constexpr size_t kSize = 128; + HWY_ALIGN XT expected[kSize]; + HWY_ALIGN XT actual[kSize]; + HWY_ALIGN WT weight[kSize]; + + for (size_t i = 0; i < kSize; ++i) { + expected[i] = hwy::ConvertScalarTo(RandomGaussian(rng)); + actual[i] = expected[i]; + weight[i] = hwy::ConvertScalarTo(RandomGaussian(rng)); + } + + ScalarRMSNorm(expected, weight, expected, kSize); + RMSNormInplace(weight, actual, kSize, hwy::Profiler::Get(), + /*worker=*/0); + + for (size_t i = 0; i < kSize; i++) { + const float e = hwy::ConvertScalarTo(expected[i]); + const float a = hwy::ConvertScalarTo(actual[i]); + if (!IsNear(e, a, 1e-5f)) { + HWY_ABORT("RMSNormInplace %s %s mismatch at %zu: %E %E\n", + TypeName(), TypeName(), i, e, a); + } + } + } +}; + +void TestAllRMSNormInplace() { + ForeachActivationType2(hn::ScalableTag()); } void TestLayerNormSimple() { @@ -497,91 +612,92 @@ void TestLayerNormSimple() { for (size_t i = 0; i < kSize; i++) { const float max_error = 1e-6f; - float value = values[i]; float res = result[i]; // out = (x - 0.0) * 1.2 * 0.9999995 + 0.1 = 1.2999994 / -1.0999994; float expected = (i % 2 == 0) ? 1.2999994f : -1.0999994f; - EXPECT_NEAR(res, expected, max_error) << "Input: " << value; + EXPECT_NEAR(res, expected, max_error); } } -// Computes mean mu and mean of squares mu2 of a vector. Used in -// ScalarLayerNorm. -template -HWY_NOINLINE void ScalarMus(const T* HWY_RESTRICT a, size_t size, double& mu, - double& mu2) { - HWY_ASSERT(size > 0); - double sum = 0.0; - double sum2 = 0.0; - for (size_t i = 0; i < size; ++i) { - const float f = hwy::ConvertScalarTo(a[i]); - sum += f; - sum2 += f * f; - } - mu = sum / size; - mu2 = sum2 / size; -} +class TestLayerNorm { + public: + template + void operator()(XT, WT, OT, D) const { + hwy::RandomState rng; + constexpr size_t kSize = 128; + XT vec[kSize]; + WT weight[kSize]; + WT bias[kSize]; + OT expected[kSize]; + OT actual[kSize]; -// Compare py/flax/linen/normalization.py. -// out = (x - mean) * scale * rsqrt(var + epsilon) + bias -template -HWY_NOINLINE void ScalarLayerNorm(const XT* x, const WT* HWY_RESTRICT scale, - const WT* HWY_RESTRICT bias, OT* out, - size_t size) { - constexpr double kEps = 1e-6; - double mu, mu2; - ScalarMus(x, size, mu, mu2); - double var = mu2 - mu * mu; - constexpr double kZero = 0.0; - var = HWY_MAX(var, kZero); - var = 1.0 / sqrt(var + kEps); - for (size_t j = 0; j < size; j++) { - const float v = hwy::ConvertScalarTo(x[j]); - const float s = hwy::ConvertScalarTo(scale[j]); - const float b = hwy::ConvertScalarTo(bias[j]); - out[j] = hwy::ConvertScalarTo((v - mu) * s * var + b); - } -} + for (size_t i = 0; i < kSize; ++i) { + vec[i] = hwy::ConvertScalarTo(RandomGaussian(rng)); + weight[i] = hwy::ConvertScalarTo(RandomGaussian(rng)); + bias[i] = hwy::ConvertScalarTo(RandomGaussian(rng)); + } -template -void TestLayerNorm(hwy::RandomState& rng) { - constexpr size_t kSize = 128; - XT vec[kSize]; - WT weight[kSize]; - WT bias[kSize]; - OT expected[kSize]; - OT actual[kSize]; + double expected_mu, expected_mu2; + ScalarMus(vec, kSize, expected_mu, expected_mu2); + double actual_mu, actual_mu2; + ComputeMoments(vec, kSize, actual_mu, actual_mu2); - for (size_t i = 0; i < kSize; ++i) { - vec[i] = hwy::ConvertScalarTo(RandomGaussian(rng)); - weight[i] = hwy::ConvertScalarTo(RandomGaussian(rng)); - bias[i] = hwy::ConvertScalarTo(RandomGaussian(rng)); - } + ScalarLayerNorm(vec, weight, bias, expected, kSize); + LayerNorm(vec, weight, bias, actual, kSize); - double expected_mu, expected_mu2; - ScalarMus(vec, kSize, expected_mu, expected_mu2); - double actual_mu, actual_mu2; - ComputeMoments(vec, kSize, actual_mu, actual_mu2); - - ScalarLayerNorm(vec, weight, bias, expected, kSize); - LayerNorm(vec, weight, bias, actual, kSize); - - for (size_t i = 0; i < kSize; i++) { - const float e = hwy::ConvertScalarTo(expected[i]); - const float a = hwy::ConvertScalarTo(actual[i]); - if (!IsNear(e, a, 1e-5f)) { - HWY_ABORT("LayerNorm %s %s %s mismatch at %zu: %E %E\n", TypeName(), - TypeName(), TypeName(), i, e, a); + for (size_t i = 0; i < kSize; i++) { + const float e = hwy::ConvertScalarTo(expected[i]); + const float a = hwy::ConvertScalarTo(actual[i]); + if (!IsNear(e, a, 1e-5f)) { + HWY_ABORT("LayerNorm %s %s %s mismatch at %zu: %E %E\n", TypeName(), + TypeName(), TypeName(), i, e, a); + } } } -} + + private: + // Computes mean mu and mean of squares mu2 of a vector. Used in + // ScalarLayerNorm. + template + static HWY_NOINLINE void ScalarMus(const T* HWY_RESTRICT a, size_t size, + double& mu, double& mu2) { + HWY_ASSERT(size > 0); + double sum = 0.0; + double sum2 = 0.0; + for (size_t i = 0; i < size; ++i) { + const float f = hwy::ConvertScalarTo(a[i]); + sum += f; + sum2 += f * f; + } + mu = sum / size; + mu2 = sum2 / size; + } + + // Compare py/flax/linen/normalization.py. + // out = (x - mean) * scale * rsqrt(var + epsilon) + bias + template + static HWY_NOINLINE void ScalarLayerNorm(const XT* x, + const WT* HWY_RESTRICT scale, + const WT* HWY_RESTRICT bias, OT* out, + size_t size) { + constexpr double kEps = 1e-6; + double mu, mu2; + ScalarMus(x, size, mu, mu2); + double var = mu2 - mu * mu; + constexpr double kZero = 0.0; + var = HWY_MAX(var, kZero); + var = 1.0 / sqrt(var + kEps); + for (size_t j = 0; j < size; j++) { + const float v = hwy::ConvertScalarTo(x[j]); + const float s = hwy::ConvertScalarTo(scale[j]); + const float b = hwy::ConvertScalarTo(bias[j]); + out[j] = hwy::ConvertScalarTo((v - mu) * s * var + b); + } + } +}; void TestAllLayerNorm() { - hwy::RandomState rng; - TestLayerNorm(rng); - TestLayerNorm(rng); - TestLayerNorm(rng); - TestLayerNorm(rng); + ForeachActivationType3(hn::ScalableTag()); } void TestSampleTopK() { @@ -646,12 +762,15 @@ namespace gcpp { HWY_BEFORE_TEST(OpsTest); HWY_EXPORT_AND_TEST_P(OpsTest, TestAllAddFrom); HWY_EXPORT_AND_TEST_P(OpsTest, TestAllMulByConst); +HWY_EXPORT_AND_TEST_P(OpsTest, TestAllMulByConstTo); HWY_EXPORT_AND_TEST_P(OpsTest, TestAllMulByConstAndAdd); HWY_EXPORT_AND_TEST_P(OpsTest, TestAllSoftmax); HWY_EXPORT_AND_TEST_P(OpsTest, TestAllCreateDistribution); -HWY_EXPORT_AND_TEST_P(OpsTest, TestSigmoid); +HWY_EXPORT_AND_TEST_P(OpsTest, TestAllSigmoid); +HWY_EXPORT_AND_TEST_P(OpsTest, TestAllGelu); HWY_EXPORT_AND_TEST_P(OpsTest, TestRopeAndMulBy); HWY_EXPORT_AND_TEST_P(OpsTest, TestAllRMSNorm); +HWY_EXPORT_AND_TEST_P(OpsTest, TestAllRMSNormInplace); HWY_EXPORT_AND_TEST_P(OpsTest, TestAllLayerNorm); HWY_EXPORT_AND_TEST_P(OpsTest, TestLayerNormSimple); HWY_EXPORT_AND_TEST_P(OpsTest, TestSampleTopK);