Implement scalar version of LayerNorm

PiperOrigin-RevId: 675085495
2024-09-16 03:53:24 -07:00 · 2024-09-16 03:53:24 -07:00 · 892f3bbcbe
parent 1c8ddcdffe
commit 892f3bbcbe
2 changed files with 124 additions and 0 deletions
--- a/ops/ops-inl.h
+++ b/ops/ops-inl.h
@ -214,6 +214,57 @@ HWY_NOINLINE HWY_MAYBE_UNUSED void RMSNormInplace(
  }
 }
 // Computes mean mu and mean of squares mu2 of a vector. Used in LayerNorm.
 template <typename T>
 HWY_NOINLINE void ScalarMus(const T* HWY_RESTRICT a, size_t size, T& mu,
                            T& mu2) {
  HWY_ASSERT(size > 0);
  double sum = 0.0;
  double sum2 = 0.0;
  for (size_t i = 0; i < size; ++i) {
    const float f = hwy::ConvertScalarTo<float>(a[i]);
    sum += f;
    sum2 += f * f;
  }
  mu = sum / size;
  mu2 = sum2 / size;
 }
 // Compare py/flax/linen/normalization.py.
 // out = (x - mean) * scale * rsqrt(var + epsilon) + bias
 template <typename VecT, typename WeightT, typename OutT>
 HWY_NOINLINE void ScalarLayerNorm(const VecT* x,
                                  const WeightT* HWY_RESTRICT scale,
                                  const WeightT* HWY_RESTRICT bias,
                                  OutT* out,
                                  size_t size) {
  constexpr float kEps = 1e-6f;
  VecT mu, mu2;
  ScalarMus(x, size, mu, mu2);
  VecT var = mu2 - mu * mu;
  VecT zero = 0.0f;
  var = HWY_MAX(var, zero);
  var = 1.0f / sqrtf(var + kEps);
  for (size_t j = 0; j < size; j++) {
    const float v = hwy::ConvertScalarTo<float>(x[j]);
    const float s = hwy::ConvertScalarTo<float>(scale[j]);
    const float b = hwy::ConvertScalarTo<float>(bias[j]);
    out[j] = hwy::ConvertScalarTo<OutT>((v - mu) * s * var + b);
  }
 }
 template <typename VecT, typename WeightT, typename OutT>
 HWY_NOINLINE HWY_MAYBE_UNUSED void LayerNorm(const VecT* x,
                                             const WeightT* HWY_RESTRICT weight,
                                             const WeightT* HWY_RESTRICT bias,
                                             OutT* out,
                                             const size_t size) {
  PROFILER_FUNC;
  // For now we only delegate to the scalar version.
  // TODO: implement vectorized version.
  ScalarLayerNorm(x, weight, bias, out, size);
 }
 static HWY_NOINLINE HWY_MAYBE_UNUSED void AddAbsolutePositionalEmbeddings(
    float* HWY_RESTRICT x, size_t dim_model, size_t pos) {
  PROFILER_ZONE("ops.AddAbsolutePositionalEmbeddings");
@ -377,6 +428,16 @@ void RMSNormInplaceBatched(size_t num_tokens, const WeightT* weights,
  }
 }
 template <typename VecT, typename WeightT, typename OutT>
 void LayerNormBatched(size_t num_tokens, const VecT* x,
                      const WeightT* HWY_RESTRICT weight,
                      const WeightT* HWY_RESTRICT bias, OutT* out,
                      const size_t size) {
  for (size_t token_idx = 0; token_idx < num_tokens; ++token_idx) {
    LayerNorm(x + token_idx * size, weight, bias, out + token_idx * size, size);
  }
 }
 static HWY_INLINE void AddFromBatched(size_t num_tokens, const float* other,
                                      float* x, const size_t model_dim) {
  for (size_t token_idx = 0; token_idx < num_tokens; ++token_idx) {
--- a/ops/ops_test.cc
+++ b/ops/ops_test.cc
@ -498,6 +498,67 @@ void TestAllRMSNorm() {
  TestRMSNorm<BF16, BF16, BF16>(rng);
 }
 void TestLayerNormSimple() {
  const size_t kSize = 52;
  std::vector<float> values(kSize);
  // Alternating 1.0/-1.0, so mean=0.0, var=1.0, rsqrt(var+epsilon)=0.9999995
  for (int i = 0; i < kSize; ++i) {
    values[i] = (i % 2 == 0) ? 1.0f : -1.0f;
  }
  std::vector<float> scale(kSize, 1.2f);
  std::vector<float> bias(kSize, 0.1f);
  std::vector<float> result(kSize);
  LayerNorm(values.data(), scale.data(), bias.data(), result.data(), kSize);
  for (size_t i = 0; i < kSize; i++) {
    const float max_error = 1e-6f;
    float value = values[i];
    float res = result[i];
    // out = (x - 0.0) * 1.2 * 0.9999995 + 0.1 = 1.2999994 / -1.0999994;
    float expected = (i % 2 == 0) ? 1.2999994f : -1.0999994f;
    EXPECT_NEAR(res, expected, max_error) << "Input: " << value;
  }
 }
 // Note: there is no vectorized implementation of LayerNorm yet. So this test
 // currently only checks that the scalar version can be called for the below
 // combinations of float/BF16 inputs and outputs.
 template <typename VecT, typename WeightT, typename OutT>
 void TestLayerNorm(hwy::RandomState& rng) {
  constexpr size_t kSize = 128;
  VecT vec[kSize];
  WeightT weight[kSize];
  WeightT bias[kSize];
  OutT expected[kSize];
  OutT actual[kSize];
  for (size_t i = 0; i < kSize; ++i) {
    vec[i] = hwy::ConvertScalarTo<VecT>(RandomGaussian(rng));
    weight[i] = hwy::ConvertScalarTo<WeightT>(RandomGaussian(rng));
    bias[i] = hwy::ConvertScalarTo<WeightT>(RandomGaussian(rng));
  }
  ScalarLayerNorm(vec, weight, bias, expected, kSize);
  LayerNorm(vec, weight, bias, actual, kSize);
  for (size_t i = 0; i < kSize; i++) {
    const float e = hwy::ConvertScalarTo<float>(expected[i]);
    const float a = hwy::ConvertScalarTo<float>(actual[i]);
    if (!IsNear(e, a, 1e-5f)) {
      HWY_ABORT("LayerNorm %s %s %s mismatch at %zu: %E %E\n", TypeName<VecT>(),
                TypeName<WeightT>(), TypeName<OutT>(), i, e, a);
    }
  }
 }
 void TestAllLayerNorm() {
  hwy::RandomState rng;
  TestLayerNorm<float, float, float>(rng);
  TestLayerNorm<float, float, BF16>(rng);
  TestLayerNorm<float, BF16, float>(rng);
  TestLayerNorm<float, BF16, BF16>(rng);
 }
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace gcpp
@ -516,6 +577,8 @@ HWY_EXPORT_AND_TEST_P(OpsTest, TestAllCreateDistribution);
 HWY_EXPORT_AND_TEST_P(OpsTest, TestSigmoid);
 HWY_EXPORT_AND_TEST_P(OpsTest, TestRopeAndMulBy);
 HWY_EXPORT_AND_TEST_P(OpsTest, TestAllRMSNorm);
 HWY_EXPORT_AND_TEST_P(OpsTest, TestAllLayerNorm);
 HWY_EXPORT_AND_TEST_P(OpsTest, TestLayerNormSimple);
 HWY_AFTER_TEST();
 }  // namespace gcpp