1.29x speedup: bf16 C1/C2. Extend most ops to any type, expand test coverage.

Also increase dot_test.cc range for Zen4, and matmul_test tolerance (failing in some configs) PiperOrigin-RevId: 801789922
2025-09-01 06:32:24 -07:00 · 2025-09-01 06:32:24 -07:00 · 229bd078a1
parent bc0c0bac8b
commit 229bd078a1
13 changed files with 761 additions and 456 deletions
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -376,6 +376,7 @@ cc_test(
        ":test_util",
        ":threading_context",
        "@googletest//:gtest_main",  # buildcleaner: keep
+        "//compression:test_util",
        "//compression:types",
        "@highway//:hwy",
        "@highway//:hwy_test_util",
--- a/compression/compress-inl.h
+++ b/compression/compress-inl.h
@ -709,6 +709,242 @@ HWY_INLINE float DecompressAndCall(D, const PackedSpan<const VT> v,
                       comp3);
 }

+// Similar to `hn::Transform*`, but for compressed `T`. Used by ops-inl.h.
+// `DF` is the decompressed type, typically `float`.
+template <class DF, typename T, class Func>
+HWY_INLINE void DecompressAndCompressInplace(DF df, T* HWY_RESTRICT inout,
+                                             size_t num, Func&& func) {
+  const auto packed_inout = MakeSpan(inout, num);
+
+  using VF = hn::Vec<decltype(df)>;
+  HWY_LANES_CONSTEXPR const size_t NF = hn::Lanes(df);
+  size_t i = 0;
+  if (num >= 2 * NF) {
+    for (; i <= num - 2 * NF; i += 2 * NF) {
+      VF v0, v1;
+      Decompress2(df, packed_inout, i, v0, v1);
+      const VF out0 = func(df, v0);
+      const VF out1 = func(df, v1);
+      Compress2(df, out0, out1, packed_inout, i);
+    }
+  }
+
+  const size_t remaining = num - i;
+  HWY_DASSERT(remaining < 2 * NF);
+  if (HWY_UNLIKELY(remaining != 0)) {
+    HWY_ALIGN float buf_inout[2 * hn::MaxLanes(df)];
+    // Ensure the second vector is zeroed even if remaining <= NF.
+    hn::Store(hn::Zero(df), df, buf_inout + NF);
+    DecompressAndZeroPad(df, packed_inout, i, buf_inout, remaining);
+    const VF v0 = hn::Load(df, buf_inout);
+    const VF v1 = hn::Load(df, buf_inout + NF);
+    const VF out0 = func(df, v0);
+    const VF out1 = func(df, v1);
+    Compress2(df, out0, out1, MakeSpan(buf_inout, 2 * NF), 0);
+    // Clang generates incorrect code for CopyBytes if num = 2.
+    for (size_t j = 0; j < remaining; ++j) {
+      inout[i + j] = hwy::ConvertScalarTo<T>(buf_inout[j]);
+    }
+  }
+}
+
+// One extra argument. `DF` is the decompressed type, typically `float`.
+template <class DF, typename T, typename T1, class Func>
+HWY_INLINE void Decompress1AndCompressInplace(DF df, T* HWY_RESTRICT inout,
+                                              size_t num,
+                                              const T1* HWY_RESTRICT p1,
+                                              Func&& func) {
+  const auto packed_inout = MakeSpan(inout, num);
+  const auto packed1 = MakeSpan(p1, num);
+
+  using VF = hn::Vec<decltype(df)>;
+  HWY_LANES_CONSTEXPR const size_t NF = hn::Lanes(df);
+  size_t i = 0;
+  if (num >= 2 * NF) {
+    for (; i <= num - 2 * NF; i += 2 * NF) {
+      VF v0, v1;
+      Decompress2(df, packed_inout, i, v0, v1);
+      VF v10, v11;
+      Decompress2(df, packed1, i, v10, v11);
+      const VF out0 = func(df, v0, v10);
+      const VF out1 = func(df, v1, v11);
+      Compress2(df, out0, out1, packed_inout, i);
+    }
+  }
+
+  const size_t remaining = num - i;
+  HWY_DASSERT(remaining < 2 * NF);
+  if (HWY_UNLIKELY(remaining != 0)) {
+    HWY_ALIGN float buf_inout[2 * hn::MaxLanes(df)];
+    HWY_ALIGN float buf1[2 * hn::MaxLanes(df)];
+    // Ensure the second vector is zeroed even if remaining <= NF.
+    hn::Store(hn::Zero(df), df, buf_inout + NF);
+    hn::Store(hn::Zero(df), df, buf1 + NF);
+    DecompressAndZeroPad(df, packed_inout, i, buf_inout, remaining);
+    DecompressAndZeroPad(df, packed1, i, buf1, remaining);
+    const VF v0 = hn::Load(df, buf_inout);
+    const VF v1 = hn::Load(df, buf_inout + NF);
+    const VF v10 = hn::Load(df, buf1);
+    const VF v11 = hn::Load(df, buf1 + NF);
+    const VF out0 = func(df, v0, v10);
+    const VF out1 = func(df, v1, v11);
+    Compress2(df, out0, out1, MakeSpan(buf_inout, 2 * NF), 0);
+    // Clang generates incorrect code for CopyBytes if num = 2.
+    for (size_t j = 0; j < remaining; ++j) {
+      inout[i + j] = hwy::ConvertScalarTo<T>(buf_inout[j]);
+    }
+  }
+}
+
+// Single input, separate output. `DF` is the decompressed type, typically
+// `float`.
+template <class DF, typename T, typename T1, class Func>
+HWY_INLINE void Decompress1AndCompressTo(DF df, T* HWY_RESTRICT out, size_t num,
+                                         const T1* HWY_RESTRICT p1,
+                                         Func&& func) {
+  const auto packed_out = MakeSpan(out, num);
+  const auto packed1 = MakeSpan(p1, num);
+
+  using VF = hn::Vec<decltype(df)>;
+  HWY_LANES_CONSTEXPR const size_t NF = hn::Lanes(df);
+  size_t i = 0;
+  if (num >= 2 * NF) {
+    for (; i <= num - 2 * NF; i += 2 * NF) {
+      VF v10, v11;
+      Decompress2(df, packed1, i, v10, v11);
+      const VF out0 = func(df, v10);
+      const VF out1 = func(df, v11);
+      Compress2(df, out0, out1, packed_out, i);
+    }
+  }
+
+  const size_t remaining = num - i;
+  HWY_DASSERT(remaining < 2 * NF);
+  if (HWY_UNLIKELY(remaining != 0)) {
+    HWY_ALIGN float buf1[2 * hn::MaxLanes(df)];
+    HWY_ALIGN float buf_out[2 * hn::MaxLanes(df)];
+    // Ensure the second vector is zeroed even if remaining <= NF.
+    hn::Store(hn::Zero(df), df, buf1 + NF);
+    DecompressAndZeroPad(df, packed1, i, buf1, remaining);
+    const VF v10 = hn::Load(df, buf1);
+    const VF v11 = hn::Load(df, buf1 + NF);
+    const VF out0 = func(df, v10);
+    const VF out1 = func(df, v11);
+    Compress2(df, out0, out1, MakeSpan(buf_out, 2 * NF), 0);
+    // Clang generates incorrect code for CopyBytes if num = 2.
+    for (size_t j = 0; j < remaining; ++j) {
+      out[i + j] = hwy::ConvertScalarTo<T>(buf_out[j]);
+    }
+  }
+}
+
+// Two inputs. `DF` is the decompressed type, typically `float`.
+template <class DF, typename T, typename T1, typename T2, class Func>
+HWY_INLINE void Decompress2AndCompressTo(DF df, T* HWY_RESTRICT out, size_t num,
+                                         const T1* HWY_RESTRICT p1,
+                                         const T2* HWY_RESTRICT p2,
+                                         Func&& func) {
+  const auto packed_out = MakeSpan(out, num);
+  const auto packed1 = MakeSpan(p1, num);
+  const auto packed2 = MakeSpan(p2, num);
+
+  using VF = hn::Vec<decltype(df)>;
+  HWY_LANES_CONSTEXPR const size_t NF = hn::Lanes(df);
+  size_t i = 0;
+  if (num >= 2 * NF) {
+    for (; i <= num - 2 * NF; i += 2 * NF) {
+      VF v10, v11, v20, v21;
+      Decompress2(df, packed1, i, v10, v11);
+      Decompress2(df, packed2, i, v20, v21);
+      const VF out0 = func(df, v10, v20);
+      const VF out1 = func(df, v11, v21);
+      Compress2(df, out0, out1, packed_out, i);
+    }
+  }
+
+  const size_t remaining = num - i;
+  HWY_DASSERT(remaining < 2 * NF);
+  if (HWY_UNLIKELY(remaining != 0)) {
+    HWY_ALIGN float buf1[2 * hn::MaxLanes(df)];
+    HWY_ALIGN float buf2[2 * hn::MaxLanes(df)];
+    HWY_ALIGN float buf_out[2 * hn::MaxLanes(df)];
+    // Ensure the second vector is zeroed even if remaining <= NF.
+    hn::Store(hn::Zero(df), df, buf1 + NF);
+    hn::Store(hn::Zero(df), df, buf2 + NF);
+    DecompressAndZeroPad(df, packed1, i, buf1, remaining);
+    DecompressAndZeroPad(df, packed2, i, buf2, remaining);
+    const VF v10 = hn::Load(df, buf1);
+    const VF v11 = hn::Load(df, buf1 + NF);
+    const VF v20 = hn::Load(df, buf2);
+    const VF v21 = hn::Load(df, buf2 + NF);
+    const VF out0 = func(df, v10, v20);
+    const VF out1 = func(df, v11, v21);
+    Compress2(df, out0, out1, MakeSpan(buf_out, 2 * NF), 0);
+    // Clang generates incorrect code for CopyBytes if num = 2.
+    for (size_t j = 0; j < remaining; ++j) {
+      out[i + j] = hwy::ConvertScalarTo<T>(buf_out[j]);
+    }
+  }
+}
+
+// Three inputs. `DF` is the decompressed type, typically `float`.
+template <class DF, typename T, typename T1, typename T2, typename T3,
+          class Func>
+HWY_INLINE void Decompress3AndCompressTo(DF df, T* HWY_RESTRICT out, size_t num,
+                                         const T1* HWY_RESTRICT p1,
+                                         const T2* HWY_RESTRICT p2,
+                                         const T3* HWY_RESTRICT p3,
+                                         Func&& func) {
+  const auto packed_out = MakeSpan(out, num);
+  const auto packed1 = MakeSpan(p1, num);
+  const auto packed2 = MakeSpan(p2, num);
+  const auto packed3 = MakeSpan(p3, num);
+
+  using VF = hn::Vec<decltype(df)>;
+  HWY_LANES_CONSTEXPR const size_t NF = hn::Lanes(df);
+  size_t i = 0;
+  if (num >= 2 * NF) {
+    for (; i <= num - 2 * NF; i += 2 * NF) {
+      VF v10, v11, v20, v21, v30, v31;
+      Decompress2(df, packed1, i, v10, v11);
+      Decompress2(df, packed2, i, v20, v21);
+      Decompress2(df, packed3, i, v30, v31);
+      const VF out0 = func(df, v10, v20, v30);
+      const VF out1 = func(df, v11, v21, v31);
+      Compress2(df, out0, out1, packed_out, i);
+    }
+  }
+
+  const size_t remaining = num - i;
+  HWY_DASSERT(remaining < 2 * NF);
+  if (HWY_UNLIKELY(remaining != 0)) {
+    HWY_ALIGN float buf1[2 * hn::MaxLanes(df)];
+    HWY_ALIGN float buf2[2 * hn::MaxLanes(df)];
+    HWY_ALIGN float buf3[2 * hn::MaxLanes(df)];
+    HWY_ALIGN float buf_out[2 * hn::MaxLanes(df)];
+    // Ensure the second vector is zeroed even if remaining <= NF.
+    hn::Store(hn::Zero(df), df, buf1 + NF);
+    hn::Store(hn::Zero(df), df, buf2 + NF);
+    hn::Store(hn::Zero(df), df, buf3 + NF);
+    DecompressAndZeroPad(df, packed1, i, buf1, remaining);
+    DecompressAndZeroPad(df, packed2, i, buf2, remaining);
+    DecompressAndZeroPad(df, packed3, i, buf3, remaining);
+    const VF v10 = hn::Load(df, buf1);
+    const VF v11 = hn::Load(df, buf1 + NF);
+    const VF v20 = hn::Load(df, buf2);
+    const VF v21 = hn::Load(df, buf2 + NF);
+    const VF v30 = hn::Load(df, buf3);
+    const VF v31 = hn::Load(df, buf3 + NF);
+    const VF out0 = func(df, v10, v20, v30);
+    const VF out1 = func(df, v11, v21, v31);
+    Compress2(df, out0, out1, MakeSpan(buf_out, 2 * NF), 0);
+    // Clang generates incorrect code for CopyBytes if num = 2.
+    for (size_t j = 0; j < remaining; ++j) {
+      out[i + j] = hwy::ConvertScalarTo<T>(buf_out[j]);
+    }
+  }
+}
+
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace gcpp
--- a/compression/compress_test.cc
+++ b/compression/compress_test.cc
@ -18,11 +18,10 @@
 #define HWY_DISABLED_TARGETS GEMMA_DISABLED_TARGETS
 #endif  // HWY_DISABLED_TARGETS

-#include "compression/compress.h"
-
 #include <stddef.h>
 #include <stdio.h>

+#include "compression/compress.h"
 #include "compression/distortion.h"
 #include "util/test_util.h"
 #include "hwy/aligned_allocator.h"
@ -45,7 +44,7 @@ namespace hn = hwy::HWY_NAMESPACE;

 // Calls Compress and Decompress2 and verifies the distortion/error.
 template <typename Packed>
-struct TestDecompress2T {
+struct TestDecompress2 {
  template <typename T, class D>
  HWY_INLINE void operator()(T /*unused*/, D d) {
    const size_t N = hn::Lanes(d);
@ -120,12 +119,12 @@ struct TestDecompress2T {
  }
 };

-void TestAllDecompress2() { ForeachPackedAndRawType<TestDecompress2T>(); }
+void TestAllDecompress2() { ForeachPackedAndRawType<TestDecompress2>(); }

 // Calls Compress and DecompressAndZeroPad for all short lengths and verifies
 // the distortion/error.
 template <typename Packed>
-struct TestShortLengthsT {
+struct TestShortLengths {
  template <typename T, class D>
  HWY_INLINE void operator()(T /*unused*/, D d) {
    const size_t N = hn::Lanes(d);
@ -196,7 +195,82 @@ struct TestShortLengthsT {
  }
 };

-void TestAllShortLengths() { ForeachPackedAndRawType<TestShortLengthsT>(); }
+void TestAllShortLengths() { ForeachPackedAndRawType<TestShortLengths>(); }
+
+// Verifies the arguments and remainder handling of `DecompressAndCompress*`.
+class TestDecompressAndCompress {
+ public:
+  template <typename T, class D>
+  HWY_INLINE void operator()(T /*unused*/, D d) {
+    ForeachActivationType3<Test>(d);
+  }
+
+ private:
+  struct Test {
+    template <typename T1, typename T2, typename T3, /*Deduced:*/ class D>
+    void operator()(T1, T2, T3, D d) {
+      hwy::RandomState rng;
+      using DF = hn::Repartition<float, D>;
+      using VF = hn::Vec<DF>;
+      const DF df;
+
+      for (size_t num = 1; num < 7 * hn::Lanes(d); ++num) {
+        auto p = hwy::AllocateAligned<T1>(num);
+        auto p1 = hwy::AllocateAligned<T2>(num);
+        auto p2 = hwy::AllocateAligned<T3>(num);
+        auto out = hwy::AllocateAligned<T1>(num);
+        auto expected1 = hwy::AllocateAligned<T1>(num);
+        auto expected2 = hwy::AllocateAligned<T1>(num);
+        auto expected3 = hwy::AllocateAligned<T1>(num);
+        HWY_ASSERT(p && p1 && p2 && out && expected1 && expected2 && expected3);
+        // Two bits each, totalling 6 bits which fit in the BF16 mantissa.
+        for (size_t i = 0; i < num; ++i) {
+          const size_t mod = i & 3;
+          p[i] = hwy::ConvertScalarTo<T1>(mod);
+          p1[i] = hwy::ConvertScalarTo<T2>(mod << 2);
+          p2[i] = hwy::ConvertScalarTo<T3>(mod << 4);
+          // For `Decompress1AndCompressInplace` to not overwrite `p`.
+          out[i] = p[i];
+          expected1[i] = hwy::ConvertScalarTo<T1>(mod);
+          expected2[i] = hwy::ConvertScalarTo<T1>((mod << 2) | mod);
+          expected3[i] =
+              hwy::ConvertScalarTo<T1>((mod << 4) | (mod << 2) | mod);
+        }
+
+        DecompressAndCompressInplace(df, p.get(), num,
+                                     [](DF, VF v) HWY_ATTR -> VF { return v; });
+        HWY_ASSERT_ARRAY_EQ(expected1.get(), p.get(), num);
+
+        // Uses `out` so as not to overwrite `p`.
+        Decompress1AndCompressInplace(
+            df, out.get(), num, p1.get(),
+            [](DF, VF v, VF v1) HWY_ATTR -> VF { return hn::Add(v, v1); });
+        HWY_ASSERT_ARRAY_EQ(expected2.get(), out.get(), num);
+
+        Decompress1AndCompressTo(df, out.get(), num, p.get(),
+                                 [](DF, VF v) HWY_ATTR -> VF { return v; });
+        HWY_ASSERT_ARRAY_EQ(expected1.get(), out.get(), num);
+
+        Decompress2AndCompressTo(df, out.get(), num, p.get(), p1.get(),
+                                 [](DF, VF v, VF v1)
+                                     HWY_ATTR -> VF { return hn::Add(v, v1); });
+        HWY_ASSERT_ARRAY_EQ(expected2.get(), out.get(), num);
+
+        Decompress3AndCompressTo(
+            df, out.get(), num, p.get(), p1.get(), p2.get(),
+            [](DF, VF v, VF v1, VF v2)
+                HWY_ATTR -> VF { return hn::Add(hn::Add(v, v1), v2); });
+        HWY_ASSERT_ARRAY_EQ(expected3.get(), out.get(), num);
+      }
+    }
+  };
+};
+
+void TestAllDecompressAndCompress() {
+  // The Highway Test interface (`ForGE128Vectors`) only supports a single type.
+  // We hard-code one here, and use `ForeachActivationType` internally.
+  hn::ForGE128Vectors<TestDecompressAndCompress>()(float());
+}

 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
@ -208,6 +282,7 @@ namespace gcpp {
 HWY_BEFORE_TEST(CompressTest);
 HWY_EXPORT_AND_TEST_P(CompressTest, TestAllDecompress2);
 HWY_EXPORT_AND_TEST_P(CompressTest, TestAllShortLengths);
+HWY_EXPORT_AND_TEST_P(CompressTest, TestAllDecompressAndCompress);
 HWY_AFTER_TEST();
 }  // namespace gcpp
 #endif  // HWY_ONCE
--- a/compression/test_util-inl.h
+++ b/compression/test_util-inl.h
@ -67,6 +67,35 @@ void ForeachPackedAndRawType() {
  }
 }

+template <class Test, class D>
+void ForeachActivationType1(D d) {
+  Test test;
+  test(float(), d);
+  test(BF16(), d);
+}
+
+template <class Test, class D>
+void ForeachActivationType2(D d) {
+  Test test;
+  test(float(), float(), d);
+  test(float(), BF16(), d);
+  test(BF16(), float(), d);
+  test(BF16(), BF16(), d);
+}
+
+template <class Test, class D>
+void ForeachActivationType3(D d) {
+  Test test;
+  test(float(), float(), float(), d);
+  test(float(), float(), BF16(), d);
+  test(float(), BF16(), float(), d);
+  test(float(), BF16(), BF16(), d);
+  test(BF16(), float(), float(), d);
+  test(BF16(), float(), BF16(), d);
+  test(BF16(), BF16(), float(), d);
+  test(BF16(), BF16(), BF16(), d);
+}
+
 // Generates inputs: deterministic, within max SfpStream range.
 template <typename MatT>
 MatStorageT<MatT> GenerateMat(const Extents2D& extents,
--- a/compression/types.h
+++ b/compression/types.h
@ -186,6 +186,11 @@ constexpr bool IsNuqStream() {
  return hwy::IsSame<hwy::RemoveCvRef<Packed>, NuqStream>();
 }

+template <typename Packed>
+constexpr bool SupportsPointerArithmetic() {
+  return !IsNuqStream<Packed>();
+}
+
 // Tensor types for loading weights.
 enum class Type { kUnknown, kF32, kBF16, kSFP, kNUQ, kF64 };
 // These are used in `ModelConfig.Specifier`, hence the strings will not
--- a/gemma/activations.h
+++ b/gemma/activations.h
@ -206,9 +206,8 @@ struct Activations {

  // Gated FFW
  MatStorageT<BF16> pre_ffw_rms_out;
-  // Norm may be large, so prefer to keep as f32.
-  MatStorageT<float> C1;
-  MatStorageT<float> C2;
+  MatStorageT<BF16> C1;
+  MatStorageT<BF16> C2;
  MatStorageT<float> ffw_out;

  AttentionActivations attention;
--- a/gemma/attention.cc
+++ b/gemma/attention.cc
@ -144,8 +144,8 @@ void SingleDotSoftmaxWeightedSum(
  // Apply rope and scaling to Q.
  if (layer.query_norm_scale.HasPtr()) {
    CallUpcasted(&layer.query_norm_scale, [&](const auto* weights_t) {
-      RMSNormInplace(weights_t->PackedScale1(), 0, q,
-                     layer.layer_config.qkv_dim, p, worker);
+      RMSNormInplace(weights_t->PackedScale1(), q, layer.layer_config.qkv_dim,
+                     p, worker);
    });
  }

@ -307,7 +307,7 @@ static HWY_INLINE void ComputeQKV(size_t num_tokens, const size_t layer_idx,
        // Apply further processing to K.
        if (layer.key_norm_scale.HasPtr()) {
          CallUpcasted(&layer.key_norm_scale, [&](const auto* weights_t) {
-            RMSNormInplace(weights_t->PackedScale1(), 0, kv_f32, qkv_dim,
+            RMSNormInplace(weights_t->PackedScale1(), kv_f32, qkv_dim,
                           env.ctx.profiler, thread);
          });
        }
--- a/gemma/gemma-inl.h
+++ b/gemma/gemma-inl.h
@ -43,14 +43,14 @@ HWY_BEFORE_NAMESPACE();
 namespace gcpp {
 namespace HWY_NAMESPACE {

-template <typename T>
-void Activation(ActivationType activation, T* HWY_RESTRICT c1,
-                const T* HWY_RESTRICT c2, const size_t count, hwy::Profiler& p,
+template <typename T1, typename T2>
+void Activation(ActivationType activation, T1* HWY_RESTRICT c1,
+                const T2* HWY_RESTRICT c2, const size_t count, hwy::Profiler& p,
                const size_t worker) {
  static const auto zone = p.AddZone("Gen.Activation");
  PROFILER_ZONE3(p, worker, zone);
  namespace hn = hwy::HWY_NAMESPACE;
-  using DF = hn::ScalableTag<T>;
+  using DF = hn::ScalableTag<float>;
  using VF = hn::Vec<DF>;
  // ActivationType::Gelu
  if (c2 == nullptr) {  // No multiplier, just Gelu.
@ -58,9 +58,10 @@ void Activation(ActivationType activation, T* HWY_RESTRICT c1,
    return;
  };
  // Has multiplier, Gelu(c1) * c2.
-  hn::Transform1(DF(), c1, count, c2, [](DF df, VF v, VF mul) HWY_ATTR {
-    return hn::Mul(mul, Gelu(df, v));
-  });
+  Decompress1AndCompressInplace(DF(), c1, count, c2,
+                                [](DF df, VF v1, VF v2) HWY_ATTR -> VF {
+                                  return hn::Mul(v2, Gelu(df, v1));
+                                });
 }

 // No C2 multiplier.
@ -75,10 +76,9 @@ void ActivationBatched(ActivationType activation, Mat& c1,
  });
 }

-template <class Mat>
-HWY_NOINLINE void ActivationBatched(ActivationType activation, Mat& c1,
-                                    const Mat* c2, ThreadingContext& ctx) {
-  using T = typename Mat::T;
+template <class Mat1, class Mat2>
+HWY_NOINLINE void ActivationBatched(ActivationType activation, Mat1& c1,
+                                    const Mat2* c2, ThreadingContext& ctx) {
  HWY_DASSERT(c1.SameShape(*c2));
  if (c2 && c2->HasPtr()) {
    SmallParallelFor(c1.Rows(), ctx.pools, [&](uint64_t task, size_t worker) {
@ -87,8 +87,9 @@ HWY_NOINLINE void ActivationBatched(ActivationType activation, Mat& c1,
    });
  } else {  // No multiplier
    SmallParallelFor(c1.Rows(), ctx.pools, [&](uint64_t task, size_t worker) {
-      Activation(activation, c1.Row(task), static_cast<const T*>(nullptr),
-                 c1.Cols(), ctx.profiler, worker);
+      Activation(activation, c1.Row(task),
+                 static_cast<const typename Mat2::T*>(nullptr), c1.Cols(),
+                 ctx.profiler, worker);
    });
  }
 }
--- a/gemma/vit.cc
+++ b/gemma/vit.cc
@ -335,7 +335,7 @@ void PrefillVit(const ModelConfig& model_config, const WeightsPtrs& weights,

    // Apply soft embedding norm before input projection.
    CallUpcasted(&weights.mm_embed_norm, [&](const auto* weights_t) {
-      RMSNormInplace(weights_t->PackedScale1(), 0, activations.x.Row(0),
+      RMSNormInplace(weights_t->PackedScale1(), activations.x.Row(0),
                     vit_model_dim, env.ctx.profiler, hwy::Profiler::Thread());
    });
  }
--- a/ops/dot_test.cc
+++ b/ops/dot_test.cc
@ -750,7 +750,7 @@ class DotStats {
  void CheckMuls() const {
    // Comp2 is between Compensated and Kahan.
    ASSERT_INSIDE(kComp2, 1.001, s_muls[kComp2].Mean(), 1.4);
-    ASSERT_INSIDE(kComp2, 1.001f, s_muls[kComp2].Max(), 2.4f);
+    ASSERT_INSIDE(kComp2, 1.001f, s_muls[kComp2].Max(), 6.8f);
    ASSERT_INSIDE(kComp2, 1.0, s_muls[kComp2].GeometricMean(), 1.2);

    // Compensated and Double are very accurate.
--- a/ops/matmul_test.cc
+++ b/ops/matmul_test.cc
@ -119,7 +119,7 @@ void AssertClose(const MatPtrT<TA>& A, const MatPtrT<TB>& B,
  const double eps_bf16 = hwy::ConvertScalarTo<double>(hwy::Epsilon<BF16>());
  const double eps_f32 = hwy::ConvertScalarTo<double>(hwy::Epsilon<float>());
  // Dot() uses double-precision summation.
-  double tolerance = 12 * norm * eps_f32;
+  double tolerance = 20 * norm * eps_f32;
  // If B is F32, Dot() promotes F32 or even F64, but MatMul demotes the F32 to
  // BF16, so add extra tolerance.
  if (IsF32<TB>()) {
--- a/ops/ops-inl.h
+++ b/ops/ops-inl.h
@ -127,12 +127,13 @@ HWY_INLINE hn::Vec<D> Gelu(D d, hn::Vec<D> v) {
 }

 // Activation already has a profiler zone.
-static HWY_NOINLINE HWY_MAYBE_UNUSED void Gelu(float* HWY_RESTRICT x,
-                                               size_t size) {
+template <typename T>
+static HWY_NOINLINE HWY_MAYBE_UNUSED void Gelu(T* HWY_RESTRICT x, size_t size) {
  namespace hn = hwy::HWY_NAMESPACE;
-  using D = hn::ScalableTag<float>;
-  hn::Transform(D(), x, size,
-                [](D d, hn::Vec<D> v) HWY_ATTR { return Gelu(d, v); });
+  using DF = hn::ScalableTag<float>;
+  using VF = hn::Vec<DF>;
+  DecompressAndCompressInplace(
+      DF(), x, size, [](DF d, VF v) HWY_ATTR -> VF { return Gelu(d, v); });
 }

 template <class D, HWY_IF_F32_D(D)>
@ -179,13 +180,15 @@ HWY_INLINE hn::Vec<D> Sigmoid(D d, hn::Vec<D> v) {
 }

 // Sigmoid using the logistic function 1 / (1 + exp(-x[i]))
-static HWY_NOINLINE HWY_MAYBE_UNUSED void Sigmoid(float* HWY_RESTRICT x,
+template <typename T>
+static HWY_NOINLINE HWY_MAYBE_UNUSED void Sigmoid(T* HWY_RESTRICT x,
                                                  size_t size) {
  PROFILER_ZONE("ops.Sigmoid");
  namespace hn = hwy::HWY_NAMESPACE;
-  using D = hn::ScalableTag<float>;
-  hn::Transform(D(), x, size,
-                [](D d, hn::Vec<D> v) HWY_ATTR { return Sigmoid(d, v); });
+  using DF = hn::ScalableTag<float>;
+  using VF = hn::Vec<DF>;
+  DecompressAndCompressInplace(
+      DF(), x, size, [](DF d, VF v) HWY_ATTR -> VF { return Sigmoid(d, v); });
 }

 namespace detail {
@ -205,71 +208,53 @@ float RMSNormMul(const VT* HWY_RESTRICT x, const size_t size, hwy::Profiler& p,

 }  // namespace detail

-// `x_ofs` is the offset within `x`, required for NuqStream.
 template <typename XT, typename WT, typename OT>
 HWY_NOINLINE HWY_MAYBE_UNUSED void RMSNorm(const XT* HWY_RESTRICT x,
                                           const WT* HWY_RESTRICT weight,
-                                           size_t w_ofs, OT* HWY_RESTRICT out,
+                                           OT* HWY_RESTRICT out,
                                           const size_t size, hwy::Profiler& p,
                                           const size_t worker) {
  static const auto zone = p.AddZone("Ops.RMSNorm");
  PROFILER_ZONE3(p, worker, zone);

  namespace hn = hwy::HWY_NAMESPACE;
-  const hn::ScalableTag<float> df;
-  using VF = hn::Vec<decltype(df)>;
-  const size_t NF = hn::Lanes(df);
+  using DF = hn::ScalableTag<float>;
+  using VF = hn::Vec<DF>;

-  const VF mul = hn::Set(df, detail::RMSNormMul(x, size, p, worker));
+  const VF mul = hn::Set(DF(), detail::RMSNormMul(x, size, p, worker));
+  const VF* HWY_RESTRICT pmul = &mul;

-  const auto packed_x = MakeSpan(x, size);
-  const auto packed_w = MakeSpan(weight, w_ofs + size);
-  const auto packed_out = MakeSpan(out, size);
-
-  HWY_DASSERT(size % (2 * NF) == 0);
-  for (size_t i = 0; i < size; i += 2 * NF) {
-    VF x0, x1, w0, w1;
-    Decompress2(df, packed_x, i, x0, x1);
-    Decompress2(df, packed_w, w_ofs + i, w0, w1);
-    const VF m0 = hn::Mul(mul, x0);
-    const VF m1 = hn::Mul(mul, x1);
-    // (1+weight) * m = m + weight*m = one FMA.
-    const VF out0 = hn::MulAdd(m0, w0, m0);
-    const VF out1 = hn::MulAdd(m1, w1, m1);
-    Compress2(df, out0, out1, packed_out, i);
-  }
+  Decompress2AndCompressTo(DF(), out, size, x, weight,
+                           [pmul](DF /*df*/, VF vx, VF vw) HWY_ATTR -> VF {
+                             const VF m = hn::Mul(*pmul, vx);
+                             // (1+weight) * m = m + weight*m = one FMA.
+                             return hn::MulAdd(m, vw, m);
+                           });
 }

 // Same as RMSNorm, but its HWY_RESTRICT forbids passing the same pointer.
 template <typename WT, typename XT>
-HWY_NOINLINE HWY_MAYBE_UNUSED void RMSNormInplace(
-    const WT* HWY_RESTRICT weight, size_t w_ofs, XT* HWY_RESTRICT inout,
-    const size_t size, hwy::Profiler& p, const size_t worker) {
+HWY_NOINLINE HWY_MAYBE_UNUSED void RMSNormInplace(const WT* HWY_RESTRICT weight,
+                                                  XT* HWY_RESTRICT inout,
+                                                  const size_t size,
+                                                  hwy::Profiler& p,
+                                                  const size_t worker) {
  static const auto zone = p.AddZone("Ops.RMSNormInplace");
  PROFILER_ZONE3(p, worker, zone);

  namespace hn = hwy::HWY_NAMESPACE;
-  const hn::ScalableTag<float> df;
-  using VF = hn::Vec<decltype(df)>;
-  const size_t NF = hn::Lanes(df);
+  using DF = hn::ScalableTag<float>;
+  using VF = hn::Vec<DF>;

-  const VF mul = hn::Set(df, detail::RMSNormMul(inout, size, p, worker));
+  const VF mul = hn::Set(DF(), detail::RMSNormMul(inout, size, p, worker));
+  const VF* HWY_RESTRICT pmul = &mul;

-  const auto packed_w = MakeSpan(weight, w_ofs + size);
-  const auto packed_x = MakeSpan(inout, size);
-
-  HWY_DASSERT(size % (2 * NF) == 0);
-  for (size_t i = 0; i < size; i += 2 * NF) {
-    VF x0, x1, w0, w1;
-    Decompress2(df, packed_x, i, x0, x1);
-    Decompress2(df, packed_w, w_ofs + i, w0, w1);
-    const VF m0 = hn::Mul(mul, x0);
-    const VF m1 = hn::Mul(mul, x1);
-    // (1+weight) * m = m + weight*m = one FMA.
-    const VF out0 = hn::MulAdd(m0, w0, m0);
-    const VF out1 = hn::MulAdd(m1, w1, m1);
-    Compress2(df, out0, out1, packed_x, i);
-  }
+  Decompress1AndCompressInplace(DF(), inout, size, weight,
+                                [pmul](DF /*df*/, VF vx, VF vw) HWY_ATTR -> VF {
+                                  const VF m = hn::Mul(*pmul, vx);
+                                  // (1+weight) * m = m + weight*m = one FMA.
+                                  return hn::MulAdd(m, vw, m);
+                                });
 }

 // Computes mean mu and mean of squares mu2 of a vector. Used in LayerNorm.
@ -301,9 +286,9 @@ HWY_NOINLINE void LayerNorm(const XT* x, const WT* HWY_RESTRICT scale,
  PROFILER_ZONE("ops.LayerNorm");

  namespace hn = hwy::HWY_NAMESPACE;
-  const hn::ScalableTag<float> df;
-  using VF = hn::Vec<decltype(df)>;
-  const size_t NF = hn::Lanes(df);
+  using DF = hn::ScalableTag<float>;
+  const DF df;
+  using VF = hn::Vec<DF>;

  double mu, mu2;
  ComputeMoments(x, size, mu, mu2);
@ -315,56 +300,13 @@ HWY_NOINLINE void LayerNorm(const XT* x, const WT* HWY_RESTRICT scale,
  const VF* HWY_RESTRICT pmu = &vmu;
  const VF* HWY_RESTRICT pvar = &vvar;

-  const auto packed_x = MakeSpan(x, size);
-  const auto packed_scale = MakeSpan(scale, size);
-  const auto packed_bias = MakeSpan(bias, size);
-  const auto packed_out = MakeSpan(out, size);
-
-  // Loop body for one vector, called from main loop and remainder loop.
-  const auto norm = [pmu, pvar](VF x, VF s, VF add) HWY_ATTR -> VF {
-    const VF centered = hn::Sub(x, *pmu);
-    const VF mul = hn::Mul(s, *pvar);
-    return hn::MulAdd(centered, mul, add);
-  };
-
-  size_t i = 0;
-  if (size >= 2 * NF) {
-    for (; i <= size - 2 * NF; i += 2 * NF) {
-      VF x0, x1, s0, s1, add0, add1;
-      Decompress2(df, packed_x, i, x0, x1);
-      Decompress2(df, packed_scale, i, s0, s1);
-      Decompress2(df, packed_bias, i, add0, add1);
-      const VF n0 = norm(x0, s0, add0);
-      const VF n1 = norm(x1, s1, add1);
-      Compress2(df, n0, n1, packed_out, i);
-    }
-  }
-
-  const size_t remaining = size - i;
-  HWY_DASSERT(remaining < 2 * NF);
-  if (HWY_UNLIKELY(remaining != 0)) {
-    HWY_ALIGN float buf_x[2 * hn::MaxLanes(df)];
-    HWY_ALIGN float buf_scale[2 * hn::MaxLanes(df)];
-    HWY_ALIGN float buf_bias[2 * hn::MaxLanes(df)];
-    // Ensure the second vectors are zeroed even if remaining <= NF.
-    hn::Store(hn::Zero(df), df, buf_x + NF);
-    hn::Store(hn::Zero(df), df, buf_scale + NF);
-    hn::Store(hn::Zero(df), df, buf_bias + NF);
-    HWY_ALIGN OT buf_out[2 * hn::MaxLanes(df)];
-    DecompressAndZeroPad(df, packed_x, i, buf_x, remaining);
-    DecompressAndZeroPad(df, packed_scale, i, buf_scale, remaining);
-    DecompressAndZeroPad(df, packed_bias, i, buf_bias, remaining);
-    const VF x0 = hn::Load(df, buf_x);
-    const VF x1 = hn::Load(df, buf_x + NF);
-    const VF s0 = hn::Load(df, buf_scale);
-    const VF s1 = hn::Load(df, buf_scale + NF);
-    const VF add0 = hn::Load(df, buf_bias);
-    const VF add1 = hn::Load(df, buf_bias + NF);
-    const VF n0 = norm(x0, s0, add0);
-    const VF n1 = norm(x1, s1, add1);
-    Compress2(df, n0, n1, MakeSpan(buf_out, 2 * NF), 0);
-    hwy::CopyBytes(buf_out, out + i, remaining * sizeof(OT));
-  }
+  Decompress3AndCompressTo(DF(), out, size, x, scale, bias,
+                           [pmu, pvar](DF /*df*/, VF x, VF s, VF add)
+                               HWY_ATTR -> VF {
+                                 const VF centered = hn::Sub(x, *pmu);
+                                 const VF mul = hn::Mul(s, *pvar);
+                                 return hn::MulAdd(centered, mul, add);
+                               });
 }

 static HWY_NOINLINE HWY_MAYBE_UNUSED void AddAbsolutePositionalEmbeddings(
@ -541,40 +483,11 @@ static HWY_NOINLINE HWY_MAYBE_UNUSED void AddFrom(const XT* HWY_RESTRICT x,
  PROFILER_ZONE3(p, worker, zone);

  namespace hn = hwy::HWY_NAMESPACE;
-  const hn::ScalableTag<float> df;
-  const size_t NF = hn::Lanes(df);
-  using VF = hn::Vec<decltype(df)>;
-
-  const auto packed_x = MakeSpan(x, size);
-
-  size_t i = 0;
-  if (size >= 2 * NF) {
-    for (; i <= size - 2 * NF; i += 2 * NF) {
-      VF x0, x1;
-      Decompress2(df, packed_x, i, x0, x1);
-      VF out0 = hn::Load(df, out + i);
-      VF out1 = hn::Load(df, out + i + NF);
-      hn::Store(hn::Add(x0, out0), df, out + i);
-      hn::Store(hn::Add(x1, out1), df, out + i + NF);
-    }
-  }
-
-  const size_t remaining = size - i;
-  const size_t remaining1 = remaining - HWY_MIN(remaining, NF);
-  HWY_DASSERT(remaining < 2 * NF);
-  HWY_DASSERT(remaining1 < NF);
-  if (HWY_UNLIKELY(remaining != 0)) {
-    HWY_ALIGN float buf_x[2 * hn::MaxLanes(df)];
-    // Ensure the second vector is zeroed even if remaining <= NF.
-    hn::Store(hn::Zero(df), df, buf_x + NF);
-    DecompressAndZeroPad(df, packed_x, i, buf_x, remaining);
-    const VF x0 = hn::Load(df, buf_x);
-    const VF x1 = hn::Load(df, buf_x + NF);
-    const VF out0 = hn::LoadN(df, out + i, remaining);
-    const VF out1 = hn::LoadN(df, out + i + NF, remaining1);
-    hn::StoreN(hn::Add(x0, out0), df, out + i, remaining);
-    hn::StoreN(hn::Add(x1, out1), df, out + i + NF, remaining1);
-  }
+  using DF = hn::ScalableTag<float>;
+  using VF = hn::Vec<DF>;
+  Decompress1AndCompressInplace(DF(), out, size, x,
+                                [&](DF /*df*/, VF out, VF x)
+                                    HWY_ATTR -> VF { return hn::Add(x, out); });
 }

 // Simple loops unless/until batch sizes are large enough to parallelize.
@ -588,7 +501,7 @@ void RMSNormBatched(const MatPtrT<XT>& activations, const MatPtr& weights,
  CallUpcasted(&weights, [&](const auto* weights_t) {
    SmallParallelFor(
        activations.Rows(), ctx.pools, [&](uint64_t token_idx, size_t worker) {
-          RMSNorm(activations.Row(token_idx), weights_t->PackedScale1(), 0,
+          RMSNorm(activations.Row(token_idx), weights_t->PackedScale1(),
                  out.Row(token_idx), activations.Cols(), ctx.profiler, worker);
        });
  });
@ -603,7 +516,7 @@ void RMSNormInplaceBatched(const MatPtr& weights, MatPtrT<XT>& inout,
  CallUpcasted(&weights, [&](const auto* weights_t) {
    SmallParallelFor(
        inout.Rows(), ctx.pools, [&](uint64_t token_idx, size_t worker) {
-          RMSNormInplace(weights_t->PackedScale1(), 0, inout.Row(token_idx),
+          RMSNormInplace(weights_t->PackedScale1(), inout.Row(token_idx),
                         inout.Cols(), ctx.profiler, worker);
        });
  });
@ -645,38 +558,15 @@ HWY_NOINLINE HWY_MAYBE_UNUSED void MulByConst(const float c, XT* HWY_RESTRICT x,
  static const auto zone = p.AddZone("Ops.MulByConst");
  PROFILER_ZONE3(p, worker, zone);
  namespace hn = hwy::HWY_NAMESPACE;
-  const hn::ScalableTag<float> df;
-  const size_t NF = hn::Lanes(df);
-  using VF = hn::Vec<decltype(df)>;
+  using DF = hn::ScalableTag<float>;
+  using VF = hn::Vec<DF>;

-  const VF v_c = hn::Set(df, c);
-  const auto packed_x = MakeSpan(x, size);
+  const VF vc = hn::Set(DF(), c);
+  const VF* HWY_RESTRICT pc = &vc;

-  size_t i = 0;
-  if (size >= 2 * NF) {
-    for (; i <= size - 2 * NF; i += 2 * NF) {
-      VF x0, x1;
-      Decompress2(df, packed_x, i, x0, x1);
-      x0 = hn::Mul(x0, v_c);
-      x1 = hn::Mul(x1, v_c);
-      Compress2(df, x0, x1, packed_x, i);
-    }
-  }
-
-  const size_t remaining = size - i;
-  HWY_DASSERT(remaining < 2 * NF);
-  if (HWY_UNLIKELY(remaining != 0)) {
-    HWY_ALIGN float buf_x[2 * hn::MaxLanes(df)];
-    // Ensure the second vector is zeroed even if remaining <= NF.
-    hn::Store(hn::Zero(df), df, buf_x + NF);
-    DecompressAndZeroPad(df, packed_x, i, buf_x, remaining);
-    VF x0 = hn::Load(df, buf_x);
-    VF x1 = hn::Load(df, buf_x + NF);
-    x0 = hn::Mul(x0, v_c);
-    x1 = hn::Mul(x1, v_c);
-    Compress2(df, x0, x1, MakeSpan(buf_x, 2 * NF), 0);
-    hwy::CopyBytes(buf_x, x + i, remaining * sizeof(XT));
-  }
+  DecompressAndCompressInplace(DF(), x, size,
+                               [pc](DF /*df*/, VF x)
+                                   HWY_ATTR -> VF { return hn::Mul(x, *pc); });
 }

 // Same as above, but with a separate output. Same as below without the add.
@ -687,42 +577,18 @@ HWY_NOINLINE HWY_MAYBE_UNUSED void MulByConstTo(
  static const auto zone = p.AddZone("Ops.MulByConstTo");
  PROFILER_ZONE3(p, worker, zone);
  namespace hn = hwy::HWY_NAMESPACE;
-  const hn::ScalableTag<float> df;
-  const size_t NF = hn::Lanes(df);
-  using VF = hn::Vec<decltype(df)>;
+  using DF = hn::ScalableTag<float>;
+  using VF = hn::Vec<DF>;

-  const VF v_c = hn::Set(df, c);
-  const auto packed_x = MakeSpan(x, size);
-  const auto packed_out = MakeSpan(out, size);
+  const VF vc = hn::Set(DF(), c);
+  const VF* HWY_RESTRICT pc = &vc;

-  size_t i = 0;
-  if (size >= 2 * NF) {
-    for (; i <= size - 2 * NF; i += 2 * NF) {
-      VF x0, x1;
-      Decompress2(df, packed_x, i, x0, x1);
-      const VF out0 = hn::Mul(x0, v_c);
-      const VF out1 = hn::Mul(x1, v_c);
-      Compress2(df, out0, out1, packed_out, i);
-    }
-  }
-
-  const size_t remaining = size - i;
-  HWY_DASSERT(remaining < 2 * NF);
-  if (HWY_UNLIKELY(remaining != 0)) {
-    HWY_ALIGN float buf_x[2 * hn::MaxLanes(df)];
-    HWY_ALIGN float buf_out[2 * hn::MaxLanes(df)];
-    // Ensure the second vector is zeroed even if remaining <= NF.
-    hn::Store(hn::Zero(df), df, buf_x + NF);
-    DecompressAndZeroPad(df, packed_x, i, buf_x, remaining);
-    const VF x0 = hn::Load(df, buf_x);
-    const VF x1 = hn::Load(df, buf_x + NF);
-    const VF out0 = hn::Mul(x0, v_c);
-    const VF out1 = hn::Mul(x1, v_c);
-    Compress2(df, out0, out1, MakeSpan(buf_out, 2 * NF), 0);
-    hwy::CopyBytes(buf_out, out + i, remaining * sizeof(OT));
-  }
+  Decompress1AndCompressTo(DF(), out, size, x,
+                           [pc](DF /*df*/, VF x)
+                               HWY_ATTR -> VF { return hn::Mul(x, *pc); });
 }

+// out[i] += x[i] * c.
 template <typename XT, typename OT>
 HWY_NOINLINE HWY_MAYBE_UNUSED void MulByConstAndAdd(
    const float c, const XT* HWY_RESTRICT x, OT* HWY_RESTRICT out,
@ -730,45 +596,16 @@ HWY_NOINLINE HWY_MAYBE_UNUSED void MulByConstAndAdd(
  static const auto zone = p.AddZone("Ops.MulByConstAndAdd");
  PROFILER_ZONE3(p, worker, zone);
  namespace hn = hwy::HWY_NAMESPACE;
-  const hn::ScalableTag<float> df;
-  const size_t NF = hn::Lanes(df);
-  using VF = hn::Vec<decltype(df)>;
+  using DF = hn::ScalableTag<float>;
+  using VF = hn::Vec<DF>;

-  const VF v_c = hn::Set(df, c);
-  const auto packed_x = MakeSpan(x, size);
-  const auto packed_out = MakeSpan(out, size);
+  const VF vc = hn::Set(DF(), c);
+  const VF* HWY_RESTRICT pc = &vc;

-  size_t i = 0;
-  if (size >= 2 * NF) {
-    for (; i <= size - 2 * NF; i += 2 * NF) {
-      VF x0, x1, out0, out1;
-      Decompress2(df, packed_x, i, x0, x1);
-      Decompress2(df, packed_out, i, out0, out1);
-      out0 = hn::MulAdd(x0, v_c, out0);
-      out1 = hn::MulAdd(x1, v_c, out1);
-      Compress2(df, out0, out1, packed_out, i);
-    }
-  }
-
-  const size_t remaining = size - i;
-  HWY_DASSERT(remaining < 2 * NF);
-  if (HWY_UNLIKELY(remaining != 0)) {
-    HWY_ALIGN float buf_x[2 * hn::MaxLanes(df)];
-    HWY_ALIGN float buf_out[2 * hn::MaxLanes(df)];
-    // Ensure the second vectors are zeroed even if remaining <= NF.
-    hn::Store(hn::Zero(df), df, buf_x + NF);
-    hn::Store(hn::Zero(df), df, buf_out + NF);
-    DecompressAndZeroPad(df, packed_x, i, buf_x, remaining);
-    DecompressAndZeroPad(df, packed_out, i, buf_out, remaining);
-    const VF x0 = hn::Load(df, buf_x);
-    const VF x1 = hn::Load(df, buf_x + NF);
-    VF out0 = hn::Load(df, buf_out);
-    VF out1 = hn::Load(df, buf_out + NF);
-    out0 = hn::MulAdd(x0, v_c, out0);
-    out1 = hn::MulAdd(x1, v_c, out1);
-    Compress2(df, out0, out1, MakeSpan(buf_out, 2 * NF), 0);
-    hwy::CopyBytes(buf_out, out + i, remaining * sizeof(OT));
-  }
+  Decompress1AndCompressInplace(DF(), out, size, x,
+                                [&](DF /*df*/, VF out, VF x) HWY_ATTR -> VF {
+                                  return hn::MulAdd(x, *pc, out);
+                                });
 }

 // See below for a specialized version for top-1 sampling.
@ -913,15 +750,18 @@ static HWY_NOINLINE void LogitsSoftCap(const float cap, float* HWY_RESTRICT x,
  PROFILER_ZONE3(p, worker, zone);

  namespace hn = hwy::HWY_NAMESPACE;
-  using D = hn::ScalableTag<float>;
-  using V = hn::Vec<D>;
+  using DF = hn::ScalableTag<float>;
+  using VF = hn::Vec<DF>;

-  const float inv_cap = 1.0f / cap;
+  const VF vcap = hn::Set(DF(), cap);
+  const VF vinv_cap = hn::Set(DF(), 1.0f / cap);
+  const VF* HWY_RESTRICT pcap = &vcap;
+  const VF* HWY_RESTRICT pinv_cap = &vinv_cap;

-  hn::Transform(D(), x, size, [cap, inv_cap](D d, V v) HWY_ATTR {
-    return hn::Mul(hn::Set(d, cap),
-                   hn::Tanh(d, hn::Mul(v, hn::Set(d, inv_cap))));
-  });
+  DecompressAndCompressInplace(
+      DF(), x, size, [pcap, pinv_cap](DF d, VF v) HWY_ATTR -> VF {
+        return hn::Mul(*pcap, hn::Tanh(d, hn::Mul(v, *pinv_cap)));
+      });
 }

 // Calls LogitsSoftCap if cap != 0.0f.
--- a/ops/ops_test.cc
+++ b/ops/ops_test.cc
@ -47,6 +47,7 @@
 #include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 // After highway.h
+#include "compression/test_util-inl.h"
 #include "ops/ops-inl.h"
 #include "hwy/tests/test_util-inl.h"

@ -83,48 +84,6 @@ T Random(hwy::RandomState& rng) {
      HWY_MAX(hwy::ConvertScalarTo<double>(hwy::LowestValue<T>()), val));
 }

-HWY_NOINLINE void SimpleAddFrom(const float* HWY_RESTRICT other,
-                                float* HWY_RESTRICT x, size_t size) {
-  for (size_t i = 0; i < size; ++i) {
-    x[i] += other[i];
-  }
-}
-
-HWY_NOINLINE void SimpleMulBy(const float* HWY_RESTRICT other,
-                              float* HWY_RESTRICT x, size_t size) {
-  for (size_t i = 0; i < size; ++i) {
-    x[i] *= other[i];
-  }
-}
-
-HWY_NOINLINE void SimpleMulByConst(float c, float* HWY_RESTRICT x,
-                                   size_t size) {
-  for (size_t i = 0; i < size; ++i) {
-    x[i] *= c;
-  }
-}
-
-HWY_NOINLINE void SimpleMulByConstAndAdd(float c, const float* HWY_RESTRICT x,
-                                         float* HWY_RESTRICT out, size_t size) {
-  for (size_t i = 0; i < size; ++i) {
-    out[i] += x[i] * c;
-  }
-}
-
-HWY_NOINLINE void SimpleSoftmax(float* HWY_RESTRICT x, size_t size) {
-  HWY_DASSERT(size != 0);
-  float sum = 0.0;
-  const float maxval = *std::max_element(x, x + size);
-  for (size_t i = 0; i < size; ++i) {
-    x[i] = std::exp(x[i] - maxval);
-    sum += x[i];
-  }
-  const float scale = 1.0f / sum;
-  for (size_t i = 0; i < size; ++i) {
-    x[i] *= scale;
-  }
-}
-
 template <size_t k>
 HWY_NOINLINE std::discrete_distribution<int> SourceCreateDistribution(
    std::array<float, k>& top_k, float temperature) {
@ -141,7 +100,8 @@ HWY_NOINLINE std::discrete_distribution<int> SourceCreateDistribution(
  return std::discrete_distribution<int>(std::begin(top_k), std::end(top_k));
 }

-struct TestAddFrom {
+class TestAddFrom {
+ public:
  template <class D>
  void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
                  hwy::RandomState& rng) {
@ -171,9 +131,24 @@ struct TestAddFrom {
    hwy::AssertArraySimilar(e, x, count, hwy::TargetName(HWY_TARGET), __FILE__,
                            __LINE__);
  }
+
+ private:
+  template <typename T1, typename T2>
+  static HWY_NOINLINE void SimpleAddFrom(const T1* HWY_RESTRICT other,
+                                         T2* HWY_RESTRICT x, size_t size) {
+    for (size_t i = 0; i < size; ++i) {
+      x[i] = hwy::ConvertScalarTo<T2>(hwy::ConvertScalarTo<float>(x[i]) +
+                                      hwy::ConvertScalarTo<float>(other[i]));
+    }
+  }
 };

-struct TestMulByConstAndAdd {
+void TestAllAddFrom() {
+  hn::ForPartialVectors<ForeachCountAndMisalign<TestAddFrom>>()(float());
+}
+
+class TestMulByConstAndAdd {
+ public:
  template <class D>
  void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
                  hwy::RandomState& rng) {
@ -204,9 +179,27 @@ struct TestMulByConstAndAdd {
    hwy::AssertArraySimilar(e, x, count, hwy::TargetName(HWY_TARGET), __FILE__,
                            __LINE__);
  }
+
+ private:
+  template <typename T1, typename T2>
+  static HWY_NOINLINE void SimpleMulByConstAndAdd(float c,
+                                                  const T1* HWY_RESTRICT x,
+                                                  T2* HWY_RESTRICT out,
+                                                  size_t size) {
+    for (size_t i = 0; i < size; ++i) {
+      out[i] = hwy::ConvertScalarTo<T2>(hwy::ConvertScalarTo<float>(out[i]) +
+                                        hwy::ConvertScalarTo<float>(x[i]) * c);
+    }
+  }
 };

-struct TestMulByConst {
+void TestAllMulByConstAndAdd() {
+  hn::ForPartialVectors<ForeachCountAndMisalign<TestMulByConstAndAdd>>()(
+      float());
+}
+
+class TestMulByConst {
+ public:
  template <class D>
  void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
                  hwy::RandomState& rng) {
@ -234,9 +227,61 @@ struct TestMulByConst {
    hwy::AssertArraySimilar(e, x, count, hwy::TargetName(HWY_TARGET), __FILE__,
                            __LINE__);
  }
+
+ private:
+  template <typename T1>
+  static HWY_NOINLINE void SimpleMulByConst(float c, T1* HWY_RESTRICT x,
+                                            size_t size) {
+    for (size_t i = 0; i < size; ++i) {
+      x[i] = hwy::ConvertScalarTo<T1>(hwy::ConvertScalarTo<float>(x[i]) * c);
+    }
+  }
 };

-struct TestSoftmax {
+void TestAllMulByConst() {
+  hn::ForPartialVectors<ForeachCountAndMisalign<TestMulByConst>>()(float());
+}
+
+struct TestMulByConstTo {
+  template <class D>
+  void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
+                  hwy::RandomState& rng) {
+    if (misalign_b == 0) return;
+    using T = hn::TFromD<D>;
+
+    hwy::AlignedFreeUniquePtr<T[]> px =
+        hwy::AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
+    hwy::AlignedFreeUniquePtr<T[]> pe =
+        hwy::AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
+    hwy::AlignedFreeUniquePtr<T[]> pactual =
+        hwy::AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
+    HWY_ASSERT(px && pe && pactual);
+
+    T* x = px.get() + misalign_a;
+    T* e = pe.get() + misalign_a;
+    T* actual = pe.get() + misalign_a;
+
+    T constant = Random<T>(rng);
+    for (size_t i = 0; i < count; ++i) {
+      x[i] = Random<T>(rng);
+      e[i] = hwy::ConvertScalarTo<T>(hwy::ConvertScalarTo<float>(x[i]) *
+                                     hwy::ConvertScalarTo<float>(constant));
+    }
+
+    MulByConstTo(constant, x, actual, count, hwy::Profiler::Get(),
+                 /*worker=*/0);
+
+    hwy::AssertArraySimilar(e, actual, count, hwy::TargetName(HWY_TARGET),
+                            __FILE__, __LINE__);
+  }
+};
+
+void TestAllMulByConstTo() {
+  hn::ForPartialVectors<ForeachCountAndMisalign<TestMulByConstTo>>()(float());
+}
+
+class TestSoftmax {
+ public:
  template <class D>
  void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
                  hwy::RandomState& rng) {
@ -270,8 +315,27 @@ struct TestSoftmax {
    }
    ASSERT_NEAR(sum, 1.0, 1e-6);
  }
+
+ private:
+  static HWY_NOINLINE void SimpleSoftmax(float* HWY_RESTRICT x, size_t size) {
+    HWY_DASSERT(size != 0);
+    float sum = 0.0;
+    const float maxval = *std::max_element(x, x + size);
+    for (size_t i = 0; i < size; ++i) {
+      x[i] = std::exp(x[i] - maxval);
+      sum += x[i];
+    }
+    const float scale = 1.0f / sum;
+    for (size_t i = 0; i < size; ++i) {
+      x[i] *= scale;
+    }
+  }
 };

+void TestAllSoftmax() {
+  hn::ForPartialVectors<ForeachCountAndMisalign<TestSoftmax>>()(float());
+}
+
 template <size_t k>
 struct TestCreateDistribution {
  void operator()(hwy::RandomState& rng) {
@ -291,43 +355,60 @@ struct TestCreateDistribution {
  }
 };

-void TestAllAddFrom() {
-  hn::ForPartialVectors<ForeachCountAndMisalign<TestAddFrom>>()(float());
-}
-
-void TestAllMulByConst() {
-  hn::ForPartialVectors<ForeachCountAndMisalign<TestMulByConst>>()(float());
-}
-
-void TestAllMulByConstAndAdd() {
-  hn::ForPartialVectors<ForeachCountAndMisalign<TestMulByConstAndAdd>>()(
-      float());
-}
-
-void TestAllSoftmax() {
-  hn::ForPartialVectors<ForeachCountAndMisalign<TestSoftmax>>()(float());
-}
-
 void TestAllCreateDistribution() {
  TestCreateDistribution<2048>();
  TestCreateDistribution<5000>();
 }

-void TestSigmoid() {
-  std::vector<float> values;
-  for (int i = -150; i <= 150; ++i) {
-    values.push_back(.1f * i);
-  }
-  std::vector<float> result = values;
-  Sigmoid(result.data(), result.size());
+struct TestSigmoid {
+  template <typename T, class D>
+  void operator()(T, D) const {
+    std::vector<T> values;
+    for (int i = -150; i <= 150; ++i) {
+      values.push_back(hwy::ConvertScalarTo<T>(.1f * i));
+    }
+    std::vector<T> result = values;
+    Sigmoid(result.data(), result.size());

-  for (size_t i = 0; i < values.size(); i++) {
-    const float max_error = 0.00007;
-    float value = values[i];
-    float approx = result[i];
-    float expected = (1 / (1 + std::exp(-values[i])));
-    EXPECT_NEAR(approx, expected, max_error) << "Input: " << value;
+    for (size_t i = 0; i < values.size(); i++) {
+      const float max_error = IsBF16<T>() ? 0.2f : 0.00007f;
+      const float value = hwy::ConvertScalarTo<float>(values[i]);
+      const float actual = hwy::ConvertScalarTo<float>(result[i]);
+      const float expected = (1 / (1 + std::exp(-value)));
+      EXPECT_NEAR(expected, actual, max_error)
+          << (IsBF16<T>() ? "bf16" : "float");
+    }
  }
+};
+
+static HWY_NOINLINE void TestAllSigmoid() {
+  ForeachActivationType1<TestSigmoid>(hn::ScalableTag<float>());
+}
+
+struct TestGelu {
+  template <typename T, class D>
+  void operator()(T, D) const {
+    std::vector<T> values;
+    for (int i = -150; i <= 150; ++i) {
+      values.push_back(hwy::ConvertScalarTo<T>(.1f * i));
+    }
+    std::vector<T> result = values;
+    Gelu(result.data(), result.size());
+
+    for (size_t i = 0; i < values.size(); i++) {
+      const float max_error = IsBF16<T>() ? 0.2f : 0.00007f;
+      const float x = hwy::ConvertScalarTo<float>(values[i]);
+      const float actual = hwy::ConvertScalarTo<float>(result[i]);
+      const float expected =
+          x * (0.5f + 0.5f * tanh(x * (0.79788f + 0.035677f * x * x)));
+      EXPECT_NEAR(expected, actual, max_error)
+          << (IsBF16<T>() ? "bf16" : "float");
+    }
+  }
+};
+
+static HWY_NOINLINE void TestAllGelu() {
+  ForeachActivationType1<TestGelu>(hn::ScalableTag<float>());
 }

 static HWY_NOINLINE HWY_MAYBE_UNUSED void ScalarRopeAndMulBy(
@ -421,7 +502,8 @@ void TestRopeAndMulBy() {
 }

 template <typename T>
-HWY_NOINLINE float ScalarSquaredL2(const T* HWY_RESTRICT a, size_t size) {
+static HWY_NOINLINE float ScalarSquaredL2(const T* HWY_RESTRICT a,
+                                          size_t size) {
  double sum = 0.0;
  for (size_t i = 0; i < size; ++i) {
    const float f = hwy::ConvertScalarTo<float>(a[i]);
@ -431,9 +513,11 @@ HWY_NOINLINE float ScalarSquaredL2(const T* HWY_RESTRICT a, size_t size) {
 }

 // Supports bf16 and f32 inputs/outputs, which can be in-place.
+// Shared between TestRMSNorm and TestRMSNormInplace.
 template <typename XT, typename WT, typename OT>
-HWY_NOINLINE void ScalarRMSNorm(const XT* x, const WT* HWY_RESTRICT weight,
-                                OT* out, size_t size) {
+static HWY_NOINLINE void ScalarRMSNorm(const XT* x,
+                                       const WT* HWY_RESTRICT weight, OT* out,
+                                       size_t size) {
  constexpr float kEps = 1e-6f;
  float ss = ScalarSquaredL2(x, size);
  ss = 1.0f / sqrtf(ss / StaticCast<float>(size) + kEps);
@ -445,42 +529,73 @@ HWY_NOINLINE void ScalarRMSNorm(const XT* x, const WT* HWY_RESTRICT weight,
  }
 }

-template <typename XT, typename WT, typename OT>
-void TestRMSNorm(hwy::RandomState& rng) {
-  constexpr size_t kSize = 128;
-  HWY_ALIGN XT vec[kSize];
-  HWY_ALIGN WT weight[kSize];
-  HWY_ALIGN OT expected[kSize];
-  HWY_ALIGN OT actual[kSize];
+struct TestRMSNorm {
+  template <typename XT, typename WT, typename OT, class D>
+  void operator()(XT, WT, OT, D) const {
+    hwy::RandomState rng;

-  for (size_t i = 0; i < kSize; ++i) {
-    vec[i] = hwy::ConvertScalarTo<XT>(RandomGaussian(rng));
-    weight[i] = hwy::ConvertScalarTo<WT>(RandomGaussian(rng));
-  }
+    constexpr size_t kSize = 128;
+    HWY_ALIGN XT vec[kSize];
+    HWY_ALIGN WT weight[kSize];
+    HWY_ALIGN OT expected[kSize];
+    HWY_ALIGN OT actual[kSize];

-  ScalarRMSNorm(vec, weight, expected, kSize);
-  RMSNorm(vec, weight, 0, actual, kSize, hwy::Profiler::Get(), /*worker=*/0);
+    for (size_t i = 0; i < kSize; ++i) {
+      vec[i] = hwy::ConvertScalarTo<XT>(RandomGaussian(rng));
+      weight[i] = hwy::ConvertScalarTo<WT>(RandomGaussian(rng));
+    }

-  for (size_t i = 0; i < kSize; i++) {
-    const float e = hwy::ConvertScalarTo<float>(expected[i]);
-    const float a = hwy::ConvertScalarTo<float>(actual[i]);
-    if (!IsNear(e, a, 1e-5f)) {
-      HWY_ABORT("RMSNorm %s %s %s mismatch at %zu: %E %E\n", TypeName<XT>(),
-                TypeName<WT>(), TypeName<OT>(), i, e, a);
+    ScalarRMSNorm(vec, weight, expected, kSize);
+    RMSNorm(vec, weight, actual, kSize, hwy::Profiler::Get(), /*worker=*/0);
+
+    for (size_t i = 0; i < kSize; i++) {
+      const float e = hwy::ConvertScalarTo<float>(expected[i]);
+      const float a = hwy::ConvertScalarTo<float>(actual[i]);
+      if (!IsNear(e, a, 1e-5f)) {
+        HWY_ABORT("RMSNorm %s %s %s mismatch at %zu: %E %E\n", TypeName<XT>(),
+                  TypeName<WT>(), TypeName<OT>(), i, e, a);
+      }
    }
  }
-}
+};

 void TestAllRMSNorm() {
-  hwy::RandomState rng;
-  TestRMSNorm<float, float, float>(rng);
-  TestRMSNorm<float, float, BF16>(rng);
-  TestRMSNorm<float, BF16, float>(rng);
-  TestRMSNorm<float, BF16, BF16>(rng);
-  TestRMSNorm<BF16, float, float>(rng);
-  TestRMSNorm<BF16, float, BF16>(rng);
-  TestRMSNorm<BF16, BF16, float>(rng);
-  TestRMSNorm<BF16, BF16, BF16>(rng);
+  ForeachActivationType3<TestRMSNorm>(hn::ScalableTag<float>());
+}
+
+struct TestRMSNormInplace {
+  template <typename XT, typename WT, class D>
+  void operator()(XT, WT, D) const {
+    hwy::RandomState rng;
+
+    constexpr size_t kSize = 128;
+    HWY_ALIGN XT expected[kSize];
+    HWY_ALIGN XT actual[kSize];
+    HWY_ALIGN WT weight[kSize];
+
+    for (size_t i = 0; i < kSize; ++i) {
+      expected[i] = hwy::ConvertScalarTo<XT>(RandomGaussian(rng));
+      actual[i] = expected[i];
+      weight[i] = hwy::ConvertScalarTo<WT>(RandomGaussian(rng));
+    }
+
+    ScalarRMSNorm(expected, weight, expected, kSize);
+    RMSNormInplace(weight, actual, kSize, hwy::Profiler::Get(),
+                   /*worker=*/0);
+
+    for (size_t i = 0; i < kSize; i++) {
+      const float e = hwy::ConvertScalarTo<float>(expected[i]);
+      const float a = hwy::ConvertScalarTo<float>(actual[i]);
+      if (!IsNear(e, a, 1e-5f)) {
+        HWY_ABORT("RMSNormInplace %s %s mismatch at %zu: %E %E\n",
+                  TypeName<XT>(), TypeName<WT>(), i, e, a);
+      }
+    }
+  }
+};
+
+void TestAllRMSNormInplace() {
+  ForeachActivationType2<TestRMSNormInplace>(hn::ScalableTag<float>());
 }

 void TestLayerNormSimple() {
@ -497,91 +612,92 @@ void TestLayerNormSimple() {

  for (size_t i = 0; i < kSize; i++) {
    const float max_error = 1e-6f;
-    float value = values[i];
    float res = result[i];
    // out = (x - 0.0) * 1.2 * 0.9999995 + 0.1 = 1.2999994 / -1.0999994;
    float expected = (i % 2 == 0) ? 1.2999994f : -1.0999994f;
-    EXPECT_NEAR(res, expected, max_error) << "Input: " << value;
+    EXPECT_NEAR(res, expected, max_error);
  }
 }

-// Computes mean mu and mean of squares mu2 of a vector. Used in
-// ScalarLayerNorm.
-template <typename T>
-HWY_NOINLINE void ScalarMus(const T* HWY_RESTRICT a, size_t size, double& mu,
-                            double& mu2) {
-  HWY_ASSERT(size > 0);
-  double sum = 0.0;
-  double sum2 = 0.0;
-  for (size_t i = 0; i < size; ++i) {
-    const float f = hwy::ConvertScalarTo<float>(a[i]);
-    sum += f;
-    sum2 += f * f;
-  }
-  mu = sum / size;
-  mu2 = sum2 / size;
-}
+class TestLayerNorm {
+ public:
+  template <typename XT, typename WT, typename OT, class D>
+  void operator()(XT, WT, OT, D) const {
+    hwy::RandomState rng;
+    constexpr size_t kSize = 128;
+    XT vec[kSize];
+    WT weight[kSize];
+    WT bias[kSize];
+    OT expected[kSize];
+    OT actual[kSize];

-// Compare py/flax/linen/normalization.py.
-// out = (x - mean) * scale * rsqrt(var + epsilon) + bias
-template <typename XT, typename WT, typename OT>
-HWY_NOINLINE void ScalarLayerNorm(const XT* x, const WT* HWY_RESTRICT scale,
-                                  const WT* HWY_RESTRICT bias, OT* out,
-                                  size_t size) {
-  constexpr double kEps = 1e-6;
-  double mu, mu2;
-  ScalarMus(x, size, mu, mu2);
-  double var = mu2 - mu * mu;
-  constexpr double kZero = 0.0;
-  var = HWY_MAX(var, kZero);
-  var = 1.0 / sqrt(var + kEps);
-  for (size_t j = 0; j < size; j++) {
-    const float v = hwy::ConvertScalarTo<float>(x[j]);
-    const float s = hwy::ConvertScalarTo<float>(scale[j]);
-    const float b = hwy::ConvertScalarTo<float>(bias[j]);
-    out[j] = hwy::ConvertScalarTo<OT>((v - mu) * s * var + b);
-  }
-}
+    for (size_t i = 0; i < kSize; ++i) {
+      vec[i] = hwy::ConvertScalarTo<XT>(RandomGaussian(rng));
+      weight[i] = hwy::ConvertScalarTo<WT>(RandomGaussian(rng));
+      bias[i] = hwy::ConvertScalarTo<WT>(RandomGaussian(rng));
+    }

-template <typename XT, typename WT, typename OT>
-void TestLayerNorm(hwy::RandomState& rng) {
-  constexpr size_t kSize = 128;
-  XT vec[kSize];
-  WT weight[kSize];
-  WT bias[kSize];
-  OT expected[kSize];
-  OT actual[kSize];
+    double expected_mu, expected_mu2;
+    ScalarMus(vec, kSize, expected_mu, expected_mu2);
+    double actual_mu, actual_mu2;
+    ComputeMoments(vec, kSize, actual_mu, actual_mu2);

-  for (size_t i = 0; i < kSize; ++i) {
-    vec[i] = hwy::ConvertScalarTo<XT>(RandomGaussian(rng));
-    weight[i] = hwy::ConvertScalarTo<WT>(RandomGaussian(rng));
-    bias[i] = hwy::ConvertScalarTo<WT>(RandomGaussian(rng));
-  }
+    ScalarLayerNorm(vec, weight, bias, expected, kSize);
+    LayerNorm(vec, weight, bias, actual, kSize);

-  double expected_mu, expected_mu2;
-  ScalarMus(vec, kSize, expected_mu, expected_mu2);
-  double actual_mu, actual_mu2;
-  ComputeMoments(vec, kSize, actual_mu, actual_mu2);
-
-  ScalarLayerNorm(vec, weight, bias, expected, kSize);
-  LayerNorm(vec, weight, bias, actual, kSize);
-
-  for (size_t i = 0; i < kSize; i++) {
-    const float e = hwy::ConvertScalarTo<float>(expected[i]);
-    const float a = hwy::ConvertScalarTo<float>(actual[i]);
-    if (!IsNear(e, a, 1e-5f)) {
-      HWY_ABORT("LayerNorm %s %s %s mismatch at %zu: %E %E\n", TypeName<XT>(),
-                TypeName<WT>(), TypeName<OT>(), i, e, a);
+    for (size_t i = 0; i < kSize; i++) {
+      const float e = hwy::ConvertScalarTo<float>(expected[i]);
+      const float a = hwy::ConvertScalarTo<float>(actual[i]);
+      if (!IsNear(e, a, 1e-5f)) {
+        HWY_ABORT("LayerNorm %s %s %s mismatch at %zu: %E %E\n", TypeName<XT>(),
+                  TypeName<WT>(), TypeName<OT>(), i, e, a);
+      }
    }
  }
-}
+
+ private:
+  // Computes mean mu and mean of squares mu2 of a vector. Used in
+  // ScalarLayerNorm.
+  template <typename T>
+  static HWY_NOINLINE void ScalarMus(const T* HWY_RESTRICT a, size_t size,
+                                     double& mu, double& mu2) {
+    HWY_ASSERT(size > 0);
+    double sum = 0.0;
+    double sum2 = 0.0;
+    for (size_t i = 0; i < size; ++i) {
+      const float f = hwy::ConvertScalarTo<float>(a[i]);
+      sum += f;
+      sum2 += f * f;
+    }
+    mu = sum / size;
+    mu2 = sum2 / size;
+  }
+
+  // Compare py/flax/linen/normalization.py.
+  // out = (x - mean) * scale * rsqrt(var + epsilon) + bias
+  template <typename XT, typename WT, typename OT>
+  static HWY_NOINLINE void ScalarLayerNorm(const XT* x,
+                                           const WT* HWY_RESTRICT scale,
+                                           const WT* HWY_RESTRICT bias, OT* out,
+                                           size_t size) {
+    constexpr double kEps = 1e-6;
+    double mu, mu2;
+    ScalarMus(x, size, mu, mu2);
+    double var = mu2 - mu * mu;
+    constexpr double kZero = 0.0;
+    var = HWY_MAX(var, kZero);
+    var = 1.0 / sqrt(var + kEps);
+    for (size_t j = 0; j < size; j++) {
+      const float v = hwy::ConvertScalarTo<float>(x[j]);
+      const float s = hwy::ConvertScalarTo<float>(scale[j]);
+      const float b = hwy::ConvertScalarTo<float>(bias[j]);
+      out[j] = hwy::ConvertScalarTo<OT>((v - mu) * s * var + b);
+    }
+  }
+};

 void TestAllLayerNorm() {
-  hwy::RandomState rng;
-  TestLayerNorm<float, float, float>(rng);
-  TestLayerNorm<float, float, BF16>(rng);
-  TestLayerNorm<float, BF16, float>(rng);
-  TestLayerNorm<float, BF16, BF16>(rng);
+  ForeachActivationType3<TestLayerNorm>(hn::ScalableTag<float>());
 }

 void TestSampleTopK() {
@ -646,12 +762,15 @@ namespace gcpp {
 HWY_BEFORE_TEST(OpsTest);
 HWY_EXPORT_AND_TEST_P(OpsTest, TestAllAddFrom);
 HWY_EXPORT_AND_TEST_P(OpsTest, TestAllMulByConst);
+HWY_EXPORT_AND_TEST_P(OpsTest, TestAllMulByConstTo);
 HWY_EXPORT_AND_TEST_P(OpsTest, TestAllMulByConstAndAdd);
 HWY_EXPORT_AND_TEST_P(OpsTest, TestAllSoftmax);
 HWY_EXPORT_AND_TEST_P(OpsTest, TestAllCreateDistribution);
-HWY_EXPORT_AND_TEST_P(OpsTest, TestSigmoid);
+HWY_EXPORT_AND_TEST_P(OpsTest, TestAllSigmoid);
+HWY_EXPORT_AND_TEST_P(OpsTest, TestAllGelu);
 HWY_EXPORT_AND_TEST_P(OpsTest, TestRopeAndMulBy);
 HWY_EXPORT_AND_TEST_P(OpsTest, TestAllRMSNorm);
+HWY_EXPORT_AND_TEST_P(OpsTest, TestAllRMSNormInplace);
 HWY_EXPORT_AND_TEST_P(OpsTest, TestAllLayerNorm);
 HWY_EXPORT_AND_TEST_P(OpsTest, TestLayerNormSimple);
 HWY_EXPORT_AND_TEST_P(OpsTest, TestSampleTopK);