diff --git a/BUILD.bazel b/BUILD.bazel
index 2fc9e60..862555c 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -48,15 +48,6 @@ cc_library(
     ],
 )
 
-# Avoids circular dependency: fp_arith-inl -> compress-inl -> ops-inl
-cc_library(
-    name = "fp_arith",
-    textual_hdrs = ["ops/fp_arith-inl.h"],
-    deps = [
-        "@hwy//:hwy",
-    ],
-)
-
 cc_library(
     name = "ops",
     hdrs = [
@@ -64,13 +55,13 @@ cc_library(
     ],
     textual_hdrs = [
         "ops/dot-inl.h",
+        "ops/fp_arith-inl.h",
         "ops/matmul-inl.h",
         "ops/matvec-inl.h",
         "ops/ops-inl.h",
     ],
     deps = [
         ":allocator",
-        ":fp_arith",
         ":threading",
         "//compression:compress",
         "//compression:sfp",
@@ -97,14 +88,17 @@ cc_test(
         ":common",
         ":gemma_lib",
         ":ops",
+        ":test_util",
         ":threading",
         "@googletest//:gtest_main",  # buildcleaner: keep
         "//compression:compress",
+        "//compression:test_util",
         "@hwy//:hwy",
         "@hwy//:hwy_test_util",
         "@hwy//:nanobenchmark",  #buildcleaner: keep
         "@hwy//:profiler",
         "@hwy//:stats",
+        "@hwy//:thread_pool",
     ],
 )
 
@@ -468,6 +462,7 @@ cc_library(
         ":ops",
         ":prompt",
         ":weights",
+        "@hwy//:dot",
         "@hwy//:hwy",  # base.h
         "@hwy//:thread_pool",
     ],
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 84832ff..c49bdf9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,10 +44,10 @@ set(SOURCES
   compression/io_win.cc
   compression/io.cc
   compression/io.h
-  compression/nuq.h
   compression/nuq-inl.h
   compression/sfp-inl.h
   compression/shared.h
+  compression/test_util-inl.h
   compression/weights_raw.h
   backprop/activations.h
   backprop/backward.cc
diff --git a/backprop/backward-inl.h b/backprop/backward-inl.h
index 76bd87e..df7c6fb 100644
--- a/backprop/backward-inl.h
+++ b/backprop/backward-inl.h
@@ -46,6 +46,7 @@
 // After highway.h
 #include "ops/matmul-inl.h"
 #include "ops/ops-inl.h"
+#include "hwy/contrib/dot/dot-inl.h"
 
 HWY_BEFORE_NAMESPACE();
 namespace gcpp {
diff --git a/backprop/forward-inl.h b/backprop/forward-inl.h
index 838b042..c799cf4 100644
--- a/backprop/forward-inl.h
+++ b/backprop/forward-inl.h
@@ -52,11 +52,14 @@ namespace HWY_NAMESPACE {
 template <typename ArrayT>
 void InputEmbedding(const ArrayT& weights, const std::vector<int>& prompt,
                     const float scaling, float* HWY_RESTRICT output,
-                    size_t model_dim) {
+                    size_t model_dim, size_t vocab_size) {
+  const hn::ScalableTag<float> df;
   HWY_ASSERT(!prompt.empty());
   for (size_t pos = 0; pos < prompt.size() - 1; ++pos) {
     int token = prompt[pos];
-    Decompress(weights, token * model_dim, output + pos * model_dim, model_dim);
+    DecompressAndZeroPad(df, MakeSpan(weights.data(), model_dim * vocab_size),
+                         token * model_dim, output + pos * model_dim,
+                         model_dim);
     MulByConst(scaling, output + pos * model_dim, model_dim);
   }
 }
@@ -245,7 +248,7 @@ float CrossEntropyLossForwardPass(const std::vector<int>& prompt,
   const size_t num_tokens = prompt.size() - 1;
 
   InputEmbedding(weights.embedder_input_embedding, prompt, kEmbScaling,
-                 forward.layers[0].input.data(), kModelDim);
+                 forward.layers[0].input.data(), kModelDim, kVocabSize);
 
   for (size_t layer = 0; layer < kLayers; ++layer) {
     auto type = TConfig::kLayerConfig[layer];
diff --git a/compression/BUILD b/compression/BUILD
index a437a17..f3767bf 100644
--- a/compression/BUILD
+++ b/compression/BUILD
@@ -50,7 +50,10 @@ cc_library(
 
 cc_library(
     name = "distortion",
-    hdrs = ["distortion.h"],
+    hdrs = [
+        "distortion.h",
+        "shared.h",
+    ],
     deps = [
         "@hwy//:hwy",
         "@hwy//:stats",
@@ -64,29 +67,43 @@ cc_test(
     srcs = ["distortion_test.cc"],
     deps = [
         ":distortion",
-        ":shared",
-        "@googletest//:gtest_main",
+        "@googletest//:gtest_main",  # buildcleaner: keep
         "//:test_util",
-        "@hwy//:hwy",
         "@hwy//:hwy_test_util",
         "@hwy//:nanobenchmark",  # Unpredictable1
     ],
 )
 
 cc_library(
-    name = "shared",
+    name = "sfp",
     hdrs = ["shared.h"],
+    textual_hdrs = ["sfp-inl.h"],
     deps = [
         "@hwy//:hwy",
     ],
 )
 
 cc_library(
-    name = "sfp",
-    textual_hdrs = ["sfp-inl.h"],
+    name = "nuq",
+    hdrs = ["shared.h"],
+    textual_hdrs = ["nuq-inl.h"],
     deps = [
-        ":shared",
+        ":sfp",
         "@hwy//:hwy",
+        "@hwy//hwy/contrib/sort:vqsort",
+    ],
+)
+
+cc_library(
+    name = "test_util",
+    textual_hdrs = [
+        "test_util-inl.h",
+    ],
+    deps = [
+        ":compress",
+        ":distortion",
+        "@hwy//:hwy",
+        "@hwy//:hwy_test_util",
     ],
 )
 
@@ -102,9 +119,7 @@ cc_test(
     deps = [
         ":distortion",
         ":sfp",
-        ":shared",
-        "@googletest//:gtest_main",
-        "//:ops",
+        "@googletest//:gtest_main",  # buildcleaner: keep
         "//:test_util",
         "@hwy//:hwy",
         "@hwy//:hwy_test_util",
@@ -112,18 +127,6 @@ cc_test(
     ],
 )
 
-cc_library(
-    name = "nuq",
-    hdrs = ["nuq.h"],
-    textual_hdrs = ["nuq-inl.h"],
-    deps = [
-        ":sfp",
-        ":shared",
-        "@hwy//:hwy",
-        "@hwy//hwy/contrib/sort:vqsort",
-    ],
-)
-
 cc_test(
     name = "nuq_test",
     size = "small",
@@ -138,8 +141,7 @@ cc_test(
         ":distortion",
         ":nuq",
         ":sfp",
-        ":shared",
-        "@googletest//:gtest_main",
+        "@googletest//:gtest_main",  # buildcleaner: keep
         "//:test_util",
         "@hwy//:hwy",
         "@hwy//:hwy_test_util",
@@ -149,19 +151,20 @@ cc_test(
 
 cc_library(
     name = "compress",
-    hdrs = ["compress.h"],
-    textual_hdrs = [
-        "compress-inl.h",
+    hdrs = [
+        "compress.h",
+        "shared.h",
     ],
+    textual_hdrs = ["compress-inl.h"],
     deps = [
         ":blob_store",
         ":distortion",
         ":io",
         ":nuq",
         ":sfp",
-        ":shared",
-        "//:fp_arith",
         "@hwy//:hwy",
+        "@hwy//:nanobenchmark",
+        "@hwy//:profiler",
         "@hwy//:stats",
         "@hwy//:thread_pool",
     ],
@@ -170,6 +173,7 @@ cc_library(
 cc_test(
     name = "compress_test",
     size = "small",
+    timeout = "long",
     srcs = ["compress_test.cc"],
     features = ["fully_static_link"],
     linkstatic = True,
@@ -179,11 +183,11 @@ cc_test(
     deps = [
         ":compress",
         ":distortion",
-        "@googletest//:gtest_main",
+        ":test_util",
+        "@googletest//:gtest_main",  # buildcleaner: keep
         "//:test_util",
         "@hwy//:hwy",
         "@hwy//:hwy_test_util",
-        "@hwy//:nanobenchmark",
         "@hwy//:thread_pool",
     ],
 )
@@ -193,11 +197,9 @@ cc_library(
     name = "analyze",
     textual_hdrs = ["analyze.h"],
     deps = [
-        ":distortion",
         ":nuq",
         ":sfp",
         "@hwy//:hwy",
-        "@hwy//:nanobenchmark",  # timer
         "@hwy//:stats",
         "@hwy//:thread_pool",
         "@hwy//hwy/contrib/sort:vqsort",
@@ -210,7 +212,6 @@ cc_library(
     deps = [
         "//:allocator",
         "//:common",
-        "//compression:compress",
         "@hwy//:hwy",
         "@hwy//:thread_pool",
     ],
@@ -221,15 +222,13 @@ cc_binary(
     srcs = ["compress_weights.cc"],
     deps = [
         ":compress",
-        ":shared",
+        ":io",
         ":weights_raw",
-        # Placeholder for internal dep, do not remove.,
+        "//:allocator",
         "//:args",
         "//:common",
-        "//:gemma_lib",
         "//:weights",
         "@hwy//:hwy",
-        "@hwy//:nanobenchmark",
         "@hwy//:profiler",
         "@hwy//:thread_pool",
     ],
diff --git a/compression/analyze.h b/compression/analyze.h
index 342f2f2..38537db 100644
--- a/compression/analyze.h
+++ b/compression/analyze.h
@@ -26,12 +26,10 @@
 #include <cstdlib>  // std::abs
 #include <vector>
 
-#include "compression/distortion.h"
-#include "compression/nuq.h"
+#include "compression/shared.h"
 #include "hwy/base.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 #include "hwy/stats.h"
-#include "hwy/timer.h"
 
 #endif  // THIRD_PARTY_GEMMA_CPP_COMPRESSION_ANALYZE_H_
 
@@ -55,6 +53,7 @@ namespace HWY_NAMESPACE {
 class PerThread {
  public:
   void NotifyGroup(const float* group) {
+    constexpr size_t kGroupSize = NuqStream::kGroupSize;
     hwy::Stats s_group;
     for (size_t i = 0; i < kGroupSize; ++i) {
       // Skip zero so we can see the lowest actual magnitude
@@ -158,7 +157,7 @@ class PerThread {
 class PerLayer {
  public:
   void NotifyGroup(const float* group) {
-    for (size_t i = 0; i < kGroupSize; ++i) {
+    for (size_t i = 0; i < NuqStream::kGroupSize; ++i) {
       s_layer_.Notify(group[i]);
     }
   }
@@ -197,8 +196,8 @@ static HWY_NOINLINE void Analyze(const char* caption, float* mat, size_t layers,
              const float* layer = &mat[idx_layer * weights_per_layer];
              // For each whole group in the layer
              for (size_t group_start = 0;
-                  group_start + kGroupSize <= weights_per_layer;
-                  group_start += kGroupSize) {
+                  group_start + NuqStream::kGroupSize <= weights_per_layer;
+                  group_start += NuqStream::kGroupSize) {
                const float* group = layer + group_start;
                per_layer[idx_layer].NotifyGroup(group);
                self.NotifyGroup(group);
@@ -210,7 +209,7 @@ static HWY_NOINLINE void Analyze(const char* caption, float* mat, size_t layers,
   const int skip = hwy::Stats::kNoGeomean;
   fprintf(stderr, "\n------------%s\n", caption);
 
-  for (size_t i = 1; i < pool.NumThreads(); ++i) {
+  for (size_t i = 1; i < pool.NumWorkers(); ++i) {
     tls[0].Assimilate(tls[i]);
   }
   tls[0].PrintAll();
diff --git a/compression/compress-inl.h b/compression/compress-inl.h
index 9579e18..e4ea1a1 100644
--- a/compression/compress-inl.h
+++ b/compression/compress-inl.h
@@ -21,7 +21,6 @@
 #include <stdint.h>
 #include <stdio.h>
 
-#include <array>
 #include <cmath>  // lroundf, only if COMPRESS_STATS
 
 #include "compression/blob_store.h"
@@ -42,133 +41,146 @@
 #define THIRD_PARTY_GEMMA_CPP_COMPRESS_TOGGLE
 #endif
 
+#include "hwy/highway.h"
+// After highway.h
 #include "compression/nuq-inl.h"
 #include "compression/sfp-inl.h"
-#include "hwy/highway.h"
+#include "hwy/profiler.h"  // also uses SIMD
 
 HWY_BEFORE_NAMESPACE();
 namespace gcpp {
 namespace HWY_NAMESPACE {
 namespace hn = hwy::HWY_NAMESPACE;
 
-namespace detail {
-
-// Adapters to store two f32 vectors to f32 or bf16; avoids duplicating
-// RMSNorm and RMSNormInplace for the two output types.
-template <class DF, HWY_IF_F32_D(DF)>
-void Store2(DF df, hn::Vec<DF> v0, hn::Vec<DF> v1, float* HWY_RESTRICT out) {
-  const size_t NF = hn::Lanes(df);
-  hn::StoreU(v0, df, out);
-  hn::StoreU(v1, df, out + NF);
-}
-
-template <class DF, HWY_IF_F32_D(DF)>
-void Store2(DF df, hn::Vec<DF> v0, hn::Vec<DF> v1, BF16* HWY_RESTRICT out) {
-  const hn::Repartition<BF16, decltype(df)> dbf;
-  hn::StoreU(hn::OrderedDemote2To(dbf, v0, v1), dbf, out);
-}
-
-}  // namespace detail
-
 // Enables generic code independent of compression type.
 template <typename T>  // primary, must specialize
 struct CompressTraits {};
 
-// Useful for backprop/, where weights are currently f32.
+// Used by backprop/, where weights are currently f32; also MatMul for f32
+// weights or activations, if native `ReorderWidenMulAccumulate` is available.
 template <>
 struct CompressTraits<float> {
-  using MatT = float;
-  static const char* Name() { return "f32"; }
-  static constexpr bool kSupportsEvenOdd = false;  // unnecessary
+  using Packed = float;
 
-  template <class DF, HWY_IF_F32_D(DF)>
-  static HWY_INLINE void Compress(DF df, const float* HWY_RESTRICT in,
-                                  size_t num, CompressPerThread& tls,
-                                  size_t /*out_capacity*/,
-                                  MatT* HWY_RESTRICT out, size_t out_ofs) {
+  template <class DF, HWY_IF_F32_D(DF), class VF = hn::Vec<DF>>
+  static HWY_INLINE void Compress(DF /*df*/, const float* HWY_RESTRICT raw,
+                                  size_t num, CompressPerThread& /*tls*/,
+                                  const PackedSpan<Packed>& packed,
+                                  const size_t packed_ofs) {
+    hwy::CopyBytes(raw, packed.ptr + packed_ofs, num * sizeof(raw[0]));
+  }
+
+  template <class DF, HWY_IF_F32_D(DF), class VF = hn::Vec<DF>>
+  static void Store2(DF df, VF raw0, VF raw1, const PackedSpan<Packed>& packed,
+                     const size_t packed_ofs) {
+    const size_t NF = hn::Lanes(df);
+    hn::StoreU(raw0, df, packed.ptr + packed_ofs);
+    hn::StoreU(raw1, df, packed.ptr + packed_ofs + NF);
+  }
+
+  template <class DBF16, HWY_IF_BF16_D(DBF16), class VBF16 = hn::Vec<DBF16>>
+  static HWY_INLINE void Load2(DBF16 dbf16,
+                               const PackedSpan<const Packed>& packed,
+                               const size_t packed_ofs, VBF16& raw0,
+                               VBF16& raw1) {
+    const hn::Repartition<float, decltype(dbf16)> df;
     using VF = hn::Vec<decltype(df)>;
-    const size_t N = hn::Lanes(df);
-    HWY_DASSERT(num >= 2 * N && num % (2 * N) == 0);
+    const size_t NF = hn::Lanes(df);
+    const VF f0 = hn::LoadU(df, packed.ptr + packed_ofs + 0 * NF);
+    const VF f1 = hn::LoadU(df, packed.ptr + packed_ofs + 1 * NF);
+    const VF f2 = hn::LoadU(df, packed.ptr + packed_ofs + 2 * NF);
+    const VF f3 = hn::LoadU(df, packed.ptr + packed_ofs + 3 * NF);
+    raw0 = hn::OrderedDemote2To(dbf16, f0, f1);
+    raw1 = hn::OrderedDemote2To(dbf16, f2, f3);
+  }
 
-    for (size_t i = 0; i < num; i += 2 * N) {
-      const VF in0 = hn::LoadU(df, in + i);
-      const VF in1 = hn::LoadU(df, in + i + N);
-      hn::StoreU(in0, df, out + out_ofs + i);
-      hn::StoreU(in1, df, out + out_ofs + i + N);
+  template <class DF, HWY_IF_F32_D(DF), class VF = hn::Vec<DF>>
+  static HWY_INLINE void Load2(DF df, const PackedSpan<const Packed>& packed,
+                               const size_t packed_ofs, VF& raw0, VF& raw1) {
+    const size_t N = hn::Lanes(df);
+    raw0 = hn::LoadU(df, packed.ptr + packed_ofs);
+    raw1 = hn::LoadU(df, packed.ptr + packed_ofs + N);
+  }
+
+  template <class DBF, HWY_IF_BF16_D(DBF)>
+  static HWY_INLINE void DecompressAndZeroPad(
+      DBF dbf, const PackedSpan<const Packed>& packed, const size_t packed_ofs,
+      BF16* HWY_RESTRICT raw, size_t num) {
+    const hn::Repartition<float, decltype(dbf)> df;
+    using VF = hn::Vec<decltype(df)>;
+    using VBF = hn::Vec<decltype(dbf)>;
+    const size_t NF = hn::Lanes(df);
+
+    size_t i = 0;
+    if (num >= 2 * NF) {
+      for (; i <= num - 2 * NF; i += 2 * NF) {
+        const VF f0 = hn::LoadU(df, packed.ptr + packed_ofs + i);
+        const VF f1 = hn::LoadU(df, packed.ptr + packed_ofs + i + NF);
+        hn::StoreU(hn::OrderedDemote2To(dbf, f0, f1), dbf, raw + i);
+      }
+    }
+    const size_t remaining = num - i;
+    HWY_DASSERT(remaining < 2 * NF);
+    if (HWY_UNLIKELY(remaining != 0)) {
+      const size_t remaining2 = remaining - HWY_MIN(remaining, NF);
+      const VF f0 = hn::LoadN(df, packed.ptr + packed_ofs + i, remaining);
+      const VF f1 = hn::LoadN(df, packed.ptr + packed_ofs + i + NF, remaining2);
+      hn::StoreU(hn::OrderedDemote2To(dbf, f0, f1), dbf, raw + i);
     }
   }
 
   template <class DF, HWY_IF_F32_D(DF)>
-  static HWY_INLINE void Decompress2(DF df, const MatT* HWY_RESTRICT in,
-                                     size_t in_ofs, hn::Vec<DF>& f0,
-                                     hn::Vec<DF>& f1) {
-    const size_t N = hn::Lanes(df);
-    f0 = hn::LoadU(df, in + in_ofs);
-    f1 = hn::LoadU(df, in + in_ofs + N);
-  }
-
-  // Called by MatMul for f32 weights or activations if native
-  // `ReorderWidenMulAccumulate` is available.
-  template <class DBF16, HWY_IF_BF16_D(DBF16), class VBF16 = hn::Vec<DBF16>>
-  static HWY_INLINE void Decompress2(DBF16 dbf16, const MatT* HWY_RESTRICT in,
-                                     size_t in_ofs, VBF16& v0, VBF16& v1) {
-    const hn::Repartition<float, decltype(dbf16)> df;
+  static HWY_INLINE void DecompressAndZeroPad(
+      DF df, const PackedSpan<const Packed>& packed, const size_t packed_ofs,
+      float* HWY_RESTRICT raw, size_t num) {
     using VF = hn::Vec<decltype(df)>;
     const size_t NF = hn::Lanes(df);
-    const VF f0 = hn::LoadU(df, in + in_ofs + 0 * NF);
-    const VF f1 = hn::LoadU(df, in + in_ofs + 1 * NF);
-    const VF f2 = hn::LoadU(df, in + in_ofs + 2 * NF);
-    const VF f3 = hn::LoadU(df, in + in_ofs + 3 * NF);
-    v0 = hn::OrderedDemote2To(dbf16, f0, f1);
-    v1 = hn::OrderedDemote2To(dbf16, f2, f3);
-  }
 
-  template <class DF, HWY_IF_F32_D(DF)>
-  static HWY_INLINE void Decompress(DF df, size_t /*in_capacity*/,
-                                    const MatT* HWY_RESTRICT in, size_t in_ofs,
-                                    float* HWY_RESTRICT out, size_t num) {
-    using VF = hn::Vec<decltype(df)>;
-    const size_t N = hn::Lanes(df);
-
-    for (size_t i = 0; i < num; i += N) {
-      const VF v = hn::LoadU(df, in + in_ofs + i);
-      hn::StoreU(v, df, out + i);
+    size_t i = 0;
+    if (num >= NF) {
+      for (; i <= num - NF; i += NF) {
+        const VF vf = hn::LoadU(df, packed.ptr + packed_ofs + i);
+        hn::StoreU(vf, df, raw + i);
+      }
+    }
+    const size_t remaining = num - i;
+    HWY_DASSERT(remaining < NF);
+    if (HWY_UNLIKELY(remaining != 0)) {
+      const VF vf = hn::LoadN(df, packed.ptr + packed_ofs + i, remaining);
+      hn::StoreU(vf, df, raw + i);  // adds zero padding
     }
   }
 };
 
 template <>
-struct CompressTraits<hwy::bfloat16_t> {
-  using MatT = hwy::bfloat16_t;
-  static const char* Name() { return "bf16"; }
-  static constexpr bool kSupportsEvenOdd = true;
+struct CompressTraits<BF16> {
+  using Packed = BF16;
 
-  template <class DF, HWY_IF_F32_D(DF)>
-  static HWY_INLINE void Compress(DF df, const float* HWY_RESTRICT in,
+  // Note: it is fine for the lower 16 mantissa bits of `raw` to be nonzero
+  // because we round rather than truncate.
+  template <class DF, HWY_IF_F32_D(DF), class VF = hn::Vec<DF>>
+  static HWY_INLINE void Compress(DF df, const float* HWY_RESTRICT raw,
                                   size_t num, CompressPerThread& tls,
-                                  size_t /*out_capacity*/,
-                                  MatT* HWY_RESTRICT out, size_t out_ofs) {
+                                  const PackedSpan<Packed>& packed,
+                                  const size_t packed_ofs) {
     const hn::RebindToUnsigned<decltype(df)> du;
-    const hn::Repartition<hwy::bfloat16_t, decltype(df)> dbf;
-    using VF = hn::Vec<decltype(df)>;
-    const size_t N = hn::Lanes(df);
-
-    hn::Vec<decltype(du)> or_sum = hn::Zero(du);
+    const hn::Repartition<BF16, decltype(df)> dbf;
+    const size_t NF = hn::Lanes(df);
 
     size_t i = 0;
-    if (num >= 2 * N) {
-      for (; i <= num - 2 * N; i += 2 * N) {
-        const VF in0 = hn::LoadU(df, in + i);
-        const VF in1 = hn::LoadU(df, in + i + N);
+    if (num >= 2 * NF) {
+      for (; i <= num - 2 * NF; i += 2 * NF) {
+        const VF raw0 = hn::LoadU(df, raw + i);
+        const VF raw1 = hn::LoadU(df, raw + i + NF);
 
-        // Sticky bits so we can warn if any lower bits were set.
-        or_sum = hn::Or3(or_sum, hn::BitCast(du, in0), hn::BitCast(du, in1));
-        hn::StoreU(hn::OrderedDemote2To(dbf, in0, in1), dbf, out + out_ofs + i);
+        hn::StoreU(hn::OrderedDemote2To(dbf, raw0, raw1), dbf,
+                   packed.ptr + packed_ofs + i);
 
         if (COMPRESS_STATS) {
           DistortionStats stats;
-          for (size_t j = 0; j < 2 * N; ++j) {
-            stats.Notify(in[i + j], hwy::F32FromBF16(out[out_ofs + i + j]));
+          for (size_t j = 0; j < 2 * NF; ++j) {
+            stats.Notify(raw[i + j],
+                         hwy::F32FromBF16(packed.ptr[packed_ofs + i + j]));
           }
           tls.stats.Notify(stats);
         }
@@ -176,270 +188,248 @@ struct CompressTraits<hwy::bfloat16_t> {
     }
 
     const size_t remaining = num - i;
+    HWY_DASSERT(remaining < 2 * NF);
     if (remaining != 0) {
-      const VF in0 = hn::LoadN(df, in + i, remaining);
-      const size_t remaining1 = remaining - HWY_MIN(remaining, N / 2);
-      const VF in1 = hn::LoadN(df, in + i + N, remaining1);
+      const VF raw0 = hn::LoadN(df, raw + i, remaining);
+      const size_t remaining1 = remaining - HWY_MIN(remaining, NF);
+      const VF raw1 = hn::LoadN(df, raw + i + NF, remaining1);
 
-      // Sticky bits so we can warn if any lower bits were set.
-      or_sum = hn::Or3(or_sum, hn::BitCast(du, in0), hn::BitCast(du, in1));
-      hn::StoreU(hn::OrderedDemote2To(dbf, in0, in1), dbf, out + out_ofs + i);
+      hn::StoreN(hn::OrderedDemote2To(dbf, raw0, raw1), dbf,
+                 packed.ptr + packed_ofs + i, remaining);
 
       if (COMPRESS_STATS) {
         DistortionStats stats;
         for (size_t j = 0; j < remaining; ++j) {
-          stats.Notify(in[i + j], hwy::F32FromBF16(out[out_ofs + i + j]));
+          stats.Notify(raw[i + j],
+                       hwy::F32FromBF16(packed.ptr[packed_ofs + i + j]));
         }
         tls.stats.Notify(stats);
       }
     }
-
-    // If the lower 16 bits are not zero, we should implement rounding.
-    or_sum = hn::And(or_sum, hn::Set(du, 0xFFFF));
-    if (!hn::AllTrue(du, hn::Eq(or_sum, hn::Zero(du)))) {
-      // fprintf(stderr, "Warning: Lossy truncation.");
-    }
   }
 
-  template <class DF, HWY_IF_F32_D(DF)>
-  static HWY_INLINE void Decompress2(DF df, const MatT* HWY_RESTRICT in,
-                                     size_t in_ofs, hn::Vec<DF>& f0,
-                                     hn::Vec<DF>& f1) {
-    const hn::Repartition<hwy::bfloat16_t, decltype(df)> dbf;
-    using VBF = hn::Vec<decltype(dbf)>;
-    const VBF in16 = hn::LoadU(dbf, in + in_ofs);
-    f0 = hn::PromoteLowerTo(df, in16);
-    f1 = hn::PromoteUpperTo(df, in16);
+  template <class DF, HWY_IF_F32_D(DF), class VF = hn::Vec<DF>>
+  static void Store2(DF df, VF raw0, VF raw1, const PackedSpan<Packed>& packed,
+                     const size_t packed_ofs) {
+    const hn::Repartition<BF16, decltype(df)> dbf;
+    hn::StoreU(hn::OrderedDemote2To(dbf, raw0, raw1), dbf,
+               packed.ptr + packed_ofs);
   }
 
   template <class DBF16, HWY_IF_BF16_D(DBF16)>
-  static HWY_INLINE void Decompress2(DBF16 dbf16, const MatT* HWY_RESTRICT in,
-                                     size_t in_ofs, hn::Vec<DBF16>& v0,
-                                     hn::Vec<DBF16>& v1) {
-    v0 = hn::LoadU(dbf16, in + in_ofs);
-    v1 = hn::LoadU(dbf16, in + in_ofs + hn::Lanes(dbf16));
+  static HWY_INLINE void Load2(DBF16 dbf16,
+                               const PackedSpan<const Packed>& packed,
+                               const size_t packed_ofs, hn::Vec<DBF16>& raw0,
+                               hn::Vec<DBF16>& raw1) {
+    const size_t N16 = hn::Lanes(dbf16);
+    raw0 = hn::LoadU(dbf16, packed.ptr + packed_ofs);
+    raw1 = hn::LoadU(dbf16, packed.ptr + packed_ofs + N16);
   }
 
   template <class DF, HWY_IF_F32_D(DF)>
-  static HWY_INLINE void Decompress(DF df, size_t /*in_capacity*/,
-                                    const MatT* HWY_RESTRICT in, size_t in_ofs,
-                                    float* HWY_RESTRICT out, size_t num) {
-    const hn::Repartition<hwy::bfloat16_t, decltype(df)> dbf;
+  static HWY_INLINE void Load2(DF df, const PackedSpan<const Packed>& packed,
+                               const size_t packed_ofs, hn::Vec<DF>& raw0,
+                               hn::Vec<DF>& raw1) {
+    const hn::Repartition<BF16, decltype(df)> dbf;
+    using VBF = hn::Vec<decltype(dbf)>;
+    const VBF packed0 = hn::LoadU(dbf, packed.ptr + packed_ofs);
+    raw0 = hn::PromoteLowerTo(df, packed0);
+    raw1 = hn::PromoteUpperTo(df, packed0);
+  }
+
+  template <class DBF, HWY_IF_BF16_D(DBF)>
+  static HWY_INLINE void DecompressAndZeroPad(
+      DBF dbf, const PackedSpan<const Packed>& packed, const size_t packed_ofs,
+      BF16* HWY_RESTRICT raw, size_t num) {
     using VBF = hn::Vec<decltype(dbf)>;
-    using VF = hn::Vec<decltype(df)>;
     const size_t N16 = hn::Lanes(dbf);
 
     size_t i = 0;
     if (num >= N16) {
       for (i = 0; i <= num - N16; i += N16) {
-        VF in0, in1;
-        Decompress2(df, in, in_ofs + i, in0, in1);
-        hn::StoreU(in0, df, out + i);
-        hn::StoreU(in1, df, out + i + N16 / 2);
+        const VBF packed0 = hn::LoadU(dbf, packed.ptr + packed_ofs + i);
+        hn::StoreU(packed0, dbf, raw + i);
       }
     }
 
     const size_t remaining = num - i;
-    if (remaining != 0) {
-      const VBF in16 = hn::LoadN(dbf, in + in_ofs + i, remaining);
-      const VF in0 = hn::PromoteLowerTo(df, in16);
-      const VF in1 = hn::PromoteUpperTo(df, in16);
-      hn::StoreN(in0, df, out + i, remaining);
-      // Avoid wraparound, potentially store nothing.
-      const size_t remaining1 = remaining - HWY_MIN(remaining, N16 / 2);
-      hn::StoreN(in1, df, out + i + N16 / 2, remaining1);
+    HWY_DASSERT(remaining < N16);
+    if (HWY_UNLIKELY(remaining != 0)) {
+      const VBF packed0 =
+          hn::LoadN(dbf, packed.ptr + packed_ofs + i, remaining);
+      hn::StoreU(packed0, dbf, raw + i);
     }
   }
 
-  // Computes the dot product of an even-odd deinterleaved, f32 `vec_aligned`
-  // and a column- major matrix `in`. `vec_aligned` should be aligned and
-  // alternate even-indexed `hn::Lanes(df32)` elements followed by odd-indexed
-  // `hn::Lanes(df32)` elements.
   template <class DF, HWY_IF_F32_D(DF)>
-  static HWY_INLINE float DotEO(
-      const DF df32, const hwy::bfloat16_t* HWY_RESTRICT in, size_t in_ofs,
-      const float* HWY_RESTRICT vec_aligned, size_t num) {
-    HWY_DASSERT(num >= (hn::Lanes(df32) * 2) &&
-                (num % (hn::Lanes(df32) * 2)) == 0);
-    HWY_DASSERT((in_ofs % (hn::Lanes(df32) * 2)) == 0);
-    HWY_DASSERT(hn::IsAligned(df32, vec_aligned));
+  static HWY_INLINE void DecompressAndZeroPad(
+      DF df, const PackedSpan<const Packed>& packed, const size_t packed_ofs,
+      float* HWY_RESTRICT raw, size_t num) {
+    const hn::Repartition<BF16, decltype(df)> dbf;
+    using VF = hn::Vec<decltype(df)>;
+    using VBF = hn::Vec<decltype(dbf)>;
+    const size_t NF = hn::Lanes(df);
 
-    const hn::Repartition<hwy::bfloat16_t, DF> dbf16;
-    using VF32 = decltype(Zero(df32));
-    const size_t N = Lanes(dbf16);
-
-    VF32 sum0 = Zero(df32);
-    VF32 sum1 = Zero(df32);
-    VF32 sum2 = Zero(df32);
-    VF32 sum3 = Zero(df32);
-
-    for (size_t i = 0; i < num; /* i += 2 * N */) {
-      const auto interleaved0 = hn::LoadU(dbf16, in + in_ofs + i);
-      const VF32 ae0 = Load(df32, vec_aligned + i);
-      const VF32 ao0 = Load(df32, vec_aligned + i + (N / 2));
-      sum0 = hn::MulAdd(ae0, hn::PromoteEvenTo(df32, interleaved0), sum0);
-      sum1 = hn::MulAdd(ao0, hn::PromoteOddTo(df32, interleaved0), sum1);
-      i += N;
-
-      const auto interleaved1 = hn::LoadU(dbf16, in + in_ofs + i);
-      const VF32 ae1 = Load(df32, vec_aligned + i);
-      const VF32 ao1 = Load(df32, vec_aligned + i + (N / 2));
-      sum2 = hn::MulAdd(ae1, hn::PromoteEvenTo(df32, interleaved1), sum2);
-      sum3 = hn::MulAdd(ao1, hn::PromoteOddTo(df32, interleaved1), sum3);
-      i += N;
+    size_t i = 0;
+    if (num >= 2 * NF) {
+      for (i = 0; i <= num - 2 * NF; i += 2 * NF) {
+        VF raw0, raw1;
+        Load2(df, packed, packed_ofs + i, raw0, raw1);
+        hn::StoreU(raw0, df, raw + i);
+        hn::StoreU(raw1, df, raw + i + NF);
+      }
     }
 
-    sum0 = hn::Add(sum0, sum1);
-    sum2 = hn::Add(sum2, sum3);
-    sum0 = hn::Add(sum0, sum2);
-    return hn::ReduceSum(df32, sum0);
+    const size_t remaining = num - i;
+    HWY_DASSERT(remaining < 2 * NF);
+    if (HWY_UNLIKELY(remaining != 0)) {
+      const VBF packed0 =
+          hn::LoadN(dbf, packed.ptr + packed_ofs + i, remaining);
+      const VF raw0 = hn::PromoteLowerTo(df, packed0);
+      const VF raw1 = hn::PromoteUpperTo(df, packed0);
+      // If at most one vector, the first store adds zero padding. Check before
+      // storing the second, because callers only pad to one vector.
+      hn::StoreU(raw0, df, raw + i);
+      if (remaining >= NF) hn::StoreU(raw1, df, raw + i + NF);
+    }
   }
 };
 
 // Switching floating point: 8-bit, 2..3 mantissa bits.
 template <>
 struct CompressTraits<SfpStream> {
-  using MatT = SfpStream;
-  static const char* Name() { return "sfp"; }
-  static constexpr bool kSupportsEvenOdd = true;
+  using Packed = SfpStream;
 
-  // Callers are responsible for scaling `in` such that its magnitudes do not
-  // exceed 1.875. See CompressedArray::scale().
+  // Callers are responsible for scaling `raw` such that its magnitudes do not
+  // exceed `SfpStream::kMax`. See CompressedArray::scale().
   template <class DF, HWY_IF_F32_D(DF)>
-  static HWY_INLINE void Compress(DF df, const float* HWY_RESTRICT in,
+  static HWY_INLINE void Compress(DF df, const float* HWY_RESTRICT raw,
                                   size_t num, CompressPerThread& tls,
-                                  size_t /*out_capacity*/,
-                                  MatT* HWY_RESTRICT out, size_t out_ofs) {
-    SfpCodec::Enc(df, in, num, out + out_ofs);
+                                  const PackedSpan<Packed>& packed,
+                                  const size_t packed_ofs) {
+    SfpCodec::Enc(df, raw, num, packed.ptr + packed_ofs);
 
     if (COMPRESS_STATS) {
-      const hn::Repartition<hwy::bfloat16_t, DF> dbf;
-      auto distorted = hwy::AllocateAligned<hwy::bfloat16_t>(num);
-      SfpCodec::Dec(dbf, out + out_ofs, num, distorted.get());
+      const hn::Repartition<BF16, DF> dbf;
+      auto distorted =
+          hwy::AllocateAligned<BF16>(hwy::RoundUpTo(num, hn::Lanes(dbf)));
+      SfpCodec::DecompressAndZeroPad(dbf, MakeConst(packed), packed_ofs,
+                                     distorted.get(), num);
       DistortionStats stats;
       for (size_t i = 0; i < num; ++i) {
-        stats.Notify(in[i], hwy::F32FromBF16(distorted[i]));
+        stats.Notify(raw[i], hwy::F32FromBF16(distorted[i]));
       }
       tls.stats.Notify(stats);
     }
   }
 
-  template <class D>  // f32 or bf16
-  static HWY_INLINE void Decompress2(D d, const MatT* HWY_RESTRICT in,
-                                     size_t in_ofs, hn::Vec<D>& v0,
-                                     hn::Vec<D>& v1) {
+  template <class D>  // Caller checks this is f32 or bf16
+  static HWY_INLINE void Load2(D d, const PackedSpan<const Packed>& packed,
+                               const size_t packed_ofs, hn::Vec<D>& raw0,
+                               hn::Vec<D>& raw1) {
     const hn::Twice<hn::Rebind<uint8_t, D>> d8;
     using V8 = hn::Vec<decltype(d8)>;
-    const V8 packed = hn::LoadU(d8, &in->byte + in_ofs);
-    SfpCodec::Dec2(d, packed, v0, v1);
+    const V8 v8 = hn::LoadU(d8, &packed.ptr->byte + packed_ofs);
+    SfpCodec::Dec2(d, v8, raw0, raw1);
   }
 
-  template <class D, typename OutT>
-  static HWY_INLINE void Decompress(D d, size_t /*in_capacity*/,
-                                    const MatT* HWY_RESTRICT in, size_t in_ofs,
-                                    OutT* HWY_RESTRICT out, size_t num) {
-    SfpCodec::Dec(d, in + in_ofs, num, out);
-  }
+  // Store2 is not yet implemented.
 
-  // Computes the dot product of an even-odd deinterleaved, f32 or bf16
-  // `vec_aligned` and a column-major matrix `in`. `vec_aligned` should be
-  // aligned and alternate even-indexed `hn::Lanes(df)` elements followed by
-  // odd-indexed `hn::Lanes(df)` elements.
-  template <class DF, typename VecT, HWY_IF_F32_D(DF)>
-  static HWY_INLINE float DotEO(const DF df, const MatT* HWY_RESTRICT in,
-                                size_t in_ofs,
-                                const VecT* HWY_RESTRICT vec_aligned,
-                                size_t num) {
-    HWY_DASSERT(num >= (hn::Lanes(df) * 2) && (num % (hn::Lanes(df) * 2)) == 0);
-    HWY_DASSERT((in_ofs % (hn::Lanes(df) * 2)) == 0);
-    HWY_DASSERT(hn::IsAligned(df, vec_aligned));
-
-    using VF = hn::Vec<decltype(df)>;
-    VF sum0 = hn::Zero(df);
-    VF sum1 = hn::Zero(df);
-    VF sum2 = hn::Zero(df);
-    VF sum3 = hn::Zero(df);
-
-    SfpCodec::DotEO(df, in + in_ofs, num, vec_aligned, sum0, sum1, sum2, sum3);
-
-    // Reduction tree: sum of all accumulators, then their lanes
-    sum0 = hn::Add(sum0, sum1);
-    sum2 = hn::Add(sum2, sum3);
-    sum0 = hn::Add(sum0, sum2);
-    return hn::ReduceSum(df, sum0);
+  template <class D, typename Raw>
+  static HWY_INLINE void DecompressAndZeroPad(
+      D d, const PackedSpan<const Packed>& packed, const size_t packed_ofs,
+      Raw* HWY_RESTRICT raw, const size_t num) {
+    SfpCodec::DecompressAndZeroPad(d, packed, packed_ofs, raw, num);
   }
 };
 
 // Nonuniform quantization, 4.5 bits per element, two separate streams.
 template <>
 struct CompressTraits<NuqStream> {
-  using MatT = NuqStream;
-  static const char* Name() { return "nuq"; }
-  static constexpr bool kSupportsEvenOdd = false;
+  using Packed = NuqStream;
 
   template <class DF, HWY_IF_F32_D(DF)>
-  static HWY_INLINE void Compress(DF df, const float* in, size_t num,
-                                  CompressPerThread& tls, size_t out_capacity,
-                                  MatT* out, size_t out_ofs) {
-    NuqCodec::Enc(df, in, num, tls.buf, out_capacity, out, out_ofs);
+  static HWY_INLINE void Compress(DF df, const float* HWY_RESTRICT raw,
+                                  size_t num, CompressPerThread& tls,
+                                  const PackedSpan<Packed>& packed,
+                                  const size_t packed_ofs) {
+    NuqCodec::Enc(df, raw, num, tls.buf, packed, packed_ofs);
 
     if (COMPRESS_STATS) {
       for (size_t i = 0; i < num; ++i) {
-        tls.stats.NotifyIn(static_cast<int>(lroundf(in[i] * 100.0f + 500.0f)));
+        tls.stats.NotifyIn(static_cast<int>(lroundf(raw[i] * 100.0f + 500.0f)));
       }
 
-      const hn::Repartition<hwy::bfloat16_t, DF> dbf;
-      auto distorted = hwy::AllocateAligned<hwy::bfloat16_t>(num);
-      NuqCodec::Dec(dbf, out_capacity, out, out_ofs, distorted.get(), num);
+      const hn::Repartition<BF16, DF> dbf;
+      const size_t N16 = hn::Lanes(dbf);
+      auto distorted = hwy::AllocateAligned<BF16>(hwy::RoundUpTo(num, N16));
+      NuqCodec::DecompressAndZeroPad(dbf, MakeConst(packed), packed_ofs,
+                                     distorted.get(), num);
       DistortionStats stats;
       for (size_t i = 0; i < num; ++i) {
-        stats.Notify(in[i], hwy::F32FromBF16(distorted[i]));
+        stats.Notify(raw[i], hwy::F32FromBF16(distorted[i]));
       }
       tls.stats.Notify(stats);
     }
   }
 
-  template <class D, typename OutT>
-  static HWY_INLINE void Decompress(D d, size_t in_capacity, const MatT* in,
-                                    size_t in_ofs, OutT* out, size_t num) {
-    NuqCodec::Dec(d, in_capacity, in, in_ofs, out, num);
+  template <class D>  // Caller checks this is f32 or bf16
+  static HWY_INLINE void Load2(D d, const PackedSpan<const Packed>& packed,
+                               const size_t packed_ofs, hn::Vec<D>& raw0,
+                               hn::Vec<D>& raw1) {
+    const hn::Twice<hn::Rebind<uint8_t, D>> d8;
+    using V8 = hn::Vec<decltype(d8)>;
+    NuqCodec::Dec2(d, packed, packed_ofs, raw0, raw1);
+  }
+
+  // Store2 is not yet implemented.
+
+  template <class D, typename Raw>
+  static HWY_INLINE void DecompressAndZeroPad(
+      D d, const PackedSpan<const Packed>& packed, const size_t packed_ofs,
+      Raw* raw, const size_t num) {
+    NuqCodec::DecompressAndZeroPad(d, packed, packed_ofs, raw, num);
   }
 };
 
-// Compresses `num` inputs to `out` starting at `out_ofs`. This can be used for
-// compressing sub-regions of an array.
-template <typename MatT>
-HWY_NOINLINE void Compress(const float* in, size_t num,
-                           CompressWorkingSet& work, size_t out_capacity,
-                           MatT* out, size_t out_ofs, hwy::ThreadPool& pool) {
-  HWY_DASSERT(out_ofs + num <= out_capacity);
-  work.tls.resize(pool.NumThreads());
+// Compresses `num` elements of `raw` to `packed` starting at `packed_ofs`,
+// which is useful for compressing sub-regions of an array.
+template <typename Packed>
+HWY_NOINLINE void Compress(const float* HWY_RESTRICT raw, size_t num,
+                           CompressWorkingSet& work,
+                           const PackedSpan<Packed>& packed,
+                           const size_t packed_ofs, hwy::ThreadPool& pool) {
+  packed.BoundsCheck(packed_ofs, num);
+  work.tls.resize(pool.NumWorkers());
   if (COMPRESS_STATS) {
     for (auto& tls : work.tls) {
       tls.stats.Reset();
     }
   }
 
-  const double t0 = hwy::platform::Now();
+  const bool want_bench = num > 1024 * 1024 || COMPRESS_STATS;
+  const double t0 = want_bench ? hwy::platform::Now() : 0.0;
 
-  using Traits = CompressTraits<MatT>;
+  using Traits = CompressTraits<Packed>;
   constexpr size_t kBatch = 8192;
   const size_t num_batches = hwy::DivCeil(num, kBatch);
   pool.Run(0, num_batches,
            [&](const uint32_t idx_batch, size_t thread) HWY_ATTR {
              const hn::ScalableTag<float> df;
 
-             const size_t in_ofs = idx_batch * kBatch;
+             const size_t my_pos = idx_batch * kBatch;
              const size_t my_num =
-                 idx_batch == num_batches - 1 ? (num - in_ofs) : kBatch;
-             Traits::Compress(df, in + in_ofs, my_num, work.tls[thread],
-                              out_capacity, out, out_ofs + in_ofs);
+                 idx_batch == num_batches - 1 ? (num - my_pos) : kBatch;
+             Traits::Compress(df, raw + my_pos, my_num, work.tls[thread],
+                              packed, packed_ofs + my_pos);
            });
 
-  const double t1 = hwy::platform::Now();
-  const double mb = static_cast<double>(num) * sizeof(in[0]) * 1E-6;
-  const double mbps = mb / (t1 - t0);
-  fprintf(stderr, "Compress %.1f MB/s\n", mbps);
+  if (want_bench) {  // Avoids log spam in tests
+    const double t1 = hwy::platform::Now();
+    const double mb = static_cast<double>(num) * sizeof(raw[0]) * 1E-6;
+    const double mbps = mb / (t1 - t0);
+    fprintf(stderr, "Compress %.1f MB/s\n", mbps);
+  }
 
   if (COMPRESS_STATS) {
     for (size_t i = 1; i < work.tls.size(); ++i) {
@@ -449,53 +439,182 @@ HWY_NOINLINE void Compress(const float* in, size_t num,
   }
 }
 
-// Compresses an entire std::array into `out`, which is assumed to have exactly
-// that much capacity.
-template <size_t kCapacity, typename MatT>
-HWY_INLINE void Compress(const std::array<float, kCapacity>& in,
-                         CompressWorkingSet& work,
-                         CompressedArray<MatT, kCapacity>& compressed,
-                         hwy::ThreadPool& pool) {
-  Compress(in.data(), kCapacity, work, kCapacity, compressed.data(), 0, pool);
+// Adapter that compresses into `CompressedArray`. `raw` must already be scaled
+// to fit the value range, if `Packed` is `SfpStream`.
+template <typename Packed, size_t kCapacity>
+HWY_INLINE void CompressScaled(const float* HWY_RESTRICT raw, size_t num,
+                               CompressWorkingSet& work,
+                               CompressedArray<Packed, kCapacity>& compressed,
+                               hwy::ThreadPool& pool) {
+  Compress(raw, num, work, MakeSpan(compressed.data(), kCapacity),
+           /*packed_ofs=*/0, pool);
 }
 
-// Decompresses `num` values from `compressed` starting at `compressed_ofs`.
-template <typename ArrayT, typename OutT>
-HWY_NOINLINE void Decompress(const ArrayT& compressed, size_t compressed_ofs,
-                             OutT* out, size_t num) {
-  HWY_DASSERT(compressed_ofs + num <= compressed.size());
-  const hn::ScalableTag<OutT> d;
-  using Traits = CompressTraits<typename ArrayT::value_type>;
-  Traits::Decompress(d, compressed.size(), compressed.data(), compressed_ofs,
-                     out, num);
+// Stores two f32 vectors to f32 or bf16; avoids duplicating RMSNorm and
+// RMSNormInplace for the two output types.
+template <class DF, typename Packed, HWY_IF_F32_D(DF), class VF = hn::Vec<DF>>
+void Compress2(DF df, VF raw0, VF raw1, const PackedSpan<Packed>& packed,
+               const size_t packed_ofs) {
+  static_assert(hwy::IsSameEither<Packed, float, BF16>());
+  packed.BoundsCheck(packed_ofs, 2 * hn::Lanes(df));
+  using Traits = CompressTraits<Packed>;
+  Traits::Store2(df, raw0, raw1, packed, packed_ofs);
 }
 
-// As above, but with threading and benchmarking.
-template <typename MatT, size_t kCapacity, typename OutT>
-HWY_INLINE void Decompress(const CompressedArray<MatT, kCapacity>& compressed,
-                           size_t compressed_ofs, OutT* out, size_t num,
-                           hwy::ThreadPool& pool) {
-  HWY_DASSERT(compressed_ofs + num <= compressed.size());
-  const double t0 = hwy::platform::Now();
+// Decompresses from any type of `packed`, to two float or BF16 vectors.
+template <class DRaw, typename Packed, class VRaw = hn::Vec<DRaw>>
+HWY_INLINE void Decompress2(DRaw d, const PackedSpan<Packed>& packed,
+                            const size_t packed_ofs, VRaw& raw0, VRaw& raw1) {
+  using TRaw = hn::TFromD<DRaw>;
+  static_assert(hwy::IsSameEither<TRaw, float, BF16>());
+  packed.BoundsCheck(packed_ofs, 2 * hn::Lanes(d));
+  using Traits = CompressTraits<hwy::RemoveCvRef<Packed>>;
+  Traits::Load2(d, MakeConst(packed), packed_ofs, raw0, raw1);
+}
 
-  using Traits = CompressTraits<MatT>;
-  constexpr size_t kBatch = 8192;
-  const size_t num_batches = hwy::DivCeil(num, kBatch);
-  pool.Run(
-      0, num_batches, [&](const uint32_t idx_batch, size_t thread) HWY_ATTR {
-        const hn::ScalableTag<OutT> d;
+// Decompresses from any type of `packed`, starting at (any) `packed_ofs`, to
+// (any) `num` elements in `raw`, then appends `[0, hn::Lanes(d))` zeroes as
+// required to round `num` up to one vector, if it is not already. The caller is
+// responsible for scaling `raw` to the original range because `EmbedToken`
+// also wants to scale the decompressed elements.
+template <class DRaw, typename Packed, typename TRaw = hn::TFromD<DRaw>>
+HWY_NOINLINE void DecompressAndZeroPad(DRaw d, const PackedSpan<Packed>& packed,
+                                       const size_t packed_ofs, TRaw* raw,
+                                       size_t num) {
+  static_assert(hwy::IsSameEither<TRaw, float, BF16>());
+  using Traits = CompressTraits<hwy::RemoveCvRef<Packed>>;
+  packed.BoundsCheck(packed_ofs, num);
+  Traits::DecompressAndZeroPad(d, MakeConst(packed), packed_ofs, raw, num);
+}
 
-        const size_t ofs = idx_batch * kBatch;
-        const size_t batch =
-            idx_batch == num_batches - 1 ? (num - ofs) : kBatch;
-        Traits::Decompress(d, compressed.size(), compressed.data(),
-                           compressed_ofs + ofs, out + ofs, batch);
-      });
+// Decompresses to the type specified by `D` from each of two arrays in groups
+// of four vectors, passes them to `kernel.Update4`, zero-pads to a vector
+// multiple, then calls `kernel.Update1` for the remaining vectors. Returns
+// `kernel.Reduce`.
+//
+// This is useful for implementing dot products, and similar to
+// `hwy/contrib/unroller`, but also supports compressed types with simpler
+// remainder handling thanks to `DecompressAndZeroPad`.
+//
+// `w` can be any packed type, including NUQ, which requires a separate `w_ofs`
+// rather than pointer arithmetic. `vec_aligned` can also be any type, but
+// typically float or BF16. We omit a `v_ofs` because it is 0 in our use cases.
+// `num`, the number of elements to process, need not be a vector multiple.
+//
+// `kernel` is const& so we can pass an rvalue argument, but can contain
+// mutable state, though not vectors (see highway.h). We pass in the four
+// loaded vectors plus eight *f32* state vectors, independent of `D`.
+template <class D, typename WeightT, typename VecT, class Kernel>
+HWY_INLINE float DecompressAndCall(D d, const PackedSpan<const WeightT>& w,
+                                   const size_t w_ofs,
+                                   const VecT* HWY_RESTRICT vec_aligned,
+                                   const size_t num, const Kernel& kernel) {
+  PROFILER_FUNC;
 
-  const double t1 = hwy::platform::Now();
-  const double mb = num * sizeof(MatT) * 1E-6;
-  const double mbps = mb / (t1 - t0);
-  fprintf(stderr, "Decompress %.1f MB/s\n", mbps);
+  HWY_DASSERT(hn::IsAligned(hn::Repartition<VecT, D>(), vec_aligned));
+  const auto v_span = MakeSpan(vec_aligned, num);
+
+  // Decompressed inputs
+  using V = hn::Vec<decltype(d)>;
+  V w0, w1, w2, w3, v0, v1, v2, v3;
+
+  // State for Kernel
+  const hn::Repartition<float, D> df;
+  using VF = hn::Vec<decltype(df)>;
+  VF sum0 = hn::Zero(df);
+  VF sum1 = hn::Zero(df);
+  VF sum2 = hn::Zero(df);
+  VF sum3 = hn::Zero(df);
+  VF comp0 = hn::Zero(df);
+  VF comp1 = hn::Zero(df);
+  VF comp2 = hn::Zero(df);
+  VF comp3 = hn::Zero(df);
+
+  const size_t N = hn::Lanes(d);
+  size_t i = 0;
+  if (num >= 4 * N) {
+    for (; i <= num - 4 * N; i += 4 * N) {
+      Decompress2(d, w, w_ofs + i + 0 * N, w0, w1);
+      Decompress2(d, w, w_ofs + i + 2 * N, w2, w3);
+      Decompress2(d, v_span, i + 0 * N, v0, v1);
+      Decompress2(d, v_span, i + 2 * N, v2, v3);
+
+      kernel.Update4(d, w0, w1, w2, w3, v0, v1, v2, v3, sum0, sum1, sum2, sum3,
+                     comp0, comp1, comp2, comp3);
+    }
+  }
+
+  size_t remaining = num - i;
+  HWY_DASSERT(remaining < 4 * N);
+  if (HWY_UNLIKELY(remaining != 0)) {
+    using T = hn::TFromD<D>;
+    HWY_ALIGN T padded_w[4 * hn::MaxLanes(d)];
+    HWY_ALIGN T padded_v[4 * hn::MaxLanes(d)];
+    DecompressAndZeroPad(d, w, w_ofs + i, padded_w, remaining);
+    DecompressAndZeroPad(d, v_span, i, padded_v, remaining);
+
+    // 1..4 whole vectors, possibly zero-padded.
+    for (size_t padded_pos = 0; padded_pos < remaining; padded_pos += N) {
+      const V w0 = hn::Load(d, padded_w + padded_pos);
+      const V v0 = hn::Load(d, padded_v + padded_pos);
+      kernel.Update1(d, w0, v0, sum0, comp0);
+    }
+  }
+
+  return kernel.Reduce(df, sum0, sum1, sum2, sum3, comp0, comp1, comp2, comp3);
+}
+
+// Same as above, but single input array. Used by RMSNorm.
+template <class D, typename VecT, class Kernel>
+HWY_INLINE float DecompressAndCall(D d, const VecT* HWY_RESTRICT vec_aligned,
+                                   const size_t num, const Kernel& kernel) {
+  PROFILER_FUNC;
+
+  HWY_DASSERT(hn::IsAligned(hn::Repartition<VecT, D>(), vec_aligned));
+  const auto v_span = MakeSpan(vec_aligned, num);
+
+  // Decompressed inputs
+  using V = hn::Vec<decltype(d)>;
+  V v0, v1, v2, v3;
+
+  // State for Kernel
+  const hn::Repartition<float, D> df;
+  using VF = hn::Vec<decltype(df)>;
+  VF sum0 = hn::Zero(d);
+  VF sum1 = hn::Zero(d);
+  VF sum2 = hn::Zero(d);
+  VF sum3 = hn::Zero(d);
+  VF comp0 = hn::Zero(d);
+  VF comp1 = hn::Zero(d);
+  VF comp2 = hn::Zero(d);
+  VF comp3 = hn::Zero(d);
+
+  const size_t N = hn::Lanes(d);
+  size_t i = 0;
+  if (num >= 4 * N) {
+    for (; i <= num - 4 * N; i += 4 * N) {
+      Decompress2(d, v_span, i + 0 * N, v0, v1);
+      Decompress2(d, v_span, i + 2 * N, v2, v3);
+
+      kernel.Update4(d, v0, v1, v2, v3, v0, v1, v2, v3, sum0, sum1, sum2, sum3,
+                     comp0, comp1, comp2, comp3);
+    }
+  }
+
+  size_t remaining = num - i;
+  HWY_DASSERT(remaining < 4 * N);
+  if (HWY_UNLIKELY(remaining != 0)) {
+    HWY_ALIGN float padded_v[4 * hn::MaxLanes(d)];
+    DecompressAndZeroPad(d, v_span, i, padded_v, remaining);
+
+    // 1..4 whole vectors, possibly zero-padded.
+    for (size_t padded_pos = 0; padded_pos < remaining; padded_pos += N) {
+      const VF v0 = hn::Load(d, padded_v + padded_pos);
+      kernel.Update1(d, v0, v0, sum0, comp0);
+    }
+  }
+
+  return kernel.Reduce(d, sum0, sum1, sum2, sum3, comp0, comp1, comp2, comp3);
 }
 
 // Functor called for each tensor, which compresses and stores them along with
@@ -504,21 +623,22 @@ class Compressor {
  public:
   explicit Compressor(hwy::ThreadPool& pool) : pool_(pool) {}
 
-  template <typename MatT, size_t kCapacity>
+  template <typename Packed, size_t kCapacity>
   void operator()(const char* name, const float* weights,
-                  CompressedArray<MatT, kCapacity>& compressed) {
+                  CompressedArray<Packed, kCapacity>& compressed) {
     Insert(name, weights, kCapacity, work_, compressed.CompressedSize(),
            compressed.data(), 0, pool_);
   }
 
-  template <typename MatT>
+  template <typename Packed>
   void Insert(const char* name, const float* weights, size_t weights_count,
-              CompressWorkingSet& work, size_t out_capacity, MatT* out,
-              size_t out_ofs, hwy::ThreadPool& pool) {
+              CompressWorkingSet& work, size_t out_capacity, Packed* packed,
+              size_t packed_ofs, hwy::ThreadPool& pool) {
     fprintf(stderr, "Regenerating %s (%zuM), please wait\n", name,
             weights_count / (1000 * 1000));
-    Compress(weights, weights_count, work_, weights_count, out, 0, pool_);
-    writer_.Add(CacheKey<MatT>(name), out, out_capacity);
+    Compress(weights, weights_count, work_,
+             PackedSpan<Packed>{packed, weights_count}, 0, pool_);
+    writer_.Add(CacheKey<Packed>(name), packed, out_capacity);
   }
 
   void AddScales(const float* scales, size_t len) {
diff --git a/compression/compress.h b/compression/compress.h
index 2dd77dd..cfd512f 100644
--- a/compression/compress.h
+++ b/compression/compress.h
@@ -29,7 +29,6 @@
 // IWYU pragma: begin_exports
 #include "compression/blob_store.h"
 #include "compression/io.h"
-#include "compression/nuq.h"
 #include "compression/shared.h"
 // IWYU pragma: end_exports
 #include "compression/distortion.h"
@@ -41,22 +40,6 @@
 
 namespace gcpp {
 
-static inline const char* TypeName(float) { return "f32"; }
-static inline const char* TypeName(BF16) { return "b16"; }
-static inline const char* TypeName(SfpStream) { return "sfp"; }
-static inline const char* TypeName(NuqStream) { return "nuq"; }
-
-// Returns the number of `MatT` elements required to store `capacity` values,
-// which must not be zero.
-template <typename MatT>
-constexpr size_t CompressedArrayElements(size_t capacity) {
-  if constexpr (hwy::IsSame<hwy::RemoveCvRef<MatT>, NuqStream>()) {
-    return NuqStream::PackedEnd(capacity);
-  } else {
-    return capacity;
-  }
-}
-
 // Compressed representation of floating-point elements. The array length may
 // differ from the number of elements. Associated operations such as Dot are
 // implemented in SIMD code and are thus non-member functions.
@@ -152,8 +135,8 @@ struct CompressStats {
 #endif  // COMPRESS_STATS
 
 struct CompressPerThread {
+  NuqStream::ClusterBuf buf;
   CompressStats stats;
-  ClusterBuf buf;
 };
 
 struct CompressWorkingSet {
diff --git a/compression/compress_test.cc b/compression/compress_test.cc
index 00e678c..52883d4 100644
--- a/compression/compress_test.cc
+++ b/compression/compress_test.cc
@@ -12,3 +12,198 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
+// SFP uses ConcatEven/Odd which are not supported; skip SVE for faster tests.
+#ifndef HWY_DISABLED_TARGETS
+#define HWY_DISABLED_TARGETS (HWY_SCALAR | HWY_SVE)
+#endif
+
+#include "compression/compress.h"
+
+#include <stddef.h>
+#include <stdio.h>
+
+#include "compression/distortion.h"
+#include "util/test_util.h"
+#include "hwy/aligned_allocator.h"
+#include "hwy/base.h"
+#include "hwy/contrib/thread_pool/thread_pool.h"
+#include "hwy/tests/hwy_gtest.h"
+// clang-format off
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "compression/compress_test.cc"  // NOLINT
+// clang-format on
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+// After highway.h
+#include "compression/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace gcpp {
+namespace HWY_NAMESPACE {
+namespace hn = hwy::HWY_NAMESPACE;
+
+// Calls Compress and Decompress2 and verifies the distortion/error.
+template <typename Packed>
+struct TestDecompress2T {
+  template <typename T, class D>
+  HWY_INLINE void operator()(T /*unused*/, D d) {
+    const size_t N = hn::Lanes(d);
+    CompressWorkingSet work;
+    hwy::ThreadPool pool(0);
+    hwy::RandomState rng;
+
+    const size_t num = 2 * N;
+    const size_t packed_num = CompressedArrayElements<Packed>(num);
+    auto raw = hwy::AllocateAligned<float>(num);  // Compress requires f32
+    auto packed = hwy::AllocateAligned<Packed>(packed_num);
+    auto dec = hwy::AllocateAligned<T>(num);
+    HWY_ASSERT(raw && packed && dec);
+    const auto packed_span = MakeSpan(packed.get(), packed_num);
+
+    hwy::Stats in_stats;
+    for (size_t i = 0; i < num; ++i) {
+      raw[i] = static_cast<float>(RandomGaussian(rng));
+      in_stats.Notify(raw[i]);
+    }
+    // Short inputs fail VerifyGaussian.
+
+    const size_t packed_ofs = 0;
+    Compress(raw.get(), num, work, packed_span, packed_ofs, pool);
+    hn::Vec<D> raw0, raw1;
+    Decompress2(d, MakeConst(packed_span), packed_ofs, raw0, raw1);
+    hn::Store(raw0, d, dec.get());
+    hn::Store(raw1, d, dec.get() + N);
+
+    DistortionStats stats;
+    for (size_t i = 0; i < num; ++i) {
+      stats.Notify(raw[i], hwy::ConvertScalarTo<float>(dec[i]));
+    }
+
+    if constexpr (false) {
+      fprintf(stderr, "%s %s: %zu: %f %f %f %f\n", TypeName<Packed>(),
+              TypeName<T>(), num, stats.SumL1(), stats.GeomeanValueDivL1(),
+              stats.WeightedAverageL1(), stats.L1().Max());
+    }
+
+    constexpr bool kFromFloat = hwy::IsSame<Packed, float>();
+    constexpr bool kToFloat = hwy::IsSame<T, float>();
+    if constexpr (kFromFloat && kToFloat) {  // Lossless
+      HWY_ASSERT(stats.NumExact() == num);
+      HWY_ASSERT(stats.SumL1() == 0.0f);
+      HWY_ASSERT(stats.L1().Max() == 0.0f);
+    } else if constexpr (hwy::IsSame<Packed, BF16>() ||
+                         (kFromFloat && hwy::IsSame<T, BF16>())) {
+      // Small roundoff error. BF16 to float is not lossless because the
+      // comparison is with float `raw`, prior to the Compress to BF16.
+      HWY_ASSERT(stats.L1().Max() <= 2E-3f);
+      HWY_ASSERT(IsInside(3E-4, 2E-3, stats.WeightedAverageL1()));
+      HWY_ASSERT(IsInside(600.0, 900.0, stats.GeomeanValueDivL1()));
+    } else if constexpr (hwy::IsSame<Packed, SfpStream>()) {
+      HWY_ASSERT(stats.SumL1() <= 0.4f);
+      HWY_ASSERT(stats.L1().Max() <= 0.04f);
+      HWY_ASSERT(IsInside(0.01, 0.03, stats.WeightedAverageL1()));
+      HWY_ASSERT(IsInside(48.0, 72.0, stats.GeomeanValueDivL1()));
+    } else if constexpr (hwy::IsSame<Packed, NuqStream>()) {
+      static_assert(NuqStream::kGroupSize == 256, "Update expected");
+      HWY_ASSERT(stats.SumL1() <= 1.2f);
+      HWY_ASSERT(stats.L1().Max() <= 0.08f);
+      HWY_ASSERT(IsInside(0.02, 0.05, stats.WeightedAverageL1()));
+      HWY_ASSERT(IsInside(18.0, 62.0, stats.GeomeanValueDivL1()));
+    } else {
+      HWY_ABORT("Unhandled type requested by ForeachPackedAndRawType");
+    }
+  }
+};
+
+void TestAllDecompress2() { ForeachPackedAndRawType<TestDecompress2T>(); }
+
+// Calls Compress and DecompressAndZeroPad for all short lengths and verifies
+// the distortion/error.
+template <typename Packed>
+struct TestShortLengthsT {
+  template <typename T, class D>
+  HWY_INLINE void operator()(T /*unused*/, D d) {
+    const size_t N = hn::Lanes(d);
+    CompressWorkingSet work;
+    hwy::ThreadPool pool(0);
+    hwy::RandomState rng;
+
+    for (size_t num = 1; num < 5 * hn::Lanes(d); ++num) {
+      const size_t packed_num = CompressedArrayElements<Packed>(num);
+
+      auto raw = hwy::AllocateAligned<float>(num);  // Compress requires f32
+      auto packed = hwy::AllocateAligned<Packed>(packed_num);
+      auto dec = hwy::AllocateAligned<T>(hwy::RoundUpTo(num, N));
+      HWY_ASSERT(raw && packed && dec);
+      const auto packed_span = MakeSpan(packed.get(), packed_num);
+
+      hwy::Stats in_stats;
+      for (size_t i = 0; i < num; ++i) {
+        raw[i] = static_cast<float>(RandomGaussian(rng));
+        in_stats.Notify(raw[i]);
+      }
+      // Short inputs fail VerifyGaussian.
+
+      const size_t packed_ofs = 0;
+      Compress(raw.get(), num, work, packed_span, packed_ofs, pool);
+      DecompressAndZeroPad(d, MakeConst(packed_span), packed_ofs, dec.get(),
+                           num);
+
+      DistortionStats stats;
+      for (size_t i = 0; i < num; ++i) {
+        stats.Notify(raw[i], hwy::ConvertScalarTo<float>(dec[i]));
+      }
+
+      if constexpr (false) {
+        fprintf(stderr, "%s %s: %zu: %f %f %f %f\n", TypeName<Packed>(),
+                TypeName<T>(), num, stats.SumL1(), stats.GeomeanValueDivL1(),
+                stats.WeightedAverageL1(), stats.L1().Max());
+      }
+
+      constexpr bool kFromFloat = hwy::IsSame<Packed, float>();
+      constexpr bool kToFloat = hwy::IsSame<T, float>();
+      if constexpr (kFromFloat && kToFloat) {  // Lossless
+        HWY_ASSERT(stats.NumExact() == num);
+        HWY_ASSERT(stats.SumL1() == 0.0f);
+        HWY_ASSERT(stats.L1().Max() == 0.0f);
+      } else if (hwy::IsSame<Packed, BF16>() ||
+                 (kFromFloat && hwy::IsSame<T, BF16>())) {
+        // Small roundoff error. BF16 to float is not lossless because the
+        // comparison is with float `raw`, prior to the Compress to BF16.
+        HWY_ASSERT(stats.L1().Max() <= 4E-3f);
+        HWY_ASSERT(IsInside(1E-5, 3E-3, stats.WeightedAverageL1()));
+        HWY_ASSERT(IsInside(300.0, 2200.0, stats.GeomeanValueDivL1()));
+      } else if (hwy::IsSame<Packed, SfpStream>()) {
+        HWY_ASSERT(stats.SumL1() <= 1.3f);
+        HWY_ASSERT(stats.L1().Max() <= 0.08f);
+        HWY_ASSERT(IsInside(7E-5, 0.05, stats.WeightedAverageL1()));
+        HWY_ASSERT(IsInside(28.0, 200.0, stats.GeomeanValueDivL1()));
+      } else if (hwy::IsSame<Packed, NuqStream>()) {
+        static_assert(NuqStream::kGroupSize == 256, "Update expected");
+        HWY_ASSERT(stats.SumL1() <= 4.6f);
+        HWY_ASSERT(stats.L1().Max() <= 0.14f);
+        HWY_ASSERT(IsInside(7E-5, 0.06, stats.WeightedAverageL1()));
+        HWY_ASSERT(IsInside(11.0, 180.0, stats.GeomeanValueDivL1()));
+      } else {
+        HWY_ABORT("Unhandled type requested by ForeachPackedAndRawType");
+      }
+    }
+  }
+};
+
+void TestAllShortLengths() { ForeachPackedAndRawType<TestShortLengthsT>(); }
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace gcpp
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace gcpp {
+HWY_BEFORE_TEST(CompressTest);
+HWY_EXPORT_AND_TEST_P(CompressTest, TestAllDecompress2);
+HWY_EXPORT_AND_TEST_P(CompressTest, TestAllShortLengths);
+HWY_AFTER_TEST();
+}  // namespace gcpp
+#endif  // HWY_ONCE
diff --git a/compression/compress_weights.cc b/compression/compress_weights.cc
index 3106db9..a36b35c 100644
--- a/compression/compress_weights.cc
+++ b/compression/compress_weights.cc
@@ -54,21 +54,6 @@ namespace gcpp {
 constexpr bool kDryRunFread = false;
 
 namespace {
-float ScaleWeights(float* data, size_t len) {
-  float maxabs = 0.0;
-  for (size_t i = 0; i < len; ++i) {
-    maxabs = std::max(maxabs, std::abs(data[i]));
-  }
-  if (maxabs <= kMaxSFP) {
-    return 1.0f;
-  }
-  const float scale = maxabs / kMaxSFP;
-  const float inv_scale = 1.0f / scale;
-  for (size_t i = 0; i < len; ++i) {
-    data[i] *= inv_scale;
-  }
-  return scale;
-}
 
 #define READ_WEIGHTS(name)                                                 \
   do {                                                                     \
diff --git a/compression/distortion_test.cc b/compression/distortion_test.cc
index 00e026a..9350b5b 100644
--- a/compression/distortion_test.cc
+++ b/compression/distortion_test.cc
@@ -17,7 +17,7 @@
 
 #include <stdio.h>
 
-#include "compression/shared.h"
+#include "compression/shared.h"  // SfpStream::kMax
 #include "util/test_util.h"
 #include "hwy/nanobenchmark.h"
 #include "hwy/tests/hwy_gtest.h"
@@ -75,13 +75,15 @@ TEST(DistortionTest, TestDilution) {
   HWY_ASSERT(IsNear(0.001, stats.WeightedAverageL1()));
 
   // Now add a large difference:
-  stats.Notify(kMaxSFP - 0.0625f, kMaxSFP);  // max magnitude, 3-bit mantissa
+  stats.Notify(SfpStream::kMax - 0.0625f,
+               SfpStream::kMax);  // max magnitude, 3-bit mantissa
   // .. WeightedAverageL1 is closer to it.
   HWY_ASSERT(IsInside(0.020, 0.025, stats.WeightedAverageL1()));
 
   // Add a small and large difference:
   stats.Notify((1.75f - 0.125f) / 1024, 1.75f / 1024);  // small, 2-bit mantissa
-  stats.Notify(-kMaxSFP + 0.0625f, -kMaxSFP);           // larger negative
+  stats.Notify(-SfpStream::kMax + 0.0625f,
+               -SfpStream::kMax);  // larger negative
   // .. SNR is still barely affected.
   HWY_ASSERT(IsInside(890.0, 900.0, stats.GeomeanValueDivL1()));
   // .. WeightedAverageL1 is higher after another large error.
diff --git a/compression/nuq-inl.h b/compression/nuq-inl.h
index 11e9204..f8fa467 100644
--- a/compression/nuq-inl.h
+++ b/compression/nuq-inl.h
@@ -19,11 +19,15 @@
 
 #include <stddef.h>
 #include <stdint.h>
+#include <stdio.h>
 
-#include "compression/nuq.h"
 #include "compression/shared.h"
 #include "hwy/base.h"
 
+#if HWY_IS_MSAN
+#include <sanitizer/msan_interface.h>
+#endif
+
 #endif  // THIRD_PARTY_GEMMA_CPP_COMPRESSION_NUQ_INL_H_
 
 // Actual per-target include guard.
@@ -40,17 +44,24 @@
 #include "compression/sfp-inl.h"
 #include "hwy/contrib/sort/vqsort-inl.h"
 
-#ifndef HWY_IF_CONSTEXPR
-#define HWY_IF_CONSTEXPR if
-#endif
-
 HWY_BEFORE_NAMESPACE();
 namespace gcpp {
 namespace HWY_NAMESPACE {
 namespace hn = hwy::HWY_NAMESPACE;
 
+static inline void MaybeCheckInitialized(const void* ptr, size_t size) {
+#if HWY_IS_MSAN
+  __msan_check_mem_is_initialized(ptr, size);
+#else
+  (void)ptr;
+  (void)size;
+#endif
+}
+
 // For internal use by NuqCodec.
 class NuqClustering {
+  static constexpr size_t kGroupSize = NuqStream::kGroupSize;
+
   // To go from sorted order back to the original order in O(1), we store the
   // original index in the lower bits of the float32 mantissa, which means they
   // are sorted alongside the value.
@@ -88,11 +99,13 @@ class NuqClustering {
     explicit ClusterCost(const float* HWY_RESTRICT sorted) {
       double cumsum = 0.0;
       double cumsum2 = 0.0;
-      cumsum_[0] = cumsum2_[0] = 0.0;
+      dcumsum_[0] = 0.0;
+      cumsum_[0] = cumsum2_[0] = 0.0f;
       for (size_t i = 0; i < kGroupSize; ++i) {
         const float x = FloatPayload::Clear(sorted[i]);
         cumsum += x;
         cumsum2 += static_cast<double>(x) * x;
+        dcumsum_[1 + i] = cumsum;
         cumsum_[1 + i] = static_cast<float>(cumsum);
         cumsum2_[1 + i] = static_cast<float>(cumsum2);
       }
@@ -132,8 +145,10 @@ class NuqClustering {
     }
 
     // Returns cost (L2 norm) for a single cluster, used for backtracking.
-    float SumOfSorted(size_t first, size_t last) const {
-      return cumsum_[last + 1] - cumsum_[first];
+    double SumOfSorted(size_t first, size_t last) const {
+      HWY_DASSERT(first < kGroupSize);
+      HWY_DASSERT(last < kGroupSize);
+      return dcumsum_[last + 1] - dcumsum_[first];
     }
 
     // Returns vector of costs of clustering first..last + i with their means.
@@ -199,6 +214,8 @@ class NuqClustering {
     float cumsum2_[kGroupSize + 1 + kMaxLanes];
     float len_[kMaxLanes + kGroupSize + 1 + kMaxLanes];      // = vlen[i]
     float inv_len_[kMaxLanes + kGroupSize + 1 + kMaxLanes];  // = 1 / vlen[i]
+
+    double dcumsum_[kGroupSize + 1];  // for SumOfSorted
   };
 
   // Dynamic programming step: returns costs of clustering 0..last+i, where the
@@ -206,18 +223,17 @@ class NuqClustering {
   // `first`, and `last`; vectorized across `last`. `first` may be greater than
   // `last`. `valid[i]` is `first <= last + i`.
   template <class DF, class VF = hn::Vec<DF>, class MF = hn::Mask<DF>>
-  static HWY_INLINE VF ClusterDynProg(DF df, const AlignedMatrix<float>& D,
-                                      const ClusterCost& cc,
-                                      const size_t idx_cluster,
-                                      const size_t first, const size_t last,
-                                      const MF valid) {
+  static HWY_INLINE VF
+  ClusterDynProg(DF df, const NuqStream::AlignedMatrix<float>& costs,
+                 const ClusterCost& cc, const size_t idx_cluster,
+                 const size_t first, const size_t last, const MF valid) {
     HWY_DASSERT(idx_cluster != 0);
     HWY_DASSERT(0 != first && first < kGroupSize);
     HWY_DASSERT(last < kGroupSize);
     HWY_DASSERT(last % hn::Lanes(df) == 0);  // Called in steps of N
 
     // Cost of clustering 0..first-1 with one fewer cluster than now.
-    const VF prev = hn::Set(df, D(idx_cluster - 1, first - 1));
+    const VF prev = hn::Set(df, costs(idx_cluster - 1, first - 1));
     // Eq2: add to that the cost of another cluster from first..last.
     return hn::Add(prev, cc.SumCosts(df, first, last, valid));
   }
@@ -237,7 +253,8 @@ class NuqClustering {
   // as implemented in FAISS, for our kGroupSize of 256.
   template <class DF>
   static HWY_NOINLINE size_t ClusterExactL2(DF df, const float* HWY_RESTRICT x,
-                                            size_t num, ClusterBuf& buf,
+                                            size_t num,
+                                            NuqStream::ClusterBuf& buf,
                                             float* HWY_RESTRICT centers,
                                             uint16_t* HWY_RESTRICT indices) {
     HWY_DASSERT(num <= kGroupSize);
@@ -268,31 +285,34 @@ class NuqClustering {
     ClusterCost cc(sorted_and_i);  // ignores payload bits.
 
     // Reference: https://arxiv.org/abs/1701.07204
-    // D[k-1][m] is the lowest cost of clustering x1..m into k clusters.
-    AlignedMatrix<float>& D = buf.d;
-    // T[k][m] is the starting index within sorted_and_i[] of the k-th cluster.
-    AlignedMatrix<int32_t>& T = buf.t;
+    // costs[k-1][m] is the lowest cost of clustering x1..m into k clusters.
+    NuqStream::AlignedMatrix<float>& costs = buf.costs;
+    // argmin[k][m] is the starting index within sorted_and_i[] of the k-th
+    // cluster.
+    NuqStream::AlignedMatrix<int32_t>& argmin = buf.argmin;
 
-    // Fill first row of `D` and `T`: single cluster, iterate over all `last`.
+    // Fill first row of `costs` and `argmin`: single cluster, iterate over all
+    // `last`.
     {
       const size_t cluster_idx = 0;
       const size_t first = 0;
       const VI vfirst = hn::Set(di, static_cast<int32_t>(first));
       const MF all_valid = hn::FirstN(df, N);  // first <= last is always true
       for (size_t last = 0; last < kGroupSize; last += N) {
-        const VF costs = cc.SumCosts(df, first, last, all_valid);
-        hn::Store(costs, df, &D(cluster_idx, last));
-        hn::Store(vfirst, di, &T(cluster_idx, last));
+        const VF vcosts = cc.SumCosts(df, first, last, all_valid);
+        hn::Store(vcosts, df, &costs(cluster_idx, last));
+        hn::Store(vfirst, di, &argmin(cluster_idx, last));
       }
     }
 
+    constexpr size_t kClusters = NuqStream::kClusters;
     for (size_t cluster_idx = 1; cluster_idx < kClusters; ++cluster_idx) {
       // For vectors of `last + i` with `i < N`:
       for (size_t last = 0; last < kGroupSize; last += N) {
         const VI vlast = hn::Iota(di, static_cast<int32_t>(last));
-        const VF prev_cost = hn::LoadU(df, &D(cluster_idx - 1, last));
+        const VF prev_cost = hn::LoadU(df, &costs(cluster_idx - 1, last));
         VF min = prev_cost;
-        VI arg = hn::LoadU(di, &T(cluster_idx - 1, last));
+        VI arg = hn::LoadU(di, &argmin(cluster_idx - 1, last));
         // For each `first` (j), which is the start of the rightmost of at least
         // two clusters, hence never zero. `first` also continues past `last`
         // because the last `vlast` lane is `last + N - 1`.
@@ -300,7 +320,7 @@ class NuqClustering {
           const VI vfirst = hn::Set(di, static_cast<int32_t>(first));
           const MF valid = hn::RebindMask(df, hn::Le(vfirst, vlast));
           const VF c =
-              ClusterDynProg(df, D, cc, cluster_idx, first, last, valid);
+              ClusterDynProg(df, costs, cc, cluster_idx, first, last, valid);
 
           // Retain the min cost and the `first` that caused it.
           const MF less = hn::And(valid, hn::Lt(c, min));
@@ -309,21 +329,21 @@ class NuqClustering {
         }
         HWY_DASSERT(hn::AllTrue(df, hn::Le(min, prev_cost)));
 
-        hn::Store(min, df, &D(cluster_idx, last));
-        hn::Store(arg, di, &T(cluster_idx, last));
+        hn::Store(min, df, &costs(cluster_idx, last));
+        hn::Store(arg, di, &argmin(cluster_idx, last));
       }
     }
 
-    // Backtrack to find centers. Clusters are [T(k, last), last].
+    // Backtrack to find centers. Clusters are [argmin(k, last), last].
     size_t last = kGroupSize - 1;
     size_t unused_clusters = 0;
     for (size_t k = kClusters - 1; k < kClusters; --k) {
-      const size_t start = static_cast<size_t>(T(k, last));
+      const size_t start = static_cast<size_t>(argmin(k, last));
       // Center = mean, O(1) thanks to cumulative sums.
-      const float sum = cc.SumOfSorted(start, last);
+      const double sum = cc.SumOfSorted(start, last);
       const int size = static_cast<int>(last) - static_cast<int>(start) + 1;
       HWY_DASSERT(0 < size && size <= static_cast<int>(kGroupSize));
-      centers[k] = sum / static_cast<float>(size);
+      centers[k] = static_cast<float>(sum / size);
 
       // We know the range inside sorted_and_i[]; translate to original indices,
       // which are stored inside each of the sorted_and_i mantissas.
@@ -347,15 +367,34 @@ class NuqClustering {
     }
 
     if (HWY_IS_DEBUG_BUILD) {
-      // Centers are in ascending order.
+      // If centers are not in ascending order, print them.
       for (size_t i = unused_clusters + 1; i < kClusters; ++i) {
-        HWY_DASSERT(centers[i] >= centers[i - 1]);
+        if (centers[i] < centers[i - 1]) {
+          for (size_t i = 0; i < kClusters; ++i) {
+            fprintf(stderr, "%2zu: %.8f\n", i, centers[i]);
+          }
+          for (size_t i = 0; i < kGroupSize; ++i) {
+            fprintf(stderr, "%3zu: %.8f\n", i,
+                    FloatPayload::Clear(sorted_and_i[i]));
+          }
+          for (size_t i = 0; i < num; ++i) {
+            fprintf(stderr, "%3zu: %.8f\n", i, x[i]);
+          }
+          HWY_ABORT("Centers not in ascending order at %zu; unused %zu\n", i,
+                    unused_clusters);
+        }
       }
     }
+
+    MaybeCheckInitialized(centers, kClusters * sizeof(centers[0]));
     return unused_clusters;
   }
 };  // NuqClustering
 
+// Half-vector of u8 from u16/bf16.
+template <class D16>
+using D8HFromD16 = hn::Half<hn::Repartition<uint8_t, D16>>;
+
 // Bit-packing 4-bit values is trivial if we have 2 or 4 independent vectors:
 // simply shift+OR them together into a full vector of 8 or 16-bit lanes.
 // However, the order then depends on the vector length, which is unacceptable
@@ -371,15 +410,15 @@ class NuqClustering {
 // operations which benefit from special-casing for target and vector length.
 class NibbleCodec {
  public:
-  // Packs four u16 vectors' lanes to nibbles within one vector, in order, and
-  // stores that vector to `out`.
-  template <class D16, class V16 = hn::Vec<D16>>
-  static HWY_INLINE void OrderedPackU16(D16 d16, V16 in0, V16 in1, V16 in2,
-                                        V16 in3, uint8_t* HWY_RESTRICT out) {
-    const hn::Repartition<uint8_t, D16> d8;
+  // Returns a byte vector whose nibbles are the lanes of four u16 vectors, in
+  // the same order.
+  template <class D16, class V16 = hn::Vec<D16>,
+            class D8 = hn::Repartition<uint8_t, D16>, class V8 = hn::Vec<D8>>
+  static HWY_INLINE V8 OrderedPackU16(D16 d16, V16 in0, V16 in1, V16 in2,
+                                      V16 in3) {
+    const D8 d8;
     const hn::Repartition<uint32_t, D16> d32;
     const hn::Repartition<uint64_t, D16> d64;
-    using V8 = hn::Vec<decltype(d8)>;
 
     // Pairwise compaction of a single vector so nibbles are packed in-order.
     // v16 lanes hold a 4-bit value; OR together adjacent pairs into the lower
@@ -393,14 +432,13 @@ class NibbleCodec {
     const V16 u8_1 = combine_u16_pair_to_8(in1);
     const V16 u8_2 = combine_u16_pair_to_8(in2);
     const V16 u8_3 = combine_u16_pair_to_8(in3);
-    V8 packed;
     if constexpr (HWY_TARGET <= HWY_AVX3_DL || !HWY_ARCH_X86) {
       // 8-bit ConcatEven is efficient. Let digits denote eight u8 lanes
       // of u8_1/0: ?d?3 ?c?2 / ?b?1 ?a?0. 8-bit ConcatEven = d3c2 b1a0, and
       // again with the second x2_1 gives 7654 3210.
       const V8 x2_0 = hn::ConcatEven(d8, BitCast(d8, u8_1), BitCast(d8, u8_0));
       const V8 x2_1 = hn::ConcatEven(d8, BitCast(d8, u8_3), BitCast(d8, u8_2));
-      packed = hn::ConcatEven(d8, x2_1, x2_0);
+      return hn::ConcatEven(d8, x2_1, x2_0);
     } else {
       // To avoid expensive 8-bit ConcatEven, compact pairs of u32 into the
       // lower 16 bits in each u64, with other bits undefined.
@@ -416,70 +454,23 @@ class NibbleCodec {
       // u16 of every u64. This is the same as above but with 16-bit Concat.
       const V16 x2_0 = hn::ConcatEven(d16, u16_1, u16_0);
       const V16 x2_1 = hn::ConcatEven(d16, u16_3, u16_2);
-      packed = hn::BitCast(d8, hn::ConcatEven(d16, x2_1, x2_0));
+      return hn::BitCast(d8, hn::ConcatEven(d16, x2_1, x2_0));
     }
-    hn::StoreU(packed, d8, out);
   }
 
-  // Unpacks `Lanes(d16)` nibbles to u16 lanes. The first comes from the low
-  // nibble of packed[0], then its high nibble, then the next low nibble, etc.
-  template <class D16, class V16 = hn::Vec<D16>>
-  static HWY_INLINE V16 OrderedUnpackU16(D16 d16, const uint8_t* packed) {
-    const hn::Repartition<uint8_t, D16> d8;
+  // Unpacks nibbles from the `kHalf` (0 or 1) half of a half-vector of bytes.
+  // Thus we use a quarter of a vector of bytes and expand nibbles 4x into u16,
+  // which fills a whole vector. Its first lane comes from the low nibble of the
+  // first byte, the second from its high nibble, then the next low nibble, etc.
+  template <size_t kHalf, class D16, class V16 = hn::Vec<D16>,
+            class D8H = D8HFromD16<D16>, class V8H = hn::Vec<D8H>>
+  static HWY_INLINE V16 OrderedUnpackU16(D16 d16, const V8H packed) {
+    const hn::Twice<D8H> d8;  // full vector
     using V8 = hn::Vec<decltype(d8)>;
-    const hn::CappedTag<uint8_t, d16.MaxBytes() / 4> d_load;
 
-    // We replicate each byte 4x, so that its two nibbles propagate to both
-    // u16 lanes that they will initialize. The only performance-portable op to
-    // replicate bytes is TableLookupBytes, which shuffles 128-bit blocks
-    // independently. Thus each block receives 4 packed bytes, replicates them
-    // 4x, shifts/masks, and casts to 8 u16 lanes.
-    //
-    // Loading 16 bytes via LoadDup128 only works on AVX3; for smaller vectors,
-    // it may trigger asan errors from overrunning the end. We thus special-case
-    // vector lengths, handling any non-constexpr, and constexpr <= 512 bit.
-    V8 rep4;
-    if constexpr (HWY_HAVE_SCALABLE) {
-      // Non constexpr length: 4 per whole block equals size/4.
-      const size_t num_bytes = HWY_MAX(1, hn::Lanes(d8) / 4);
-      const V8 bytes = hn::LoadN(d8, packed, num_bytes);
-      // Replicate bytes 4x: lowest 4 = 0, next 4 = 1 etc.
-      const V8 idx = hn::ShiftRight<2>(hn::Iota(d8, 0));
-      rep4 = hn::TableLookupLanes(bytes, hn::IndicesFromVec(d8, idx));
-    } else if (hn::MaxLanes(d16) <= 8) {  // <= 128-bit
-      const V8 bytes = hn::ResizeBitCast(d8, hn::LoadU(d_load, packed));
-      alignas(16) static constexpr uint8_t kRep4[16] = {
-          HWY_REP4(0), HWY_REP4(1), HWY_REP4(2), HWY_REP4(3)};
-      rep4 = hn::TableLookupBytes(bytes, hn::Load(d8, kRep4));
-    } else if (HWY_TARGET <= HWY_AVX3_DL || !HWY_ARCH_X86) {
-      // Plain load, can do 256..512-bit permute across blocks.
-      const V8 bytes = hn::ResizeBitCast(d8, hn::LoadU(d_load, packed));
-      alignas(64) static constexpr uint8_t kRep4[64] = {
-          HWY_REP4(0),  HWY_REP4(1),  HWY_REP4(2),  HWY_REP4(3),
-          HWY_REP4(4),  HWY_REP4(5),  HWY_REP4(6),  HWY_REP4(7),
-          HWY_REP4(8),  HWY_REP4(9),  HWY_REP4(10), HWY_REP4(11),
-          HWY_REP4(12), HWY_REP4(13), HWY_REP4(14), HWY_REP4(15)};
-      rep4 = hn::TableLookupLanes(bytes, hn::SetTableIndices(d8, kRep4));
-    } else if (hn::MaxLanes(d16) == 16) {  // 256-bit
-      const V8 bytes = hn::ResizeBitCast(d8, hn::LoadU(d_load, packed));
-      // First copy to upper block for TableLookupBytes. This is slightly
-      // faster than 64-bit BroadcastLane.
-      const V8 bcast = hn::ConcatLowerLower(d8, bytes, bytes);
-      alignas(32) static constexpr uint8_t kRep4[32] = {
-          HWY_REP4(0), HWY_REP4(1), HWY_REP4(2), HWY_REP4(3),
-          HWY_REP4(4), HWY_REP4(5), HWY_REP4(6), HWY_REP4(7)};
-      rep4 = hn::TableLookupBytes(bcast, hn::Load(d8, kRep4));
-    } else if (hn::MaxLanes(d16) == 32) {  // 512-bit
-      const V8 bytes = hn::LoadDup128(d8, packed);
-      alignas(64) static constexpr uint8_t kRep4[64] = {
-          HWY_REP4(0),  HWY_REP4(1),  HWY_REP4(2),  HWY_REP4(3),
-          HWY_REP4(4),  HWY_REP4(5),  HWY_REP4(6),  HWY_REP4(7),
-          HWY_REP4(8),  HWY_REP4(9),  HWY_REP4(10), HWY_REP4(11),
-          HWY_REP4(12), HWY_REP4(13), HWY_REP4(14), HWY_REP4(15)};
-      rep4 = hn::TableLookupBytes(bytes, hn::Load(d8, kRep4));
-    } else {
-      HWY_DASSERT(false);
-    }
+    // Replicate each byte 4x, so that its two nibbles propagate to both u16
+    // lanes that they will initialize.
+    const V8 rep4 = Replicate4x<kHalf>(d8, hn::ResizeBitCast(d8, packed));
 
     const V16 mask4 = hn::Set(d16, 0xF);
     const V16 u16 = BitCast(d16, rep4);
@@ -490,10 +481,60 @@ class NibbleCodec {
     // zz z3 zz z2 | zz z1 zz z0  And (unpacked result)
     return hn::And(mask4, hn::OddEven(hn::ShiftRight<4>(u16), u16));
   }
+
+ private:
+  // Returns `bytes[0 + kHalf * N/2]` in lanes 0..3, `bytes[1 + kHalf * N/2]` in
+  // lanes 4..7, etc. We fuse `kHalf` into the tables, which avoids the caller
+  // having to pass in `UpperHalf(bytes)`.
+  template <size_t kHalf, class D8, class V8 = hn::Vec<D8>>
+  static HWY_INLINE V8 Replicate4x(D8 d8, V8 bytes) {
+    static_assert(kHalf <= 1);
+    const size_t N = hn::Lanes(d8);
+    constexpr size_t kMaxN = hn::MaxLanes(d8);
+    // For kHalf=1 and 512-bit vectors, kAdd would be 16, which is out of
+    // bounds for TableLookupBytes. We instead BroadcastBlock<1> there.
+    constexpr uint8_t kAdd = kMaxN < 64 ? kHalf * kMaxN / 4 : 0;
+    // The only performance-portable op to replicate bytes is TableLookupBytes,
+    // but this only works if vectors are 128-bit or we first BroadcastBlock,
+    // which only works for <= 512-bit vectors. For scalable vectors, we
+    // instead synthesize this table via Iota+ShiftRight.
+    alignas(64) static constexpr uint8_t kRep4[64] = {
+        HWY_REP4(kAdd + 0),  HWY_REP4(kAdd + 1),  HWY_REP4(kAdd + 2),
+        HWY_REP4(kAdd + 3),  HWY_REP4(kAdd + 4),  HWY_REP4(kAdd + 5),
+        HWY_REP4(kAdd + 6),  HWY_REP4(kAdd + 7),  HWY_REP4(kAdd + 8),
+        HWY_REP4(kAdd + 9),  HWY_REP4(kAdd + 10), HWY_REP4(kAdd + 11),
+        HWY_REP4(kAdd + 12), HWY_REP4(kAdd + 13), HWY_REP4(kAdd + 14),
+        HWY_REP4(kAdd + 15)};
+
+    if constexpr (HWY_HAVE_SCALABLE) {
+      // Replicate bytes 4x: lowest 4 = 0, next 4 = 1 etc. This works for up to
+      // 1024-bit vectors: Iota is [128, 256), and [32, 64) after shifting.
+      // For larger vectors, this would overflow and we should instead add kAdd.
+      HWY_DASSERT(N <= 128);
+      const V8 iota = hn::Iota(d8, static_cast<uint8_t>(kHalf * N));
+      const V8 idx = hn::ShiftRight<2>(iota);
+      return hn::TableLookupLanes(bytes, hn::IndicesFromVec(d8, idx));
+    } else if constexpr (kMaxN <= 16) {  // <= 128-bit
+      // No BroadcastBlock, we anyway only have one block.
+      return hn::TableLookupBytes(bytes, hn::Load(d8, kRep4));
+    } else if constexpr (HWY_TARGET <= HWY_AVX3_DL || !HWY_ARCH_X86) {
+      // No BroadcastBlock, can directly permute across blocks.
+      return hn::TableLookupLanes(bytes, hn::SetTableIndices(d8, kRep4));
+    } else {  // 256..512-bit, no efficient TableLookupLanes
+      static_assert(kMaxN <= 64);  // Else BroadcastBlock does not work.
+      // See kAdd comment above.
+      constexpr size_t kBlock = (kMaxN == 64 && kHalf == 1) ? 1 : 0;
+      bytes = hn::BroadcastBlock<kBlock>(bytes);
+      return hn::TableLookupBytes(bytes, hn::Load(d8, kRep4));
+    }
+  }
 };
 
 // Encode/decode functions.
 class NuqCodec {
+  static constexpr size_t kClusters = NuqStream::kClusters;
+  static constexpr size_t kGroupSize = NuqStream::kGroupSize;
+
   // 256-bit vectors can hold 16 bf16, otherwise we require 2x128-bit.
   template <class DU>
   static constexpr size_t NumTables(DU du) {
@@ -508,308 +549,465 @@ class NuqCodec {
                                           hn::Vec<DU>* HWY_RESTRICT tbl1) {
     // Cap to the table size (kClusters) for decoding SFP - sufficient, and may
     // be faster than a large vector.
-    const hn::CappedTag<hwy::bfloat16_t, kClusters> d_table;
+    const hn::CappedTag<BF16, kClusters> d_table;
     // We ResizeCast tables to DU: if DU is bigger, table lookups will only
     // access lanes < kClusters. If DU is smaller (128-bit), we have 2 tables.
     HWY_DASSERT(hn::Lanes(du) >= hn::Lanes(d_table) || NumTables(du) == 2);
 
-    HWY_ALIGN hwy::bfloat16_t table[kClusters];
-    SfpCodec::Dec(d_table, reinterpret_cast<const SfpStream*>(centers),
-                  kClusters, table);
+    HWY_ALIGN BF16 table[kClusters];
+    SfpCodec::DecompressAndZeroPad(
+        d_table,
+        MakeSpan(reinterpret_cast<const SfpStream*>(centers), kClusters), 0,
+        table, kClusters);
 
     // If we assume >= 128-bit vectors, we can use [Two]TableLookupLanes
     // instead of TableLookupBytes, which requires extra interleaving of lo/hi.
     HWY_DASSERT(hn::Lanes(du) >= 8);
 
-    HWY_IF_CONSTEXPR(NumTables(du) == 2) {
+    if constexpr (NumTables(du) == 2) {
       // Reduce cap for second half to avoid loading past the end of the table.
-      const hn::CappedTag<hwy::bfloat16_t, kClusters / 2> d_table2;
+      const hn::CappedTag<BF16, kClusters / 2> d_table2;
       *tbl1 = hn::ResizeBitCast(du, hn::LoadU(d_table2, table + kClusters / 2));
     }
     return hn::ResizeBitCast(du, hn::Load(d_table, table));
   }
 
-  // Unpacks per-weight indices and sets c0/c1 to the corresponding centers.
-  template <class DU>
-  static HWY_INLINE void TableLookups(DU du, hn::Vec<DU> tbl0, hn::Vec<DU> tbl1,
-                                      const uint8_t* packed, hn::Vec<DU>& c0,
-                                      hn::Vec<DU>& c1) {
-    using V16 = hn::Vec<decltype(du)>;
-    const size_t N16 = hn::Lanes(du);
-
-    const V16 idx0 = NibbleCodec::OrderedUnpackU16(du, packed);
-    const V16 idx1 = NibbleCodec::OrderedUnpackU16(du, packed + N16 / 2);
+  // Unpacks a half-vector of nibbles into two vectors of u16 indices and sets
+  // c0/c1 to the corresponding bf16 (stored in u16) centers from tbl0/tbl1.
+  template <class DU, class VU = hn::Vec<DU>, class D8H = D8HFromD16<DU>,
+            class V8H = hn::Vec<D8H>>
+  static HWY_INLINE void TableLookups(DU du, VU tbl0, VU tbl1, const V8H packed,
+                                      VU& c0, VU& c1) {
+    const VU idx0 = NibbleCodec::OrderedUnpackU16<0>(du, packed);
+    const VU idx1 = NibbleCodec::OrderedUnpackU16<1>(du, packed);
 
     const auto indices0 = hn::IndicesFromVec(du, idx0);
     const auto indices1 = hn::IndicesFromVec(du, idx1);
 
-    HWY_IF_CONSTEXPR(NumTables(du) == 1) {
+    if constexpr (NumTables(du) == 1) {
       (void)tbl1;
       c0 = hn::TableLookupLanes(tbl0, indices0);
       c1 = hn::TableLookupLanes(tbl0, indices1);
     }
-    HWY_IF_CONSTEXPR(NumTables(du) == 2) {  // `else` is poorly formatted.
+    if constexpr (NumTables(du) == 2) {  // `else` is poorly formatted.
       c0 = hn::TwoTablesLookupLanes(du, tbl0, tbl1, indices0);
       c1 = hn::TwoTablesLookupLanes(du, tbl0, tbl1, indices1);
     }
   }
 
+  // As above, but returns a single 16-bit output vector for f32 Dec2, thus
+  // packed is only a quarter-vector.
+  template <class DU, class VU = hn::Vec<DU>,
+            class D8Q = hn::Half<D8HFromD16<DU>>, class V8Q = hn::Vec<D8Q>>
+  static HWY_INLINE VU TableLookups(DU du, VU tbl0, VU tbl1, const V8Q packed) {
+    const D8HFromD16<DU> d8h;
+    // OrderedUnpackU16 expects a half-vector, but will only use the lower half
+    // of it.
+    const hn::Vec<decltype(d8h)> packed_h = hn::ZeroExtendVector(d8h, packed);
+    const VU idx0 = NibbleCodec::OrderedUnpackU16<0>(du, packed_h);
+
+    const auto indices0 = hn::IndicesFromVec(du, idx0);
+
+    if constexpr (NumTables(du) == 1) {
+      (void)tbl1;
+      return hn::TableLookupLanes(tbl0, indices0);
+    }
+    if constexpr (NumTables(du) == 2) {  // `else` is poorly formatted.
+      return hn::TwoTablesLookupLanes(du, tbl0, tbl1, indices0);
+    }
+  }
+
  public:
-  // Encodes `num` floats starting from `in`. `out` points to compressed
-  // storage for `out_capacity` values and `out_ofs` indicates the destination
-  // offset within it, in units of float values, for parallel encoding by
-  // multiple threads. `num`, `out_capacity`, and `out_ofs` must all be
-  // multiples of `kGroupSize`. Returns the total number of unused clusters,
-  // which is expected to be zero.
+  // Encodes `num` floats from `raw`. `packed` points to compressed storage and
+  // `packed_ofs` indicates the destination offset within it, in units of float
+  // values, for parallel encoding by multiple threads. Returns the total
+  // number of unused clusters, which is typically zero.
   template <class DF, HWY_IF_F32_D(DF)>
-  static HWY_INLINE size_t Enc(DF df, const float* const in, const size_t num,
-                               ClusterBuf& buf, const size_t out_capacity,
-                               NuqStream* const out, const size_t out_ofs) {
+  static HWY_INLINE size_t Enc(DF df, const float* HWY_RESTRICT raw,
+                               const size_t num, NuqStream::ClusterBuf& buf,
+                               const PackedSpan<NuqStream>& packed,
+                               size_t packed_ofs) {
     const hn::Repartition<uint16_t, DF> d16;
+    const hn::Repartition<uint8_t, DF> d8;
     using V16 = hn::Vec<decltype(d16)>;
-
+    using V8 = hn::Vec<decltype(d8)>;
     const size_t N16 = hn::Lanes(d16);
-    HWY_ASSERT(kGroupSize >= 4 * N16);
 
-    HWY_ASSERT(out_ofs + num <= out_capacity);
-    buf.Resize(num);
-    HWY_ASSERT(num % kGroupSize == 0);
-    HWY_ASSERT(out_capacity % kGroupSize == 0);
-    HWY_ASSERT(out_ofs % kGroupSize == 0);
-    const size_t num_groups = num / kGroupSize;
-    const size_t ofs_groups = out_ofs / kGroupSize;
+    HWY_ASSERT(packed_ofs % kGroupSize == 0);
+    const size_t ofs_groups = packed_ofs / kGroupSize;
+    const size_t num_groups = hwy::DivCeil(num, kGroupSize);
+    buf.Resize(num_groups);
 
     size_t unused_clusters = 0;
     for (size_t g = 0; g < num_groups; ++g) {
-      const float* HWY_RESTRICT g_in = in + g * kGroupSize;
+      const size_t g_num = HWY_MIN(num - g * kGroupSize, kGroupSize);
+      const float* HWY_RESTRICT g_in = raw + g * kGroupSize;
       float* HWY_RESTRICT g_centers = buf.centers.get() + g * kClusters;
       uint16_t* HWY_RESTRICT g_idx = buf.idx.get() + g * kGroupSize;
-      unused_clusters += NuqClustering::ClusterExactL2(df, g_in, kGroupSize,
-                                                       buf, g_centers, g_idx);
+      unused_clusters +=
+          NuqClustering::ClusterExactL2(df, g_in, g_num, buf, g_centers, g_idx);
     }
 
-    uint8_t* centers = &out->byte + ofs_groups * kClusters;
+    uint8_t* centers = &packed.ptr->byte + ofs_groups * kClusters;
     SfpCodec::Enc(df, buf.centers.get(), num_groups * kClusters,
                   reinterpret_cast<SfpStream*>(centers));
-    uint8_t* packed_start = &out->byte + NuqStream::PackedStart(out_capacity) +
+    uint8_t* packed_start = &packed.ptr->byte +
+                            NuqStream::PackedStart(packed.num) +
                             ofs_groups * kGroupSize / 2;
 
+    // All but the last group have no remainders.
+    HWY_DASSERT(kGroupSize % (4 * N16) == 0);
     HWY_UNROLL(1)
-    for (size_t g = 0; g < num_groups; ++g) {
+    for (size_t g = 0; g < num_groups - 1; ++g) {
       const uint16_t* HWY_RESTRICT g_idx = buf.idx.get() + g * kGroupSize;
       uint8_t* HWY_RESTRICT g_packed = packed_start + g * kGroupSize / 2;
 
       HWY_UNROLL(1)
       for (size_t i = 0; i < kGroupSize; i += 4 * N16) {
-        const V16 idx0 = hn::LoadU(d16, g_idx + i + N16 * 0);
-        const V16 idx1 = hn::LoadU(d16, g_idx + i + N16 * 1);
-        const V16 idx2 = hn::LoadU(d16, g_idx + i + N16 * 2);
-        const V16 idx3 = hn::LoadU(d16, g_idx + i + N16 * 3);
-        NibbleCodec::OrderedPackU16(d16, idx0, idx1, idx2, idx3,
-                                    g_packed + i / 2);
+        const V16 idx0 = hn::LoadU(d16, g_idx + i + 0 * N16);
+        const V16 idx1 = hn::LoadU(d16, g_idx + i + 1 * N16);
+        const V16 idx2 = hn::LoadU(d16, g_idx + i + 2 * N16);
+        const V16 idx3 = hn::LoadU(d16, g_idx + i + 3 * N16);
+        const V8 nibbles =
+            NibbleCodec::OrderedPackU16(d16, idx0, idx1, idx2, idx3);
+        hn::StoreU(nibbles, d8, g_packed + i / 2);
+      }
+    }
+
+    // Last group may have remainders.
+    {
+      HWY_DASSERT(num_groups != 0);
+      const size_t g = num_groups - 1;
+      const size_t g_num = num - g * kGroupSize;
+      HWY_DASSERT(g_num <= kGroupSize);
+      const uint16_t* HWY_RESTRICT g_idx = buf.idx.get() + g * kGroupSize;
+      uint8_t* HWY_RESTRICT g_packed = packed_start + g * kGroupSize / 2;
+
+      size_t i = 0;
+      if (g_num >= 4 * N16) {
+        HWY_UNROLL(1)
+        for (; i <= g_num - 4 * N16; i += 4 * N16) {
+          const V16 idx0 = hn::LoadU(d16, g_idx + i + 0 * N16);
+          const V16 idx1 = hn::LoadU(d16, g_idx + i + 1 * N16);
+          const V16 idx2 = hn::LoadU(d16, g_idx + i + 2 * N16);
+          const V16 idx3 = hn::LoadU(d16, g_idx + i + 3 * N16);
+          const V8 nibbles =
+              NibbleCodec::OrderedPackU16(d16, idx0, idx1, idx2, idx3);
+          hn::StoreU(nibbles, d8, g_packed + i / 2);
+        }
+      }
+
+      const size_t remaining = g_num - i;
+      HWY_DASSERT(remaining < 4 * N16);
+      if (HWY_UNLIKELY(remaining != 0)) {
+        const V16 idx0 = hn::LoadU(d16, g_idx + i + 0 * N16);
+        const V16 idx1 = hn::LoadU(d16, g_idx + i + 1 * N16);
+        const V16 idx2 = hn::LoadU(d16, g_idx + i + 2 * N16);
+        const V16 idx3 = hn::LoadU(d16, g_idx + i + 3 * N16);
+        const V8 nibbles =
+            NibbleCodec::OrderedPackU16(d16, idx0, idx1, idx2, idx3);
+        // i is even, but remaining might not be.
+        hn::StoreN(nibbles, d8, g_packed + i / 2, hwy::DivCeil(remaining, 2));
       }
     }
 
     return unused_clusters;
   }
 
-  // Decodes `num` values from the stream `in`, starting at the offset `in_ofs`
-  // (in units of values), to bf16 in `out`. `in_capacity`, `in_ofs` and `num`
-  // must all be multiples of `kGroupSize`.
+  // Decompresses to two bf16 vectors. `packed_ofs` must be a multiple of two
+  // vectors so that we only have to load one group's table.
   template <class DBF, HWY_IF_BF16_D(DBF)>
-  static HWY_INLINE void Dec(DBF dbf, const size_t in_capacity,
-                             const NuqStream* const in, const size_t in_ofs,
-                             hwy::bfloat16_t* const out, const size_t num) {
+  static HWY_INLINE void Dec2(DBF dbf,
+                              const PackedSpan<const NuqStream>& packed,
+                              const size_t packed_ofs, hn::Vec<DBF>& raw0,
+                              hn::Vec<DBF>& raw1) {
     const hn::RebindToUnsigned<decltype(dbf)> d16;
+    const D8HFromD16<DBF> d8h;
     using V16 = hn::Vec<decltype(d16)>;
+    using V8H = hn::Vec<decltype(d8h)>;
 
-    const size_t N16 = hn::Lanes(d16);
-    HWY_DASSERT(kGroupSize >= 4 * N16);
+    const size_t within_group = packed_ofs % kGroupSize;
+    HWY_DASSERT(within_group % (2 * hn::Lanes(d16)) == 0);
+    const size_t ofs_in_groups = packed_ofs / kGroupSize;
+    const uint8_t* table = &packed.ptr->byte + ofs_in_groups * kClusters;
+    const uint8_t* indices =
+        &packed.ptr->byte + NuqStream::PackedStart(packed.num) +
+        hwy::DivCeil(ofs_in_groups * kGroupSize + within_group, 2);
 
-    HWY_DASSERT(in_ofs + num <= in_capacity);
-    HWY_DASSERT(in_capacity % kGroupSize == 0);
-    HWY_DASSERT(in_ofs % kGroupSize == 0);
-    HWY_DASSERT(num % kGroupSize == 0);
-    const size_t num_groups = num / kGroupSize;
-    const size_t ofs_groups = in_ofs / kGroupSize;
-    const uint8_t* tables = &in->byte + ofs_groups * kClusters;
-    const uint8_t* packed_start = &in->byte +
-                                  NuqStream::PackedStart(in_capacity) +
-                                  ofs_groups * kGroupSize / 2;
+    V16 tbl1 = Zero(d16);
+    const V16 tbl0 = LoadTable(d16, table, &tbl1);
 
-    HWY_UNROLL(1)
-    for (size_t g = 0; g < num_groups; ++g) {
-      const uint8_t* g_centers = tables + g * kClusters;
-      const uint8_t* HWY_RESTRICT g_packed = packed_start + g * kGroupSize / 2;
-      hwy::bfloat16_t* HWY_RESTRICT g_out = out + g * kGroupSize;
+    const V8H nibbles = hn::LoadU(d8h, indices);
 
-      V16 tbl1 = Zero(d16);
-      const V16 tbl0 = LoadTable(d16, g_centers, &tbl1);
-
-      HWY_UNROLL(1)
-      for (size_t i = 0; i < kGroupSize; i += 2 * N16) {
-        V16 c0, c1;
-        TableLookups(d16, tbl0, tbl1, g_packed + i / 2, c0, c1);
-        hn::StoreU(BitCast(dbf, c0), dbf, g_out + i + N16 * 0);
-        hn::StoreU(BitCast(dbf, c1), dbf, g_out + i + N16 * 1);
-      }
-    }
+    V16 c0, c1;
+    TableLookups(d16, tbl0, tbl1, nibbles, c0, c1);
+    raw0 = BitCast(dbf, c0);
+    raw1 = BitCast(dbf, c1);
   }
 
-  // Decodes `num` values from the stream `in`, starting at the offset
-  // `in_ofs` (in units of values), to f32 in `out`. `in_capacity`,
-  // `in_ofs` and `num` must all be multiples of `kGroupSize`.
+  // Decompresses to two f32 vectors. `packed_ofs` must be a multiple of two
+  // vectors so that we only have to load one group's table.
   template <class DF, HWY_IF_F32_D(DF)>
-  static HWY_INLINE void Dec(DF df, const size_t in_capacity,
-                             const NuqStream* const in, const size_t in_ofs,
-                             float* const out, const size_t num) {
-    const hn::Repartition<hwy::bfloat16_t, DF> dbf;
+  static HWY_INLINE void Dec2(DF df, const PackedSpan<const NuqStream>& packed,
+                              const size_t packed_ofs, hn::Vec<DF>& raw0,
+                              hn::Vec<DF>& raw1) {
+    const hn::Repartition<BF16, decltype(df)> dbf;
     const hn::RebindToUnsigned<decltype(dbf)> d16;
+    const hn::Half<D8HFromD16<decltype(d16)>> d8q;
+    using V8Q = hn::Vec<decltype(d8q)>;
     using V16 = hn::Vec<decltype(d16)>;
-    using VF = hn::Vec<DF>;
-
-    const size_t NF = hn::Lanes(df);
-    HWY_DASSERT(kGroupSize >= 4 * NF);
-
-    HWY_DASSERT(in_ofs + num <= in_capacity);
-    HWY_DASSERT(in_capacity % kGroupSize == 0);
-    HWY_DASSERT(in_ofs % kGroupSize == 0);
-    HWY_DASSERT(num % kGroupSize == 0);
-    const size_t ofs_groups = in_ofs / kGroupSize;
-    const size_t num_groups = num / kGroupSize;
-    const uint8_t* tables = &in->byte + ofs_groups * kClusters;
-    const uint8_t* packed_start = &in->byte +
-                                  NuqStream::PackedStart(in_capacity) +
-                                  ofs_groups * kGroupSize / 2;
-
-    HWY_UNROLL(1)
-    for (size_t g = 0; g < num_groups; ++g) {
-      const uint8_t* g_centers = tables + g * kClusters;
-      const uint8_t* HWY_RESTRICT g_packed = packed_start + g * kGroupSize / 2;
-      float* HWY_RESTRICT g_out = out + g * kGroupSize;
-
-      V16 tbl1 = Zero(d16);
-      const V16 tbl0 = LoadTable(d16, g_centers, &tbl1);
-
-      HWY_UNROLL(1)
-      for (size_t i = 0; i < kGroupSize; i += 4 * NF) {
-        V16 c0, c1;
-        TableLookups(d16, tbl0, tbl1, g_packed + i / 2, c0, c1);
-        const VF f0 = hn::PromoteLowerTo(df, BitCast(dbf, c0));
-        const VF f1 = hn::PromoteUpperTo(df, BitCast(dbf, c0));
-        const VF f2 = hn::PromoteLowerTo(df, BitCast(dbf, c1));
-        const VF f3 = hn::PromoteUpperTo(df, BitCast(dbf, c1));
-        hn::StoreU(f0, df, g_out + i + NF * 0);
-        hn::StoreU(f1, df, g_out + i + NF * 1);
-        hn::StoreU(f2, df, g_out + i + NF * 2);
-        hn::StoreU(f3, df, g_out + i + NF * 3);
-      }
-    }
-  }
-
-  // Accumulates into `sum0..3` dot products of decoded values with `num` bf16
-  // from `vec_aligned`. DF is f32 because sum0..3 are also f32. `in_capacity`,
-  // `in_ofs` and `num` must all be multiples of `kGroupSize`.
-  template <class DF, HWY_IF_F32_D(DF)>
-  static HWY_INLINE void Dot(DF df, const size_t in_capacity,
-                             const NuqStream* const in, const size_t in_ofs,
-                             const hwy::bfloat16_t* const vec_aligned,
-                             const size_t num, hn::Vec<DF>& sum0,
-                             hn::Vec<DF>& sum1, hn::Vec<DF>& sum2,
-                             hn::Vec<DF>& sum3) {
-    const hn::Repartition<hwy::bfloat16_t, DF> dbf;
-    const hn::RebindToUnsigned<decltype(dbf)> d16;
-    using VBF = hn::Vec<decltype(dbf)>;
-    using V16 = hn::Vec<decltype(d16)>;
-    const size_t N16 = hn::Lanes(d16);
-    HWY_DASSERT(kGroupSize >= 4 * N16);
-
-    HWY_DASSERT(in_ofs + num <= in_capacity);
-    HWY_DASSERT(in_capacity % kGroupSize == 0);
-    HWY_DASSERT(in_ofs % kGroupSize == 0);
-    HWY_DASSERT(num % kGroupSize == 0);
-    const size_t ofs_groups = in_ofs / kGroupSize;
-    const size_t num_groups = num / kGroupSize;
-    const uint8_t* tables = &in->byte + ofs_groups * kClusters;
-    const uint8_t* packed_start = &in->byte +
-                                  NuqStream::PackedStart(in_capacity) +
-                                  ofs_groups * kGroupSize / 2;
-
-    HWY_UNROLL(1)
-    for (size_t g = 0; g < num_groups; ++g) {
-      const uint8_t* g_centers = tables + g * kClusters;
-      const uint8_t* HWY_RESTRICT g_packed = packed_start + g * kGroupSize / 2;
-      const hwy::bfloat16_t* HWY_RESTRICT g_in = vec_aligned + g * kGroupSize;
-
-      V16 tbl1 = Zero(d16);
-      const V16 tbl0 = LoadTable(d16, g_centers, &tbl1);
-
-      HWY_UNROLL(1)
-      for (size_t i = 0; i < kGroupSize; i += 2 * N16) {
-        V16 c0, c1;
-        TableLookups(d16, tbl0, tbl1, g_packed + i / 2, c0, c1);
-        const VBF in0 = hn::Load(dbf, g_in + i + N16 * 0);
-        const VBF in1 = hn::Load(dbf, g_in + i + N16 * 1);
-        sum0 = hn::ReorderWidenMulAccumulate(df, in0, BitCast(dbf, c0), sum0,
-                                             sum1);
-        sum2 = hn::ReorderWidenMulAccumulate(df, in1, BitCast(dbf, c1), sum2,
-                                             sum3);
-      }
-    }
-  }
-
-  // Accumulates into `sum0..3` dot products of decoded values with `num` f32
-  // from `vec_aligned`. `in_capacity`, `in_ofs` and `num` must all be
-  // multiples of `kGroupSize`.
-  template <class DF, HWY_IF_F32_D(DF)>
-  static HWY_INLINE void Dot(DF df, const size_t in_capacity,
-                             const NuqStream* const in, const size_t in_ofs,
-                             const float* const vec_aligned, const size_t num,
-                             hn::Vec<DF>& sum0, hn::Vec<DF>& sum1,
-                             hn::Vec<DF>& sum2, hn::Vec<DF>& sum3) {
-    const hn::Repartition<hwy::bfloat16_t, DF> dbf;
-    const hn::RebindToUnsigned<decltype(dbf)> d16;
     using VF = hn::Vec<decltype(df)>;
-    using V16 = hn::Vec<decltype(d16)>;
-    const size_t NF = hn::Lanes(df);
-    HWY_DASSERT(kGroupSize >= 4 * NF);
 
-    HWY_DASSERT(in_ofs + num <= in_capacity);
-    HWY_DASSERT(in_capacity % kGroupSize == 0);
-    HWY_DASSERT(in_ofs % kGroupSize == 0);
-    HWY_DASSERT(num % kGroupSize == 0);
-    const size_t ofs_groups = in_ofs / kGroupSize;
-    const size_t num_groups = num / kGroupSize;
-    const uint8_t* tables = &in->byte + ofs_groups * kClusters;
-    const uint8_t* packed_start = &in->byte +
-                                  NuqStream::PackedStart(in_capacity) +
-                                  ofs_groups * kGroupSize / 2;
+    const size_t within_group = packed_ofs % kGroupSize;
+    HWY_DASSERT(within_group % (2 * hn::Lanes(df)) == 0);
+    const size_t ofs_groups = packed_ofs / kGroupSize;
+    const uint8_t* table = &packed.ptr->byte + ofs_groups * kClusters;
+    const uint8_t* indices =
+        &packed.ptr->byte + NuqStream::PackedStart(packed.num) +
+        hwy::DivCeil(ofs_groups * kGroupSize + within_group, 2);
+
+    V16 tbl1 = Zero(d16);
+    const V16 tbl0 = LoadTable(d16, table, &tbl1);
+
+    // The single-vector TableLookups overload only calls OrderedUnpackU16<0>,
+    // which expects a quarter vector of bytes.
+    const V8Q nibbles = hn::LoadU(d8q, indices);
+
+    const V16 c0 = TableLookups(d16, tbl0, tbl1, nibbles);
+    raw0 = hn::PromoteLowerTo(df, BitCast(dbf, c0));
+    raw1 = hn::PromoteUpperTo(df, BitCast(dbf, c0));
+  }
+
+  // Decompresses from `packed`, starting at (any) `packed_ofs`, to (any) `num`
+  // elements in `raw`, then appends `[0, hn::Lanes(d))` zeroes as required to
+  // round `num` up to one vector, if it is not already.
+  template <class D, typename Raw = hn::TFromD<D>>
+  static HWY_INLINE void DecompressAndZeroPad(
+      D d, const PackedSpan<const NuqStream>& packed, size_t packed_ofs,
+      Raw* HWY_RESTRICT raw, size_t num) {
+    // If unaligned, load elements from the first group and update the args,
+    // from which we compute new tables/indices below.
+    if (size_t within_group = packed_ofs % kGroupSize; within_group != 0) {
+      const size_t ofs_in_groups = packed_ofs / kGroupSize;
+      const uint8_t* tables = &packed.ptr->byte + ofs_in_groups * kClusters;
+      const uint8_t* indices =
+          &packed.ptr->byte + NuqStream::PackedStart(packed.num) +
+          hwy::DivCeil(ofs_in_groups * kGroupSize + within_group, 2);
+      const size_t remaining = HWY_MIN(num, kGroupSize - within_group);
+      DecPartialGroup(d, tables, indices, raw, remaining);
+      packed_ofs += remaining;
+      raw += remaining;
+      num -= remaining;
+      if (num == 0) return;
+    }
+
+    HWY_DASSERT(packed_ofs % kGroupSize == 0);
+    const size_t ofs_in_groups = packed_ofs / kGroupSize;
+    const uint8_t* tables = &packed.ptr->byte + ofs_in_groups * kClusters;
+    const uint8_t* indices = &packed.ptr->byte +
+                             NuqStream::PackedStart(packed.num) +
+                             hwy::DivCeil(ofs_in_groups * kGroupSize, 2);
+
+    const size_t num_groups = hwy::DivCeil(num, kGroupSize);
+    HWY_UNROLL(1)
+    for (size_t g = 0; g < num_groups - 1; ++g) {
+      DecWholeGroup(d, tables + g * kClusters, indices + g * kGroupSize / 2,
+                    raw + g * kGroupSize);
+    }
+
+    const size_t g = num_groups - 1;
+    DecPartialGroup(d, tables + g * kClusters, indices + g * kGroupSize / 2,
+                    raw + g * kGroupSize, num - g * kGroupSize);
+  }
+
+ private:
+  template <class DBF, HWY_IF_BF16_D(DBF)>
+  static HWY_INLINE void DecWholeGroup(DBF dbf,
+                                       const uint8_t* HWY_RESTRICT table,
+                                       const uint8_t* HWY_RESTRICT indices,
+                                       BF16* HWY_RESTRICT raw_bf) {
+    const hn::RebindToUnsigned<decltype(dbf)> d16;
+    const D8HFromD16<DBF> d8h;
+    using V16 = hn::Vec<decltype(d16)>;
+    using V8H = hn::Vec<decltype(d8h)>;
+    const size_t N16 = hn::Lanes(d16);
+
+    V16 tbl1 = Zero(d16);
+    const V16 tbl0 = LoadTable(d16, table, &tbl1);
 
     HWY_UNROLL(1)
-    for (size_t g = 0; g < num_groups; ++g) {
-      const uint8_t* g_centers = tables + g * kClusters;
-      const uint8_t* HWY_RESTRICT g_packed = packed_start + g * kGroupSize / 2;
-      const float* HWY_RESTRICT g_in = vec_aligned + g * kGroupSize;
+    for (size_t i = 0; i < kGroupSize; i += 2 * N16) {
+      const V8H nibbles = hn::LoadU(d8h, indices + i / 2);
+      V16 c0, c1;
+      TableLookups(d16, tbl0, tbl1, nibbles, c0, c1);
+      hn::StoreU(BitCast(dbf, c0), dbf, raw_bf + i + 0 * N16);
+      hn::StoreU(BitCast(dbf, c1), dbf, raw_bf + i + 1 * N16);
+    }
+  }
 
-      V16 tbl1 = Zero(d16);
-      const V16 tbl0 = LoadTable(d16, g_centers, &tbl1);
+  // Called for first and last group.
+  template <class DBF, HWY_IF_BF16_D(DBF)>
+  static HWY_INLINE void DecPartialGroup(DBF dbf,
+                                         const uint8_t* HWY_RESTRICT table,
+                                         const uint8_t* HWY_RESTRICT indices,
+                                         BF16* HWY_RESTRICT raw_bf,
+                                         size_t num) {
+    HWY_DASSERT(num <= kGroupSize);
 
+    const hn::RebindToUnsigned<decltype(dbf)> d16;
+    const D8HFromD16<DBF> d8h;
+    using V16 = hn::Vec<decltype(d16)>;
+    using V8H = hn::Vec<decltype(d8h)>;
+    const size_t N16 = hn::Lanes(d16);
+
+    V16 tbl1 = Zero(d16);
+    const V16 tbl0 = LoadTable(d16, table, &tbl1);
+
+    size_t i = 0;
+
+    if (num >= 2 * N16) {
       HWY_UNROLL(1)
-      for (size_t i = 0; i < kGroupSize; i += 4 * NF) {
+      for (; i <= num - 2 * N16; i += 2 * N16) {
+        const V8H nibbles = hn::LoadU(d8h, indices + i / 2);
         V16 c0, c1;
-        TableLookups(d16, tbl0, tbl1, g_packed + i / 2, c0, c1);
-        const VF in0 = hn::LoadU(df, g_in + i + NF * 0);
-        const VF in1 = hn::LoadU(df, g_in + i + NF * 1);
-        const VF in2 = hn::LoadU(df, g_in + i + NF * 2);
-        const VF in3 = hn::LoadU(df, g_in + i + NF * 3);
+        TableLookups(d16, tbl0, tbl1, nibbles, c0, c1);
+        hn::StoreU(BitCast(dbf, c0), dbf, raw_bf + i + 0 * N16);
+        hn::StoreU(BitCast(dbf, c1), dbf, raw_bf + i + 1 * N16);
+      }
+    }
+
+    const size_t remaining = num - i;
+    HWY_DASSERT(remaining < 2 * N16);
+    if (HWY_UNLIKELY(remaining != 0)) {
+      // i is even, but remaining might not be.
+      const V8H nibbles =
+          hn::LoadN(d8h, indices + i / 2, hwy::DivCeil(remaining, 2));
+
+      V16 c0, c1;
+      TableLookups(d16, tbl0, tbl1, nibbles, c0, c1);
+      // Out of bounds `nibbles` are 0, but this does not yet guarantee
+      // c0/c1 are, because centers[0] might not be 0.
+      c0 = hn::IfThenElseZero(hn::FirstN(d16, remaining), c0);
+      hn::StoreU(BitCast(dbf, c0), dbf, raw_bf + i);
+      // Callers only pad to one vector, so check before storing the second.
+      if (remaining > N16) {
+        c1 = hn::IfThenElseZero(hn::FirstN(d16, remaining - N16), c1);
+        hn::StoreU(BitCast(dbf, c1), dbf, raw_bf + i + N16);
+      }
+    }
+  }
+
+  template <class DF, HWY_IF_F32_D(DF)>
+  static HWY_INLINE void DecWholeGroup(DF df, const uint8_t* HWY_RESTRICT table,
+                                       const uint8_t* HWY_RESTRICT indices,
+                                       float* HWY_RESTRICT raw_f) {
+    const hn::Repartition<BF16, decltype(df)> dbf;
+    const hn::RebindToUnsigned<decltype(dbf)> d16;
+    const D8HFromD16<decltype(d16)> d8h;
+    using V16 = hn::Vec<decltype(d16)>;
+    using V8H = hn::Vec<decltype(d8h)>;
+    using VF = hn::Vec<decltype(df)>;
+    const size_t NF = hn::Lanes(df);
+
+    V16 tbl1 = Zero(d16);
+    const V16 tbl0 = LoadTable(d16, table, &tbl1);
+
+    HWY_UNROLL(1)
+    for (size_t i = 0; i < kGroupSize; i += 4 * NF) {
+      const V8H nibbles = hn::LoadU(d8h, indices + i / 2);
+      V16 c0, c1;
+      TableLookups(d16, tbl0, tbl1, nibbles, c0, c1);
+      const VF f0 = hn::PromoteLowerTo(df, BitCast(dbf, c0));
+      const VF f1 = hn::PromoteUpperTo(df, BitCast(dbf, c0));
+      const VF f2 = hn::PromoteLowerTo(df, BitCast(dbf, c1));
+      const VF f3 = hn::PromoteUpperTo(df, BitCast(dbf, c1));
+      hn::StoreU(f0, df, raw_f + i + 0 * NF);
+      hn::StoreU(f1, df, raw_f + i + 1 * NF);
+      hn::StoreU(f2, df, raw_f + i + 2 * NF);
+      hn::StoreU(f3, df, raw_f + i + 3 * NF);
+    }
+  }
+
+  // Called for first and last group.
+  template <class DF, HWY_IF_F32_D(DF)>
+  static HWY_INLINE void DecPartialGroup(DF df,
+                                         const uint8_t* HWY_RESTRICT table,
+                                         const uint8_t* HWY_RESTRICT indices,
+                                         float* HWY_RESTRICT raw_f,
+                                         const size_t num) {
+    HWY_DASSERT(num <= kGroupSize);
+
+    const hn::Repartition<BF16, decltype(df)> dbf;
+    const hn::RebindToUnsigned<decltype(dbf)> d16;
+    const D8HFromD16<decltype(d16)> d8h;
+    using V16 = hn::Vec<decltype(d16)>;
+    using V8H = hn::Vec<decltype(d8h)>;
+    using VF = hn::Vec<decltype(df)>;
+    const size_t NF = hn::Lanes(df);
+
+    V16 tbl1 = Zero(d16);
+    const V16 tbl0 = LoadTable(d16, table, &tbl1);
+
+    size_t i = 0;
+
+    if (num >= 4 * NF) {
+      HWY_UNROLL(1)
+      for (; i <= num - 4 * NF; i += 4 * NF) {
+        const V8H nibbles = hn::LoadU(d8h, indices + i / 2);
+        V16 c0, c1;
+        TableLookups(d16, tbl0, tbl1, nibbles, c0, c1);
         const VF f0 = hn::PromoteLowerTo(df, BitCast(dbf, c0));
         const VF f1 = hn::PromoteUpperTo(df, BitCast(dbf, c0));
         const VF f2 = hn::PromoteLowerTo(df, BitCast(dbf, c1));
         const VF f3 = hn::PromoteUpperTo(df, BitCast(dbf, c1));
-        sum0 = hn::MulAdd(in0, f0, sum0);
-        sum1 = hn::MulAdd(in1, f1, sum1);
-        sum2 = hn::MulAdd(in2, f2, sum2);
-        sum3 = hn::MulAdd(in3, f3, sum3);
+        hn::StoreU(f0, df, raw_f + i + 0 * NF);
+        hn::StoreU(f1, df, raw_f + i + 1 * NF);
+        hn::StoreU(f2, df, raw_f + i + 2 * NF);
+        hn::StoreU(f3, df, raw_f + i + 3 * NF);
+      }
+    }
+
+    const size_t remaining = num - i;
+    HWY_DASSERT(remaining < 4 * NF);
+    if (HWY_UNLIKELY(remaining != 0)) {
+      // i is even, but remaining might not be.
+      const V8H nibbles =
+          hn::LoadN(d8h, indices + i / 2, hwy::DivCeil(remaining, 2));
+
+      V16 c0, c1;
+      TableLookups(d16, tbl0, tbl1, nibbles, c0, c1);
+      const VF f0 = hn::PromoteLowerTo(df, BitCast(dbf, c0));
+      const VF f1 = hn::PromoteUpperTo(df, BitCast(dbf, c0));
+      const VF f2 = hn::PromoteLowerTo(df, BitCast(dbf, c1));
+      const VF f3 = hn::PromoteUpperTo(df, BitCast(dbf, c1));
+      // `raw_f` is only guaranteed to padded to NF, hence we cannot store all
+      // four vectors. We could conditionally store vectors either to `raw_f`
+      // or a buffer. However, we still have to mask because only `nibbles`
+      // are guaranteed to be 0, not c0/c1. Copying also involves branches,
+      // so we fully unroll the copy loop to avoid a buffer. We could also
+      // change the contract to pad to four vectors, but it would anyway be
+      // better to decompress to bf16.
+      if (remaining <= 1 * NF) {
+        const hn::Mask<DF> mask = hn::FirstN(df, remaining);
+        hn::StoreU(hn::IfThenElseZero(mask, f0), df, raw_f + i + 0 * NF);
+        return;
+      }
+      hn::StoreU(f0, df, raw_f + i + 0 * NF);
+      if (remaining <= 2 * NF) {
+        const hn::Mask<DF> mask = hn::FirstN(df, remaining - NF);
+        hn::StoreU(hn::IfThenElseZero(mask, f1), df, raw_f + i + 1 * NF);
+        return;
+      }
+      hn::StoreU(f1, df, raw_f + i + 1 * NF);
+      if (remaining <= 3 * NF) {
+        const hn::Mask<DF> mask = hn::FirstN(df, remaining - 2 * NF);
+        hn::StoreU(hn::IfThenElseZero(mask, f2), df, raw_f + i + 2 * NF);
+        return;
+      }
+      hn::StoreU(f2, df, raw_f + i + 2 * NF);
+      {
+        const hn::Mask<DF> mask = hn::FirstN(df, remaining - 3 * NF);
+        hn::StoreU(hn::IfThenElseZero(mask, f3), df, raw_f + i + 3 * NF);
       }
     }
   }
diff --git a/compression/nuq.h b/compression/nuq.h
deleted file mode 100644
index d7ae814..0000000
--- a/compression/nuq.h
+++ /dev/null
@@ -1,112 +0,0 @@
-// Copyright 2023 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef THIRD_PARTY_GEMMA_CPP_COMPRESSION_NUQ_H_
-#define THIRD_PARTY_GEMMA_CPP_COMPRESSION_NUQ_H_
-
-// Non-uniform quantization: a compressed representation of f32 inputs that
-// supports seeking at a granularity of kGroupSize, decoding to bf16/f32, and a
-// fused decode/dot product with bf16/f32 vectors.
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "hwy/aligned_allocator.h"
-#include "hwy/base.h"  // HWY_INLINE
-
-namespace gcpp {
-
-// 4-bit indices are a sweet spot in terms of quality per size.
-static constexpr size_t kClusters = 16;
-
-// Number of weights that share a table. Larger = slower encode, higher error,
-// smaller size (table amortized over more weights). This is the minimum
-// granularity for seeking/decoding in the stream, and must be at least four
-// times the number of bf16 elements per vector.
-static constexpr size_t kGroupSize = 256;
-
-// Points to the *start* of a NUQ stream. Aligning the allocation (see
-// aligned_allocator.h) may be speed up decoding but is not required.
-//
-// Layout: first one table of kClusters entries per group, in ascending order
-// of group index, then two packed indices per byte.
-//
-// Indices are stored in-order to enable vector-length agnostic decode, because
-// streams may be persisted to disk and used by other CPUs.
-//
-// To enable parallel encoding and decoding, Enc/Dec have `offset` parameters
-// which refer to the stream, NOT the raw from/to pointers, which point directly
-// to the source/destination. Offsets are in units of values, NOT compressed
-// bytes within the stream.
-#pragma pack(push, 1)
-struct NuqStream {
-  // Returns offset of packed indices from the start of the stream. This matches
-  // the (padded) total table size because table entries are bytes.
-  static constexpr size_t PackedStart(size_t capacity) {
-    // Round up to avoid cache-line splits when loading indices. No effect on
-    // size as long as capacity / kGroupSize is a multiple of 4.
-    return hwy::RoundUpTo(hwy::DivCeil(capacity, kGroupSize) * kClusters, 64);
-  }
-
-  // Returns number of NuqStream to allocate for the stream, which matches its
-  // size in bytes. `capacity` is already a multiple of `kGroupSize`.
-  static constexpr size_t PackedEnd(size_t capacity) {
-    return PackedStart(capacity) + hwy::DivCeil(capacity, 2);  // 2x 4-bit/byte
-  }
-
-  uint8_t byte;
-};
-#pragma pack(pop)
-
-// Storage for dynamic programming. There are two matrices; we use separate
-// allocations to avoid type punning.
-template <class T>
-class AlignedMatrix {
- public:
-  AlignedMatrix() : mem_(hwy::AllocateAligned<T>(kClusters * kGroupSize)) {}
-
-  HWY_INLINE const T& operator()(size_t row, size_t col) const {
-    return mem_[row * kGroupSize + col];
-  }
-
-  HWY_INLINE T& operator()(size_t row, size_t col) {
-    return mem_[row * kGroupSize + col];
-  }
-
- private:
-  hwy::AlignedFreeUniquePtr<T[]> mem_;
-};
-
-// Reuse memory across calls to Enc to avoid per-call allocations.
-struct ClusterBuf {
-  void Resize(size_t new_num) {
-    if (new_num < num) return;
-
-    num = new_num;
-    const size_t num_groups = hwy::DivCeil(num, kGroupSize);
-    centers = hwy::AllocateAligned<float>(num_groups * kClusters);
-    idx = hwy::AllocateAligned<uint16_t>(hwy::RoundUpTo(num, kGroupSize));
-  }
-
-  AlignedMatrix<float> d;
-  AlignedMatrix<int32_t> t;
-
-  size_t num = 0;
-  hwy::AlignedFreeUniquePtr<float[]> centers;
-  hwy::AlignedFreeUniquePtr<uint16_t[]> idx;
-};
-
-}  // namespace gcpp
-#endif  // THIRD_PARTY_GEMMA_CPP_COMPRESSION_NUQ_H_
diff --git a/compression/nuq_test.cc b/compression/nuq_test.cc
index 8c6175c..8cbce6c 100644
--- a/compression/nuq_test.cc
+++ b/compression/nuq_test.cc
@@ -18,8 +18,6 @@
 #define HWY_DISABLED_TARGETS (HWY_SCALAR | HWY_SVE)
 #endif
 
-#include "compression/nuq.h"
-
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
@@ -28,9 +26,11 @@
 #include <random>
 
 #include "compression/distortion.h"
+#include "compression/shared.h"
 #include "util/test_util.h"
 #include "hwy/aligned_allocator.h"
 #include "hwy/base.h"
+#include "hwy/tests/hwy_gtest.h"
 #include "hwy/tests/test_util.h"
 #include "hwy/timer.h"
 
@@ -39,10 +39,9 @@
 #define HWY_TARGET_INCLUDE "compression/nuq_test.cc"  // NOLINT
 // clang-format on
 #include "hwy/foreach_target.h"  // IWYU pragma: keep
-// Other headers that include Highway must come after foreach_target.h
-#include "compression/nuq-inl.h"
 #include "hwy/highway.h"
-#include "hwy/tests/hwy_gtest.h"
+// After highway.h
+#include "compression/nuq-inl.h"
 #include "hwy/tests/test_util-inl.h"
 
 HWY_BEFORE_NAMESPACE();
@@ -50,6 +49,8 @@ namespace gcpp {
 namespace HWY_NAMESPACE {
 
 static constexpr size_t kTimingReps = hn::AdjustedReps(3);
+static constexpr size_t kClusters = NuqStream::kClusters;
+static constexpr size_t kGroupSize = NuqStream::kGroupSize;
 
 // All-equal inputs: only one cluster
 struct TestFlat {
@@ -65,7 +66,7 @@ struct TestFlat {
     for (size_t i = 0; i < kGroupSize; ++i) {
       in[i] = 0.5f;
     }
-    ClusterBuf buf;
+    NuqStream::ClusterBuf buf;
     float centers[kClusters];
     uint16_t indices[kGroupSize];
     const size_t unused_clusters = NuqClustering::ClusterExactL2(
@@ -107,7 +108,7 @@ struct TestPlateaus {
     std::mt19937 rng(rd());
     std::shuffle(in.get(), in.get() + kGroupSize, rng);
 
-    ClusterBuf buf;
+    NuqStream::ClusterBuf buf;
     float centers[kClusters];
     uint16_t indices[kGroupSize];
     const size_t unused_clusters = NuqClustering::ClusterExactL2(
@@ -154,7 +155,7 @@ struct TestRamp {
     std::mt19937 rng(rd());
     std::shuffle(in.get(), in.get() + kGroupSize, rng);
 
-    ClusterBuf buf;
+    NuqStream::ClusterBuf buf;
     float centers[kClusters];
     uint16_t indices[kGroupSize];
     const size_t unused_clusters = NuqClustering::ClusterExactL2(
@@ -199,7 +200,7 @@ struct TestNormal {
     }
     VerifyGaussian(in_stats);
 
-    ClusterBuf buf;
+    NuqStream::ClusterBuf buf;
     float centers[kClusters];
     uint16_t indices[kGroupSize];
     double elapsed = hwy::HighestValue<double>();
@@ -239,7 +240,7 @@ struct TestOffset {
   template <typename T, class D>
   HWY_INLINE void operator()(T /*unused*/, D d) {
     const hn::Repartition<float, D> df;
-    const size_t total = 10 * kGroupSize;
+    const size_t total = 10 * kGroupSize;   // already padded
     const size_t kMidLen = 2 * kGroupSize;  // length of middle piece
 
     auto in = hwy::AllocateAligned<float>(total);  // Enc() requires f32
@@ -247,6 +248,7 @@ struct TestOffset {
     auto dec2 = hwy::AllocateAligned<T>(kMidLen);
     auto nuq = hwy::AllocateAligned<NuqStream>(NuqStream::PackedEnd(total));
     HWY_ASSERT(in && dec1 && dec2 && nuq);
+    const auto nuq_span = MakeSpan(nuq.get(), total);
 
     hwy::RandomState rng;
     for (size_t i = 0; i < total; ++i) {
@@ -254,53 +256,72 @@ struct TestOffset {
     }
 
     // Encode + decode everything
-    ClusterBuf buf;
-    (void)NuqCodec::Enc(df, in.get(), total, buf, total, nuq.get(), 0);
-    NuqCodec::Dec(d, total, nuq.get(), 0, dec1.get(), total);
+    NuqStream::ClusterBuf buf;
+    (void)NuqCodec::Enc(df, in.get(), total, buf, nuq_span, 0);
+    NuqCodec::DecompressAndZeroPad(d, MakeConst(nuq_span), 0, dec1.get(),
+                                   total);
 
     // Overwrite middle with first inputs
     const size_t offset = 5 * kGroupSize;
-    (void)NuqCodec::Enc(df, in.get(), kMidLen, buf, total, nuq.get(), offset);
+    (void)NuqCodec::Enc(df, in.get(), kMidLen, buf, nuq_span, offset);
 
     // Decoded middle now matches previously decoded first
-    NuqCodec::Dec(d, total, nuq.get(), offset, dec2.get(), kMidLen);
+    NuqCodec::DecompressAndZeroPad(d, MakeConst(nuq_span), offset, dec2.get(),
+                                   kMidLen);
     for (size_t i = 0; i < kMidLen; ++i) {
       HWY_ASSERT(dec1[i] == dec2[i]);
     }
   }
 };
 
-void TestAllOffsetF32() {
-  const hn::ForGEVectors<128, TestOffset> test;
-  test(float());
-}
-
-void TestAllOffsetBF16() {
-  const hn::ForGEVectors<128, TestOffset> test;
-  test(hwy::bfloat16_t());
-}
+void TestOffsetBF16() { hn::ForGEVectors<128, TestOffset>()(BF16()); }
+void TestOffsetF32() { hn::ForGEVectors<128, TestOffset>()(float()); }
 
 struct TestNibble {
   template <typename T, class D>
   HWY_INLINE void operator()(T /*unused*/, D d) {
+    const hn::Repartition<uint8_t, D> d8;
+    const hn::Half<decltype(d8)> d8h;
     using V = hn::Vec<decltype(d)>;
-    const size_t N = hn::Lanes(d);
-    const size_t num = 4 * N;
-    auto bytes = hwy::AllocateAligned<uint8_t>(num / 2);
-    HWY_ASSERT(bytes);
-    const V v0 = hn::And(hn::Iota(d, 0), hn::Set(d, 15));
-    const V v1 = hn::Set(d, 1);
-    const V v2 = hn::OddEven(v1, hn::Zero(d));
-    const V v3 = hn::Reverse(d, v0);
-    NibbleCodec::OrderedPackU16(d, v0, v1, v2, v3, bytes.get());
-    const V out0 = NibbleCodec::OrderedUnpackU16(d, bytes.get() + 0 * N / 2);
-    const V out1 = NibbleCodec::OrderedUnpackU16(d, bytes.get() + 1 * N / 2);
-    const V out2 = NibbleCodec::OrderedUnpackU16(d, bytes.get() + 2 * N / 2);
-    const V out3 = NibbleCodec::OrderedUnpackU16(d, bytes.get() + 3 * N / 2);
-    HWY_ASSERT_VEC_EQ(d, v0, out0);
-    HWY_ASSERT_VEC_EQ(d, v1, out1);
-    HWY_ASSERT_VEC_EQ(d, v2, out2);
-    HWY_ASSERT_VEC_EQ(d, v3, out3);
+    using V8 = hn::Vec<decltype(d8)>;
+    using V8H = hn::Vec<decltype(d8h)>;
+    const V mask = hn::Set(d, 15);
+
+    {
+      const V v0 = hn::And(hn::Iota(d, 0), mask);
+      const V v1 = hn::Set(d, 1);
+      const V v2 = hn::OddEven(v1, hn::Zero(d));
+      const V v3 = hn::Reverse(d, v0);
+      const V8 nibbles = NibbleCodec::OrderedPackU16(d, v0, v1, v2, v3);
+      const V8H nibbles0 = hn::LowerHalf(d8h, nibbles);
+      const V8H nibbles1 = hn::UpperHalf(d8h, nibbles);
+      const V out0 = NibbleCodec::OrderedUnpackU16<0>(d, nibbles0);
+      const V out1 = NibbleCodec::OrderedUnpackU16<1>(d, nibbles0);
+      const V out2 = NibbleCodec::OrderedUnpackU16<0>(d, nibbles1);
+      const V out3 = NibbleCodec::OrderedUnpackU16<1>(d, nibbles1);
+      HWY_ASSERT_VEC_EQ(d, v0, out0);
+      HWY_ASSERT_VEC_EQ(d, v1, out1);
+      HWY_ASSERT_VEC_EQ(d, v2, out2);
+      HWY_ASSERT_VEC_EQ(d, v3, out3);
+    }
+    // Same, but with different values in each lane.
+    {
+      const V v0 = hn::And(hn::Iota(d, 0), mask);
+      const V v1 = hn::And(hn::Iota(d, 1), mask);
+      const V v2 = hn::And(hn::Iota(d, 2), mask);
+      const V v3 = hn::And(hn::Iota(d, 3), mask);
+      const V8 nibbles = NibbleCodec::OrderedPackU16(d, v0, v1, v2, v3);
+      const V8H nibbles0 = hn::LowerHalf(d8h, nibbles);
+      const V8H nibbles1 = hn::UpperHalf(d8h, nibbles);
+      const V out0 = NibbleCodec::OrderedUnpackU16<0>(d, nibbles0);
+      const V out1 = NibbleCodec::OrderedUnpackU16<1>(d, nibbles0);
+      const V out2 = NibbleCodec::OrderedUnpackU16<0>(d, nibbles1);
+      const V out3 = NibbleCodec::OrderedUnpackU16<1>(d, nibbles1);
+      HWY_ASSERT_VEC_EQ(d, v0, out0);
+      HWY_ASSERT_VEC_EQ(d, v1, out1);
+      HWY_ASSERT_VEC_EQ(d, v2, out2);
+      HWY_ASSERT_VEC_EQ(d, v3, out3);
+    }
   }
 };
 
@@ -309,15 +330,19 @@ void TestAllNibble() {
   test(uint16_t());
 }
 
-struct TestStream {
+// Checks the distortion from an encode and decode round trip. Unlike
+// `TestShortLengthsT` in compress_test, this covers large `num` and
+// prints the enc/dec throughput.
+struct TestEncDec {
   template <typename T, class D>
   HWY_INLINE void operator()(T /*unused*/, D d) {
     const hn::Repartition<float, D> df;
     const size_t num = 4 * kGroupSize;
     auto in = hwy::AllocateAligned<float>(num);  // Enc() requires f32
-    auto out = hwy::AllocateAligned<T>(num);
+    auto out = hwy::AllocateAligned<T>(num);     // already padded
     auto nuq = hwy::AllocateAligned<NuqStream>(NuqStream::PackedEnd(num));
     HWY_ASSERT(in && out && nuq);
+    const auto nuq_span = MakeSpan(nuq.get(), num);
 
     hwy::RandomState rng;
     hwy::Stats in_stats;
@@ -327,12 +352,12 @@ struct TestStream {
     }
     VerifyGaussian(in_stats);
 
-    ClusterBuf buf;
+    NuqStream::ClusterBuf buf;
     double elapsed = hwy::HighestValue<double>();
     for (size_t rep = 0; rep < kTimingReps; ++rep) {
       const double t0 = hwy::platform::Now();
       const size_t unused_clusters =
-          NuqCodec::Enc(df, in.get(), num, buf, num, nuq.get(), 0);
+          NuqCodec::Enc(df, in.get(), num, buf, nuq_span, 0);
       HWY_ASSERT(unused_clusters == 0);
       const double t1 = hwy::platform::Now();
       elapsed = HWY_MIN(elapsed, t1 - t0);
@@ -343,7 +368,7 @@ struct TestStream {
     elapsed = hwy::HighestValue<double>();
     for (size_t rep = 0; rep < kTimingReps; ++rep) {
       const double t0 = hwy::platform::Now();
-      NuqCodec::Dec(d, num, nuq.get(), 0, out.get(), num);
+      NuqCodec::DecompressAndZeroPad(d, MakeConst(nuq_span), 0, out.get(), num);
       const double t1 = hwy::platform::Now();
       elapsed = HWY_MIN(elapsed, t1 - t0);
     }
@@ -367,129 +392,8 @@ struct TestStream {
   }
 };
 
-void TestAllStreamF32() {
-  const hn::ForGEVectors<128, TestStream> test;
-  test(float());
-}
-
-void TestAllStreamBF16() {
-  const hn::ForGEVectors<128, TestStream> test;
-  test(hwy::bfloat16_t());
-}
-
-struct TestDot {
-  template <typename T, class D>
-  HWY_INLINE void operator()(T /*unused*/, D d) {
-    const hn::Repartition<float, D> df;
-    const size_t num = 4 * kGroupSize;
-    auto in = hwy::AllocateAligned<float>(num);
-    auto dec = hwy::AllocateAligned<float>(num);
-    auto vec = hwy::AllocateAligned<T>(num);
-    auto nuq = hwy::AllocateAligned<NuqStream>(NuqStream::PackedEnd(num));
-    HWY_ASSERT(in && dec && vec && nuq);
-
-    // Generate inputs and verify their distribution.
-    hwy::RandomState rng;
-    hwy::Stats in_stats;
-    for (size_t i = 0; i < num; ++i) {
-      in[i] = static_cast<float>(RandomGaussian(rng));
-      in_stats.Notify(in[i]);
-    }
-    for (size_t i = 0; i < num; ++i) {
-      const float r = static_cast<float>(RandomGaussian(rng));
-      in_stats.Notify(r);
-      vec[i] = hwy::ConvertScalarTo<T>(r);
-    }
-    VerifyGaussian(in_stats);
-
-    ClusterBuf buf;
-    const size_t unused_clusters =
-        NuqCodec::Enc(df, in.get(), num, buf, num, nuq.get(), 0);
-    HWY_ASSERT(unused_clusters == 0);
-
-    // Compute dot product without decompression.
-    float actual = 0.0f;
-    double elapsed = hwy::HighestValue<double>();
-    for (size_t rep = 0; rep < kTimingReps; ++rep) {
-      hn::Vec<decltype(df)> sum0 = hn::Zero(df);
-      hn::Vec<decltype(df)> sum1 = hn::Zero(df);
-      hn::Vec<decltype(df)> sum2 = hn::Zero(df);
-      hn::Vec<decltype(df)> sum3 = hn::Zero(df);
-      const double t0 = hwy::platform::Now();
-      NuqCodec::Dot(df, num, nuq.get(), 0, vec.get(), num, sum0, sum1, sum2,
-                    sum3);
-      const double t1 = hwy::platform::Now();
-      elapsed = HWY_MIN(elapsed, t1 - t0);
-      sum0 = hn::Add(hn::Add(sum0, sum1), hn::Add(sum2, sum3));
-      actual = hn::ReduceSum(df, sum0);
-    }
-
-    NuqCodec::Dec(df, num, nuq.get(), 0, dec.get(), num);
-    fprintf(stderr, "Vec %zu Dec %.2f MB/s\n", Lanes(d) * sizeof(T),
-            num * sizeof(in[0]) * 1E-6 / elapsed);
-
-    // Exact and decompressed dot products for comparison.
-    float exact = 0.0f;     // using original input
-    float expected = 0.0f;  // using decoded NUQ
-    DistortionStats dec_stats;
-    hwy::Stats ratios;
-    for (size_t i = 0; i < num; ++i) {
-      dec_stats.Notify(in[i], dec[i]);
-      const float v1 = hwy::ConvertScalarTo<float>(vec[i]);
-      exact += in[i] * v1;
-      expected += dec[i] * v1;
-      if (expected != 0.0f) {
-        ratios.Notify(exact / expected);
-      }
-    }
-    const bool isBF = sizeof(T) == 2;
-    const double dec_snr = dec_stats.GeomeanValueDivL1();
-    const double dec_wl1 = dec_stats.WeightedAverageL1();
-    const double dot_snr = 1.0 / hwy::ScalarAbs(1.0 - ratios.GeometricMean());
-    // exact and actual fluctuate due to the combination of NUQ imprecision,
-    // and whether vec[i] is negative or positive, so this is quite loose.
-    const float final_ratio = HWY_MIN(exact / actual, actual / exact);
-    if (HWY_ONCE) {
-      fprintf(stderr, "ratios %s\n", ratios.ToString().c_str());
-      fprintf(stderr,
-              "exact %.3f e2 %.4f actual %.4f final_ratio %.3f dec_snr %.2f "
-              "dot_snr %.2f dec_wl1 %.4f\n",
-              exact, expected, actual, final_ratio, dec_snr, dot_snr, dec_wl1);
-    }
-    // Final values are not too far apart.
-    HWY_ASSERT(gcpp::IsInside(0.88f, 1.0f, final_ratio));
-    // Decompressed and uncompressed dot should match exactly.
-    HWY_ASSERT(gcpp::IsNear(expected, actual, 1E-4f));
-    // Geomean of ratios for each i should be very close to one.
-    HWY_ASSERT(dot_snr >= (isBF ? 17.7 : 14.3));
-
-    // dec[] is close to in[], but we already check that in TestStream with the
-    // same input distribution.
-    HWY_ASSERT(gcpp::IsNear(13.1, dec_snr, 0.1));
-    HWY_ASSERT(gcpp::IsNear(0.034, dec_wl1, 0.001));
-    HWY_ASSERT(gcpp::IsNear(23.5, dec_stats.SumL1(), 0.1));
-    HWY_ASSERT(dec_stats.NumSignFlip() < num / kClusters);
-    HWY_ASSERT_EQ(0, dec_stats.NumExact());
-    HWY_ASSERT_EQ(0, dec_stats.NumRoundedToZero());
-    HWY_ASSERT_EQ(0.0, dec_stats.SumL1Rounded());
-    // Absolute decode errors are in [0, 0.11], and somewhat right-tailed.
-    HWY_ASSERT(gcpp::IsInside(0.0f, 2E-5f, dec_stats.L1().Min()));
-    HWY_ASSERT(gcpp::IsInside(0.09f, 0.11f, dec_stats.L1().Max()));
-    HWY_ASSERT(gcpp::IsInside(0.02, 0.03, dec_stats.L1().Mean()));
-    HWY_ASSERT(gcpp::IsInside(1.0, 1.1, dec_stats.L1().Skewness()));
-    HWY_ASSERT(gcpp::IsInside(4.0, 5.0, dec_stats.L1().Kurtosis()));
-    static_assert(kGroupSize == 256, "Update expected*");
-  }
-};
-
-void TestAllDotF32() {
-  const hn::ForGEVectors<128, TestDot> test;
-  test(float());
-}
-void TestAllDotBF16() {
-  const hn::ForGEVectors<128, TestDot> test;
-  test(hwy::bfloat16_t());
-}
+void TestEncDecBF16() { hn::ForGEVectors<128, TestEncDec>()(BF16()); }
+void TestEncDecF32() { hn::ForGEVectors<128, TestEncDec>()(float()); }
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
@@ -497,23 +401,17 @@ void TestAllDotBF16() {
 HWY_AFTER_NAMESPACE();
 
 #if HWY_ONCE
-
 namespace gcpp {
 HWY_BEFORE_TEST(NuqTest);
 HWY_EXPORT_AND_TEST_P(NuqTest, TestAllFlat);
 HWY_EXPORT_AND_TEST_P(NuqTest, TestAllPlateaus);
 HWY_EXPORT_AND_TEST_P(NuqTest, TestAllRamp);
 HWY_EXPORT_AND_TEST_P(NuqTest, TestAllNormal);
-HWY_EXPORT_AND_TEST_P(NuqTest, TestAllOffsetF32);
-HWY_EXPORT_AND_TEST_P(NuqTest, TestAllOffsetBF16);
+HWY_EXPORT_AND_TEST_P(NuqTest, TestOffsetBF16);
+HWY_EXPORT_AND_TEST_P(NuqTest, TestOffsetF32);
 HWY_EXPORT_AND_TEST_P(NuqTest, TestAllNibble);
-HWY_EXPORT_AND_TEST_P(NuqTest, TestAllStreamF32);
-HWY_EXPORT_AND_TEST_P(NuqTest, TestAllStreamBF16);
-HWY_EXPORT_AND_TEST_P(NuqTest, TestAllDotF32);
-HWY_EXPORT_AND_TEST_P(NuqTest, TestAllDotBF16);
-#ifdef HWY_AFTER_TEST
+HWY_EXPORT_AND_TEST_P(NuqTest, TestEncDecBF16);
+HWY_EXPORT_AND_TEST_P(NuqTest, TestEncDecF32);
 HWY_AFTER_TEST();
-#endif
 }  // namespace gcpp
-
-#endif
+#endif  // HWY_ONCE
diff --git a/compression/sfp-inl.h b/compression/sfp-inl.h
index 78c941f..1be84e9 100644
--- a/compression/sfp-inl.h
+++ b/compression/sfp-inl.h
@@ -52,9 +52,6 @@ HWY_INLINE hn::Mask<DU> SignedLt(DU du, hn::Vec<DU> a, hn::Vec<DU> b) {
   return SignedGt(du, b, a);
 }
 
-// Saturated subtraction; returns 0 if the result would be negative.
-static inline size_t SubOr0(size_t a, size_t b) { return a > b ? a - b : 0; }
-
 // Encode/decode functions.
 class SfpCodec {
  public:
@@ -260,9 +257,9 @@ class SfpCodec {
   }
 
   // Encodes `num` bf16 values from `in_bf` to `out_packed`. Their magnitude
-  // must be at most 1.875.
+  // must be at most SfpStream::kMax.
   template <class DBF, HWY_IF_BF16_D(DBF)>
-  static HWY_INLINE void Enc(DBF dbf, const hwy::bfloat16_t* HWY_RESTRICT in_bf,
+  static HWY_INLINE void Enc(DBF dbf, const BF16* HWY_RESTRICT in_bf,
                              size_t num, SfpStream* HWY_RESTRICT out_packed) {
     const hn::Repartition<uint8_t, DBF> d8;
     using V8 = hn::Vec<decltype(d8)>;
@@ -280,7 +277,7 @@ class SfpCodec {
     const size_t remaining = num - i;
     HWY_DASSERT(remaining < 2 * N16);
     if (remaining != 0) {
-      HWY_ALIGN hwy::bfloat16_t padded[2 * hn::MaxLanes(dbf)];
+      HWY_ALIGN BF16 padded[2 * hn::MaxLanes(dbf)];
       hwy::ZeroBytes(padded, sizeof(padded));
       hwy::CopyBytes(in_bf + i, padded, remaining * sizeof(padded[0]));
       const V8 packed = Enc2B(dbf, padded);
@@ -289,7 +286,7 @@ class SfpCodec {
   }
 
   // Encodes `num` f32 values from `in_f` to `packed`. Their magnitude
-  // must be at most 1.875.
+  // must be at most SfpStream::kMax.
   template <class DF, HWY_IF_F32_D(DF)>
   static HWY_INLINE void Enc(DF df, const float* HWY_RESTRICT in_f, size_t num,
                              SfpStream* HWY_RESTRICT out_packed) {
@@ -317,148 +314,112 @@ class SfpCodec {
     }
   }
 
-  // Decodes `num` values from `in_packed` to `out_bf`.
+  template <class DBF16, HWY_IF_BF16_D(DBF16),
+            class V8 = hn::Vec<hn::Repartition<uint8_t, DBF16>>>
+  static HWY_INLINE void Dec2(DBF16 dbf16, V8 packed, hn::Vec<DBF16>& raw0,
+                              hn::Vec<DBF16>& raw1) {
+    Dec2B(dbf16, packed, raw0, raw1);
+  }
+
+  template <class DF, HWY_IF_F32_D(DF),
+            class V8 = hn::Vec<hn::Twice<hn::Rebind<uint8_t, DF>>>>
+  static HWY_INLINE void Dec2(DF df, V8 packed, hn::Vec<DF>& raw0,
+                              hn::Vec<DF>& raw1) {
+    const hn::Rebind<BF16, DF> dbf;  // half-vector
+    using VBF = hn::Vec<decltype(dbf)>;
+    VBF bf0, bf1;
+    Dec2B(dbf, packed, bf0, bf1);
+    raw0 = hn::PromoteTo(df, bf0);
+    raw1 = hn::PromoteTo(df, bf1);
+  }
+
+  // Decompresses to (arbitrary) `num` BF16 elements in `raw_bf`, then appends
+  // `[0, hn::Lanes(dbf))` zeroes as required to round `num` up to one vector,
+  // if it is not already. DBF argument is provided by nuq-inl.h.
   template <class DBF, HWY_IF_BF16_D(DBF)>
-  static HWY_INLINE void Dec(DBF dbf, const SfpStream* HWY_RESTRICT in_packed,
-                             size_t num, hwy::bfloat16_t* HWY_RESTRICT out_bf) {
+  static HWY_INLINE void DecompressAndZeroPad(
+      DBF dbf, const PackedSpan<const SfpStream>& packed, size_t packed_ofs,
+      BF16* HWY_RESTRICT raw_bf, size_t num) {
     const hn::Repartition<uint8_t, DBF> d8;
     using V8 = hn::Vec<decltype(d8)>;
     using VBF = hn::Vec<decltype(dbf)>;
     const size_t N16 = hn::Lanes(dbf);
 
+    const uint8_t* HWY_RESTRICT base = &packed.ptr->byte + packed_ofs;
+
     size_t i = 0;
     if (num >= 2 * N16) {
       HWY_UNROLL(1)
       for (; i <= num - 2 * N16; i += 2 * N16) {
-        const V8 packed = hn::LoadU(d8, &in_packed->byte + i);
+        const V8 packed = hn::LoadU(d8, base + i);
         VBF bf0, bf1;
         Dec2B(dbf, packed, bf0, bf1);
-        hn::StoreU(bf0, dbf, out_bf + i);
-        hn::StoreU(bf1, dbf, out_bf + i + N16);
+        hn::StoreU(bf0, dbf, raw_bf + i);
+        hn::StoreU(bf1, dbf, raw_bf + i + N16);
       }
     }
 
     const size_t remaining = num - i;
     HWY_DASSERT(remaining < 2 * N16);
     if (remaining != 0) {
-      const V8 packed = hn::LoadN(d8, &in_packed->byte + i, remaining);
+      const V8 packed = hn::LoadN(d8, base + i, remaining);
       VBF bf0, bf1;
       Dec2B(dbf, packed, bf0, bf1);
-      hn::StoreN(bf0, dbf, out_bf + i, remaining);
-      hn::StoreN(bf1, dbf, out_bf + i + N16, SubOr0(remaining, N16));
+      // If at most one vector, the first store adds zero padding. Check before
+      // storing the second, because callers only pad to one vector.
+      hn::StoreU(bf0, dbf, raw_bf + i);
+      if (remaining > N16) hn::StoreU(bf1, dbf, raw_bf + i + N16);
     }
   }
 
-  // Decodes `num` values from `in_packed` to `out_f`.
+  // Decompresses to (arbitrary) `num` float elements in `raw_f`, then appends
+  // `[0, hn::Lanes(df))` zeroes as required to round `num` up to one vector,
+  // if it is not already.
   template <class DF, HWY_IF_F32_D(DF)>
-  static HWY_INLINE void Dec(DF df, const SfpStream* HWY_RESTRICT in_packed,
-                             size_t num, float* HWY_RESTRICT out_f) {
+  static HWY_INLINE void DecompressAndZeroPad(
+      DF df, const PackedSpan<const SfpStream>& packed, size_t packed_ofs,
+      float* HWY_RESTRICT raw_f, size_t num) {
     const hn::Repartition<uint8_t, DF> d8;
     using V8 = hn::Vec<decltype(d8)>;
     using VF = hn::Vec<decltype(df)>;
     const size_t NF = hn::Lanes(df);
 
+    const uint8_t* HWY_RESTRICT base = &packed.ptr->byte + packed_ofs;
+
     size_t i = 0;
     if (num >= 4 * NF) {
       HWY_UNROLL(1)
       for (; i <= num - 4 * NF; i += 4 * NF) {
-        const V8 packed = hn::LoadU(d8, &in_packed->byte + i);
+        const V8 packed = hn::LoadU(d8, base + i);
         VF f0, f1, f2, f3;
         Dec4F(df, packed, f0, f1, f2, f3);
-        hn::StoreU(f0, df, out_f + i + NF * 0);
-        hn::StoreU(f1, df, out_f + i + NF * 1);
-        hn::StoreU(f2, df, out_f + i + NF * 2);
-        hn::StoreU(f3, df, out_f + i + NF * 3);
+        hn::StoreU(f0, df, raw_f + i + NF * 0);
+        hn::StoreU(f1, df, raw_f + i + NF * 1);
+        hn::StoreU(f2, df, raw_f + i + NF * 2);
+        hn::StoreU(f3, df, raw_f + i + NF * 3);
       }
     }
 
     const size_t remaining = num - i;
     HWY_DASSERT(remaining < 4 * NF);
-    if (remaining != 0) {
-      const V8 packed = hn::LoadN(d8, &in_packed->byte + i, remaining);
+    if (HWY_UNLIKELY(remaining != 0)) {
+      const V8 packed = hn::LoadN(d8, base + i, remaining);
       VF f0, f1, f2, f3;
       Dec4F(df, packed, f0, f1, f2, f3);
-      hn::StoreN(f0, df, out_f + i + 0 * NF, remaining);
-      hn::StoreN(f1, df, out_f + i + 1 * NF, SubOr0(remaining, 1 * NF));
-      hn::StoreN(f2, df, out_f + i + 2 * NF, SubOr0(remaining, 2 * NF));
-      hn::StoreN(f3, df, out_f + i + 3 * NF, SubOr0(remaining, 3 * NF));
+      // We are only guaranteed one vector of padding, so cannot unconditionally
+      // store four vectors. `StoreN` would work, at the cost of saturated
+      // subtraction and creating masks. Because we know that `raw_f` is padded
+      // to at least one vector, we can instead store entire vectors and only
+      // make the address conditional, which potentially avoids branches.
+      // Separate per-vector storage may avoid conflicts.
+      HWY_ALIGN float buf[4 * hn::MaxLanes(df)];
+      hn::StoreU(f0, df, raw_f + i);
+      hn::StoreU(f1, df, (remaining > 1 * NF ? (raw_f + i) : buf) + 1 * NF);
+      hn::StoreU(f2, df, (remaining > 2 * NF ? (raw_f + i) : buf) + 2 * NF);
+      hn::StoreU(f3, df, (remaining > 3 * NF ? (raw_f + i) : buf) + 3 * NF);
     }
   }
 
-  // Fused decode and dot product with even-odd bf16 into four f32 accumulators.
-  template <class DF, HWY_IF_F32_D(DF)>
-  static HWY_INLINE void DotEO(DF df, const SfpStream* HWY_RESTRICT in_packed,
-                               size_t num,
-                               const hwy::bfloat16_t* HWY_RESTRICT vec_aligned,
-                               hn::Vec<DF>& sum0, hn::Vec<DF>& sum1,
-                               hn::Vec<DF>& sum2, hn::Vec<DF>& sum3) {
-    const hn::Repartition<uint8_t, DF> d8;
-    const hn::Repartition<hwy::bfloat16_t, DF> dbf;
-    using V8 = hn::Vec<decltype(d8)>;
-    using VBF = hn::Vec<decltype(dbf)>;
-    const size_t N16 = hn::Lanes(dbf);
-    HWY_DASSERT(num % (2 * N16) == 0);  // whole SFP vector -> 2x bf16
-
-    HWY_UNROLL(1)
-    for (size_t i = 0; i < num; i += 2 * N16) {
-      const V8 packed = hn::LoadU(d8, &in_packed->byte + i);
-      const VBF ve = hn::LoadU(dbf, vec_aligned + i);
-      const VBF vo = hn::LoadU(dbf, vec_aligned + i + N16);
-      VBF be, bo;
-      DecEvenOdd(dbf, packed, be, bo);
-      sum0 = hn::ReorderWidenMulAccumulate(df, be, ve, sum0, sum1);
-      sum2 = hn::ReorderWidenMulAccumulate(df, bo, vo, sum2, sum3);
-    }
-  }
-
-  // Fused decode and dot product with even-odd f32 into four f32 accumulators.
-  template <class DF, HWY_IF_F32_D(DF)>
-  static HWY_INLINE void DotEO(DF df, const SfpStream* HWY_RESTRICT in_packed,
-                               size_t num,
-                               const float* HWY_RESTRICT vec_aligned,
-                               hn::Vec<DF>& sum0, hn::Vec<DF>& sum1,
-                               hn::Vec<DF>& sum2, hn::Vec<DF>& sum3) {
-    const hn::Repartition<uint8_t, DF> d8;
-    using V8 = hn::Vec<decltype(d8)>;
-    using VF = hn::Vec<decltype(df)>;
-    const size_t NF = hn::Lanes(df);
-    HWY_DASSERT(num % (4 * NF) == 0);  // whole SFP vector -> 4x f32
-
-    HWY_UNROLL(1)
-    for (size_t i = 0; i < num; i += 4 * NF) {
-      const V8 packed = hn::LoadU(d8, &in_packed->byte + i);
-      const VF ve0 = hn::LoadU(df, vec_aligned + i + NF * 0);
-      const VF vo0 = hn::LoadU(df, vec_aligned + i + NF * 1);
-      const VF ve1 = hn::LoadU(df, vec_aligned + i + NF * 2);
-      const VF vo1 = hn::LoadU(df, vec_aligned + i + NF * 3);
-      VF fe0, fo0, fe1, fo1;
-      DecEvenOddF(df, packed, fe0, fo0, fe1, fo1);
-      sum0 = hn::MulAdd(fe0, ve0, sum0);
-      sum1 = hn::MulAdd(fo0, vo0, sum1);
-      sum2 = hn::MulAdd(fe1, ve1, sum2);
-      sum3 = hn::MulAdd(fo1, vo1, sum3);
-    }
-  }
-
-  template <class DF, HWY_IF_F32_D(DF),
-            class V8 = hn::Vec<hn::Twice<hn::Rebind<uint8_t, DF>>>>
-  static HWY_INLINE void Dec2(DF df, V8 packed, hn::Vec<DF>& f0,
-                              hn::Vec<DF>& f1) {
-    const hn::Rebind<hwy::bfloat16_t, DF> dbf;
-    using VBF = hn::Vec<decltype(dbf)>;
-    VBF bf0, bf1;
-    Dec2B(dbf, packed, bf0, bf1);
-    f0 = hn::PromoteTo(df, bf0);
-    f1 = hn::PromoteTo(df, bf1);
-  }
-
-  template <class DBF16, HWY_IF_BF16_D(DBF16),
-            class V8 = hn::Vec<hn::Repartition<uint8_t, DBF16>>>
-  static HWY_INLINE void Dec2(DBF16 dbf16, V8 packed, hn::Vec<DBF16>& bf0,
-                              hn::Vec<DBF16>& bf1) {
-    Dec2B(dbf16, packed, bf0, bf1);
-  }
-
  private:
   // Wrappers to avoid code duplication across float/bf16 input types and
   // the main loop/remainder.
@@ -479,7 +440,7 @@ class SfpCodec {
 
   template <class DBF, HWY_IF_BF16_D(DBF),
             class V8 = hn::Vec<hn::Repartition<uint8_t, DBF>>>
-  static HWY_INLINE V8 Enc2B(DBF dbf, const hwy::bfloat16_t* HWY_RESTRICT in) {
+  static HWY_INLINE V8 Enc2B(DBF dbf, const BF16* HWY_RESTRICT in) {
     const hn::Repartition<uint16_t, DBF> d16;
     const size_t N16 = hn::Lanes(d16);
     using V16 = hn::Vec<decltype(d16)>;
@@ -505,7 +466,7 @@ class SfpCodec {
             class V8 = hn::Vec<hn::Repartition<uint8_t, DF>>>
   static HWY_INLINE V8 Enc4F(DF df, const float* HWY_RESTRICT in) {
     const hn::Repartition<uint16_t, DF> d16;
-    const hn::Repartition<hwy::bfloat16_t, DF> dbf;
+    const hn::Repartition<BF16, DF> dbf;
     using VF = hn::Vec<decltype(df)>;
     using V16 = hn::Vec<decltype(d16)>;
     const size_t NF = hn::Lanes(df);
@@ -549,7 +510,7 @@ class SfpCodec {
   static HWY_INLINE void Dec4F(DF df, V8 packed, hn::Vec<DF>& f0,
                                hn::Vec<DF>& f1, hn::Vec<DF>& f2,
                                hn::Vec<DF>& f3) {
-    const hn::Repartition<hwy::bfloat16_t, DF> dbf;
+    const hn::Repartition<BF16, DF> dbf;
     using VBF = hn::Vec<decltype(dbf)>;
     VBF bf0, bf1;
     Dec2B(dbf, packed, bf0, bf1);
@@ -559,6 +520,7 @@ class SfpCodec {
     f3 = hn::PromoteUpperTo(df, bf1);
   }
 
+  // TODO: currently unused, but keep for potential later MatMul packing.
   template <class DBF, HWY_IF_BF16_D(DBF),
             class V8 = hn::Vec<hn::Repartition<uint8_t, DBF>>>
   static HWY_INLINE void DecEvenOdd(DBF dbf, V8 packed, hn::Vec<DBF>& even,
@@ -576,7 +538,7 @@ class SfpCodec {
   static HWY_INLINE void DecEvenOddF(DF df, V8 packed, hn::Vec<DF>& even0,
                                      hn::Vec<DF>& odd0, hn::Vec<DF>& even1,
                                      hn::Vec<DF>& odd1) {
-    const hn::Repartition<hwy::bfloat16_t, DF> dbf;
+    const hn::Repartition<BF16, DF> dbf;
     using VBF = hn::Vec<decltype(dbf)>;
     VBF even_bf, odd_bf;
     DecEvenOdd(dbf, packed, even_bf, odd_bf);
diff --git a/compression/sfp_test.cc b/compression/sfp_test.cc
index da4f220..f79e600 100644
--- a/compression/sfp_test.cc
+++ b/compression/sfp_test.cc
@@ -39,7 +39,6 @@
 #include "hwy/highway.h"
 // After highway.h
 #include "compression/sfp-inl.h"
-#include "ops/dot-inl.h"
 #include "hwy/tests/test_util-inl.h"
 
 HWY_BEFORE_NAMESPACE();
@@ -128,7 +127,7 @@ void TestAllFastDecode() {
 
 // Encode
 HWY_INLINE uint32_t SFP8FromF32(float f) {
-  HWY_ASSERT(-1.875f <= f && f <= 1.875f);
+  HWY_ASSERT(-SfpStream::kMax <= f && f <= SfpStream::kMax);
 
   constexpr uint32_t kMaskM = hwy::MantissaMask<float>();
   uint32_t binary32;
@@ -182,7 +181,7 @@ struct TestDecEnc {
   template <class T, class D>
   HWY_INLINE void operator()(T /*unused*/, D d) {
     const hn::RepartitionToWide<D> d16;
-    const hn::Rebind<hwy::bfloat16_t, decltype(d16)> dbf;
+    const hn::Rebind<BF16, decltype(d16)> dbf;
     const hn::Repartition<float, D> df;
     for (uint32_t encoded = 0; encoded < 256; ++encoded) {
       if (encoded == 0x80) continue;  // -0 is reserved
@@ -215,7 +214,7 @@ struct TestGolden {
   template <class T, class D>
   HWY_INLINE void operator()(T /*unused*/, D d) {
     const hn::Repartition<float, D> df;
-    const hn::Repartition<hwy::bfloat16_t, D> dbf;
+    const hn::Repartition<BF16, D> dbf;
     const hn::RebindToUnsigned<decltype(dbf)> d16;
 
     struct Golden {
@@ -294,9 +293,53 @@ void TestAllGolden() {
   TestGolden()(uint8_t(), hn::ScalableTag<uint8_t>());
 }
 
+// ------------------------------ Order
+
+// Store 8-bit iota, decode, encode, check iota == packed. This ensures
+// Enc/Dec are preserving the order independent of vector length.
+struct TestOrder {
+  template <class T, class DBF>
+  HWY_INLINE void operator()(T /*unused*/, DBF dbf) {
+    const size_t N16 = hn::Lanes(dbf);
+
+    for (size_t num = 1; num < 6 * N16; ++num) {
+      const size_t padded = hwy::RoundUpTo(num, N16);
+
+      auto iota = hwy::AllocateAligned<SfpStream>(num);
+      auto packed = hwy::AllocateAligned<SfpStream>(num);
+      auto bf = hwy::AllocateAligned<BF16>(padded);
+      HWY_ASSERT(iota && packed && bf);
+      for (size_t i = 0; i < num; ++i) {
+        // Clear sign bit so we can also check that bf is in ascending order.
+        iota[i].byte = i & 127;
+      }
+
+      SfpCodec::DecompressAndZeroPad(dbf, MakeConstSpan(iota.get(), num), 0,
+                                     bf.get(), num);
+      for (size_t i = num; i < padded; ++i) {
+        if (hwy::ConvertScalarTo<float>(bf[i]) != 0.0f) {
+          HWY_ABORT("num %zu padded %zu i %zu: not padded", num, padded, i);
+        }
+      }
+
+      SfpCodec::Enc(dbf, bf.get(), num, packed.get());
+
+      for (size_t i = 0; i < num; ++i) {
+        if (iota[i].byte != packed[i].byte) {
+          HWY_ABORT("@%zu: %d %d\n", i, iota[i].byte, packed[i].byte);
+        }
+      }
+    }
+  }
+};
+
+void TestAllOrder() { hn::ForGEVectors<32, TestOrder>()(BF16()); }
+
 // ------------------------------ Foreach bf16 input
 
-// Generate all values, encode, decode back.
+// Checks the distortion from an encode and decode round trip. Unlike
+// `TestShortLengthsT` in compress_test, this covers large `num` and
+// prints the enc/dec throughput.
 struct TestEncDec {
   template <class T, class DBF>
   HWY_INLINE void operator()(T /*unused*/, DBF dbf) {
@@ -309,14 +352,14 @@ struct TestEncDec {
 
     auto in = hwy::AllocateAligned<T>(max);
     auto packed = hwy::AllocateAligned<SfpStream>(max);
-    auto dec = hwy::AllocateAligned<T>(max);
+    auto dec = hwy::AllocateAligned<T>(max);  // already padded
     HWY_ASSERT(in && packed && dec);
     size_t num = 0;
     for (size_t i = 0; i < max; ++i) {
       const uint16_t bits = i * kStep;
       const float f = hwy::F32FromBF16(hwy::BitCastScalar<T>(bits));
       // Keep if within range
-      if (hwy::ScalarIsFinite(f) && f <= 1.875f) {
+      if (hwy::ScalarIsFinite(f) && f <= SfpStream::kMax) {
         in[num] = hwy::BF16FromF32(f);
         in[num + 1] = hwy::BF16FromF32(-f);
         num += 2;
@@ -329,7 +372,8 @@ struct TestEncDec {
       const double t0 = hwy::platform::Now();
       SfpCodec::Enc(dbf, in.get(), num, packed.get());
       const double t1 = hwy::platform::Now();
-      SfpCodec::Dec(dbf, packed.get(), num, dec.get());
+      SfpCodec::DecompressAndZeroPad(dbf, MakeConstSpan(packed.get(), num), 0,
+                                     dec.get(), num);
       const double t2 = hwy::platform::Now();
       enc_elapsed = HWY_MIN(enc_elapsed, t1 - t0);
       dec_elapsed = HWY_MIN(dec_elapsed, t2 - t1);
@@ -358,9 +402,10 @@ struct TestEncDec {
                 stats.SumL1Rounded(), snr, wl1);
       }
       HWY_ASSERT(stats.Original().Count() == stats.L1().Count());
-      // Inputs are in [-1.875, 1.875], symmetric, and heavy-tailed.
-      HWY_ASSERT(stats.Original().Min() == -1.875f);
-      HWY_ASSERT(stats.Original().Max() == 1.875f);
+      // Inputs are in [-SfpStream::kMax, SfpStream::kMax], symmetric, and
+      // heavy-tailed.
+      HWY_ASSERT(stats.Original().Min() == -SfpStream::kMax);
+      HWY_ASSERT(stats.Original().Max() == SfpStream::kMax);
       HWY_ASSERT(gcpp::IsInside(-1E-6, 1E-6, stats.Original().Mean()));
       HWY_ASSERT(gcpp::IsInside(-1E-6, 1E-6, stats.Original().Skewness()));
       HWY_ASSERT(gcpp::IsInside(80.0, 100.0, stats.Original().Kurtosis()));
@@ -382,179 +427,7 @@ struct TestEncDec {
   }
 };
 
-void TestAllEncDec() { hn::ForGEVectors<32, TestEncDec>()(hwy::bfloat16_t()); }
-
-// ------------------------------ Order
-
-// Store 8-bit iota, decode, encode, check iota == packed. This ensures
-// Enc/Dec are preserving the order independent of vector length.
-struct TestOrder {
-  template <class T, class DBF>
-  HWY_INLINE void operator()(T /*unused*/, DBF dbf) {
-    const hn::Repartition<uint8_t, DBF> du8;
-
-    const size_t num = 10 * hn::Lanes(du8) / 3;
-
-    auto iota = hwy::AllocateAligned<SfpStream>(num);
-    auto packed = hwy::AllocateAligned<SfpStream>(num);
-    auto bf = hwy::AllocateAligned<hwy::bfloat16_t>(num);
-    HWY_ASSERT(iota && packed && bf);
-    for (size_t i = 0; i < num; ++i) {
-      // Clear sign bit so we can also check that bf is in ascending order.
-      iota[i].byte = i & 127;
-    }
-
-    SfpCodec::Dec(dbf, iota.get(), num, bf.get());
-    SfpCodec::Enc(dbf, bf.get(), num, packed.get());
-
-    for (size_t i = 0; i < num; ++i) {
-      if (iota[i].byte != packed[i].byte) {
-        HWY_ABORT("@%zu: %d %d\n", i, iota[i].byte, packed[i].byte);
-      }
-    }
-  }
-};
-
-void TestAllOrder() { hn::ForGEVectors<32, TestOrder>()(hwy::bfloat16_t()); }
-
-// ------------------------------ Dot
-
-struct TestDot {
-  template <typename T, class D>
-  HWY_INLINE void operator()(T /*unused*/, D d) {
-    const hn::Repartition<float, D> df;
-    const size_t num = 1024;  // not too many for GeometricMean overflow.
-    const size_t N = hn::Lanes(d);
-    auto in = hwy::AllocateAligned<T>(num);
-    auto dec = hwy::AllocateAligned<T>(num);
-    auto vec = hwy::AllocateAligned<T>(num);
-    auto vec_eo = hwy::AllocateAligned<T>(num);
-    auto sfp = hwy::AllocateAligned<SfpStream>(num);
-    HWY_ASSERT(in && dec && vec && vec_eo && sfp);
-
-    // Generate inputs and verify their distribution.
-    hwy::RandomState rng;
-    hwy::Stats in_stats;
-    for (size_t i = 0; i < num; ++i) {
-      const float r = static_cast<float>(RandomGaussian(rng));
-      in_stats.Notify(r);
-      in[i] = hwy::ConvertScalarTo<T>(r);
-    }
-    for (size_t i = 0; i < num; ++i) {
-      const float r = static_cast<float>(RandomGaussian(rng));
-      in_stats.Notify(r);
-      vec[i] = hwy::ConvertScalarTo<T>(r);
-    }
-    VerifyGaussian(in_stats);
-
-    // Convert vec to even/odd for DotEO
-    for (size_t i = 0; i < num; i += 2 * N) {
-      hn::Vec<D> ve, vo;
-      hn::LoadInterleaved2(d, vec.get() + i, ve, vo);
-      hn::Store(ve, d, vec_eo.get() + i + 0);
-      hn::Store(vo, d, vec_eo.get() + i + N);
-    }
-
-    SfpCodec::Enc(d, in.get(), num, sfp.get());
-
-    // Compute dot product without decompression.
-    float actual = 0.0f;
-    float actual_eo = 0.0f;
-    double elapsed = hwy::HighestValue<double>();
-    double elapsed_eo = hwy::HighestValue<double>();
-    for (size_t rep = 0; rep < 200; ++rep) {
-      {
-        const double t0 = hwy::platform::Now();
-        actual = SimpleDot(df, sfp.get(), 0, vec.get(), num);
-        const double t1 = hwy::platform::Now();
-        elapsed = HWY_MIN(elapsed, t1 - t0);
-      }
-      {
-        hn::Vec<decltype(df)> sum0 = hn::Zero(df);
-        hn::Vec<decltype(df)> sum1 = hn::Zero(df);
-        hn::Vec<decltype(df)> sum2 = hn::Zero(df);
-        hn::Vec<decltype(df)> sum3 = hn::Zero(df);
-        const double t0 = hwy::platform::Now();
-        SfpCodec::DotEO(df, sfp.get(), num, vec_eo.get(), sum0, sum1, sum2,
-                        sum3);
-        const double t1 = hwy::platform::Now();
-        elapsed_eo = HWY_MIN(elapsed_eo, t1 - t0);
-        sum0 = hn::Add(hn::Add(sum0, sum1), hn::Add(sum2, sum3));
-        actual_eo = hn::ReduceSum(df, sum0);
-      }
-    }
-
-    SfpCodec::Dec(d, sfp.get(), num, dec.get());
-    fprintf(stderr, "Vec %zu Dot %zu-bit %.2f ; %.2f MB/s\n",
-            Lanes(d) * sizeof(T), sizeof(T) * 8,
-            num * sizeof(T) * 1E-6 / elapsed,
-            num * sizeof(T) * 1E-6 / elapsed_eo);
-
-    // Exact and decompressed dot products for comparison.
-    float exact = 0.0f;     // using original input
-    float expected = 0.0f;  // using decoded SFP
-    DistortionStats dec_stats;
-    hwy::Stats ratios;
-    for (size_t i = 0; i < num; ++i) {
-      const float in1 = hwy::ConvertScalarTo<float>(in[i]);
-      const float dec1 = hwy::ConvertScalarTo<float>(dec[i]);
-      const float vec1 = hwy::ConvertScalarTo<float>(vec[i]);
-      dec_stats.Notify(in1, dec1);
-
-      exact += in1 * vec1;
-      expected += dec1 * vec1;
-      if (expected != 0.0f) {
-        ratios.Notify(exact / expected);
-      }
-    }
-    const bool isBF = sizeof(T) == 2;
-    const double dec_snr = dec_stats.GeomeanValueDivL1();
-    const double dec_wl1 = dec_stats.WeightedAverageL1();
-    const double dot_snr = 1.0 / hwy::ScalarAbs(1.0 - ratios.GeometricMean());
-    // exact and actual fluctuate due to the combination of SFP imprecision,
-    // and whether vec[i] is negative or positive, so this is quite loose.
-    const float final_ratio = HWY_MIN(exact / actual, actual / exact);
-    if (HWY_ONCE) {
-      fprintf(stderr, "ratios %s\n", ratios.ToString().c_str());
-      fprintf(stderr,
-              "exact %.3f e2 %.4f actual %.4f final_ratio %.3f dec_snr %.2f "
-              "dot_snr %.2f dec_wl1 %.5f\n",
-              exact, expected, actual, final_ratio, dec_snr, dot_snr, dec_wl1);
-    }
-    // Final values are not too far apart.
-    HWY_ASSERT(gcpp::IsInside(0.87f, 1.0f, final_ratio));
-    // Decompressed and uncompressed dot should match exactly.
-    HWY_ASSERT(gcpp::IsNear(expected, actual, 1E-4f));
-    // Even/odd dot should also match
-    HWY_ASSERT(gcpp::IsNear(actual, actual_eo, 1E-4f));
-    // Geomean of ratios for each i should be very close to one.
-    HWY_ASSERT(dot_snr >= (isBF ? 70.0 : 1000.0));
-
-    // dec[] is close to in[]. We also check that in TestEncDec, but for much
-    // smaller input magnitudes.
-    HWY_ASSERT(gcpp::IsNear(isBF ? 51.0 : 64.0, dec_snr, 1.0));
-    HWY_ASSERT(gcpp::IsNear(isBF ? 0.013 : 0.012, dec_wl1, 0.001));
-    HWY_ASSERT(gcpp::IsNear(isBF ? 6.2 : 6.3, dec_stats.SumL1(), 0.1));
-    HWY_ASSERT_EQ(0, dec_stats.NumSignFlip());
-    HWY_ASSERT_EQ(0, dec_stats.NumRoundedToZero());
-    HWY_ASSERT_EQ(0.0, dec_stats.SumL1Rounded());
-    // Absolute decode errors are in [0, 5E-2], and somewhat right-tailed.
-    HWY_ASSERT(gcpp::IsInside(0.0f, 2E-6f, dec_stats.L1().Min()));
-    HWY_ASSERT(gcpp::IsInside(3E-2f, 5E-2f, dec_stats.L1().Max()));
-    HWY_ASSERT(gcpp::IsInside(4E-3, 7E-3, dec_stats.L1().Mean()));
-    HWY_ASSERT(gcpp::IsInside(1.8, 1.9, dec_stats.L1().Skewness()));
-    HWY_ASSERT(gcpp::IsInside(6.0, 7.0, dec_stats.L1().Kurtosis()));
-  }
-};
-
-void TestAllDotF32() {
-  const hn::ForGEVectors<128, TestDot> test;
-  test(float());
-}
-void TestAllDotBF16() {
-  const hn::ForGEVectors<128, TestDot> test;
-  test(hwy::bfloat16_t());
-}
+void TestAllEncDec() { hn::ForGEVectors<32, TestEncDec>()(BF16()); }
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
@@ -562,7 +435,6 @@ void TestAllDotBF16() {
 HWY_AFTER_NAMESPACE();
 
 #if HWY_ONCE
-
 namespace gcpp {
 HWY_BEFORE_TEST(SfpTest);
 HWY_EXPORT_AND_TEST_P(SfpTest, PrintTables);
@@ -570,13 +442,8 @@ HWY_EXPORT_AND_TEST_P(SfpTest, TestAllUnique);
 HWY_EXPORT_AND_TEST_P(SfpTest, TestAllFastDecode);
 HWY_EXPORT_AND_TEST_P(SfpTest, TestAllDecEnc);
 HWY_EXPORT_AND_TEST_P(SfpTest, TestAllGolden);
-HWY_EXPORT_AND_TEST_P(SfpTest, TestAllEncDec);
 HWY_EXPORT_AND_TEST_P(SfpTest, TestAllOrder);
-HWY_EXPORT_AND_TEST_P(SfpTest, TestAllDotF32);
-HWY_EXPORT_AND_TEST_P(SfpTest, TestAllDotBF16);
-#ifdef HWY_AFTER_TEST
+HWY_EXPORT_AND_TEST_P(SfpTest, TestAllEncDec);
 HWY_AFTER_TEST();
-#endif
 }  // namespace gcpp
-
-#endif
+#endif  // HWY_ONCE
diff --git a/compression/shared.h b/compression/shared.h
index 5f8b173..166cd29 100644
--- a/compression/shared.h
+++ b/compression/shared.h
@@ -20,8 +20,12 @@
 #define THIRD_PARTY_GEMMA_CPP_COMPRESSION_SHARED_H_
 
 #include <stddef.h>
+#include <stdint.h>
 
-#include "hwy/base.h"  // hwy::bfloat16_t
+#include <cstdio>
+
+#include "hwy/aligned_allocator.h"
+#include "hwy/base.h"  // HWY_INLINE
 
 namespace gcpp {
 
@@ -35,25 +39,172 @@ using BF16 = hwy::bfloat16_t;
 // - 24-bit dynamic range, with max exponent 2^0.
 // - 3 bit mantissa for values >= 2^-7, otherwise 2.
 //
-// A pointer to this is the *start* of an SFP stream. Values are stored
-// in-order to enable vector-length agnostic seeking, because streams may be
-// written to disk for loading on other CPUs.
+// A pointer to this is the *start* of an SFP stream. Aligning the allocation
+// (see aligned_allocator.h) may speed up decoding but is not required.
+//
+// Layout: Values are stored in-order to enable vector-length agnostic seeking,
+// because streams may be written to disk for loading on other CPUs.
 //
 // This is faster to decode than a straightforward implementation of eXmY, in
 // part because SFP does not require subnormals. Unlike OCP MX, it also does not
 // require side information (shared exponents).
 //
 // Although the representation could probably be shrunk to 6-7 bits, more
-// savings can be had by non-uniform clustering - see nuq.h.
+// savings can be had by non-uniform clustering - see NuqStream.
 #pragma pack(push, 1)
 struct SfpStream {
+  // Largest possible input magnitude: 1.111 * 2^0. This could be increased by
+  // shifting the value range (exponent bias).
+  static constexpr float kMax = 1.875f;
+
   uint8_t byte;
 };
 #pragma pack(pop)
 
-// Largest possible input magnitude: 1.111 * 2^0. This could be increased by
-// shifting the value range (exponent bias).
-constexpr float kMaxSFP = 1.875f;
+// Returns 1.0f if all magnitudes are <= SfpStream::kMax, otherwise scales them
+// such that the largest magnitude is SfpStream::kMax, and returns the
+// multiplier with which to restore the original values. This is only necessary
+// before compressing to SfpStream.
+// TODO: vectorize
+static inline float ScaleWeights(float* HWY_RESTRICT raw, size_t num) {
+  float maxabs = 0.0;
+  for (size_t i = 0; i < num; ++i) {
+    maxabs = HWY_MAX(maxabs, hwy::ScalarAbs(raw[i]));
+  }
+  if (maxabs <= SfpStream::kMax) {
+    return 1.0f;
+  }
+  const float scale = maxabs / SfpStream::kMax;
+  const float inv_scale = static_cast<float>(1.0 / static_cast<double>(scale));
+  for (size_t i = 0; i < num; ++i) {
+    // Clamp because kMax may still be exceeded.
+    const float magn =
+        HWY_MIN(SfpStream::kMax, hwy::ScalarAbs(raw[i] * inv_scale));
+    raw[i] = hwy::ScalarCopySign(magn, raw[i]);
+  }
+  return scale;
+}
+
+// Non-uniform quantization: a compressed representation of f32 inputs that
+// supports seeking at a granularity of 1 (for `DecompressAndZeroPad`) or
+// two vectors (for `Decompress2`), and decoding to bf16/f32.
+//
+// A pointer to this is the *start* of a NUQ stream. Aligning the allocation
+// (see aligned_allocator.h) may be speed up decoding but is not required.
+//
+// Layout: first one table of kClusters entries per group, in ascending order
+// of group index, then two packed indices per byte. Indices are stored
+// in-order to enable vector-length agnostic decode, because streams may be
+// persisted to disk and used by other CPUs.
+//
+// To enable parallel encoding and decoding, Enc/Dec have `offset` parameters
+// which refer to the stream, NOT the raw from/to pointers, which point directly
+// to the source/destination. Offsets are in units of values, NOT compressed
+// bytes within the stream.
+#pragma pack(push, 1)
+struct NuqStream {
+  // 4-bit indices are a sweet spot in terms of quality per size.
+  static constexpr size_t kClusters = 16;
+
+  // Number of weights that share a table. Larger = slower encode, higher error,
+  // smaller size (table amortized over more weights).
+  static constexpr size_t kGroupSize = 256;
+
+  // Storage for dynamic programming. There are two matrices; we use separate
+  // allocations to avoid type punning.
+  template <class T>
+  class AlignedMatrix {
+   public:
+    AlignedMatrix() : mem_(hwy::AllocateAligned<T>(kClusters * kGroupSize)) {}
+
+    HWY_INLINE const T& operator()(size_t row, size_t col) const {
+      return mem_[row * kGroupSize + col];
+    }
+
+    HWY_INLINE T& operator()(size_t row, size_t col) {
+      return mem_[row * kGroupSize + col];
+    }
+
+   private:
+    hwy::AlignedFreeUniquePtr<T[]> mem_;
+  };
+
+  // Reuse memory across calls to Enc to avoid per-call allocations.
+  struct ClusterBuf {
+    // Move-only (stored inside vector in CompressWorkingSet).
+    ClusterBuf() = default;
+    ClusterBuf(const ClusterBuf&) = delete;
+    ClusterBuf& operator=(const ClusterBuf&) = delete;
+    ClusterBuf(ClusterBuf&&) = default;
+    ClusterBuf& operator=(ClusterBuf&&) = default;
+
+    void Resize(size_t new_num_groups) {
+      if (new_num_groups < num_groups) return;
+
+      num_groups = new_num_groups;
+      centers = hwy::AllocateAligned<float>(num_groups * kClusters);
+      idx = hwy::AllocateAligned<uint16_t>(num_groups * kGroupSize);
+    }
+
+    // Independent of num_groups.
+    AlignedMatrix<float> costs;
+    AlignedMatrix<int32_t> argmin;
+
+    size_t num_groups = 0;
+    hwy::AlignedFreeUniquePtr<float[]> centers;
+    hwy::AlignedFreeUniquePtr<uint16_t[]> idx;
+  };
+
+  // Returns offset of packed indices from the start of the stream. This matches
+  // the (padded) total table size because table entries are bytes.
+  static constexpr size_t PackedStart(size_t capacity) {
+    // Round up to avoid cache-line splits when loading indices. No effect on
+    // size as long as capacity / kGroupSize is a multiple of 4.
+    return hwy::RoundUpTo(hwy::DivCeil(capacity, kGroupSize) * kClusters, 64);
+  }
+
+  // Returns number of NuqStream to allocate for the stream, which matches its
+  // size in bytes.
+  static constexpr size_t PackedEnd(size_t capacity) {
+    return PackedStart(capacity) + hwy::DivCeil(capacity, 2);  // 2x 4-bit/byte
+  }
+
+  uint8_t byte;
+};
+#pragma pack(pop)
+
+template <typename PackedT>
+const char* TypeName() {
+  using Packed = hwy::RemoveCvRef<PackedT>;
+  if constexpr (hwy::IsSame<Packed, float>()) {
+    return "f32";
+  } else if constexpr (hwy::IsSame<Packed, BF16>()) {
+    return "b16";
+  } else if constexpr (hwy::IsSame<Packed, SfpStream>()) {
+    return "sfp";
+  } else if constexpr (hwy::IsSame<Packed, NuqStream>()) {
+    return "nuq";
+  } else {
+    HWY_DASSERT(false);
+    return "unknown";
+  }
+}
+
+template <typename Packed>
+constexpr bool IsCompressed() {
+  return hwy::IsSameEither<hwy::RemoveCvRef<Packed>, SfpStream, NuqStream>();
+}
+
+// Returns the number of `MatT` elements required to store `capacity` values,
+// which must not be zero.
+template <typename Packed>
+constexpr size_t CompressedArrayElements(size_t capacity) {
+  if constexpr (hwy::IsSame<hwy::RemoveCvRef<Packed>, NuqStream>()) {
+    return NuqStream::PackedEnd(capacity);
+  } else {
+    return capacity;
+  }
+}
 
 // Non-owning view of packed elements. Shortens argument lists.
 //
@@ -63,13 +214,19 @@ constexpr float kMaxSFP = 1.875f;
 // reusing `hwy::Span`.
 template <typename Packed>
 struct PackedSpan {
-  void BoundsCheck(size_t packed_ofs, size_t num) const {
-    HWY_DASSERT(packed_ofs + num <= size);
-    (void)size;
+  // Ensures callers can read or write `num_accessible` elements starting at
+  // `packed_ofs`.
+  void BoundsCheck(size_t packed_ofs, size_t num_accessible) const {
+    // For NUQ, there can be fewer Packed than the number of elements, hence
+    // check the compressed count and ensure we have that many.
+    const size_t required =
+        CompressedArrayElements<Packed>(packed_ofs + num_accessible);
+    HWY_DASSERT(num >= required);
+    (void)required;
   }
 
   Packed* HWY_RESTRICT ptr;
-  size_t size;  // for BoundsCheck and nuq-inl.h HWY_ASSERT.
+  size_t num;  // for BoundsCheck and nuq-inl.h HWY_ASSERT.
 };
 
 // Avoids spelling out the template parameter in every call.
@@ -87,7 +244,7 @@ HWY_INLINE PackedSpan<const Packed> MakeConstSpan(Packed* ptr, size_t size) {
 // `RMSNormInplace` and compression tests.
 template <typename Packed>
 HWY_INLINE PackedSpan<const Packed> MakeConst(PackedSpan<Packed> packed) {
-  return {packed.ptr, packed.size};
+  return {packed.ptr, packed.num};
 }
 
 }  // namespace gcpp
diff --git a/compression/test_util-inl.h b/compression/test_util-inl.h
new file mode 100644
index 0000000..1f591ff
--- /dev/null
+++ b/compression/test_util-inl.h
@@ -0,0 +1,68 @@
+// Copyright 2024 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Include guard for headers.
+#ifndef THIRD_PARTY_GEMMA_CPP_COMPRESSION_TEST_UTIL_INL_H_
+#define THIRD_PARTY_GEMMA_CPP_COMPRESSION_TEST_UTIL_INL_H_
+
+// IWYU pragma: begin_exports
+#include "compression/compress.h"
+#include "compression/distortion.h"
+// IWYU pragma: end_exports
+
+#endif  // THIRD_PARTY_GEMMA_CPP_COMPRESSION_TEST_UTIL_INL_H_
+
+// Include guard for (potentially) SIMD code.
+#if defined(THIRD_PARTY_GEMMA_CPP_COMPRESS_TEST_UTIL_TOGGLE) == \
+    defined(HWY_TARGET_TOGGLE)  // NOLINT
+#ifdef THIRD_PARTY_GEMMA_CPP_COMPRESS_TEST_UTIL_TOGGLE
+#undef THIRD_PARTY_GEMMA_CPP_COMPRESS_TEST_UTIL_TOGGLE
+#else
+#define THIRD_PARTY_GEMMA_CPP_COMPRESS_TEST_UTIL_TOGGLE
+#endif
+
+#include "hwy/highway.h"
+// After highway.h
+#include "compression/compress-inl.h"
+#include "hwy/tests/test_util-inl.h"  // IWYU pragma: export
+
+HWY_BEFORE_NAMESPACE();
+namespace gcpp {
+namespace HWY_NAMESPACE {
+namespace hn = hwy::HWY_NAMESPACE;
+
+// `Packed` is the type passed to `TestT`.
+template <typename Packed, template <class> class TestT>
+void ForeachRawType() {
+  const hn::ForGEVectors<128, TestT<Packed>> test;
+  // The argument selects the type to decode to: BF16 or float.
+  test(BF16());
+  test(float());
+}
+
+template <template <class> class TestT>
+void ForeachPackedAndRawType() {
+  ForeachRawType<BF16, TestT>();
+  ForeachRawType<float, TestT>();
+  ForeachRawType<SfpStream, TestT>();
+  ForeachRawType<NuqStream, TestT>();
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace gcpp
+HWY_AFTER_NAMESPACE();
+
+#endif  // NOLINT
diff --git a/evals/benchmark_helper.cc b/evals/benchmark_helper.cc
index d27b4eb..00d84b1 100644
--- a/evals/benchmark_helper.cc
+++ b/evals/benchmark_helper.cc
@@ -244,7 +244,7 @@ void ShowConfig(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app,
             pools.Inner(0).NumWorkers(),
             hwy::TargetName(hwy::DispatchedTarget()), hwy::VectorBytes() * 8,
             CompiledConfig(), StringFromType(loader.Info().weight),
-            TypeName(EmbedderInputT()));
+            TypeName<EmbedderInputT>());
   }
 }
 
diff --git a/gemma/gemma-inl.h b/gemma/gemma-inl.h
index 3dab525..bfdb2d4 100644
--- a/gemma/gemma-inl.h
+++ b/gemma/gemma-inl.h
@@ -560,18 +560,23 @@ HWY_NOINLINE void EmbedToken(int token, size_t batch_idx, size_t pos,
                              const CompressedWeights<TConfig>& weights,
                              RowVectorBatch<float>& x) {
   constexpr size_t kModelDim = TConfig::kModelDim;
+  constexpr size_t kVocabSize = TConfig::kVocabSize;
   GEMMA_CONSTEXPR_EMBSCALING const float kEmbScaling =
       EmbeddingScaling<TConfig>();
 
   HWY_DASSERT(token >= 0);
-  HWY_DASSERT(token < TConfig::kVocabSize);
+  HWY_DASSERT(token < kVocabSize);
 
-  Decompress(weights.embedder_input_embedding, token * kModelDim,
+  const hn::ScalableTag<float> df;
+  DecompressAndZeroPad(
+      df,
+      MakeSpan(weights.embedder_input_embedding.data(), kVocabSize * kModelDim),
+      token * kModelDim, x.Batch(batch_idx), kModelDim);
+  MulByConst(kEmbScaling * weights.embedder_input_embedding.scale(),
              x.Batch(batch_idx), kModelDim);
-  MulByConst(kEmbScaling, x.Batch(batch_idx), kModelDim);
   if constexpr (TConfig::kAbsolutePE) {
     AddAbsolutePositionalEmbeddings(x.Batch(batch_idx), kModelDim, pos);
-  };
+  }
 }
 
 template <class TConfig, typename T>
diff --git a/ops/dot-inl.h b/ops/dot-inl.h
index 7addc1b..23306cd 100644
--- a/ops/dot-inl.h
+++ b/ops/dot-inl.h
@@ -15,12 +15,9 @@
 
 #include <stddef.h>
 
-#include <algorithm>  // std::sort
 #include <array>
-#include <cstdlib>  // std::abs
 
 #include "compression/compress.h"
-#include "compression/distortion.h"  // TwoSum
 #include "hwy/base.h"
 
 // Include guard for (potentially) SIMD code.
@@ -34,339 +31,247 @@
 #include "hwy/highway.h"
 // After highway.h
 #include "compression/compress-inl.h"
+#include "ops/fp_arith-inl.h"
 #include "hwy/contrib/math/math-inl.h"
-#include "hwy/profiler.h"
+#include "hwy/profiler.h"  // also uses SIMD
 
 HWY_BEFORE_NAMESPACE();
 namespace gcpp {
 namespace HWY_NAMESPACE {
 namespace hn = hwy::HWY_NAMESPACE;
 
-// Returns dot product of `x` and `w`, both length `num`. Uses Decompress2 to
-// convert WeightT and VecT to float, then FMA.
-// TODO: improve precision?
-// TODO: use bf16 products?
-template <class DF, typename WeightT, typename VecT>
-HWY_INLINE float SimpleDot(DF df, const WeightT* HWY_RESTRICT w, size_t w_ofs,
-                           const VecT* HWY_RESTRICT x, size_t num) {
-  PROFILER_FUNC;
-  const size_t N = hn::Lanes(df);
-  HWY_DASSERT(hn::IsAligned(df, x));
-  using VF = hn::Vec<DF>;
-  using TraitsW = CompressTraits<WeightT>;
-  using TraitsV = CompressTraits<VecT>;
-
-  VF sum0 = hn::Zero(df);
-  VF sum1 = hn::Zero(df);
-  VF sum2 = hn::Zero(df);
-  VF sum3 = hn::Zero(df);
-
-  VF w0, w1, w2, w3, v0, v1, v2, v3;  // decompressed inputs
-
-  size_t i = 0;
-  if (num >= 4 * N) {
-    for (; i <= num - 4 * N; i += 4 * N) {
-      TraitsW::Decompress2(df, w, w_ofs + i, w0, w1);
-      TraitsW::Decompress2(df, w, w_ofs + i + 2 * N, w2, w3);
-      TraitsV::Decompress2(df, x, i, v0, v1);
-      TraitsV::Decompress2(df, x, i + 2 * N, v2, v3);
-
-      sum0 = hn::MulAdd(w0, v0, sum0);
-      sum1 = hn::MulAdd(w1, v1, sum1);
-      sum2 = hn::MulAdd(w2, v2, sum2);
-      sum3 = hn::MulAdd(w3, v3, sum3);
-    }
-  }
-
-  const size_t remaining = num - i;
-  if (HWY_UNLIKELY(remaining != 0)) {
-    HWY_ALIGN float padded_w[4 * hn::MaxLanes(df)] = {};
-    HWY_ALIGN float padded_x[4 * hn::MaxLanes(df)] = {};
-    // The actual capacity of w[] is unknown, so pass a lower bound.
-    const size_t w_capacity = w_ofs + num;
-    TraitsW::Decompress(df, w_capacity, w, w_ofs + i, padded_w, remaining);
-    TraitsV::Decompress(df, num, x, i, padded_x, remaining);
-    const size_t padding = 4 * N - remaining;
-    hwy::ZeroBytes(padded_w + remaining, padding * sizeof(padded_w[0]));
-    hwy::ZeroBytes(padded_x + remaining, padding * sizeof(padded_x[0]));
-    for (; i < num; i += N) {
-      const VF w0 = hn::Load(df, padded_w + i);
-      const VF v0 = hn::Load(df, padded_x + i);
-      sum0 = hn::MulAdd(w0, v0, sum0);
-    }
-  }
-
-  // Reduction tree: sum of all accumulators by pairs, then across lanes.
-  sum0 = hn::Add(sum0, sum1);
-  sum2 = hn::Add(sum2, sum3);
-  sum0 = hn::Add(sum0, sum2);
-  return hn::ReduceSum(df, sum0);
-}
-
-// Adapter for use by matvec-inl.h. TODO: remove when that is no longer used.
-template <bool kVecEO, class DF, size_t kCapacity, typename VecT>
-HWY_INLINE float Dot(DF df, const std::array<float, kCapacity>& w, size_t ofs,
-                     const VecT* vec_aligned, size_t num) {
-  PROFILER_ZONE("Dot array");
-  HWY_DASSERT(ofs + num <= kCapacity);
-  HWY_DASSERT(hn::IsAligned(df, vec_aligned));
-  return SimpleDot(df, w.data(), ofs, vec_aligned, num);
-}
-
-// Adapter for use by matvec-inl.h. TODO: remove when that is no longer used.
-template <bool kVecEO, class DF, typename MatT, size_t kCapacity, typename VecT>
-HWY_INLINE float Dot(DF df, const CompressedArray<MatT, kCapacity>& compressed,
-                     size_t compressed_ofs, const VecT* vec_aligned,
-                     size_t num) {
-  PROFILER_ZONE("Dot CompressedArray");
-  HWY_DASSERT(compressed_ofs + num <= compressed.size());
-  HWY_DASSERT(hn::IsAligned(df, vec_aligned));
-  using Traits = CompressTraits<MatT>;
-  float dot_result;
-  if constexpr (kVecEO) {
-    dot_result =
-        Traits::DotEO(df, compressed.data(), compressed_ofs, vec_aligned, num);
-  } else {
-    dot_result =
-        SimpleDot(df, compressed.data(), compressed_ofs, vec_aligned, num);
-  }
-  return compressed.scale() * dot_result;
-}
-
-// Returns result accurate to 1.5 ulp, assuming `num` < 2^(52-23), no overflow,
-// and round to nearest. See "Accurate and efficient floating point summation".
-HWY_INLINE float ExactDot(const float* HWY_RESTRICT a,
-                          const float* HWY_RESTRICT b, size_t num,
-                          double* HWY_RESTRICT buf) {
-  PROFILER_FUNC;
-  for (size_t i = 0; i < num; ++i) {
-    buf[i] = static_cast<double>(a[i]) * static_cast<double>(b[i]);
-  }
-  // Sort by decreasing magnitude (not supported by VQSort).
-  std::sort(buf, buf + num,
-            [](double a, double b) { return std::abs(a) > std::abs(b); });
-  double sum = 0.0;
-  for (size_t i = 0; i < num; ++i) {
-    sum += buf[i];
-  }
-  return static_cast<float>(sum);
-}
-
-//------------------------------------------------------------------------------
-// Cascaded summation (twice working precision)
-
-// Returns `sum` and `err` such that `sum + err` is exactly equal to `a + b`,
-// despite floating-point rounding. `sum` is already the best estimate for the
-// addition, so do not actually add `err` to it. `UpdateCascadedSums` instead
-// accumulates multiple `err`, which are then later added to `sum`.
-//
-// Knuth98/Moller65. Unlike Fast2Sum [Dekker71], this does not require any
-// relative ordering of the exponents of a and b.
-template <class DF, HWY_IF_FLOAT3264_D(DF), class VF = hn::Vec<DF>>
-static HWY_INLINE VF TwoSums(DF /*df*/, VF a, VF b, VF& err) {
-  const VF sum = hn::Add(a, b);
-  const VF a2 = hn::Sub(sum, b);
-  const VF b2 = hn::Sub(sum, a2);
-  const VF err_a = hn::Sub(a, a2);
-  const VF err_b = hn::Sub(b, b2);
-  err = hn::Add(err_a, err_b);
-  return sum;
-}
-
-// Adds vectors with about twice the precision of VF using 7 FLOPS.
-// Rump/Ogita/Oishi08, Algorithm 6.11 in Handbook of Floating-Point Arithmetic.
-// `sum` and `sum_err` must be initially zero.
-//
-// Each lane is an independent cascaded sum. To obtain a single result, use
-// `ReduceCascadedSum`. Vectors generally cannot be wrapped in a class, hence we
-// use free functions.
-template <class DF, HWY_IF_FLOAT3264_D(DF), class VF = hn::Vec<DF>>
-void UpdateCascadedSums(DF df, VF v, VF& sum, VF& sum_err) {
-  VF err;
-  sum = TwoSums(df, sum, v, err);
-  sum_err += err;
-}
-
-// Combines two cascaded sum vectors, typically from unrolling/parallelization.
-template <class DF, HWY_IF_FLOAT3264_D(DF), class VF = hn::Vec<DF>>
-void AssimilateCascadedSums(DF df, const VF& other_sum, const VF& other_sum_err,
-                            VF& sum, VF& sum_err) {
-  UpdateCascadedSums(df, other_sum, sum, sum_err);
-  sum_err += other_sum_err;
-}
-
-// Reduces cascaded sums, to a single value. Slow, call outside of loops.
-template <class DF, HWY_IF_FLOAT3264_D(DF), class VF = hn::Vec<DF>>
-hn::TFromD<DF> ReduceCascadedSums(DF df, const VF sum, VF sum_err) {
-  const size_t N = hn::Lanes(df);
-  using TF = hn::TFromD<DF>;
-  TF total = TF{0.0};
-  TF total_err = TF{0.0};
-  for (size_t i = 0; i < N; ++i) {
-    TF err;
-    total = TwoSum(total, hn::ExtractLane(sum, i), err);
-    total_err += hn::ExtractLane(sum_err, i);
-    total_err += err;
-  }
-  return total + total_err;
-}
-
 //------------------------------------------------------------------------------
 
-// Returns 2 * sum(|f|) / |sum(f)|. This is large when there are many
-// similar-magnitude and opposite-sign elements in `f`. See
+// Returns 2 * sum(|w.*v|) / |sum(w.*v)|. This is large when there are many
+// similar-magnitude and opposite-sign elements. See
 // https://en.wikipedia.org/wiki/Condition_number.
-template <class DF, HWY_IF_FLOAT3264_D(DF), class VF = hn::Vec<DF>>
-static inline double ConditionNumber(DF df, const float* HWY_RESTRICT f,
-                                     size_t num) {
+template <typename WeightT, typename VecT>
+HWY_MAYBE_UNUSED double ConditionNumber(const WeightT* HWY_RESTRICT w,
+                                        const VecT* HWY_RESTRICT v,
+                                        size_t num) {
   PROFILER_FUNC;
+  const hn::ScalableTag<float> df;
+  using VF = hn::Vec<decltype(df)>;
   const size_t N = hn::Lanes(df);
 
   VF sum = hn::Zero(df);
   VF sum_err = hn::Zero(df);
   VF sum_abs = hn::Zero(df);
-  VF sum_err_abs = hn::Zero(df);
+  VF sum_abs_err = hn::Zero(df);
+
+  const auto packed_w = MakeSpan(w, num);
+  const auto packed_v = MakeSpan(v, num);
 
   size_t i = 0;
-  if (num >= N) {
-    for (; i <= num - N; i += N) {
-      const VF v = hn::Load(df, f + i);
-      UpdateCascadedSums(v, sum, sum_err);
-      UpdateCascadedSums(hn::Abs(v), sum_abs, sum_err_abs);
+  if (num >= 2 * N) {
+    for (; i <= num - 2 * N; i += 2 * N) {
+      VF w0, w1, v0, v1;
+      Decompress2(df, packed_w, i, w0, w1);
+      Decompress2(df, packed_v, i, v0, v1);
+      const VF mul0 = hn::Mul(w0, v0);
+      const VF mul1 = hn::Mul(w1, v1);
+      UpdateCascadedSums(df, mul0, sum, sum_err);
+      UpdateCascadedSums(df, mul1, sum, sum_err);
+      UpdateCascadedSums(df, hn::Abs(mul0), sum_abs, sum_abs_err);
+      UpdateCascadedSums(df, hn::Abs(mul1), sum_abs, sum_abs_err);
     }
   }
-  const size_t remaining = num - i;
-  if (remaining != 0) {
-    const VF v = hn::LoadN(df, f + i, remaining);
-    UpdateCascadedSums(v, sum, sum_err);
-    UpdateCascadedSums(hn::Abs(v), sum_abs, sum_err_abs);
-  }
 
-  const float div = std::abs(ReduceCascadedSums(df, sum, sum_err));
-  if (div == 0.0f) return hwy::HighestValue<float>();
-  const double cond = 2.0 * ReduceCascadedSums(df, sum_abs, sum_err_abs) /
-                      static_cast<double>(div);
-  HWY_ASSERT(cond >= 0.0);
-  return cond;
-}
+  size_t remaining = num - i;
+  HWY_DASSERT(remaining < 2 * N);
+  if (HWY_UNLIKELY(remaining != 0)) {
+    HWY_ALIGN float padded_w[2 * hn::MaxLanes(df)];
+    HWY_ALIGN float padded_v[2 * hn::MaxLanes(df)];
+    DecompressAndZeroPad(df, packed_w, i, padded_w, remaining);
+    DecompressAndZeroPad(df, packed_v, i, padded_v, remaining);
 
-// Same, but for dot product of two arrays.
-// TODO: move into dot_test.
-template <class DF, HWY_IF_FLOAT3264_D(DF), class VF = hn::Vec<DF>>
-static inline double ConditionNumber(DF df, const float* HWY_RESTRICT a,
-                                     const float* HWY_RESTRICT b, size_t num) {
-  PROFILER_FUNC;
-  const size_t N = hn::Lanes(df);
-
-  VF sum = hn::Zero(df);
-  VF sum_err = hn::Zero(df);
-  VF sum_abs = hn::Zero(df);
-  VF sum_err_abs = hn::Zero(df);
-
-  size_t i = 0;
-  if (num >= N) {
-    for (; i <= num - N; i += N) {
-      const VF va = hn::Load(df, a + i);
-      const VF vb = hn::Load(df, b + i);
-      const VF mul = hn::Mul(va, vb);
+    // 1..2 whole vectors, possibly zero-padded.
+    for (size_t padded_pos = 0; padded_pos < remaining; padded_pos += N) {
+      const VF w0 = hn::Load(df, padded_w + padded_pos);
+      const VF v0 = hn::Load(df, padded_v + padded_pos);
+      const VF mul = hn::Mul(w0, v0);
       UpdateCascadedSums(df, mul, sum, sum_err);
-      UpdateCascadedSums(df, hn::Abs(mul), sum_abs, sum_err_abs);
+      UpdateCascadedSums(df, hn::Abs(mul), sum_abs, sum_abs_err);
     }
   }
-  const size_t remaining = num - i;
-  if (remaining != 0) {
-    const VF va = hn::LoadN(df, a + i, remaining);
-    const VF vb = hn::LoadN(df, b + i, remaining);
-    const VF mul = hn::Mul(va, vb);
-    UpdateCascadedSums(df, mul, sum, sum_err);
-    UpdateCascadedSums(df, hn::Abs(mul), sum_abs, sum_err_abs);
-  }
 
-  const float div = std::abs(ReduceCascadedSums(df, sum, sum_err));
+  const float div = hwy::ScalarAbs(ReduceCascadedSums(df, sum, sum_err));
   if (div == 0.0f) return hn::GetLane(hn::Inf(df));
-  const double cond = 2.0 * ReduceCascadedSums(df, sum_abs, sum_err_abs) /
+  const double cond = 2.0 * ReduceCascadedSums(df, sum_abs, sum_abs_err) /
                       static_cast<double>(div);
   HWY_ASSERT(cond >= 0.0);
   return cond;
 }
 
-//------------------------------------------------------------------------------
-// Compensated dot product
-
-#if !HWY_NATIVE_FMA
-
-// Returns non-overlapping `x` and `y` such that `x + y` = `f` and |x| >= |y|.
-// Notation from Algorithm 3.1 in Handbook of Floating-Point Arithmetic. 4 ops.
-template <class DF, HWY_IF_FLOAT3264_D(DF), class VF = hn::Vec<DF>>
-static HWY_INLINE void VeltkampSplit(DF df, VF a, VF& x, VF& y) {
-  using TF = hn::TFromD<DF>;
-  constexpr int t = hwy::MantissaBits<TF>() + 1;  // = -log2(epsilon)
-  constexpr int s = hwy::DivCeil(t, 2);
-  const VF factor = hn::Set(df, hwy::ConvertScalarTo<TF>((1ULL << s) + 1));
-  const VF c = hn::Mul(factor, a);
-  x = hn::Sub(c, hn::Sub(c, a));
-  y = hn::Sub(a, x);
-}
-
-#endif  // !HWY_NATIVE_FMA
-
-// Returns `prod` and `err` such that `prod + err` is exactly equal to `a * b`,
-// despite floating-point rounding, assuming that `err` is not subnormal, i.e.,
-// the sum of exponents >= min exponent + mantissa bits. 2..17 ops.
-template <class DF, HWY_IF_FLOAT3264_D(DF), class VF = hn::Vec<DF>>
-static HWY_INLINE VF TwoProducts(DF df, VF a, VF b, VF& err) {
-  const VF prod = hn::Mul(a, b);
-#if HWY_NATIVE_FMA
-  err = hn::MulSub(a, b, prod);
-#else
-  VF a1, a2, b1, b2;
-  VeltkampSplit(df, a, a1, a2);
-  VeltkampSplit(df, b, b1, b2);
-  const VF m = hn::Sub(prod, hn::Mul(a1, b1));
-  const VF n = hn::Sub(m, hn::Mul(a2, b1));
-  const VF o = hn::Sub(n, hn::Mul(a1, b2));
-  err = hn::Sub(hn::Mul(a2, b2), o);
-#endif
-  return prod;
-}
-
-// Algorithm 6.15 from Handbook of Floating-Point Arithmetic.
-template <class DF, typename WeightT, typename VecT>
-HWY_INLINE float CompensatedDot(DF df, const WeightT* HWY_RESTRICT w,
-                                size_t w_ofs, const VecT* HWY_RESTRICT x,
-                                size_t num) {
+// Same, but for a single vector - just skips the product.
+template <typename VecT>
+HWY_MAYBE_UNUSED double ConditionNumber(const VecT* HWY_RESTRICT v,
+                                        size_t num) {
   PROFILER_FUNC;
+  const hn::ScalableTag<float> df;
+  using VF = hn::Vec<decltype(df)>;
   const size_t N = hn::Lanes(df);
-  HWY_ASSERT((num % (2 * N)) == 0);
-  HWY_DASSERT(hn::IsAligned(df, x));
-  using VF = hn::Vec<DF>;
-  using TraitsW = CompressTraits<WeightT>;
-  using TraitsV = CompressTraits<VecT>;
 
-  VF sum0 = hn::Zero(df);
-  VF sum1 = hn::Zero(df);
-  VF sum_err0 = hn::Zero(df);
-  VF sum_err1 = hn::Zero(df);
+  VF sum = hn::Zero(df);
+  VF sum_err = hn::Zero(df);
+  VF sum_abs = hn::Zero(df);
+  VF sum_abs_err = hn::Zero(df);
 
-  VF w0, w1, v0, v1;              // decompressed inputs
-  VF perr0, perr1, serr0, serr1;  // output arg of TwoProducts/TwoSums
+  const auto packed_v = MakeSpan(v, num);
 
-  for (size_t i = 0; i < num; i += 2 * N) {
-    TraitsW::Decompress2(df, w, w_ofs + i, w0, w1);
-    TraitsV::Decompress2(df, x, i, v0, v1);
+  size_t i = 0;
+  if (num >= 2 * N) {
+    for (; i <= num - 2 * N; i += 2 * N) {
+      VF v0, v1;
+      Decompress2(df, packed_v, i, v0, v1);
+      UpdateCascadedSums(df, v0, sum, sum_err);
+      UpdateCascadedSums(df, v1, sum, sum_err);
+      UpdateCascadedSums(df, hn::Abs(v0), sum_abs, sum_abs_err);
+      UpdateCascadedSums(df, hn::Abs(v1), sum_abs, sum_abs_err);
+    }
+  }
 
+  size_t remaining = num - i;
+  HWY_DASSERT(remaining < 2 * N);
+  if (HWY_UNLIKELY(remaining != 0)) {
+    HWY_ALIGN float padded_v[2 * hn::MaxLanes(df)];
+    DecompressAndZeroPad(df, packed_v, i, padded_v, remaining);
+
+    // 1..2 whole vectors, possibly zero-padded.
+    for (size_t padded_pos = 0; padded_pos < remaining; padded_pos += N) {
+      const VF v0 = hn::Load(df, padded_v + padded_pos);
+      UpdateCascadedSums(df, v0, sum, sum_err);
+      UpdateCascadedSums(df, hn::Abs(v0), sum_abs, sum_abs_err);
+    }
+  }
+
+  const float div = hwy::ScalarAbs(ReduceCascadedSums(df, sum, sum_err));
+  if (div == 0.0f) return hn::GetLane(hn::Inf(df));
+  const double cond = 2.0 * ReduceCascadedSums(df, sum_abs, sum_abs_err) /
+                      static_cast<double>(div);
+  HWY_ASSERT(cond >= 0.0);
+  return cond;
+}
+
+// Algorithm 6.15 from Handbook of Floating-Point Arithmetic. 10 ops is too slow
+// for compute-limited Matmul but might be OK for attention.
+// Also supports bf16 inputs, used by matvec-inl.h.
+struct DotKernelCompensated {
+  template <class DF, class VF = hn::Vec<DF>, HWY_IF_F32_D(DF)>
+  HWY_INLINE void Update4(DF df, const VF w0, const VF w1, const VF w2,
+                          const VF w3, const VF v0, const VF v1, const VF v2,
+                          const VF v3, VF& sum0, VF& sum1, VF& sum2, VF& sum3,
+                          VF& comp0, VF& comp1, VF& comp2, VF& comp3) const {
+    VF perr0, perr1, perr2, perr3;
     const VF prod0 = TwoProducts(df, w0, v0, perr0);
     const VF prod1 = TwoProducts(df, w1, v1, perr1);
+    const VF prod2 = TwoProducts(df, w2, v2, perr2);
+    const VF prod3 = TwoProducts(df, w3, v3, perr3);
 
+    VF serr0, serr1, serr2, serr3;
     sum0 = TwoSums(df, prod0, sum0, serr0);
     sum1 = TwoSums(df, prod1, sum1, serr1);
+    sum2 = TwoSums(df, prod2, sum2, serr2);
+    sum3 = TwoSums(df, prod3, sum3, serr3);
 
-    sum_err0 += perr0 + serr0;
-    sum_err1 += perr1 + serr1;
+    comp0 = hn::Add(comp0, hn::Add(perr0, serr0));
+    comp1 = hn::Add(comp1, hn::Add(perr1, serr1));
+    comp2 = hn::Add(comp2, hn::Add(perr2, serr2));
+    comp3 = hn::Add(comp3, hn::Add(perr3, serr3));
   }
 
-  AssimilateCascadedSums(df, sum1, sum_err1, sum0, sum_err0);
-  return ReduceCascadedSums(df, sum0, sum_err0);
+  template <class DBF, class VBF = hn::Vec<DBF>, HWY_IF_BF16_D(DBF),
+            class DF = hn::Repartition<float, DBF>, class VF = hn::Vec<DF>>
+  HWY_INLINE void Update4(DBF /*dbf*/, const VBF w0, const VBF w1, const VBF w2,
+                          const VBF w3, const VBF v0, const VBF v1,
+                          const VBF v2, const VBF v3, VF& sum0, VF& sum1,
+                          VF& sum2, VF& sum3, VF& comp0, VF& comp1, VF& comp2,
+                          VF& comp3) const {
+    const DF df;
+    const VF prod0 = WidenMulPairwiseAdd(df, w0, v0);
+    const VF prod1 = WidenMulPairwiseAdd(df, w1, v1);
+    const VF prod2 = WidenMulPairwiseAdd(df, w2, v2);
+    const VF prod3 = WidenMulPairwiseAdd(df, w3, v3);
+
+    VF serr0, serr1, serr2, serr3;
+    sum0 = TwoSums(df, prod0, sum0, serr0);
+    sum1 = TwoSums(df, prod1, sum1, serr1);
+    sum2 = TwoSums(df, prod2, sum2, serr2);
+    sum3 = TwoSums(df, prod3, sum3, serr3);
+
+    comp0 = hn::Add(comp0, serr0);
+    comp1 = hn::Add(comp1, serr1);
+    comp2 = hn::Add(comp2, serr2);
+    comp3 = hn::Add(comp3, serr3);
+  }
+
+  template <class DF, class VF = hn::Vec<DF>, HWY_IF_F32_D(DF)>
+  HWY_INLINE void Update1(DF df, const VF w0, const VF v0, VF& sum0,
+                          VF& comp0) const {
+    VF perr0;
+    const VF prod0 = TwoProducts(df, w0, v0, perr0);
+
+    VF serr0;
+    sum0 = TwoSums(df, prod0, sum0, serr0);
+
+    comp0 = hn::Add(comp0, hn::Add(perr0, serr0));
+  }
+
+  template <class DBF, class VBF = hn::Vec<DBF>, HWY_IF_BF16_D(DBF),
+            class DF = hn::Repartition<float, DBF>, class VF = hn::Vec<DF>>
+  HWY_INLINE void Update1(DBF /*dbf*/, const VBF w0, const VBF v0, VF& sum0,
+                          VF& comp0) const {
+    const DF df;
+    const VF prod0 = WidenMulPairwiseAdd(df, w0, v0);
+
+    VF serr0;
+    sum0 = TwoSums(df, prod0, sum0, serr0);
+
+    comp0 = hn::Add(comp0, serr0);
+  }
+
+  template <class DF, class VF = hn::Vec<DF>>
+  HWY_INLINE float Reduce(DF df, VF& sum0, VF& sum1, VF& sum2, VF& sum3,
+                          VF& comp0, VF& comp1, VF& comp2, VF& comp3) const {
+    // Reduction tree: sum of all accumulators by pairs, then across lanes.
+    AssimilateCascadedSums(df, sum1, comp1, sum0, comp0);
+    AssimilateCascadedSums(df, sum3, comp3, sum2, comp2);
+    AssimilateCascadedSums(df, sum2, comp2, sum0, comp0);
+    return ReduceCascadedSums(df, sum0, comp0);
+  }
+};
+
+// Default kernel
+template <class D, typename WeightT, typename VecT>
+HWY_INLINE float Dot(D d, const PackedSpan<const WeightT>& w, size_t w_ofs,
+                     const VecT* HWY_RESTRICT vec_aligned, size_t num) {
+  return DecompressAndCall(d, w, w_ofs, vec_aligned, num,
+                           DotKernelCompensated());
+}
+
+// Adapter for a single pointer, no bounds checking.
+template <typename WeightT, typename VecT>
+HWY_INLINE float Dot(const WeightT* HWY_RESTRICT w, const VecT* vec_aligned,
+                     size_t num) {
+  const hn::ScalableTag<VecT> d;
+  return Dot(d, MakeConstSpan(w, num), /*w_ofs=*/0, vec_aligned, num);
+}
+
+// Adapter for use by matvec-inl.h. TODO: remove when that is no longer used.
+template <size_t kCapacity, typename VecT>
+HWY_INLINE float Dot(const std::array<float, kCapacity>& w, size_t w_ofs,
+                     const VecT* vec_aligned, size_t num) {
+  const hn::ScalableTag<VecT> d;
+  return Dot(d, MakeConstSpan(w.data(), kCapacity), w_ofs, vec_aligned, num);
+}
+
+// Adapter for use by matvec-inl.h. TODO: remove when that is no longer used.
+template <typename MatT, size_t kCapacity, typename VecT>
+HWY_INLINE float Dot(const CompressedArray<MatT, kCapacity>& w, size_t w_ofs,
+                     const VecT* vec_aligned, size_t num) {
+  const hn::ScalableTag<VecT> d;
+  return w.scale() *
+         Dot(d, MakeConstSpan(w.data(), kCapacity), w_ofs, vec_aligned, num);
 }
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
diff --git a/ops/dot_test.cc b/ops/dot_test.cc
index 107b67b..844f61a 100644
--- a/ops/dot_test.cc
+++ b/ops/dot_test.cc
@@ -21,15 +21,18 @@
 #include <stddef.h>
 #include <stdio.h>
 
-#include <algorithm>  // std::swap
+#include <algorithm>  // std::swap, std::sort
 #include <array>
 #include <cmath>
 #include <random>
 
+#include "compression/compress.h"
+#include "compression/shared.h"
 #include "util/allocator.h"
+#include "util/test_util.h"
 #include "util/threading.h"
-#include "hwy/aligned_allocator.h"
 #include "hwy/base.h"
+#include "hwy/contrib/thread_pool/thread_pool.h"
 #include "hwy/stats.h"
 #include "hwy/timer.h"
 
@@ -40,17 +43,526 @@
 #include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 // After highway.h
-#include "compression/compress-inl.h"
+#include "compression/test_util-inl.h"
 #include "ops/dot-inl.h"
-#include "hwy/profiler.h"
-#include "hwy/tests/test_util-inl.h"
+#include "hwy/profiler.h"  // also uses SIMD
 
 HWY_BEFORE_NAMESPACE();
 namespace gcpp {
 namespace HWY_NAMESPACE {
 namespace hn = hwy::HWY_NAMESPACE;
 
-using Array = hwy::AlignedFreeUniquePtr<float[]>;
+//------------------------------------------------------------------------------
+// Dot product variants
+
+// All combinations of {*, TwoProducts} x {+, FastTwoSums, TwoSums}.
+
+struct DotKernelNaive {
+  template <class DF, class VF = hn::Vec<DF>>
+  HWY_INLINE void Update4(DF df, const VF w0, const VF w1, const VF w2,
+                          const VF w3, const VF v0, const VF v1, const VF v2,
+                          const VF v3, VF& sum0, VF& sum1, VF& sum2, VF& sum3,
+                          VF& /*comp0*/, VF& /*comp1*/, VF& /*comp2*/,
+                          VF& /*comp3*/) const {
+    sum0 = hn::MulAdd(w0, v0, sum0);
+    sum1 = hn::MulAdd(w1, v1, sum1);
+    sum2 = hn::MulAdd(w2, v2, sum2);
+    sum3 = hn::MulAdd(w3, v3, sum3);
+  }
+
+  template <class DF, class VF = hn::Vec<DF>>
+  HWY_INLINE void Update1(DF df, const VF w0, const VF v0, VF& sum0,
+                          VF& /*comp0*/) const {
+    sum0 = hn::MulAdd(w0, v0, sum0);
+  }
+
+  template <class DF, class VF = hn::Vec<DF>>
+  HWY_INLINE float Reduce(DF df, VF& sum0, VF& sum1, VF& sum2, VF& sum3,
+                          VF& /*comp0*/, VF& /*comp1*/, VF& /*comp2*/,
+                          VF& /*comp3*/) const {
+    // Reduction tree: sum of all accumulators by pairs, then across lanes.
+    sum0 = hn::Add(sum0, sum1);
+    sum2 = hn::Add(sum2, sum3);
+    sum0 = hn::Add(sum0, sum2);
+    return hn::ReduceSum(df, sum0);
+  }
+};
+template <class D, typename WeightT, typename VecT>
+HWY_INLINE float DotNaive(D d, const PackedSpan<const WeightT>& w, size_t w_ofs,
+                          const VecT* HWY_RESTRICT vec_aligned, size_t num) {
+  return DecompressAndCall(d, w, w_ofs, vec_aligned, num, DotKernelNaive());
+}
+
+// https://en.wikipedia.org/wiki/Kahan_summation_algorithm: FastTwoSum.
+struct DotKernelKahan {
+  template <class DF, class VF = hn::Vec<DF>>
+  HWY_INLINE void Update4(DF df, const VF w0, const VF w1, const VF w2,
+                          const VF w3, const VF v0, const VF v1, const VF v2,
+                          const VF v3, VF& sum0, VF& sum1, VF& sum2, VF& sum3,
+                          VF& comp0, VF& comp1, VF& comp2, VF& comp3) const {
+    // Add compensation from last iteration, which is an approximation of the
+    // running error.
+    const VF prod0 = hn::MulAdd(w0, v0, comp0);
+    const VF prod1 = hn::MulAdd(w1, v1, comp1);
+    const VF prod2 = hn::MulAdd(w2, v2, comp2);
+    const VF prod3 = hn::MulAdd(w3, v3, comp3);
+
+    sum0 = FastTwoSums(df, sum0, prod0, comp0);
+    sum1 = FastTwoSums(df, sum1, prod1, comp1);
+    sum2 = FastTwoSums(df, sum2, prod2, comp2);
+    sum3 = FastTwoSums(df, sum3, prod3, comp3);
+  }
+
+  template <class DF, class VF = hn::Vec<DF>>
+  HWY_INLINE void Update1(DF df, const VF w0, const VF v0, VF& sum0,
+                          VF& comp0) const {
+    const VF prod0 = hn::MulAdd(w0, v0, comp0);
+    sum0 = FastTwoSums(df, sum0, prod0, comp0);
+  }
+
+  template <class DF, class VF = hn::Vec<DF>>
+  HWY_INLINE float Reduce(DF df, VF& sum0, VF& sum1, VF& sum2, VF& sum3,
+                          VF& comp0, VF& comp1, VF& comp2, VF& comp3) const {
+    // Reduction tree: sum of all accumulators by pairs, then across lanes.
+    comp0 = hn::Add(comp0, comp1);
+    comp2 = hn::Add(comp2, comp3);
+    VF sum_err = hn::Add(comp0, comp2);
+    UpdateCascadedSums(df, sum1, sum0, sum_err);
+    UpdateCascadedSums(df, sum3, sum2, sum_err);
+    UpdateCascadedSums(df, sum2, sum0, sum_err);
+    return ReduceCascadedSums(df, sum0, sum_err);
+  }
+};
+template <class D, typename WeightT, typename VecT>
+HWY_INLINE float DotKahan(D d, const PackedSpan<const WeightT>& w, size_t w_ofs,
+                          const VecT* HWY_RESTRICT vec_aligned, size_t num) {
+  return DecompressAndCall(d, w, w_ofs, vec_aligned, num, DotKernelKahan());
+}
+
+template <class D, typename WeightT, typename VecT>
+HWY_INLINE float DotCompensated(D d, const PackedSpan<const WeightT>& w,
+                                size_t w_ofs,
+                                const VecT* HWY_RESTRICT vec_aligned,
+                                size_t num) {
+  return DecompressAndCall(d, w, w_ofs, vec_aligned, num,
+                           DotKernelCompensated());
+}
+
+// Like Compensated, but FastTwoSum instead of TwoSum.
+struct DotKernelTwoProdFast {
+  template <class DF, class VF = hn::Vec<DF>>
+  HWY_INLINE void Update4(DF df, const VF w0, const VF w1, const VF w2,
+                          const VF w3, const VF v0, const VF v1, const VF v2,
+                          const VF v3, VF& sum0, VF& sum1, VF& sum2, VF& sum3,
+                          VF& comp0, VF& comp1, VF& comp2, VF& comp3) const {
+    VF perr0, perr1, perr2, perr3;
+    const VF prod0 = TwoProducts(df, w0, v0, perr0);
+    const VF prod1 = TwoProducts(df, w1, v1, perr1);
+    const VF prod2 = TwoProducts(df, w2, v2, perr2);
+    const VF prod3 = TwoProducts(df, w3, v3, perr3);
+
+    VF serr0, serr1, serr2, serr3;
+    sum0 = FastTwoSums(df, sum0, prod0, serr0);
+    sum1 = FastTwoSums(df, sum1, prod1, serr1);
+    sum2 = FastTwoSums(df, sum2, prod2, serr2);
+    sum3 = FastTwoSums(df, sum3, prod3, serr3);
+
+    comp0 = hn::Add(comp0, hn::Add(perr0, serr0));
+    comp1 = hn::Add(comp1, hn::Add(perr1, serr1));
+    comp2 = hn::Add(comp2, hn::Add(perr2, serr2));
+    comp3 = hn::Add(comp3, hn::Add(perr3, serr3));
+  }
+
+  template <class DF, class VF = hn::Vec<DF>>
+  HWY_INLINE void Update1(DF df, const VF w0, const VF v0, VF& sum0,
+                          VF& comp0) const {
+    VF perr0;
+    const VF prod0 = TwoProducts(df, w0, v0, perr0);
+
+    VF serr0;
+    sum0 = FastTwoSums(df, sum0, prod0, serr0);
+
+    comp0 = hn::Add(comp0, hn::Add(perr0, serr0));
+  }
+
+  template <class DF, class VF = hn::Vec<DF>>
+  HWY_INLINE float Reduce(DF df, VF& sum0, VF& sum1, VF& sum2, VF& sum3,
+                          VF& comp0, VF& comp1, VF& comp2, VF& comp3) const {
+    // Reduction tree: sum of all accumulators by pairs, then across lanes.
+    AssimilateCascadedSums(df, sum1, comp1, sum0, comp0);
+    AssimilateCascadedSums(df, sum3, comp3, sum2, comp2);
+    AssimilateCascadedSums(df, sum2, comp2, sum0, comp0);
+    return ReduceCascadedSums(df, sum0, comp0);
+  }
+};
+template <class D, typename WeightT, typename VecT>
+HWY_INLINE float DotTwoProdFast(D d, const PackedSpan<const WeightT>& w,
+                                size_t w_ofs,
+                                const VecT* HWY_RESTRICT vec_aligned,
+                                size_t num) {
+  return DecompressAndCall(d, w, w_ofs, vec_aligned, num,
+                           DotKernelTwoProdFast());
+}
+
+// Like Compensated, but without TwoProducts. Vs Kahan, upgrades FastTwoSums
+// to TwoSums.
+struct DotKernelMulTwoSum {
+  template <class DF, class VF = hn::Vec<DF>>
+  HWY_INLINE void Update4(DF df, const VF w0, const VF w1, const VF w2,
+                          const VF w3, const VF v0, const VF v1, const VF v2,
+                          const VF v3, VF& sum0, VF& sum1, VF& sum2, VF& sum3,
+                          VF& comp0, VF& comp1, VF& comp2, VF& comp3) const {
+    const VF prod0 = hn::Mul(w0, v0);
+    const VF prod1 = hn::Mul(w1, v1);
+    const VF prod2 = hn::Mul(w2, v2);
+    const VF prod3 = hn::Mul(w3, v3);
+
+    VF serr0, serr1, serr2, serr3;
+    sum0 = TwoSums(df, prod0, sum0, serr0);
+    sum1 = TwoSums(df, prod1, sum1, serr1);
+    sum2 = TwoSums(df, prod2, sum2, serr2);
+    sum3 = TwoSums(df, prod3, sum3, serr3);
+
+    comp0 = hn::Add(comp0, serr0);
+    comp1 = hn::Add(comp1, serr1);
+    comp2 = hn::Add(comp2, serr2);
+    comp3 = hn::Add(comp3, serr3);
+  }
+
+  template <class DF, class VF = hn::Vec<DF>>
+  HWY_INLINE void Update1(DF df, const VF w0, const VF v0, VF& sum0,
+                          VF& comp0) const {
+    const VF prod0 = hn::Mul(w0, v0);
+
+    VF serr0;
+    sum0 = TwoSums(df, prod0, sum0, serr0);
+
+    comp0 = hn::Add(comp0, serr0);
+  }
+
+  template <class DF, class VF = hn::Vec<DF>>
+  HWY_INLINE float Reduce(DF df, VF& sum0, VF& sum1, VF& sum2, VF& sum3,
+                          VF& comp0, VF& comp1, VF& comp2, VF& comp3) const {
+    // Reduction tree: sum of all accumulators by pairs, then across lanes.
+    AssimilateCascadedSums(df, sum1, comp1, sum0, comp0);
+    AssimilateCascadedSums(df, sum3, comp3, sum2, comp2);
+    AssimilateCascadedSums(df, sum2, comp2, sum0, comp0);
+    return ReduceCascadedSums(df, sum0, comp0);
+  }
+};
+template <class D, typename WeightT, typename VecT>
+HWY_INLINE float DotMulTwoSum(D d, const PackedSpan<const WeightT>& w,
+                              size_t w_ofs,
+                              const VecT* HWY_RESTRICT vec_aligned,
+                              size_t num) {
+  return DecompressAndCall(d, w, w_ofs, vec_aligned, num, DotKernelMulTwoSum());
+}
+
+// -Like Compensated, but only TwoProducts, no [Fast]TwoSums. This is only 10%
+// better (mul) than naive.
+struct DotKernelTwoProdAdd {
+  template <class DF, class VF = hn::Vec<DF>>
+  HWY_INLINE void Update4(DF df, const VF w0, const VF w1, const VF w2,
+                          const VF w3, const VF v0, const VF v1, const VF v2,
+                          const VF v3, VF& sum0, VF& sum1, VF& sum2, VF& sum3,
+                          VF& comp0, VF& comp1, VF& comp2, VF& comp3) const {
+    VF perr0, perr1, perr2, perr3;
+    const VF prod0 = TwoProducts(df, w0, v0, perr0);
+    const VF prod1 = TwoProducts(df, w1, v1, perr1);
+    const VF prod2 = TwoProducts(df, w2, v2, perr2);
+    const VF prod3 = TwoProducts(df, w3, v3, perr3);
+
+    sum0 = hn::Add(sum0, prod0);
+    sum1 = hn::Add(sum1, prod1);
+    sum2 = hn::Add(sum2, prod2);
+    sum3 = hn::Add(sum3, prod3);
+
+    comp0 = hn::Add(comp0, perr0);
+    comp1 = hn::Add(comp1, perr1);
+    comp2 = hn::Add(comp2, perr2);
+    comp3 = hn::Add(comp3, perr3);
+  }
+
+  template <class DF, class VF = hn::Vec<DF>>
+  HWY_INLINE void Update1(DF df, const VF w0, const VF v0, VF& sum0,
+                          VF& comp0) const {
+    VF perr0;
+    const VF prod0 = TwoProducts(df, w0, v0, perr0);
+
+    sum0 = hn::Add(sum0, prod0);
+
+    comp0 = hn::Add(comp0, perr0);
+  }
+
+  template <class DF, class VF = hn::Vec<DF>>
+  HWY_INLINE float Reduce(DF df, VF& sum0, VF& sum1, VF& sum2, VF& sum3,
+                          VF& comp0, VF& comp1, VF& comp2, VF& comp3) const {
+    // Reduction tree: sum of all accumulators by pairs, then across lanes.
+    AssimilateCascadedSums(df, sum1, comp1, sum0, comp0);
+    AssimilateCascadedSums(df, sum3, comp3, sum2, comp2);
+    AssimilateCascadedSums(df, sum2, comp2, sum0, comp0);
+    return ReduceCascadedSums(df, sum0, comp0);
+  }
+};
+template <class D, typename WeightT, typename VecT>
+HWY_INLINE float DotTwoProdAdd(D d, const PackedSpan<const WeightT>& w,
+                               size_t w_ofs,
+                               const VecT* HWY_RESTRICT vec_aligned,
+                               size_t num) {
+  return DecompressAndCall(d, w, w_ofs, vec_aligned, num,
+                           DotKernelTwoProdAdd());
+}
+
+enum {  // alphabetical order
+  kAddTwoProd,
+  kAddTwoSum,
+  kCompensated,
+  kKahan,
+  kNaive,
+  kOnlyTwoProd,
+
+  kVariants
+};
+
+const char* VariantName(size_t variant) {
+  switch (variant) {
+    case kAddTwoProd:
+      return "add2prod";
+    case kAddTwoSum:
+      return "add2sum";
+    case kCompensated:
+      return "comp";
+    case kKahan:
+      return "kahan";
+    case kNaive:
+      return "naive";
+    case kOnlyTwoProd:
+      return "only2prod";
+    default:
+      HWY_ABORT("Unknown variant %zu", variant);
+      return "?";
+  }
+}
+
+template <class D, typename WeightT, typename VecT>
+float CallDot(D d, size_t variant, const PackedSpan<const WeightT>& w,
+              size_t w_ofs, const VecT* HWY_RESTRICT v, size_t num) {
+  switch (variant) {
+    case kAddTwoProd:
+      return DotTwoProdFast(d, w, 0, v, num);
+    case kAddTwoSum:
+      return DotMulTwoSum(d, w, 0, v, num);
+    case kCompensated:
+      return DotCompensated(d, w, 0, v, num);
+    case kKahan:
+      return DotKahan(d, w, 0, v, num);
+    case kNaive:
+      return DotNaive(d, w, 0, v, num);
+    case kOnlyTwoProd:
+      return DotTwoProdAdd(d, w, 0, v, num);
+    default:
+      HWY_ABORT("Unknown variant %zu", variant);
+      return 0.0f;
+  }
+}
+
+// Returns result accurate to 1.5 ulp, assuming `num` < 2^(52-23), no overflow,
+// and round to nearest. See "Accurate and efficient floating point summation".
+// Much too slow to be useful. Kept separate from the above kernels because it
+// is used to compute their error.
+template <typename WeightT, typename VecT>
+float ExactDot(const WeightT* HWY_RESTRICT w, const VecT* HWY_RESTRICT v,
+               size_t num, double* HWY_RESTRICT buf) {
+  PROFILER_FUNC;
+  for (size_t i = 0; i < num; ++i) {
+    buf[i] =
+        hwy::ConvertScalarTo<double>(w[i]) * hwy::ConvertScalarTo<double>(v[i]);
+  }
+  // Sort by decreasing magnitude (not supported by VQSort).
+  std::sort(buf, buf + num,
+            [](double a, double b) { return std::abs(a) > std::abs(b); });
+  double sum = 0.0;
+  for (size_t i = 0; i < num; ++i) {
+    sum += buf[i];
+  }
+  return static_cast<float>(sum);
+}
+
+//------------------------------------------------------------------------------
+
+class DotStats {
+  static float Ratio(float a, float b) {
+    // If 0, we would return infinity, which messes up the statistics.
+    if (a == 0.0f || b == 0.0f) return 1.0f;
+    // Absolute value because a sign change and 4x difference would
+    // otherwise return the smaller ratio 0.25.
+    return HWY_MAX(std::abs(a / b), std::abs(b / a));
+  }
+
+ public:
+  DotStats() {
+    for (size_t i = 0; i < kVariants; ++i) {
+      max_muls[i] = 1.0f;
+    }
+  }
+
+  static void PrintStats(const char* caption, size_t variant,
+                         const hwy::Stats& stats) {
+    fprintf(stderr, "%s %9s %s\n", caption, VariantName(variant),
+            stats.ToString(/*exclude=*/0).c_str());
+  }
+
+  // Call once per rep.
+  void NotifyRep(size_t num, double cond, float dot_exact,
+                 float dots[kVariants]) {
+    s_cond.Notify(cond);
+    const float mul_tol = cond > 1E8 ? 1.5f : cond > 1E7 ? 1.1f : 1.01f;
+
+    float muls[kVariants];
+    float l1s[kVariants];
+    uint32_t ulps[kVariants];
+    for (size_t i = 0; i < kVariants; ++i) {
+      muls[i] = Ratio(dots[i], dot_exact);
+      max_muls[i] = HWY_MAX(max_muls[i], muls[i]);
+
+      l1s[i] = std::abs(dots[i] - dot_exact);
+      s_l1s[i].Notify(l1s[i]);
+
+      ulps[i] = hwy::detail::ComputeUlpDelta(dots[i], dot_exact);
+      s_ulps[i].Notify(ulps[i]);
+    }
+
+    if (muls[kKahan] > mul_tol || l1s[kKahan] > 0.1f ||
+        muls[kNaive] + 1E-3f < muls[kKahan] || ulps[kCompensated] > 10) {
+      fprintf(stderr, "num %2zu cond %.1E exact %.8f\n", num, cond, dot_exact);
+      for (size_t i = 0; i < kVariants; ++i) {
+        fprintf(stderr, "  %9s dot %11.8f mul %.8f\n", VariantName(i), dots[i],
+                muls[i]);
+      }
+    }
+  }
+
+  // Call after all reps.
+  void NotifyRatios() {
+    for (size_t i = 0; i < kVariants; ++i) {
+      s_muls[i].Notify(max_muls[i]);
+    }
+  }
+
+  void NotifyTimes(double times[kVariants]) {
+    for (size_t i = 0; i < kVariants; ++i) {
+      s_times[i].Notify(times[i]);
+    }
+  }
+
+  void Assimilate(const DotStats& other) {
+    s_cond.Assimilate(other.s_cond);
+    for (size_t i = 0; i < kVariants; ++i) {
+      s_muls[i].Assimilate(other.s_muls[i]);
+      s_l1s[i].Assimilate(other.s_l1s[i]);
+      s_ulps[i].Assimilate(other.s_ulps[i]);
+      s_times[i].Assimilate(other.s_times[i]);
+    }
+  }
+
+  void Print() const {
+    PrintStats("cond", 0, s_cond);
+    for (size_t variant = 0; variant < kVariants; ++variant) {
+      PrintStats("mul", variant, s_muls[variant]);
+    }
+    for (size_t variant = 0; variant < kVariants; ++variant) {
+      PrintStats(" l1", variant, s_l1s[variant]);
+    }
+    for (size_t variant = 0; variant < kVariants; ++variant) {
+      PrintStats("ulp", variant, s_ulps[variant]);
+    }
+    if (s_times[0].Count()) {
+      for (size_t variant = 0; variant < kVariants; ++variant) {
+        PrintStats("time", variant, s_times[variant]);
+      }
+    }
+  }
+
+  void Check() const {
+    CheckMuls();
+    CheckL1();
+    CheckUlps();
+
+    // We do not check times because they can be noisy/nonportable, but
+    // `kAddTwoProd` is only about 10% slower than `kKahan`, and about 1.5 times
+    // as fast as `kCompensated`.
+  }
+
+ private:
+  // Factor by which the approximate result is off; larger is worse.
+  void CheckMuls() const {
+    // Compensated is very accurate.
+    HWY_ASSERT(s_muls[kCompensated].Min() <= 1.0f + 2E-6f);
+    HWY_ASSERT(s_muls[kCompensated].Max() <= 1.0f + 2E-5f);
+
+    // Naive and OnlyTwoProd are considerably worse. >10x is for narrower
+    // vectors, compared to AVX-512. GeometricMean overflows, must use Mean.
+    HWY_ASSERT(gcpp::IsInside(1.01, 16.0, s_muls[kNaive].Mean()));
+    HWY_ASSERT(gcpp::IsInside(1.01, 13.0, s_muls[kOnlyTwoProd].Mean()));
+
+    // Kahan (FastTwoSum) is decent:
+    HWY_ASSERT(gcpp::IsInside(1.001, 4.1, s_muls[kKahan].Mean()));
+    HWY_ASSERT(gcpp::IsInside(1.001f, 14.1f, s_muls[kKahan].Max()));
+    HWY_ASSERT(gcpp::IsInside(1.0, 1.6, s_muls[kKahan].GeometricMean()));
+
+    // But can be considerably improved via TwoProducts:
+    HWY_ASSERT(gcpp::IsInside(1.0005, 1.5, s_muls[kAddTwoProd].Mean()));
+    HWY_ASSERT(gcpp::IsInside(1.001f, 2.3f, s_muls[kAddTwoProd].Max()));
+    HWY_ASSERT(gcpp::IsInside(1.0, 1.2, s_muls[kAddTwoProd].GeometricMean()));
+    // Updating Kahan's FastTwoSums to TwoSums is not quite as helpful.
+    HWY_ASSERT(gcpp::IsInside(1.0005, 2.2, s_muls[kAddTwoSum].Mean()));
+    HWY_ASSERT(gcpp::IsInside(1.0, 1.3, s_muls[kAddTwoProd].GeometricMean()));
+  }
+
+  // Absolute error; larger is worse.
+  void CheckL1() const {
+    // Compensated is very accurate.
+    HWY_ASSERT(s_l1s[kCompensated].Min() == 0.0f);
+    HWY_ASSERT(s_l1s[kCompensated].Max() <= 3E-7f);
+
+    // Naive and OnlyTwoProd are considerably higher, but not huge.
+    HWY_ASSERT(gcpp::IsInside(1E-3, 2E-2, s_l1s[kNaive].Mean()));
+    HWY_ASSERT(gcpp::IsInside(1E-3, 2E-2, s_l1s[kOnlyTwoProd].Mean()));
+
+    // Kahan (FastTwoSum) is decent:
+    HWY_ASSERT(gcpp::IsInside(4.5E-4, 1E-3, s_l1s[kKahan].Mean()));
+    HWY_ASSERT(gcpp::IsInside(1.1E-3f, 3.2E-3f, s_l1s[kKahan].Max()));
+
+    // But can be nearly halved via TwoProducts:
+    HWY_ASSERT(gcpp::IsInside(2.5E-4, 8E-4, s_l1s[kAddTwoProd].Mean()));
+    HWY_ASSERT(gcpp::IsInside(4E-4f, 2.0E-3f, s_l1s[kAddTwoProd].Max()));
+    // Updating Kahan's FastTwoSums to TwoSums does help a bit.
+    HWY_ASSERT(gcpp::IsInside(1.5E-4, 5.2E-4, s_l1s[kAddTwoSum].Mean()));
+  }
+
+  // Units in the last place; larger is worse.
+  void CheckUlps() const {
+    HWY_ASSERT(s_ulps[kCompensated].Max() <= 250.0f);
+
+    HWY_ASSERT(s_ulps[kNaive].Max() <= 4E9f);
+    HWY_ASSERT(s_ulps[kOnlyTwoProd].Max() <= 3E9f);
+
+    HWY_ASSERT(s_ulps[kKahan].Max() <= 4E7f);
+    HWY_ASSERT(s_ulps[kAddTwoProd].Max() <= 1E7f);
+    HWY_ASSERT(s_ulps[kAddTwoSum].Max() <= 2.5E7f);
+  }
+
+  hwy::Stats s_cond;
+
+  // Relative error
+  float max_muls[kVariants];
+  hwy::Stats s_muls[kVariants];
+
+  hwy::Stats s_l1s[kVariants];  // Absolute error
+
+  hwy::Stats s_ulps[kVariants];  // Only relevant for small cond
+  hwy::Stats s_times[kVariants];
+};
 
 // Returns normalized value in [-1, 1).
 float RandomFloat(std::mt19937& rng) {
@@ -64,51 +576,77 @@ float RandomFloat(std::mt19937& rng) {
   return f;
 }
 
-// Based on Algorithm 6.1 from "Accurate Sum and Dot Product".
-// `num` is the size of a, b[, and buf] and must be larger than 2 and even.
-void GenerateIllConditionedInputs(double target_cond, size_t num,
-                                  float* HWY_RESTRICT a, float* HWY_RESTRICT b,
-                                  double* HWY_RESTRICT buf, std::mt19937& rng) {
+// `raw` holds the decompressed values, so that the test measures only the
+// error from the Dot algorithms, not the compression.
+template <typename Packed>
+void GenerateWellConditionedInputs(const size_t num, float* HWY_RESTRICT raw,
+                                   std::mt19937& rng,
+                                   const PackedSpan<Packed>& packed,
+                                   CompressWorkingSet& work) {
+  std::uniform_int_distribution<int> e_dist(0, 6);
+
+  for (size_t i = 0; i < num; ++i) {
+    raw[i] = RandomFloat(rng) * (1 << e_dist(rng));
+  }
+
+  if (IsCompressed<Packed>()) {
+    // Don't care about the original range.
+    (void)ScaleWeights(raw, num);
+  }
+
+  hwy::ThreadPool pool(0);  // num is too small for parallelization
+  const size_t packed_ofs = 0;
+  Compress(raw, num, work, packed, packed_ofs, pool);
+
+  const hn::ScalableTag<float> df;
+  DecompressAndZeroPad(df, MakeConst(packed), packed_ofs, raw, num);
+}
+
+// Returns the actual condition number. Based on Algorithm 6.1 from "Accurate
+// Sum and Dot Product". `num` is the (arbitrary) size of w, v, and buf.
+template <typename WeightT, typename VecT>
+double GenerateIllConditionedInputs(const size_t num, WeightT* w,
+                                    VecT* HWY_RESTRICT v, std::mt19937& rng) {
   PROFILER_FUNC;
-  HWY_ASSERT(target_cond >= 1.0);
-  HWY_ASSERT(num % 2 == 0);
-  const size_t half = num / 2;
+  const size_t half = HWY_MAX(1, num / 2);  // generate at least one random
+  HWY_DASSERT(half != 0);
+
   const hn::ScalableTag<float> df;
 
-  const int max_exp = static_cast<int>(std::log2(target_cond) / 2.0);
+  const PackedSpan<WeightT> w_span(w, num);
+
+  // Regardless of WeightT and VecT, we will accumulate into float. Multiplying
+  // two maximal inputs and accumulating `num` times is enough for some loss of
+  // precision and condition numbers between 1E6-1E9, which is what we see for
+  // Attention Dot and `RMSNormMul`.
+  const int max_exp = 5;
   std::uniform_int_distribution<int> e_dist(0, max_exp);
 
   // First half: random exponents and mantissas
   for (size_t i = 0; i < half; ++i) {
     // Ensure the min and max exponents are used.
     const int e = i == 0 ? 0 : i == 1 ? max_exp : e_dist(rng);
-    a[i] = RandomFloat(rng) * (1 << e);
-    b[i] = RandomFloat(rng) * (1 << e);
+    w[i] = hwy::ConvertScalarTo<WeightT>(RandomFloat(rng) * (1 << e));
+    v[i] = hwy::ConvertScalarTo<VecT>(RandomFloat(rng) * (1 << e));
   }
 
-  // Zero-init second half for DotExact
-  for (size_t i = half; i < num; ++i) {
-    a[i] = 0.0f;
-    b[i] = 0.0f;
-  }
-
-  const float a_exp_step = max_exp / (half - 1);
+  const float a_exp_step =
+      num == half ? 0.0f : static_cast<float>(max_exp) / (num - half);
   float a_exp = max_exp;  // max_exp downto 0
   for (size_t i = half; i < num; ++i, a_exp -= a_exp_step) {
     const int e = static_cast<int>(a_exp);
     HWY_DASSERT(e >= 0);
-    a[i] = RandomFloat(rng) * (1 << e);
+    w[i] = hwy::ConvertScalarTo<WeightT>(RandomFloat(rng) * (1 << e));
     const float r = RandomFloat(rng) * (1 << e);
-    if (a[i] == 0.0f) {
-      b[i] = 0.0f;
+    if (hwy::ConvertScalarTo<float>(w[i]) == 0.0f) {
+      v[i] = hwy::ConvertScalarTo<VecT>(0.0f);
     } else {
-      // This is called >100K times. CompensatedDot is much faster than ExactDot
-      // and just about as accurate, but requires multiples of two vectors.
-      // const float exact = ExactDot(a, b, i, buf);
-      (void)buf;
-      const size_t padded = hwy::RoundUpTo(i, 2 * hn::Lanes(df));
-      const float exact = CompensatedDot(df, a, /*w_ofs=*/0, b, padded);
-      b[i] = r - exact / a[i];
+      // This is called >100K times. DotCompensated is much faster than ExactDot
+      // and just about as accurate.
+      const float exact =
+          DotCompensated(df, MakeConst(w_span), /*w_ofs=*/0, v, i);
+      v[i] = hwy::ConvertScalarTo<VecT>(
+          r - exact / hwy::ConvertScalarTo<float>(w[i]));
     }
   }
 
@@ -118,20 +656,106 @@ void GenerateIllConditionedInputs(double target_cond, size_t num,
     std::uniform_int_distribution<size_t> dist(0, i);
     const size_t j = dist(rng);
 
-    std::swap(a[i], a[j]);
-    std::swap(b[i], b[j]);
+    std::swap(w[i], w[j]);
+    std::swap(v[i], v[j]);
   }
+
+  return ConditionNumber(w, v, num);
 }
 
-template <typename T, size_t kNum>
-void PrintStats(const char* caption, const std::array<T, kNum>& values) {
-  hwy::Stats stats;
-  for (T t : values) {
-    stats.Notify(static_cast<float>(t));
+// Runs all Dot algorithms for all short lengths and all Packed/raw types
+// on well-conditioned inputs, and ensures the results are close to exact.
+template <typename Packed>
+struct TestShortDotsT {
+  template <typename T, class D>
+  HWY_INLINE void operator()(T /*unused*/, D d) {
+    const size_t N = hn::Lanes(d);
+    const hn::ScalableTag<float> df;  // for CallDot
+
+    CompressWorkingSet work;
+    std::mt19937 rng;
+    rng.seed(12345);
+
+    hwy::Stats s_l1[kVariants];
+
+    for (size_t num = 1; num <= 5 * N; ++num) {
+      // GenerateWellConditionedInputs calls DecompressAndZeroPad to `raw*`,
+      // hence they require padding to one vector.
+      const size_t padded_num = hwy::RoundUpTo(num, N);
+      const size_t packed_num = CompressedArrayElements<Packed>(num);
+      RowVectorBatch<float> raw_w(1, padded_num);
+      RowVectorBatch<float> raw_v(1, padded_num);
+      RowVectorBatch<Packed> weights(1, packed_num);
+      const PackedSpan<Packed> w(weights.Batch(0), packed_num);
+      RowVectorBatch<T> vectors(1, num);
+      const PackedSpan<T> v(vectors.Batch(0), num);
+
+      RowVectorBatch<double> bufs(1, num);
+      double* HWY_RESTRICT buf = bufs.Batch(0);
+
+      for (size_t rep = 0; rep < hn::AdjustedReps(20); ++rep) {
+        GenerateWellConditionedInputs(num, raw_w.All(), rng, w, work);
+        GenerateWellConditionedInputs(num, raw_v.All(), rng, v, work);
+
+        const float dot_exact = ExactDot(raw_w.All(), raw_v.All(), num, buf);
+        float dots[kVariants];
+        for (size_t variant = 0; variant < kVariants; ++variant) {
+          dots[variant] = CallDot(df, variant, MakeConst(w), 0, v.ptr, num);
+
+          const float l1 = hwy::ScalarAbs(dots[variant] - dot_exact);
+          s_l1[variant].Notify(l1);
+        }
+      }
+    }
+
+    // Avoid extra output for partial vectors.
+    if (hn::detail::IsFull(d)) {
+      for (size_t variant = 0; variant < kVariants; ++variant) {
+        DotStats::PrintStats("l1", variant, s_l1[variant]);
+      }
+    }
+
+    // Verify the dot products are plausible. This is only to verify
+    // correctness, not to differentiate between the variants.
+    double expected_l1[kVariants];
+    // Tolerances are much lower for compressed inputs: the more limited set of
+    // values seems to reduce roundoff.
+    constexpr bool kCompressed = IsCompressed<Packed>();
+    expected_l1[kAddTwoProd] = kCompressed ? 1.5E-6 : 5E-5;
+    expected_l1[kAddTwoSum] = kCompressed ? 1.5E-6 : 6E-5;
+    expected_l1[kCompensated] = kCompressed ? 1.5E-6 : 4E-5;
+    expected_l1[kKahan] = kCompressed ? 1.5E-6 : 7E-5;
+    expected_l1[kNaive] = kCompressed ? 4E-6 : 1.5E-4;
+    expected_l1[kOnlyTwoProd] = kCompressed ? 1.5E-6 : 6E-5;
+
+    for (size_t variant = 0; variant < kVariants; ++variant) {
+      HWY_ASSERT(s_l1[variant].Min() >= 0.0f);
+      HWY_ASSERT(s_l1[variant].Max() <= 1.5E-3f);
+      if (s_l1[variant].Mean() > expected_l1[variant]) {
+        HWY_ABORT("%s -> %s: %s mean l1 %.5E > %.5E\n", TypeName<Packed>(),
+                  TypeName<T>(), VariantName(variant), s_l1[variant].Mean(),
+                  expected_l1[variant]);
+      }
+    }
   }
-  fprintf(stderr, "%s %s\n", caption, stats.ToString().c_str());
+};
+
+void TestAllShortDots() { ForeachPackedAndRawType<TestShortDotsT>(); }
+
+// Excludes outliers; we might not have enough samples for a reliable mode.
+double TrimmedMean(double* seconds, size_t num) {
+  std::sort(seconds, seconds + num);
+  double sum = 0;
+  int count = 0;
+  for (size_t i = num / 4; i < num / 2; ++i) {
+    sum += seconds[i];
+    count += 1;
+  }
+  return sum / count;
 }
 
+// Tests W=float, V=float for one large size and many reps on ill-conditioned
+// inputs. Also includes benchmarking.
 void TestAllDot() {
   // Skip EMU128 and old x86, include SSE4 because it tests the non-FMA path.
   if (HWY_TARGET == HWY_EMU128 || HWY_TARGET == HWY_SSSE3 ||
@@ -139,72 +763,64 @@ void TestAllDot() {
     return;
   }
 
-  hn::ScalableTag<float> df;
+  const hn::ScalableTag<float> df;
 
-  constexpr size_t kMaxThreads = 8;
-  std::mt19937 rngs[kMaxThreads];
-  for (size_t i = 0; i < kMaxThreads; ++i) {
+  constexpr size_t kMaxWorkers = 15;
+  std::mt19937 rngs[kMaxWorkers];
+  for (size_t i = 0; i < kMaxWorkers; ++i) {
     rngs[i].seed(12345 + 65537 * i);
   }
 
-  constexpr size_t kReps = hn::AdjustedReps(200);
+  constexpr size_t kReps = hn::AdjustedReps(40);
   const size_t num = 24 * 1024;
-  PerClusterPools pools(/*max_clusters=*/1, kMaxThreads, /*pin=*/1);
-  RowVectorBatch<float> a(kMaxThreads, num);
-  RowVectorBatch<float> b(kMaxThreads, num);
-  RowVectorBatch<double> bufs(kMaxThreads, num);
-
-  const double target_cond = 1e12;
-  std::array<double, kReps> conds;
-  std::array<uint32_t, kReps> ulps_fast;
-  std::array<uint32_t, kReps> ulps_comp;
-  std::array<double, kReps> t_fast;
-  std::array<double, kReps> t_comp;
-
-  constexpr size_t kTimeReps = 3;
+  PerClusterPools pools(/*max_clusters=*/1, kMaxWorkers - 1, /*pin=*/1);
+  RowVectorBatch<float> a(kMaxWorkers, num);
+  RowVectorBatch<float> b(kMaxWorkers, num);
+  RowVectorBatch<double> bufs(kMaxWorkers, num);
+  std::array<DotStats, kMaxWorkers> all_stats;
 
   pools.Inner(0).Run(0, kReps, [&](const uint32_t rep, size_t thread) {
     float* HWY_RESTRICT pa = a.Batch(thread);
     float* HWY_RESTRICT pb = b.Batch(thread);
     double* HWY_RESTRICT buf = bufs.Batch(thread);
-    GenerateIllConditionedInputs(target_cond, num, pa, pb, buf, rngs[thread]);
-    conds[rep] = ConditionNumber(df, pa, pb, num);
+    const PackedSpan<const float> a_span(pa, num);
+    DotStats& stats = all_stats[thread];
+    const double cond = GenerateIllConditionedInputs(num, pa, pb, rngs[thread]);
 
     const float dot_exact = ExactDot(pa, pb, num, buf);
 
-    float dot_fast = 0.0f;
-    float dot_comp = 0.0f;
-
-    double elapsed = hwy::HighestValue<double>();
-    for (int rep = 0; rep < kTimeReps; ++rep) {
-      const double start = hwy::platform::Now();
-      dot_fast += SimpleDot(df, pa, 0, pb, num);
-      elapsed = HWY_MIN(elapsed, hwy::platform::Now() - start);
+    float dots[kVariants] = {};
+    double times[kVariants] = {};
+    for (size_t variant = 0; variant < kVariants; ++variant) {
+      constexpr size_t kTimeReps = hn::AdjustedReps(10);
+      std::array<double, kTimeReps> elapsed;
+      for (int time_rep = 0; time_rep < kTimeReps; ++time_rep) {
+        const double start = hwy::platform::Now();
+        dots[variant] += CallDot(df, variant, a_span, /*w_ofs=*/0, pb, num);
+        hwy::PreventElision(*pa);
+        elapsed[time_rep] = hwy::platform::Now() - start;
+      }
+      dots[variant] /= kTimeReps;
+      times[variant] = TrimmedMean(elapsed.data(), kTimeReps);
     }
-    dot_fast /= kTimeReps;
-    t_fast[rep] = elapsed;
 
-    elapsed = hwy::HighestValue<double>();
-    for (size_t r = 0; r < kTimeReps; ++r) {
-      const double start = hwy::platform::Now();
-      dot_comp += CompensatedDot(df, pa, /*w_ofs=*/0, pb, num);
-      elapsed = HWY_MIN(elapsed, hwy::platform::Now() - start);
-    }
-    dot_comp /= kTimeReps;
-    t_comp[rep] = elapsed;
-
-    ulps_fast[rep] = hwy::detail::ComputeUlpDelta(dot_fast, dot_exact);
-    ulps_comp[rep] = hwy::detail::ComputeUlpDelta(dot_comp, dot_exact);
-    fprintf(stderr, "cond %.1E: %15.7E %15.7E %15.7E ulp %5u %1u\n", conds[rep],
-            dot_exact, dot_fast, dot_comp, ulps_fast[rep], ulps_comp[rep]);
+    stats.NotifyTimes(times);
+    stats.NotifyRep(num, cond, dot_exact, dots);
+    stats.NotifyRatios();
   });
 
+  DotStats& stats = all_stats[0];
+  for (size_t i = 1; i < kMaxWorkers; ++i) {
+    stats.Assimilate(all_stats[i]);
+  }
+  static bool once = true;
+  if (once) {
+    once = false;
+    stats.Print();
+  }
+  stats.Check();
+
   PROFILER_PRINT_RESULTS();
-  PrintStats("cond", conds);
-  PrintStats("ulp fast", ulps_fast);
-  PrintStats("ulp comp", ulps_comp);
-  PrintStats("t fast", t_fast);
-  PrintStats("t comp", t_comp);
 }
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
@@ -216,6 +832,7 @@ HWY_AFTER_NAMESPACE();
 
 namespace gcpp {
 HWY_BEFORE_TEST(DotTest);
+HWY_EXPORT_AND_TEST_P(DotTest, TestAllShortDots);
 HWY_EXPORT_AND_TEST_P(DotTest, TestAllDot);
 HWY_AFTER_TEST();
 
diff --git a/ops/gemma_matvec_test.cc b/ops/gemma_matvec_test.cc
index 012b24c..daac9b0 100644
--- a/ops/gemma_matvec_test.cc
+++ b/ops/gemma_matvec_test.cc
@@ -21,9 +21,8 @@
 #include <stddef.h>
 #include <stdio.h>
 
-#include <algorithm>
-#include <array>
-#include <cmath>
+#include <algorithm>  // std::max
+#include <cmath>      // std::abs
 #include <memory>
 
 #include "compression/compress.h"
@@ -37,58 +36,59 @@
 // clang-format on
 #include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
-#include "hwy/tests/test_util-inl.h"
 // After highway.h
 #include "ops/matvec-inl.h"
-#include "ops/ops-inl.h"  // MulByConst
+#include "hwy/tests/test_util-inl.h"
 
 HWY_BEFORE_NAMESPACE();
 namespace gcpp {
 namespace HWY_NAMESPACE {
 
-template <size_t kOuter, size_t kInner>
-hwy::AlignedFreeUniquePtr<float[]> SimpleMatVecAdd(
-    const CompressedArray<float, kOuter * kInner>& mat,
-    const hwy::AlignedFreeUniquePtr<float[]>& vec,
-    const hwy::AlignedFreeUniquePtr<float[]>& add) {
-  hwy::AlignedFreeUniquePtr<float[]> uncompressed_mat =
-      hwy::AllocateAligned<float>(kOuter * kInner);
-  hwy::AlignedFreeUniquePtr<float[]> out = hwy::AllocateAligned<float>(kOuter);
-  HWY_ASSERT(uncompressed_mat && out);
-  Decompress(mat, 0, uncompressed_mat.get(), kOuter * kInner);
-  MulByConst(mat.scale(), uncompressed_mat.get(), kOuter * kInner);
+using FloatPtr = hwy::AlignedFreeUniquePtr<float[]>;
+
+template <size_t kOuter, size_t kInner, size_t kNum = kOuter * kInner>
+FloatPtr SimpleMatVecAdd(const CompressedArray<float, kNum>& mat,
+                         const FloatPtr& vec, const FloatPtr& add) {
+  FloatPtr raw_mat = hwy::AllocateAligned<float>(kNum);
+  FloatPtr out = hwy::AllocateAligned<float>(kOuter);
+  HWY_ASSERT(raw_mat && out);
+  const hn::ScalableTag<float> df;
+  DecompressAndZeroPad(df, MakeSpan(mat.data(), kNum), 0, raw_mat.get(), kNum);
   for (size_t idx_row = 0; idx_row < kOuter; idx_row++) {
-    out[idx_row] = add[idx_row];
+    out[idx_row] = 0.0f;
     for (size_t idx_col = 0; idx_col < kInner; idx_col++) {
-      out[idx_row] +=
-          uncompressed_mat[kInner * idx_row + idx_col] * vec[idx_col];
+      out[idx_row] += raw_mat[kInner * idx_row + idx_col] * vec[idx_col];
     }
+    out[idx_row] *= mat.scale();
+    out[idx_row] += add[idx_row];
   }
   return out;
 }
 
-template <typename MatT, size_t kOuter, size_t kInner>
-CompressedArray<MatT, kOuter * kInner> GenerateMat(size_t offset,
-                                                   hwy::ThreadPool& pool) {
+template <typename MatT, size_t kOuter, size_t kInner,
+          size_t kNum = kOuter * kInner,
+          class MatPtr = std::unique_ptr<CompressedArray<MatT, kNum>>>
+MatPtr GenerateMat(size_t offset, hwy::ThreadPool& pool) {
   gcpp::CompressWorkingSet ws;
-  CompressedArray<MatT, kOuter * kInner> mat;
-  std::array<float, kOuter * kInner> content;
+  MatPtr mat = std::make_unique<CompressedArray<MatT, kNum>>();
+  FloatPtr raw_mat = hwy::AllocateAligned<float>(kNum);
+  HWY_ASSERT(raw_mat);
   const float scale = 1.0f / kInner;
   pool.Run(0, kOuter, [&](const size_t i, size_t /*thread*/) {
     for (size_t j = 0; j < kInner; j++) {
-      content[i * kInner + j] =
+      raw_mat[i * kInner + j] =
           static_cast<float>((i * kInner + j + offset) * scale);
     }
   });
 
-  Compress(content, ws, mat, pool);
-  mat.set_scale(1.9f);  // Arbitrary value, different from 1.
+  CompressScaled(raw_mat.get(), kNum, ws, *mat, pool);
+  mat->set_scale(1.9f);  // Arbitrary value, different from 1.
   return mat;
 }
 
 template <size_t length>
-hwy::AlignedFreeUniquePtr<float[]> GenerateVec(size_t offset) {
-  hwy::AlignedFreeUniquePtr<float[]> vec = hwy::AllocateAligned<float>(length);
+FloatPtr GenerateVec(size_t offset) {
+  FloatPtr vec = hwy::AllocateAligned<float>(length);
   HWY_ASSERT(vec);
   for (size_t idx = 0; idx < length; idx++) {
     vec[idx] = static_cast<float>(idx + offset);
@@ -97,8 +97,7 @@ hwy::AlignedFreeUniquePtr<float[]> GenerateVec(size_t offset) {
 }
 
 template <size_t length>
-void AssertClose(const hwy::AlignedFreeUniquePtr<float[]>& a,
-                 const hwy::AlignedFreeUniquePtr<float[]>& b) {
+void AssertClose(const FloatPtr& a, const FloatPtr& b) {
   for (size_t idx = 0; idx < length; idx++) {
     const float rel_abs_delta = std::abs(a[idx] - b[idx]) /
                                 std::max(std::abs(a[idx]), std::abs(b[idx]));
@@ -111,16 +110,13 @@ void TestMatVecAdd() {
   hwy::ThreadPool pool(hwy::ThreadPool::MaxThreads());
   constexpr size_t kOuter = 128 * 3;
   constexpr size_t kInner = 128 * 5;
-  CompressedArray<float, kOuter * kInner> mat =
-      GenerateMat<float, kOuter, kInner>(0, pool);
-  hwy::AlignedFreeUniquePtr<float[]> vec = GenerateVec<kInner>(0);
-  hwy::AlignedFreeUniquePtr<float[]> add = GenerateVec<kOuter>(0);
-  hwy::AlignedFreeUniquePtr<float[]> expected_out =
-      SimpleMatVecAdd<kOuter, kInner>(mat, vec, add);
-  hwy::AlignedFreeUniquePtr<float[]> actual_out =
-      hwy::AllocateAligned<float>(kOuter);
+  auto mat = GenerateMat<float, kOuter, kInner>(0, pool);
+  FloatPtr vec = GenerateVec<kInner>(0);
+  FloatPtr add = GenerateVec<kOuter>(0);
+  FloatPtr expected_out = SimpleMatVecAdd<kOuter, kInner>(*mat, vec, add);
+  FloatPtr actual_out = hwy::AllocateAligned<float>(kOuter);
   HWY_ASSERT(vec && add && expected_out && actual_out);
-  MatVecAdd<kOuter, kInner>(mat, 0, vec.get(), add.get(), actual_out.get(),
+  MatVecAdd<kOuter, kInner>(*mat, 0, vec.get(), add.get(), actual_out.get(),
                             pool);
   AssertClose<kOuter>(actual_out, expected_out);
 }
@@ -129,25 +125,20 @@ void TestTwoMatVecAdd() {
   hwy::ThreadPool pool(hwy::ThreadPool::MaxThreads());
   constexpr size_t kOuter = 128 * 3;
   constexpr size_t kInner = 128 * 5;
-  CompressedArray<float, kOuter * kInner> mat0 =
-      GenerateMat<float, kOuter, kInner>(0, pool);
-  CompressedArray<float, kOuter * kInner> mat1 =
-      GenerateMat<float, kOuter, kInner>(1, pool);
-  hwy::AlignedFreeUniquePtr<float[]> vec = GenerateVec<kInner>(0);
-  hwy::AlignedFreeUniquePtr<float[]> add0 = GenerateVec<kOuter>(0);
-  hwy::AlignedFreeUniquePtr<float[]> add1 = GenerateVec<kOuter>(1);
-  hwy::AlignedFreeUniquePtr<float[]> expected_out0 =
-      SimpleMatVecAdd<kOuter, kInner>(mat0, vec, add0);
-  hwy::AlignedFreeUniquePtr<float[]> expected_out1 =
-      SimpleMatVecAdd<kOuter, kInner>(mat1, vec, add1);
-  hwy::AlignedFreeUniquePtr<float[]> actual_out0 =
-      hwy::AllocateAligned<float>(kOuter);
-  hwy::AlignedFreeUniquePtr<float[]> actual_out1 =
-      hwy::AllocateAligned<float>(kOuter);
+  auto mat0 = GenerateMat<float, kOuter, kInner>(0, pool);
+  auto mat1 = GenerateMat<float, kOuter, kInner>(1, pool);
+  FloatPtr vec = GenerateVec<kInner>(0);
+  FloatPtr add0 = GenerateVec<kOuter>(0);
+  FloatPtr add1 = GenerateVec<kOuter>(1);
+  FloatPtr expected_out0 = SimpleMatVecAdd<kOuter, kInner>(*mat0, vec, add0);
+  FloatPtr expected_out1 = SimpleMatVecAdd<kOuter, kInner>(*mat1, vec, add1);
+  FloatPtr actual_out0 = hwy::AllocateAligned<float>(kOuter);
+  FloatPtr actual_out1 = hwy::AllocateAligned<float>(kOuter);
   HWY_ASSERT(vec && add0 && add1 && expected_out0 && actual_out0 &&
              expected_out1 && actual_out1);
-  TwoMatVecAdd<kOuter, kInner>(mat0, mat1, 0, vec.get(), add0.get(), add1.get(),
-                               actual_out0.get(), actual_out1.get(), pool);
+  TwoMatVecAdd<kOuter, kInner>(*mat0, *mat1, 0, vec.get(), add0.get(),
+                               add1.get(), actual_out0.get(), actual_out1.get(),
+                               pool);
   AssertClose<kOuter>(actual_out0, expected_out0);
   AssertClose<kOuter>(actual_out1, expected_out1);
 }
@@ -156,22 +147,17 @@ void TestTwoOfsMatVecAddLoop() {
   hwy::ThreadPool pool(hwy::ThreadPool::MaxThreads());
   constexpr size_t kOuter = 128 * 3;
   constexpr size_t kInner = 128 * 5;
-  CompressedArray<float, kOuter * kInner> mat =
-      GenerateMat<float, kOuter, kInner>(0, pool);
-  hwy::AlignedFreeUniquePtr<float[]> vec = GenerateVec<kInner>(0);
-  hwy::AlignedFreeUniquePtr<float[]> add0 = GenerateVec<kOuter>(0);
-  hwy::AlignedFreeUniquePtr<float[]> add1 = GenerateVec<kOuter>(1);
-  hwy::AlignedFreeUniquePtr<float[]> expected_out0 =
-      SimpleMatVecAdd<kOuter, kInner>(mat, vec, add0);
-  hwy::AlignedFreeUniquePtr<float[]> expected_out1 =
-      SimpleMatVecAdd<kOuter, kInner>(mat, vec, add1);
-  hwy::AlignedFreeUniquePtr<float[]> actual_out0 =
-      hwy::AllocateAligned<float>(kOuter);
-  hwy::AlignedFreeUniquePtr<float[]> actual_out1 =
-      hwy::AllocateAligned<float>(kOuter);
+  auto mat = GenerateMat<float, kOuter, kInner>(0, pool);
+  FloatPtr vec = GenerateVec<kInner>(0);
+  FloatPtr add0 = GenerateVec<kOuter>(0);
+  FloatPtr add1 = GenerateVec<kOuter>(1);
+  FloatPtr expected_out0 = SimpleMatVecAdd<kOuter, kInner>(*mat, vec, add0);
+  FloatPtr expected_out1 = SimpleMatVecAdd<kOuter, kInner>(*mat, vec, add1);
+  FloatPtr actual_out0 = hwy::AllocateAligned<float>(kOuter);
+  FloatPtr actual_out1 = hwy::AllocateAligned<float>(kOuter);
   HWY_ASSERT(vec && add0 && add1 && expected_out0 && actual_out0 &&
              expected_out1 && actual_out1);
-  TwoOfsMatVecAddLoop<kOuter, kInner>(mat, 0, 0, vec.get(), add0.get(),
+  TwoOfsMatVecAddLoop<kOuter, kInner>(*mat, 0, 0, vec.get(), add0.get(),
                                       add1.get(), actual_out0.get(),
                                       actual_out1.get());
   AssertClose<kOuter>(actual_out0, expected_out0);
diff --git a/ops/matmul-inl.h b/ops/matmul-inl.h
index d14dfd7..b2678a3 100644
--- a/ops/matmul-inl.h
+++ b/ops/matmul-inl.h
@@ -14,6 +14,7 @@
 // limitations under the License.
 
 #include <stddef.h>
+#include <stdint.h>
 
 #include "compression/compress.h"  // IWYU pragma: keep, b/conditionally used
 #include "ops/matmul.h"  // IWYU pragma: export
@@ -57,9 +58,9 @@ constexpr size_t kRegRows = kRegCols;
 // at a time. Any combination of A and B can be bf16: activations may already be
 // bf16, and weights can be decompressed to bf16.
 //
-// The corresponding op is `ReordenWidenMulAccumulate`, and it is always
+// The corresponding op is `ReorderWidenMulAccumulate`, and it is always
 // supported, but only useful if it returns a single vector of pairwise sums
-// `a[0] * b[0] + a[1] * b[1]`. On other targets, `ReordenWidenMulAccumulate`
+// `a[0] * b[0] + a[1] * b[1]`. On other targets, `ReorderWidenMulAccumulate`
 // insteads return `a[1] * b[1]` in its `sum1` output. We cannot afford to keep
 // a `sum1` for each of the `kRegRows * kRegCols` C vectors, and it would be
 // expensive to add each `sum0` and `sum1`, hence we only 'decompress' A and B
@@ -73,20 +74,22 @@ using MulT = hwy::If<HWY_NATIVE_DOT_BF16, BF16, float>;
 template <size_t kRow, typename MatTB>
 class BRow {
   static_assert(kRow < kRegRows);  // which unrolled instance we are
-  using TraitsB = CompressTraits<MatTB>;
 
  public:
-  BRow(const Mat<const MatTB>& B, size_t row_b)
-      : B_(B.ptr), B_ofs_(B.Row(row_b + kRow)) {}
+  BRow(const Mat<const MatTB>& B, size_t row_b, size_t cols_c)
+      // B.cols * C.cols is the total number of elements, required for
+      // PackedSpan::BoundsCheck.
+      : B_(MakeSpan(B.ptr, B.ofs + B.cols * cols_c)),
+        B_ofs_(B.Row(row_b + kRow)) {}
 
   template <class DM, class VM = hn::Vec<DM>>
   HWY_INLINE void Load2(DM d, size_t col_ab, VM& b0, VM& b1) const {
     static_assert(hwy::IsSame<hn::TFromD<DM>, MulT>());
-    TraitsB::Decompress2(d, B_, B_ofs_ + col_ab, b0, b1);
+    Decompress2(d, B_, B_ofs_ + col_ab, b0, b1);
   }
 
  private:
-  const MatTB* HWY_RESTRICT B_;
+  PackedSpan<const MatTB> B_;
   const size_t B_ofs_;
 };
 
@@ -101,7 +104,7 @@ class BRow {
 // `AddHorizontalSums`. Most MatMul instead broadcast one element from A and
 // multiply with one element from N columns in B to obtain N columns of C.
 // This is a poor fit for our setting:
-// - `CompressTraits` decompresses two vectors at a time;
+// - `Decompress2` decompresses two vectors at a time;
 // - B is column-major, so unit-stride SIMD loads return a column, not values
 //   from different columns, i.e. a row.
 // Both could be fixed in a packing stage, which is not implemented yet, and
@@ -113,11 +116,13 @@ class BRow {
 template <size_t kRow, typename MatTA>
 class ALoadAccumulate {
   static_assert(kRow < kRegRows);  // which unrolled instance we are
-  using TraitsA = CompressTraits<MatTA>;
 
  public:
-  ALoadAccumulate(const Mat<const MatTA>& A, size_t row_ac)
-      : A_(A.ptr), A_ofs_(A.Row(row_ac + kRow)) {}
+  ALoadAccumulate(const Mat<const MatTA>& A, size_t row_ac, size_t batch_size)
+      // A.cols * batch_size is the total number of elements, required for
+      // PackedSpan::BoundsCheck.
+      : A_(MakeSpan(A.ptr, A.ofs + A.cols * batch_size)),
+        A_ofs_(A.Row(row_ac + kRow)) {}
 
   // First iteration, col_ab = 0: initialize C0..3 instead of updating them.
   template <size_t kNumRows, class DM, class VM = hn::Vec<DM>, HWY_IF_F32_D(DM)>
@@ -128,7 +133,7 @@ class ALoadAccumulate {
     static_assert(kNumRows <= kRegRows);  // How many rows actually present
     if constexpr (kRow < kNumRows) {
       VM a0, a1;
-      TraitsA::Decompress2(dm, A_, A_ofs_, a0, a1);
+      Decompress2(dm, A_, A_ofs_, a0, a1);
 
       static_assert(kRegCols == 4);
       C0 = hn::Mul(a0, b00);
@@ -153,7 +158,7 @@ class ALoadAccumulate {
     static_assert(kNumRows <= kRegRows);  // How many rows actually present
     if constexpr (kRow < kNumRows) {
       VM a0, a1;
-      TraitsA::Decompress2(dm, A_, A_ofs_, a0, a1);
+      Decompress2(dm, A_, A_ofs_, a0, a1);
 
       const DF df;
       VF unused_sum1 = hn::Zero(df);
@@ -183,7 +188,7 @@ class ALoadAccumulate {
     HWY_DASSERT(col_ab >= 2 * hn::Lanes(dm));  // Should not be first iteration.
     if constexpr (kRow < kNumRows) {
       VM a0, a1;
-      TraitsA::Decompress2(dm, A_, A_ofs_ + col_ab, a0, a1);
+      Decompress2(dm, A_, A_ofs_ + col_ab, a0, a1);
 
       static_assert(kRegCols == 4);
       C0 = hn::MulAdd(a0, b00, C0);
@@ -209,7 +214,7 @@ class ALoadAccumulate {
     HWY_DASSERT(col_ab >= 2 * hn::Lanes(dm));  // Should not be first iteration.
     if constexpr (kRow < kNumRows) {
       VM a0, a1;
-      TraitsA::Decompress2(dm, A_, A_ofs_ + col_ab, a0, a1);
+      Decompress2(dm, A_, A_ofs_ + col_ab, a0, a1);
 
       const DF df;
       hn::Vec<DF> unused_sum1 = hn::Zero(df);
@@ -230,7 +235,7 @@ class ALoadAccumulate {
   }
 
  private:
-  const MatTA* HWY_RESTRICT A_;
+  PackedSpan<const MatTA> A_;
   const size_t A_ofs_;
 };  // ALoadAccumulate
 
@@ -352,9 +357,10 @@ class AddHorizontalSums {
 // *finished* tile of f32 `C` whose top left is (row_ac, row_b_col_c).
 // TODO: loop over sections instead of full rows and accumulate into `tile_c`.
 template <size_t kNumRows, bool kAdd, typename MatTA, typename MatTB>
-HWY_INLINE void MatMulTile(const Mat<const MatTA>& A, const Mat<const MatTB>& B,
-                           const size_t row_ac, const size_t row_b_col_c,
-                           const float scale, const float* HWY_RESTRICT add,
+HWY_INLINE void MatMulTile(const size_t batch_size, const Mat<const MatTA>& A,
+                           const Mat<const MatTB>& B, const size_t row_ac,
+                           const size_t row_b_col_c, const float scale,
+                           const float* HWY_RESTRICT add,
                            float* HWY_RESTRICT buf, const Mat<float>& C) {
   // For 'decompressing' A and B into BF16 or float.
   const hn::ScalableTag<MulT> dm;
@@ -362,15 +368,15 @@ HWY_INLINE void MatMulTile(const Mat<const MatTA>& A, const Mat<const MatTB>& B,
   const size_t NM = hn::Lanes(dm);
 
   static_assert(kRegRows == 4);
-  const BRow<0, MatTB> b_row0(B, row_b_col_c);
-  const BRow<1, MatTB> b_row1(B, row_b_col_c);
-  const BRow<2, MatTB> b_row2(B, row_b_col_c);
-  const BRow<3, MatTB> b_row3(B, row_b_col_c);
+  const BRow<0, MatTB> b_row0(B, row_b_col_c, C.cols);
+  const BRow<1, MatTB> b_row1(B, row_b_col_c, C.cols);
+  const BRow<2, MatTB> b_row2(B, row_b_col_c, C.cols);
+  const BRow<3, MatTB> b_row3(B, row_b_col_c, C.cols);
 
-  const ALoadAccumulate<0, MatTA> a_row0(A, row_ac);
-  const ALoadAccumulate<1, MatTA> a_row1(A, row_ac);
-  const ALoadAccumulate<2, MatTA> a_row2(A, row_ac);
-  const ALoadAccumulate<3, MatTA> a_row3(A, row_ac);
+  const ALoadAccumulate<0, MatTA> a_row0(A, row_ac, batch_size);
+  const ALoadAccumulate<1, MatTA> a_row1(A, row_ac, batch_size);
+  const ALoadAccumulate<2, MatTA> a_row2(A, row_ac, batch_size);
+  const ALoadAccumulate<3, MatTA> a_row3(A, row_ac, batch_size);
 
   const hn::Repartition<float, decltype(dm)> df;
   using VF = hn::Vec<decltype(df)>;
@@ -475,16 +481,20 @@ HWY_NOINLINE void MatMul(const size_t batch_size, const Mat<const MatTA>& A,
         HWY_DASSERT(num_rows != 0);
         switch (num_rows) {
           case 1:
-            MatMulTile<1, kAdd>(A, B, row_ac, row_b_col_c, scale, add, buf, C);
+            MatMulTile<1, kAdd>(batch_size, A, B, row_ac, row_b_col_c, scale,
+                                add, buf, C);
             break;
           case 2:
-            MatMulTile<2, kAdd>(A, B, row_ac, row_b_col_c, scale, add, buf, C);
+            MatMulTile<2, kAdd>(batch_size, A, B, row_ac, row_b_col_c, scale,
+                                add, buf, C);
             break;
           case 3:
-            MatMulTile<3, kAdd>(A, B, row_ac, row_b_col_c, scale, add, buf, C);
+            MatMulTile<3, kAdd>(batch_size, A, B, row_ac, row_b_col_c, scale,
+                                add, buf, C);
             break;
           default:
-            MatMulTile<4, kAdd>(A, B, row_ac, row_b_col_c, scale, add, buf, C);
+            MatMulTile<4, kAdd>(batch_size, A, B, row_ac, row_b_col_c, scale,
+                                add, buf, C);
         }
       });
 }
diff --git a/ops/matmul_test.cc b/ops/matmul_test.cc
index 15289fa..dedeb8c 100644
--- a/ops/matmul_test.cc
+++ b/ops/matmul_test.cc
@@ -45,16 +45,17 @@ HWY_BEFORE_NAMESPACE();
 namespace gcpp {
 namespace HWY_NAMESPACE {
 
-namespace hn = hwy::HWY_NAMESPACE;
+using FloatPtr = hwy::AlignedFreeUniquePtr<float[]>;
 
 // Generates inputs: deterministic, within max SfpStream range.
-template <typename MatT, size_t kRows, size_t kCols>
-std::unique_ptr<CompressedArray<MatT, kRows * kCols>> GenerateMatHeap(
-    size_t offset, hwy::ThreadPool& pool) {
+template <typename MatT, size_t kRows, size_t kCols,
+          size_t kNum = kRows * kCols,
+          class MatPtr = std::unique_ptr<CompressedArray<MatT, kNum>>>
+MatPtr GenerateMatHeap(size_t offset, hwy::ThreadPool& pool) {
   gcpp::CompressWorkingSet ws;
-  hwy::AlignedFreeUniquePtr<float[]> content =
-      hwy::AllocateAligned<float>(kRows * kCols);
-  const float scale = 1.875f / (kCols * kRows + offset);
+  FloatPtr content = hwy::AllocateAligned<float>(kNum);
+  HWY_ASSERT(content);
+  const float scale = SfpStream::kMax / (kNum + offset);
   pool.Run(0, kRows, [&](const size_t i, size_t /*thread*/) {
     for (size_t j = 0; j < kCols; j++) {
       content[i * kCols + j] =
@@ -62,21 +63,19 @@ std::unique_ptr<CompressedArray<MatT, kRows * kCols>> GenerateMatHeap(
     }
   });
 
-  std::unique_ptr<CompressedArray<MatT, kRows * kCols>> mat =
-      std::make_unique<CompressedArray<MatT, kRows * kCols>>();
-  Compress(content.get(), kRows * kCols, ws, kRows * kCols, mat->data(), 0,
-           pool);
+  MatPtr mat = std::make_unique<CompressedArray<MatT, kNum>>();
+  CompressScaled(content.get(), kNum, ws, *mat, pool);
   mat->set_scale(0.6f);  // Arbitrary value, different from 1.
   return mat;
 }
 
-template <typename MatT, size_t kRows, size_t kCols>
-std::unique_ptr<CompressedArray<MatT, kRows * kCols>> GenerateTransposeMatHeap(
-    size_t offset, hwy::ThreadPool& pool) {
+template <typename MatT, size_t kRows, size_t kCols,
+          size_t kNum = kRows * kCols,
+          class MatPtr = std::unique_ptr<CompressedArray<MatT, kNum>>>
+MatPtr GenerateTransposeMatHeap(size_t offset, hwy::ThreadPool& pool) {
   gcpp::CompressWorkingSet ws;
-  hwy::AlignedFreeUniquePtr<float[]> content =
-      hwy::AllocateAligned<float>(kRows * kCols);
-  const float scale = 1.875f / (kCols * kRows + offset);
+  FloatPtr content = hwy::AllocateAligned<float>(kNum);
+  const float scale = SfpStream::kMax / (kNum + offset);
   pool.Run(0, kRows, [&](const size_t i, size_t /*thread*/) {
     for (size_t j = 0; j < kCols; j++) {
       content[j * kRows + i] =
@@ -84,42 +83,31 @@ std::unique_ptr<CompressedArray<MatT, kRows * kCols>> GenerateTransposeMatHeap(
     }
   });
 
-  std::unique_ptr<CompressedArray<MatT, kRows * kCols>> mat =
-      std::make_unique<CompressedArray<MatT, kRows * kCols>>();
-  Compress(content.get(), kRows * kCols, ws, kRows * kCols, mat->data(), 0,
-           pool);
+  MatPtr mat = std::make_unique<CompressedArray<MatT, kNum>>();
+  CompressScaled(content.get(), kNum, ws, *mat, pool);
   // Arbitrary value, different from 1, must match GenerateMatHeap.
   mat->set_scale(0.6f);
   return mat;
 }
 
-template <typename MatT, size_t kRows, size_t kCols>
-std::unique_ptr<CompressedArray<MatT, kRows * kCols>> GenerateZeroMatHeap(
-    hwy::ThreadPool& pool) {
+template <typename MatT, size_t kRows, size_t kCols,
+          size_t kNum = kRows * kCols,
+          class MatPtr = std::unique_ptr<CompressedArray<MatT, kNum>>>
+MatPtr GenerateZeroMatHeap(hwy::ThreadPool& pool) {
   gcpp::CompressWorkingSet ws;
-  hwy::AlignedFreeUniquePtr<float[]> content =
-      hwy::AllocateAligned<float>(kRows * kCols);
+  FloatPtr content = hwy::AllocateAligned<float>(kNum);
+  HWY_ASSERT(content);
 
   pool.Run(0, kRows, [&](const size_t i, size_t thread) {
     hwy::ZeroBytes(&content[i * kCols], kCols * sizeof(content[0]));
   });
 
-  std::unique_ptr<CompressedArray<MatT, kRows * kCols>> mat =
-      std::make_unique<CompressedArray<MatT, kRows * kCols>>();
-  Compress(content.get(), kRows * kCols, ws, kRows * kCols, mat->data(), 0,
-           pool);
+  MatPtr mat = std::make_unique<CompressedArray<MatT, kNum>>();
+  CompressScaled(content.get(), kNum, ws, *mat, pool);
   mat->set_scale(1.2f);  // Arbitrary value, different from 1.
   return mat;
 }
 
-template <typename MatT>
-void Decompress(const MatT* compressed, size_t num, float* out) {
-  const hn::ScalableTag<float> d;
-  hwy::AlignedFreeUniquePtr<float[]> b = hwy::AllocateAligned<float>(num);
-  CompressTraits<MatT>::Decompress(d, /*in_capacity=*/0, compressed, 0, out,
-                                   num);
-}
-
 // Returns 1-norm, used for estimating tolerable numerical differences.
 double MaxColAbsSum(const float* HWY_RESTRICT a, size_t rows, size_t cols) {
   double max_col_abs_sum = 0.0;
@@ -135,18 +123,21 @@ double MaxColAbsSum(const float* HWY_RESTRICT a, size_t rows, size_t cols) {
 
 template <typename MatTA, typename MatTB>
 void AssertClose(size_t rows_ac, size_t cols_ab, size_t cols_c_rows_b,
-                 const MatTA* HWY_RESTRICT a_compr,
-                 const MatTB* HWY_RESTRICT b_trans_compr,
+                 const MatTA* HWY_RESTRICT pa,
+                 const MatTB* HWY_RESTRICT pb_trans,
                  const float* HWY_RESTRICT expected_c,
                  const float* HWY_RESTRICT actual_c) {
+  const hn::ScalableTag<float> df;
   const size_t num_a = rows_ac * cols_ab;
   const size_t num_b = cols_c_rows_b * cols_ab;
+  HWY_ASSERT(num_a % hn::Lanes(df) == 0);  // for DecompressAndZeroPad
+  HWY_ASSERT(num_b % hn::Lanes(df) == 0);  // for DecompressAndZeroPad
   const size_t num_c = rows_ac * cols_c_rows_b;
-  hwy::AlignedFreeUniquePtr<float[]> a = hwy::AllocateAligned<float>(num_a);
-  hwy::AlignedFreeUniquePtr<float[]> b_trans =
-      hwy::AllocateAligned<float>(num_b);
-  Decompress(a_compr, num_a, a.get());
-  Decompress(b_trans_compr, num_b, b_trans.get());
+  FloatPtr a = hwy::AllocateAligned<float>(num_a);
+  FloatPtr b_trans = hwy::AllocateAligned<float>(num_b);
+  HWY_ASSERT(a && b_trans);
+  DecompressAndZeroPad(df, MakeSpan(pa, num_a), 0, a.get(), num_a);
+  DecompressAndZeroPad(df, MakeSpan(pb_trans, num_b), 0, b_trans.get(), num_b);
 
   const double norm = MaxColAbsSum(a.get(), rows_ac, cols_ab) *
                       MaxColAbsSum(b_trans.get(), cols_c_rows_b, cols_ab);
@@ -196,38 +187,37 @@ HWY_INLINE void MatMulSlow(size_t rows_ac, size_t cols_a_rows_b, size_t cols_bc,
                            const MatTA* HWY_RESTRICT a,
                            const MatTB* HWY_RESTRICT b_compr, const float scale,
                            const float* add, float* HWY_RESTRICT out) {
-  const hn::ScalableTag<float> d;
-  hwy::AlignedFreeUniquePtr<float[]> b =
-      hwy::AllocateAligned<float>(cols_a_rows_b * cols_bc);
-  CompressTraits<MatTB>::Decompress(d, /*in_capacity=*/0, b_compr, 0, b.get(),
-                                    cols_a_rows_b * cols_bc);
+  const size_t num_b = cols_a_rows_b * cols_bc;
+  FloatPtr b = hwy::AllocateAligned<float>(num_b);
+  HWY_ASSERT(b);
+  const hn::ScalableTag<float> df;
+  DecompressAndZeroPad(df, MakeSpan(b_compr, num_b), 0, b.get(), num_b);
   MatMulSlow(rows_ac, cols_a_rows_b, cols_bc, a, b.get(), scale, add, out);
 }
 
 void PrintSpeed(const char* algo, size_t rows_ac, size_t cols_a_rows_b,
                 size_t cols_bc, double elapsed) {
+  const size_t num_b = cols_a_rows_b * cols_bc;
   // 2x because of FMA.
   fprintf(stderr, "                     %10s: %f seconds, %.1f GFLOPS.\n", algo,
-          elapsed, 2 * 1E-9 * rows_ac * cols_a_rows_b * cols_bc / elapsed);
+          elapsed, 2 * 1E-9 * rows_ac * num_b / elapsed);
 }
 
 template <size_t kRowsAC, size_t kColsARowsB, size_t kColsBC, bool kAdd,
           typename MatTA, typename MatTB = MatTA>
 void TestMatMul(MatMulEnv& env) {
   hwy::ThreadPool& pool = env.Pool();
-  using TraitsA = CompressTraits<MatTA>;
-  using TraitsB = CompressTraits<MatTB>;
   const bool want_bench = kColsBC > 2000;  // avoid spam for small matrices
   fprintf(stderr, "TestMatMul %lu, %lu, %lu, add=%d, MatTA=%s, MatTB=%s\n",
-          kRowsAC, kColsARowsB, kColsBC, kAdd, TraitsA::Name(),
-          TraitsB::Name());
+          kRowsAC, kColsARowsB, kColsBC, kAdd, TypeName<MatTA>(),
+          TypeName<MatTB>());
 
   std::unique_ptr<CompressedArray<MatTA, kRowsAC * kColsARowsB>> a =
       GenerateMatHeap<MatTA, kRowsAC, kColsARowsB>(0, pool);
   std::unique_ptr<CompressedArray<MatTB, kColsARowsB * kColsBC>> b_trans =
       GenerateTransposeMatHeap<MatTB, kColsARowsB, kColsBC>(0, pool);
-  hwy::AlignedFreeUniquePtr<float[]> c =
-      hwy::AllocateAligned<float>(kRowsAC * kColsBC);
+  FloatPtr c = hwy::AllocateAligned<float>(kRowsAC * kColsBC);
+  HWY_ASSERT(c);
 
   const float scale = a->scale() * b_trans->scale();
   std::unique_ptr<CompressedArray<float, kColsBC>> add;
diff --git a/ops/matvec-inl.h b/ops/matvec-inl.h
index a1c9e57..ffc8677 100644
--- a/ops/matvec-inl.h
+++ b/ops/matvec-inl.h
@@ -37,7 +37,6 @@
 
 #include "compression/compress-inl.h"
 #include "ops/dot-inl.h"
-#include "hwy/contrib/dot/dot-inl.h"
 #include "hwy/contrib/math/math-inl.h"
 #include "hwy/contrib/matvec/matvec-inl.h"
 
@@ -58,15 +57,14 @@ HWY_INLINE void TwoOfsMatVecAddLoop(const ArrayT& mat, const size_t mat_ofs0,
                                     float* HWY_RESTRICT out0,
                                     float* HWY_RESTRICT out1) {
   PROFILER_ZONE("TwoOfsMatVecAddLoop");
-  const hn::ScalableTag<float> df;
 
   for (size_t idx_row = 0; idx_row < kOuter; ++idx_row) {
     const size_t row_ofs0 = mat_ofs0 + (idx_row)*kInner;
     const size_t row_ofs1 = mat_ofs1 + (idx_row)*kInner;
     out0[idx_row] = hwy::ConvertScalarTo<float>(add0[idx_row]) +
-                    Dot<false>(df, mat, row_ofs0, vec_aligned, kInner);
+                    Dot(mat, row_ofs0, vec_aligned, kInner);
     out1[idx_row] = hwy::ConvertScalarTo<float>(add1[idx_row]) +
-                    Dot<false>(df, mat, row_ofs1, vec_aligned, kInner);
+                    Dot(mat, row_ofs1, vec_aligned, kInner);
   }
 }
 
@@ -98,8 +96,7 @@ HWY_INLINE void AccumulatePartialDotProducts(
     const VecT* HWY_RESTRICT vec_aligned, float* HWY_RESTRICT out) {
   for (size_t idx_row = 0; idx_row < num_rows; ++idx_row) {
     const size_t row_ofs = mat_ofs + (r0 + idx_row) * mat_stride;
-    out[idx_row] +=
-        Dot<false>(df, mat, row_ofs + c0, vec_aligned + c0, num_cols);
+    out[idx_row] += Dot(mat, row_ofs + c0, vec_aligned + c0, num_cols);
   }
 }
 
@@ -117,12 +114,10 @@ HWY_INLINE void SetFirstPartialDotProducts(DF df, const ArrayT& mat,
   for (size_t idx_row = 0; idx_row < num_rows; ++idx_row) {
     const size_t row_ofs = mat_ofs + (r0 + idx_row) * mat_stride;
     if constexpr (kInit) {
-      out[idx_row] =
-          hwy::ConvertScalarTo<float>(init[idx_row + r0]) +
-          Dot<false>(df, mat, row_ofs + c0, vec_aligned + c0, num_cols);
+      out[idx_row] = hwy::ConvertScalarTo<float>(init[idx_row + r0]) +
+                     Dot(mat, row_ofs + c0, vec_aligned + c0, num_cols);
     } else {
-      out[idx_row] =
-          Dot<false>(df, mat, row_ofs + c0, vec_aligned + c0, num_cols);
+      out[idx_row] = Dot(mat, row_ofs + c0, vec_aligned + c0, num_cols);
     }
   }
 }
diff --git a/ops/ops-inl.h b/ops/ops-inl.h
index a7f4bbf..129e2f5 100644
--- a/ops/ops-inl.h
+++ b/ops/ops-inl.h
@@ -26,6 +26,7 @@
 #include <random>
 #include <type_traits>  // std::enable_if_t
 
+#include "compression/compress.h"
 #include "hwy/base.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 #include "hwy/detect_targets.h"
@@ -41,8 +42,8 @@
 #endif
 
 #include "compression/compress-inl.h"
+#include "ops/dot-inl.h"
 #include "hwy/contrib/algo/transform-inl.h"
-#include "hwy/contrib/dot/dot-inl.h"
 #include "hwy/contrib/math/math-inl.h"
 
 HWY_BEFORE_NAMESPACE();
@@ -54,11 +55,12 @@ template <typename To, typename From>
 HWY_INLINE constexpr std::enable_if_t<
     std::is_arithmetic_v<To> && std::is_arithmetic_v<From>, To>
 StaticCast(From from) noexcept {
-  if constexpr (std::is_unsigned_v<From> && std::is_floating_point_v<To>)
+  if constexpr (std::is_unsigned_v<From> && std::is_floating_point_v<To>) {
     return static_cast<To>(
         static_cast<hwy::SignedFromSize<sizeof(From)>>(from));
-  else
+  } else {
     return static_cast<To>(from);
+  }
 }
 
 template <class D, HWY_IF_F32_D(D)>
@@ -136,48 +138,13 @@ static HWY_NOINLINE HWY_MAYBE_UNUSED void Sigmoid(float* HWY_RESTRICT x,
                 [](D d, hn::Vec<D> v) HWY_ATTR { return Sigmoid(d, v); });
 }
 
-static HWY_NOINLINE HWY_MAYBE_UNUSED float Dot(const float* HWY_RESTRICT a,
-                                               const float* HWY_RESTRICT b,
-                                               size_t size) {
-  PROFILER_ZONE("ops.Dot");
-  const hn::ScalableTag<float> d;
-  HWY_DASSERT(size >= hn::Lanes(d));
-  HWY_DASSERT(size % hn::Lanes(d) == 0);
-  constexpr int kAssumptions =
-      hn::Dot::kAtLeastOneVector | hn::Dot::kMultipleOfVector;
-  return hn::Dot::Compute<kAssumptions>(d, a, b, size);
-}
-
 namespace detail {
 
-// = Dot(a, a, size), but that is not allowed due to HWY_RESTRICT.
-template <typename VecT>
-float SquaredL2(const VecT* HWY_RESTRICT a, size_t size) {
-  using TraitsV = CompressTraits<VecT>;
-
-  const hn::ScalableTag<float> d;
-  using V = hn::Vec<decltype(d)>;
-  const size_t N = hn::Lanes(d);
-  HWY_DASSERT(size >= 2 * N);
-  HWY_DASSERT(size % (2 * N) == 0);
-
-  // TODO: use more accurate Dot
-  V sum0 = hn::Zero(d);
-  V sum1 = hn::Zero(d);
-  for (size_t i = 0; i <= size - 2 * N; i += 2 * N) {
-    V a0, a1;
-    TraitsV::Decompress2(d, a, i, a0, a1);
-    sum0 = hn::MulAdd(a0, a0, sum0);
-    sum1 = hn::MulAdd(a1, a1, sum1);
-  }
-
-  return hn::ReduceSum(d, hn::Add(sum0, sum1));
-}
-
 // Shared by RMSNorm and RMSNormInplace.
 template <typename VecT>
 float RMSNormMul(const VecT* HWY_RESTRICT x, size_t size) {
-  const float l2 = SquaredL2(x, size);
+  const hn::ScalableTag<float> df;
+  const float l2 = DecompressAndCall(df, x, size, DotKernelCompensated());
   constexpr float kEps = 1e-6f;  // avoid divide by zero
   return 1.0f / sqrtf(l2 / StaticCast<float>(size) + kEps);
 }
@@ -191,9 +158,6 @@ HWY_NOINLINE HWY_MAYBE_UNUSED void RMSNorm(const VecT* HWY_RESTRICT x,
                                            const size_t size) {
   PROFILER_FUNC;
 
-  using TraitsV = CompressTraits<VecT>;
-  using TraitsW = CompressTraits<WeightT>;
-
   namespace hn = hwy::HWY_NAMESPACE;
   const hn::ScalableTag<float> df;
   using VF = hn::Vec<decltype(df)>;
@@ -201,17 +165,21 @@ HWY_NOINLINE HWY_MAYBE_UNUSED void RMSNorm(const VecT* HWY_RESTRICT x,
 
   const VF mul = hn::Set(df, detail::RMSNormMul(x, size));
 
+  const auto packed_w = MakeSpan(weight, size);
+  const auto packed_v = MakeSpan(x, size);
+  const auto packed_out = MakeSpan(out, size);
+
   HWY_DASSERT(size % (2 * MaxLanes(df)) == 0);
   for (size_t i = 0; i < size; i += 2 * NF) {
     VF v0, v1, w0, w1;
-    TraitsV::Decompress2(df, x, i, v0, v1);
-    TraitsW::Decompress2(df, weight, i, w0, w1);
+    Decompress2(df, packed_v, i, v0, v1);
+    Decompress2(df, packed_w, i, w0, w1);
     const VF m0 = hn::Mul(mul, v0);
     const VF m1 = hn::Mul(mul, v1);
     // (1+weight) * m = m + weight*m = one FMA.
     const VF out0 = hn::MulAdd(m0, w0, m0);
     const VF out1 = hn::MulAdd(m1, w1, m1);
-    detail::Store2(df, out0, out1, out + i);
+    Compress2(df, out0, out1, packed_out, i);
   }
 }
 
@@ -222,9 +190,6 @@ HWY_NOINLINE HWY_MAYBE_UNUSED void RMSNormInplace(
     const size_t size) {
   PROFILER_FUNC;
 
-  using TraitsV = CompressTraits<VecT>;
-  using TraitsW = CompressTraits<WeightT>;
-
   namespace hn = hwy::HWY_NAMESPACE;
   const hn::ScalableTag<float> df;
   using VF = hn::Vec<decltype(df)>;
@@ -232,17 +197,20 @@ HWY_NOINLINE HWY_MAYBE_UNUSED void RMSNormInplace(
 
   const VF mul = hn::Set(df, detail::RMSNormMul(inout, size));
 
+  const auto packed_w = MakeSpan(weight, size);
+  const auto packed_v = MakeSpan(inout, size);
+
   HWY_DASSERT(size % (2 * MaxLanes(df)) == 0);
   for (size_t i = 0; i < size; i += 2 * NF) {
     VF v0, v1, w0, w1;
-    TraitsV::Decompress2(df, inout, i, v0, v1);
-    TraitsW::Decompress2(df, weight, i, w0, w1);
-    const VF m0 = hn::Mul(mul, hn::LoadU(df, inout + i));
-    const VF m1 = hn::Mul(mul, hn::LoadU(df, inout + i + NF));
+    Decompress2(df, MakeConst(packed_v), i, v0, v1);
+    Decompress2(df, packed_w, i, w0, w1);
+    const VF m0 = hn::Mul(mul, v0);
+    const VF m1 = hn::Mul(mul, v1);
     // (1+weight) * m = m + weight*m = one FMA.
     const VF out0 = hn::MulAdd(m0, w0, m0);
     const VF out1 = hn::MulAdd(m1, w1, m1);
-    detail::Store2(df, out0, out1, inout + i);
+    Compress2(df, out0, out1, packed_v, i);
   }
 }
 
@@ -486,9 +454,9 @@ static HWY_NOINLINE void Softmax(float* HWY_RESTRICT x, const size_t size,
   const V vmin = hn::Set(d, hwy::LowestValue<float>());
   V vmax = vmin;
   V* pmax = &vmax;  // workaround for SVE: cannot capture &vector directly
-  Foreach(d, x, mask_pos, vmin, [pmax](const auto d, const V value) HWY_ATTR {
-    *pmax = hn::Max(*pmax, value);
-  });
+  hn::Foreach(d, x, mask_pos, vmin,
+              [pmax](const auto d, const V value)
+                  HWY_ATTR { *pmax = hn::Max(*pmax, value); });
   vmax = hn::MaxOfLanes(d, vmax);
 
   // Subtract max (avoid precision loss for large exponents) and exponentiate.
@@ -504,9 +472,9 @@ static HWY_NOINLINE void Softmax(float* HWY_RESTRICT x, const size_t size,
 
   V sum = hn::Zero(d);
   V* psum = &sum;
-  Foreach(d, x, mask_pos, sum, [psum](const auto d, const V value) HWY_ATTR {
-    *psum = hn::Add(*psum, value);
-  });
+  hn::Foreach(d, x, mask_pos, sum,
+              [psum](const auto d, const V value)
+                  HWY_ATTR { *psum = hn::Add(*psum, value); });
 
   // Normalize to probability distribution
   const float mul = 1.0f / hn::ReduceSum(d, sum);
diff --git a/ops/ops_test.cc b/ops/ops_test.cc
index b8cddaf..aebbab1 100644
--- a/ops/ops_test.cc
+++ b/ops/ops_test.cc
@@ -480,8 +480,8 @@ void TestRMSNorm(hwy::RandomState& rng) {
     const float e = hwy::ConvertScalarTo<float>(expected[i]);
     const float a = hwy::ConvertScalarTo<float>(actual[i]);
     if (!IsNear(e, a, 1e-5f)) {
-      HWY_ABORT("RMSNorm %s %s %s mismatch at %zu: %E %E\n", TypeName(VecT()),
-                TypeName(WeightT()), TypeName(OutT()), i, e, a);
+      HWY_ABORT("RMSNorm %s %s %s mismatch at %zu: %E %E\n", TypeName<VecT>(),
+                TypeName<WeightT>(), TypeName<OutT>(), i, e, a);
     }
   }
 }
diff --git a/util/threading.h b/util/threading.h
index 930791e..9144e34 100644
--- a/util/threading.h
+++ b/util/threading.h
@@ -139,6 +139,13 @@ class PerClusterPools {
   }
 
  public:
+  // Move-only.
+  PerClusterPools() = delete;
+  PerClusterPools(const PerClusterPools&) = delete;
+  PerClusterPools& operator=(const PerClusterPools&) = delete;
+  PerClusterPools(PerClusterPools&&) = default;
+  PerClusterPools& operator=(PerClusterPools&&) = default;
+
   // PerClusterPools supports spin waits (see StartSpinning below). To prevent
   // drastic slowdowns caused by excessive user-specified thread counts, which
   // result in threads not running on their own core, we only allow for