diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7cc586b..c0c749b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,12 +22,12 @@ set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
-FetchContent_Declare(highway GIT_REPOSITORY https://github.com/google/highway.git GIT_TAG da250571a45826b21eebbddc1e50d0c1137dee5f)
+FetchContent_Declare(highway GIT_REPOSITORY https://github.com/google/highway.git GIT_TAG da250571a45826b21eebbddc1e50d0c1137dee5f EXCLUDE_FROM_ALL)
 FetchContent_MakeAvailable(highway)
 
 ## Note: absl needs to be installed by sentencepiece. This will only happen if
 ## cmake is invoked with -DSPM_ENABLE_SHARED=OFF and -DSPM_ABSL_PROVIDER=module
-FetchContent_Declare(sentencepiece GIT_REPOSITORY https://github.com/google/sentencepiece GIT_TAG 53de76561cfc149d3c01037f0595669ad32a5e7c)
+FetchContent_Declare(sentencepiece GIT_REPOSITORY https://github.com/google/sentencepiece GIT_TAG 53de76561cfc149d3c01037f0595669ad32a5e7c EXCLUDE_FROM_ALL)
 FetchContent_MakeAvailable(sentencepiece)
 
 set(SOURCES
@@ -91,15 +91,6 @@ set(GEMMA_TEST_FILES
   ops_test.cc
 )
 
-include(FetchContent)
-FetchContent_Declare(
-    googletest
-    URL https://github.com/google/googletest/archive/refs/tags/v1.14.0.zip
-)
-set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
-FetchContent_MakeAvailable(googletest)
-include(GoogleTest)
-
 foreach (TESTFILE IN LISTS GEMMA_TEST_FILES)
   # The TESTNAME is the name without the extension or directory.
   get_filename_component(TESTNAME ${TESTFILE} NAME_WE)
diff --git a/ops.h b/ops.h
index 0959c02..263e091 100644
--- a/ops.h
+++ b/ops.h
@@ -88,56 +88,91 @@ template <size_t kOuter>
 HWY_INLINE constexpr size_t RowsPerStrip() {
   // Aim for 128 work items to reduce pool overhead. Must be at least one
   // vector; prefer a power of two for faster division.
+  constexpr size_t kLanes = hn::ScalableTag<float>().MaxLanes();
   constexpr size_t kRowsPerStrip =
-      HWY_MAX(hn::ScalableTag<float>().MaxLanes(),
-              1ULL << hwy::FloorLog2(kOuter / 128));
+      kOuter < 128 ? kLanes
+                   : HWY_MAX(kLanes, 1ULL << hwy::FloorLog2(kOuter / 128));
   return kRowsPerStrip;
 }
 
 // Simple version without tiling nor threading.
-template <size_t kOuter, size_t kInner, typename MatT, size_t kCapacity,
-          typename VecT>
-HWY_INLINE void MatVecLoop(const CompressedArray<MatT, kCapacity>& mat,
-                           const size_t mat_ofs,
-                           const VecT* HWY_RESTRICT vec_aligned,
-                           float* HWY_RESTRICT out) {
-  PROFILER_ZONE("MatVecLoop");
+template <bool kAdd, size_t kOuter, size_t kInner, typename ArrayT,
+          typename VecT, typename AddT>
+HWY_INLINE void MatVecAddLoop(const ArrayT& mat, const size_t mat_ofs,
+                              const VecT* HWY_RESTRICT vec_aligned,
+                              const AddT* HWY_RESTRICT add,
+                              float* HWY_RESTRICT out) {
+  PROFILER_ZONE("MatVecAddLoop");
   const hn::ScalableTag<float> df;
 
   for (size_t idx_row = 0; idx_row < kOuter; ++idx_row) {
     const size_t row_ofs = mat_ofs + idx_row * kInner;
-    out[idx_row] = Dot(df, mat, row_ofs, vec_aligned, kInner);
+    if constexpr (kAdd) {
+      out[idx_row] = hwy::ConvertScalarTo<float>(add[idx_row]) +
+                     Dot(df, mat, row_ofs, vec_aligned, kInner);
+    } else {
+      out[idx_row] = Dot(df, mat, row_ofs, vec_aligned, kInner);
+    }
   }
 }
 
+template <size_t kOuter, size_t kInner, typename ArrayT, typename VecT>
+HWY_INLINE void MatVecLoop(const ArrayT& mat, const size_t mat_ofs,
+                           const VecT* HWY_RESTRICT vec_aligned,
+                           float* HWY_RESTRICT out) {
+  MatVecAddLoop<false, kOuter, kInner, ArrayT, VecT, VecT>(
+      mat, mat_ofs, vec_aligned, /*add=*/nullptr, out);
+}
+
 // Simple version without tiling nor threading, but two offsets/outputs.
-template <size_t kOuter, size_t kInner, typename MatT, size_t kCapacity,
-          typename VecT>
-HWY_INLINE void TwoOfsMatVecLoop(const CompressedArray<MatT, kCapacity>& mat,
-                                 const size_t mat_ofs0, const size_t mat_ofs1,
-                                 const VecT* HWY_RESTRICT vec_aligned,
-                                 float* HWY_RESTRICT out0,
-                                 float* HWY_RESTRICT out1) {
+template <bool kAdd, size_t kOuter, size_t kInner, typename ArrayT,
+          typename VecT, typename AddT>
+HWY_INLINE void TwoOfsMatVecAddLoop(const ArrayT& mat, const size_t mat_ofs0,
+                                    const size_t mat_ofs1,
+                                    const VecT* HWY_RESTRICT vec_aligned,
+                                    const AddT* HWY_RESTRICT add0,
+                                    const AddT* HWY_RESTRICT add1,
+                                    float* HWY_RESTRICT out0,
+                                    float* HWY_RESTRICT out1) {
   PROFILER_ZONE("MatVecLoop");
   const hn::ScalableTag<float> df;
 
   for (size_t idx_row = 0; idx_row < kOuter; ++idx_row) {
     const size_t row_ofs0 = mat_ofs0 + (idx_row)*kInner;
     const size_t row_ofs1 = mat_ofs1 + (idx_row)*kInner;
-    out0[idx_row] = Dot(df, mat, row_ofs0, vec_aligned, kInner);
-    out1[idx_row] = Dot(df, mat, row_ofs1, vec_aligned, kInner);
+    if constexpr (kAdd) {
+      out0[idx_row] = hwy::ConvertScalarTo<float>(add0[idx_row]) +
+                      Dot(df, mat, row_ofs0, vec_aligned, kInner);
+      out1[idx_row] = hwy::ConvertScalarTo<float>(add1[idx_row]) +
+                      Dot(df, mat, row_ofs1, vec_aligned, kInner);
+    } else {
+      out0[idx_row] = Dot(df, mat, row_ofs0, vec_aligned, kInner);
+      out1[idx_row] = Dot(df, mat, row_ofs1, vec_aligned, kInner);
+    }
   }
 }
 
+// Simple version without tiling nor threading, but two offsets/outputs.
+template <size_t kOuter, size_t kInner, typename ArrayT, typename VecT>
+HWY_INLINE void TwoOfsMatVecLoop(const ArrayT& mat, const size_t mat_ofs0,
+                                 const size_t mat_ofs1,
+                                 const VecT* HWY_RESTRICT vec_aligned,
+                                 float* HWY_RESTRICT out0,
+                                 float* HWY_RESTRICT out1) {
+  TwoOfsMatVecAddLoop<false, kOuter, kInner, ArrayT, VecT, VecT>(
+      mat, mat_ofs0, mat_ofs1, vec_aligned, /*add0=*/nullptr, /*add1=*/nullptr,
+      out0, out1);
+}
+
 namespace detail {
 
 // For each i = [0, num_rows), compute partial (length `num_cols`) dot product
 // of row i with `vec_aligned` and add into `out[i]`. The upper-left coordinate
 // of the tile is r0, c0.
-template <class DF, typename MatT, size_t kCapacity, typename VecT>
+template <class DF, typename ArrayT, typename VecT>
 HWY_INLINE void AccumulatePartialDotProducts(
-    DF df, const CompressedArray<MatT, kCapacity>& mat, size_t mat_ofs,
-    size_t mat_stride, size_t r0, size_t c0, size_t num_rows, size_t num_cols,
+    DF df, const ArrayT& mat, size_t mat_ofs, size_t mat_stride, size_t r0,
+    size_t c0, size_t num_rows, size_t num_cols,
     const VecT* HWY_RESTRICT vec_aligned, float* HWY_RESTRICT out) {
   for (size_t idx_row = 0; idx_row < num_rows; ++idx_row) {
     const size_t row_ofs = mat_ofs + (r0 + idx_row) * mat_stride;
@@ -145,37 +180,49 @@ HWY_INLINE void AccumulatePartialDotProducts(
   }
 }
 
-// Same as above, but sets out[i] to the first partial dot product, which
-// avoids having to zero-initialize and accumulate.
-template <class DF, typename MatT, size_t kCapacity, typename VecT>
-HWY_INLINE void SetFirstPartialDotProducts(
-    DF df, const CompressedArray<MatT, kCapacity>& mat, size_t mat_ofs,
-    size_t mat_stride, size_t r0, size_t c0, size_t num_rows, size_t num_cols,
-    const VecT* HWY_RESTRICT vec_aligned, float* HWY_RESTRICT out) {
+// Same as above, but sets out[i] to the first partial dot product +
+// init (if kInit), which avoids having to zero-initialize and accumulate.
+template <bool kInit, class DF, typename ArrayT, typename VecT, typename InitT>
+HWY_INLINE void SetFirstPartialDotProducts(DF df, const ArrayT& mat,
+                                           size_t mat_ofs, size_t mat_stride,
+                                           size_t r0, size_t c0,
+                                           size_t num_rows, size_t num_cols,
+                                           const VecT* HWY_RESTRICT vec_aligned,
+                                           const InitT* HWY_RESTRICT init,
+                                           float* HWY_RESTRICT out) {
   for (size_t idx_row = 0; idx_row < num_rows; ++idx_row) {
     const size_t row_ofs = mat_ofs + (r0 + idx_row) * mat_stride;
-    out[idx_row] = Dot(df, mat, row_ofs + c0, vec_aligned + c0, num_cols);
+    if constexpr (kInit) {
+      out[idx_row] = hwy::ConvertScalarTo<float>(init[idx_row + r0]) +
+                     Dot(df, mat, row_ofs + c0, vec_aligned + c0, num_cols);
+    } else {
+      out[idx_row] = Dot(df, mat, row_ofs + c0, vec_aligned + c0, num_cols);
+    }
   }
 }
 
 // Adds together partial dot products for all tiles with the same r0 (a
 // horizontal strip of the entire matrix); the result is the full dot product
-// for rows r in [r0, r0 + num_rows), which we store into in out[r - r0].
-template <class DF, typename MatT, size_t kCapacity, typename VecT>
-HWY_INLINE void FullDotProductsForStrip(
-    DF df, const CompressedArray<MatT, kCapacity>& mat, size_t mat_ofs,
-    size_t mat_stride, size_t r0, size_t num_rows,
-    const VecT* HWY_RESTRICT vec_aligned, float* HWY_RESTRICT out) {
+// for rows r in [r0, r0 + num_rows) + optionally the add vector, which we store
+// into in out[r - r0].
+template <bool kAdd, class DF, typename ArrayT, typename VecT, typename AddT>
+HWY_INLINE void FullDotProductsForStrip(DF df, const ArrayT& mat,
+                                        size_t mat_ofs, size_t mat_stride,
+                                        size_t r0, size_t num_rows,
+                                        const VecT* HWY_RESTRICT vec_aligned,
+                                        const AddT* HWY_RESTRICT add,
+                                        float* HWY_RESTRICT out) {
   // Tall and skinny: set `out` to the single dot product.
   if (mat_stride < MaxCols()) {
-    SetFirstPartialDotProducts(df, mat, mat_ofs, mat_stride, r0, 0, num_rows,
-                               mat_stride, vec_aligned, out);
+    SetFirstPartialDotProducts<kAdd>(df, mat, mat_ofs, mat_stride, r0, 0,
+                                     num_rows, mat_stride, vec_aligned, add,
+                                     out);
     return;
   }
 
   // We have at least MaxCols, so start by setting `out` to that:
-  SetFirstPartialDotProducts(df, mat, mat_ofs, mat_stride, r0, 0, num_rows,
-                             MaxCols(), vec_aligned, out);
+  SetFirstPartialDotProducts<kAdd>(df, mat, mat_ofs, mat_stride, r0, 0,
+                                   num_rows, MaxCols(), vec_aligned, add, out);
   // For further multiples of MaxCols, accumulate. Remainders handled below.
   size_t c0 = MaxCols();
   for (; c0 <= mat_stride - MaxCols(); c0 += MaxCols()) {
@@ -191,15 +238,16 @@ HWY_INLINE void FullDotProductsForStrip(
 
 }  // namespace detail
 
-// Stores dot products of rows with `vec_aligned` to a buffer, then stores them
-// to `out`.
-template <size_t kOuter, size_t kInner, typename MatT, size_t kCapacity,
-          typename VecT>
-HWY_INLINE void MatVec(const CompressedArray<MatT, kCapacity>& mat,
-                       const size_t mat_ofs,
-                       const VecT* HWY_RESTRICT const vec_aligned,
-                       float* HWY_RESTRICT out, hwy::ThreadPool& pool) {
-  PROFILER_ZONE("MatVec");
+// Stores dot products of rows with `vec_aligned` + add the values from `add`
+// (if kAdd), then stores them to `out`.
+//
+template <bool kAdd, size_t kOuter, size_t kInner, typename ArrayT,
+          typename VecT, typename AddT>
+HWY_INLINE void MatVecAdd(const ArrayT& mat, const size_t mat_ofs,
+                          const VecT* HWY_RESTRICT const vec_aligned,
+                          const AddT* HWY_RESTRICT const add,
+                          float* HWY_RESTRICT out, hwy::ThreadPool& pool) {
+  PROFILER_ZONE("MatVecAdd");
 
   const hn::ScalableTag<float> df;
   constexpr size_t kRowsPerStrip = RowsPerStrip<kOuter>();
@@ -209,8 +257,9 @@ HWY_INLINE void MatVec(const CompressedArray<MatT, kCapacity>& mat,
   pool.Run(0, kNumStrips, [&](const uint64_t strip, size_t thread) HWY_ATTR {
     PROFILER_ZONE("MatVec.lambda");
     const size_t r0 = strip * kRowsPerStrip;
-    detail::FullDotProductsForStrip(df, mat, mat_ofs, kInner, r0, kRowsPerStrip,
-                                    vec_aligned, out + r0);
+    detail::FullDotProductsForStrip<kAdd>(df, mat, mat_ofs, kInner, r0,
+                                          kRowsPerStrip, vec_aligned, add,
+                                          out + r0);
   });
 
   // Remaining rows
@@ -218,11 +267,19 @@ HWY_INLINE void MatVec(const CompressedArray<MatT, kCapacity>& mat,
   if (r0 < kOuter) {
     PROFILER_ZONE("MatVec remainder");
     const size_t num_rows = kOuter - r0;
-    detail::FullDotProductsForStrip(df, mat, mat_ofs, kInner, r0, num_rows,
-                                    vec_aligned, out + r0);
+    detail::FullDotProductsForStrip<kAdd>(df, mat, mat_ofs, kInner, r0,
+                                          num_rows, vec_aligned, add, out + r0);
   }
 }
 
+template <size_t kOuter, size_t kInner, typename ArrayT, typename VecT>
+HWY_INLINE void MatVec(const ArrayT& mat, const size_t mat_ofs,
+                       const VecT* HWY_RESTRICT const vec_aligned,
+                       float* HWY_RESTRICT out, hwy::ThreadPool& pool) {
+  MatVecAdd<false, kOuter, kInner, ArrayT, VecT, VecT>(
+      mat, mat_ofs, vec_aligned, /*add=*/nullptr, out, pool);
+}
+
 template <class D, HWY_IF_F32_D(D)>
 static HWY_INLINE hn::Vec<D> Gelu(D d, hn::Vec<D> v) {
   const hn::Vec<D> kMul = hn::Set(d, 0.044715f);
@@ -277,69 +334,105 @@ static HWY_NOINLINE HWY_MAYBE_UNUSED void GeluMulToBF16(
   }
 }
 
+template <class D, HWY_IF_F32_D(D)>
+static HWY_INLINE hn::Vec<D> Sigmoid(D d, hn::Vec<D> v) {
+  using VF = hn::Vec<D>;
+  // Chebyshev polynomial coefficients for rational approximation
+  const VF c0 = hn::Set(d, 0.00949107017368078f);
+  const VF c1 = hn::Set(d, 0.0654858946800232f);
+  const VF c2 = hn::Set(d, 0.231547489762306f - 0.00949107017368078f);
+  const VF c3 = hn::Set(d, 0.530778527259827f);
+  const VF c4 = hn::Set(d, 0.855334937572479f);
+  const VF c5 = hn::Set(d, 0.500000894069672f);
+
+  const VF d0 = hn::Set(d, 0.130970627069473f);
+  const VF d1 = hn::Set(d, 3.99615288415589e-07f);
+  const VF d2 = hn::Set(d, 1.06155431270599f - 0.130970627069473f);
+  const VF d3 = hn::Set(d, 1.35144250634767e-06f);
+  const VF d4 = hn::Set(d, 1);
+
+  // The approximation works in range -12..12, but the input value is clamped
+  // in -11.5..11.5 since the approximation slightly overshoots after that.
+  // The function is nearly 0 for input values below -11.5 and nearly 1 for
+  // input values above 11.5.
+  const VF invtwelve = hn::Set(d, 1.0f / 12.0f);
+  const VF lo = hn::Set(d, -11.5f);
+  const VF hi = hn::Set(d, 11.5f);
+
+  VF f = hn::Clamp(v, lo, hi);
+  f = hn::Mul(f, invtwelve);
+  VF f2 = hn::Add(f, f);
+
+  VF a1 = hn::MulAdd(f2, c0, c1);
+  VF a2 = hn::MulAdd(f2, a1, c2);
+  VF a3 = hn::Sub(hn::MulAdd(f2, a2, c3), a1);
+  VF a4 = hn::Sub(hn::MulAdd(f2, a3, c4), a2);
+  VF f0 = hn::Sub(hn::MulAdd(f, a4, c5), a3);
+
+  VF b1 = hn::MulAdd(f2, d0, d1);
+  VF b2 = hn::MulAdd(f2, b1, d2);
+  VF b3 = hn::Sub(hn::MulAdd(f2, b2, d3), b1);
+  VF f1 = hn::Sub(hn::MulAdd(f, b3, d4), b2);
+
+  return Div(f0, f1);
+}
+
+// Sigmoid using the logistic function 1 / (1 + exp(-x[i]))
+static HWY_NOINLINE HWY_MAYBE_UNUSED void Sigmoid(float* HWY_RESTRICT x,
+                                                  size_t size) {
+  namespace hn = hwy::HWY_NAMESPACE;
+  using D = hn::ScalableTag<float>;
+  hn::Transform(D(), x, size,
+                [](D d, hn::Vec<D> v) HWY_ATTR { return Sigmoid(d, v); });
+}
+
 // Two matrices, same vector
-// TODO(janwas): apply optimizations from MatVec/replace with above overload
-template <size_t kOuter, size_t kInner, typename MatT, size_t kCapacity,
-          typename VecT>
-HWY_NOINLINE void TwoMatVec(const CompressedArray<MatT, kCapacity>& mat0,
-                            const CompressedArray<MatT, kCapacity>& mat1,
+template <bool kAdd, size_t kOuter, size_t kInner, typename ArrayT,
+          typename VecT, typename AddT>
+HWY_NOINLINE void TwoMatVecAdd(
+    const ArrayT& mat0, const ArrayT& mat1, const size_t mat_ofs,
+    const VecT* HWY_RESTRICT vec_aligned, const AddT* HWY_RESTRICT add0,
+    const AddT* HWY_RESTRICT add1, float* HWY_RESTRICT out0,
+    float* HWY_RESTRICT out1, hwy::ThreadPool& pool) {
+  PROFILER_ZONE("TwoMatVecAdd");
+
+  const hn::ScalableTag<float> df;
+  constexpr size_t kRowsPerStrip = RowsPerStrip<kOuter>();
+  constexpr size_t kNumStrips = kOuter / kRowsPerStrip;
+
+  // For each entire strip.
+  pool.Run(0, kNumStrips, [&](const uint64_t strip, size_t thread) HWY_ATTR {
+    PROFILER_ZONE("TwoMatVec.lambda");
+    const size_t r0 = strip * kRowsPerStrip;
+    detail::FullDotProductsForStrip<kAdd>(df, mat0, mat_ofs, kInner, r0,
+                                          kRowsPerStrip, vec_aligned, add0,
+                                          out0 + r0);
+    detail::FullDotProductsForStrip<kAdd>(df, mat1, mat_ofs, kInner, r0,
+                                          kRowsPerStrip, vec_aligned, add1,
+                                          out1 + r0);
+  });
+
+  // Remaining rows
+  const size_t r0 = kNumStrips * kRowsPerStrip;
+  if (r0 < kOuter) {
+    PROFILER_ZONE("TwoMatVec remainder");
+    const size_t num_rows = kOuter - r0;
+    detail::FullDotProductsForStrip<kAdd>(
+        df, mat0, mat_ofs, kInner, r0, num_rows, vec_aligned, add0, out0 + r0);
+    detail::FullDotProductsForStrip<kAdd>(
+        df, mat1, mat_ofs, kInner, r0, num_rows, vec_aligned, add1, out1 + r0);
+  }
+}
+
+template <size_t kOuter, size_t kInner, typename ArrayT, typename VecT>
+HWY_NOINLINE void TwoMatVec(const ArrayT& mat0, const ArrayT& mat1,
                             const size_t mat_ofs,
                             const VecT* HWY_RESTRICT vec_aligned,
                             float* HWY_RESTRICT out0, float* HWY_RESTRICT out1,
                             hwy::ThreadPool& pool) {
-  const hn::ScalableTag<float> df;
-  const size_t NF = hn::Lanes(df);
-
-  // Process multiple rows at a time so that we write multiples of a cache line
-  // to avoid false sharing (>= 64).
-  constexpr size_t kRowsPerStrip = 128 / sizeof(float);
-  const uint32_t num_strips = kOuter / kRowsPerStrip;
-
-  // No remainder handling after ThreadPool.
-  static_assert(kOuter % kRowsPerStrip == 0, "Add remainder handling");
-
-  // Required for Stream loop, otherwise we might have partial vectors.
-  HWY_DASSERT(kRowsPerStrip >= NF);
-  pool.Run(0, num_strips,
-           [&](const uint32_t strip, size_t /*thread*/) HWY_ATTR {
-             // MSVC workaround: duplicate to ensure constexpr.
-             constexpr size_t kRowsPerStrip = 128 / sizeof(float);
-             // Software write-combining to avoid cache pollution from out.
-             // Although `out` may be used later, keeping it out of the cache
-             // now and avoiding RFOs is a consistent 5% overall win.
-             HWY_ALIGN float buf0[kRowsPerStrip];
-             HWY_ALIGN float buf1[kRowsPerStrip];
-
-             // Only handle entire strips here because the Stream is not masked.
-             const size_t begin = strip * kRowsPerStrip;
-             for (size_t idx_row = 0; idx_row < kRowsPerStrip; ++idx_row) {
-               const size_t row_ofs = mat_ofs + (begin + idx_row) * kInner;
-               buf0[idx_row] = Dot(df, mat0, row_ofs, vec_aligned, kInner);
-               buf1[idx_row] = Dot(df, mat1, row_ofs, vec_aligned, kInner);
-             }
-
-             HWY_UNROLL(4)
-             for (size_t i = 0; i != kRowsPerStrip; i += NF) {
-               hn::Stream(hn::Load(df, buf0 + i), df, out0 + begin + i);
-             }
-             HWY_UNROLL(4)
-             for (size_t i = 0; i != kRowsPerStrip; i += NF) {
-               hn::Stream(hn::Load(df, buf1 + i), df, out1 + begin + i);
-             }
-           });
-  hwy::FlushStream();
-}
-
-// Baseline Naive MatMul
-template <size_t kOuter, size_t kInner, size_t kBatchSize, typename MatT,
-          size_t kCapacity, typename VecT>
-HWY_NOINLINE void MatMul(const CompressedArray<MatT, kCapacity>& mat,
-                         const size_t mat_ofs, const VecT* HWY_RESTRICT vec,
-                         float* HWY_RESTRICT out, hwy::ThreadPool& pool) {
-  for (size_t i = 0; i < kBatchSize; ++i) {
-    MatVec<kOuter, kInner, MatT, kCapacity, VecT>(
-        mat, mat_ofs, vec + i * kInner, out + i * kOuter, pool);
-  }
+  TwoMatVecAdd<false, kOuter, kInner, ArrayT, VecT, VecT>(
+      mat0, mat1, mat_ofs, vec_aligned, /*add0=*/nullptr, /*add1=*/nullptr,
+      out0, out1, pool);
 }
 
 static HWY_NOINLINE HWY_MAYBE_UNUSED float Dot(const float* HWY_RESTRICT a,
diff --git a/ops_test.cc b/ops_test.cc
index f908c39..c711946 100644
--- a/ops_test.cc
+++ b/ops_test.cc
@@ -356,6 +356,155 @@ void TestAllCreateDistribution() {
   TestCreateDistribution<5000>();
 }
 
+template <size_t kOuter, size_t kInner>
+CompressedArray<float, kOuter * kInner> GenerateMat(size_t offset) {
+  hwy::ThreadPool pool(0);
+  gcpp::CompressWorkingSet ws;
+  CompressedArray<float, kOuter * kInner> mat;
+  std::array<float, kOuter * kInner> content;
+  const float scale = 1.0f / kInner;
+  for (size_t i = 0; i < kOuter; i++) {
+    for (size_t j = 0; j < kInner; j++) {
+      content[i * kInner + j] = static_cast<float>((i + j + offset) * scale);
+    }
+  }
+  Compress(content, ws, mat, pool);
+  return mat;
+}
+
+template <size_t length>
+hwy::AlignedFreeUniquePtr<float[]> GenerateVec(size_t offset) {
+  hwy::AlignedFreeUniquePtr<float[]> vec = hwy::AllocateAligned<float>(length);
+  for (size_t idx = 0; idx < length; idx++) {
+    vec[idx] = static_cast<float>(idx + offset);
+  }
+  return vec;
+}
+
+template <size_t kOuter, size_t kInner>
+hwy::AlignedFreeUniquePtr<float[]> SimpleMatVecAdd(
+    const CompressedArray<float, kOuter * kInner>& mat,
+    const hwy::AlignedFreeUniquePtr<float[]>& vec,
+    const hwy::AlignedFreeUniquePtr<float[]>& add) {
+  hwy::AlignedFreeUniquePtr<float[]> uncompressed_mat =
+      hwy::AllocateAligned<float>(kOuter * kInner);
+  Decompress(mat, 0, uncompressed_mat.get(), kOuter * kInner);
+  hwy::AlignedFreeUniquePtr<float[]> out = hwy::AllocateAligned<float>(kOuter);
+  for (size_t idx_row = 0; idx_row < kOuter; idx_row++) {
+    out[idx_row] = add[idx_row];
+    for (size_t idx_col = 0; idx_col < kInner; idx_col++) {
+      out[idx_row] +=
+          uncompressed_mat[kInner * idx_row + idx_col] * vec[idx_col];
+    }
+  }
+  return out;
+}
+
+template <size_t length>
+void AssertClose(const hwy::AlignedFreeUniquePtr<float[]>& a,
+                 const hwy::AlignedFreeUniquePtr<float[]>& b) {
+  for (size_t idx = 0; idx < length; idx++) {
+    const float rel_abs_delta = std::abs(a[idx] - b[idx]) /
+                                std::max(std::abs(a[idx]), std::abs(b[idx]));
+    EXPECT_LT(rel_abs_delta, 2e-6)
+        << "a[" << idx << "]=" << a[idx] << ", b[" << idx << "]=" << b[idx];
+  }
+}
+
+void TestMatVecAdd() {
+  hwy::ThreadPool pool(0);
+  constexpr size_t kOuter = 128 * 3;
+  constexpr size_t kInner = 128 * 5;
+  CompressedArray<float, kOuter * kInner> mat = GenerateMat<kOuter, kInner>(0);
+  hwy::AlignedFreeUniquePtr<float[]> vec = GenerateVec<kInner>(0);
+  hwy::AlignedFreeUniquePtr<float[]> add = GenerateVec<kOuter>(0);
+  hwy::AlignedFreeUniquePtr<float[]> expected_out =
+      SimpleMatVecAdd<kOuter, kInner>(mat, vec, add);
+  hwy::AlignedFreeUniquePtr<float[]> actual_out =
+      hwy::AllocateAligned<float>(kOuter);
+  MatVecAdd<true, kOuter, kInner>(mat, 0, vec.get(), add.get(),
+                                  actual_out.get(), pool);
+  AssertClose<kOuter>(actual_out, expected_out);
+}
+
+void TestMatVecAddLoop() {
+  constexpr size_t kOuter = 128 * 3;
+  constexpr size_t kInner = 128 * 5;
+  CompressedArray<float, kOuter * kInner> mat = GenerateMat<kOuter, kInner>(0);
+  hwy::AlignedFreeUniquePtr<float[]> vec = GenerateVec<kInner>(0);
+  hwy::AlignedFreeUniquePtr<float[]> add = GenerateVec<kOuter>(0);
+  hwy::AlignedFreeUniquePtr<float[]> expected_out =
+      SimpleMatVecAdd<kOuter, kInner>(mat, vec, add);
+  hwy::AlignedFreeUniquePtr<float[]> actual_out =
+      hwy::AllocateAligned<float>(kOuter);
+  MatVecAddLoop<true, kOuter, kInner>(mat, 0, vec.get(), add.get(),
+                                      actual_out.get());
+  AssertClose<kOuter>(actual_out, expected_out);
+}
+
+void TestTwoMatVecAdd() {
+  hwy::ThreadPool pool(0);
+  constexpr size_t kOuter = 128 * 3;
+  constexpr size_t kInner = 128 * 5;
+  CompressedArray<float, kOuter * kInner> mat0 = GenerateMat<kOuter, kInner>(0);
+  CompressedArray<float, kOuter * kInner> mat1 = GenerateMat<kOuter, kInner>(1);
+  hwy::AlignedFreeUniquePtr<float[]> vec = GenerateVec<kInner>(0);
+  hwy::AlignedFreeUniquePtr<float[]> add0 = GenerateVec<kOuter>(0);
+  hwy::AlignedFreeUniquePtr<float[]> add1 = GenerateVec<kOuter>(1);
+  hwy::AlignedFreeUniquePtr<float[]> expected_out0 =
+      SimpleMatVecAdd<kOuter, kInner>(mat0, vec, add0);
+  hwy::AlignedFreeUniquePtr<float[]> expected_out1 =
+      SimpleMatVecAdd<kOuter, kInner>(mat1, vec, add1);
+  hwy::AlignedFreeUniquePtr<float[]> actual_out0 =
+      hwy::AllocateAligned<float>(kOuter);
+  hwy::AlignedFreeUniquePtr<float[]> actual_out1 =
+      hwy::AllocateAligned<float>(kOuter);
+  TwoMatVecAdd<true, kOuter, kInner>(mat0, mat1, 0, vec.get(), add0.get(),
+                                     add1.get(), actual_out0.get(),
+                                     actual_out1.get(), pool);
+  AssertClose<kOuter>(actual_out0, expected_out0);
+  AssertClose<kOuter>(actual_out1, expected_out1);
+}
+
+void TestTwoOfsMatVecAddLoop() {
+  constexpr size_t kOuter = 128 * 3;
+  constexpr size_t kInner = 128 * 5;
+  CompressedArray<float, kOuter * kInner> mat = GenerateMat<kOuter, kInner>(0);
+  hwy::AlignedFreeUniquePtr<float[]> vec = GenerateVec<kInner>(0);
+  hwy::AlignedFreeUniquePtr<float[]> add0 = GenerateVec<kOuter>(0);
+  hwy::AlignedFreeUniquePtr<float[]> add1 = GenerateVec<kOuter>(1);
+  hwy::AlignedFreeUniquePtr<float[]> expected_out0 =
+      SimpleMatVecAdd<kOuter, kInner>(mat, vec, add0);
+  hwy::AlignedFreeUniquePtr<float[]> expected_out1 =
+      SimpleMatVecAdd<kOuter, kInner>(mat, vec, add1);
+  hwy::AlignedFreeUniquePtr<float[]> actual_out0 =
+      hwy::AllocateAligned<float>(kOuter);
+  hwy::AlignedFreeUniquePtr<float[]> actual_out1 =
+      hwy::AllocateAligned<float>(kOuter);
+  TwoOfsMatVecAddLoop<true, kOuter, kInner>(mat, 0, 0, vec.get(), add0.get(),
+                                            add1.get(), actual_out0.get(),
+                                            actual_out1.get());
+  AssertClose<kOuter>(actual_out0, expected_out0);
+  AssertClose<kOuter>(actual_out1, expected_out1);
+}
+
+void TestSigmoid() {
+  std::vector<float> values;
+  for (int i = -150; i <= 150; ++i) {
+    values.push_back(.1f * i);
+  }
+  std::vector<float> result = values;
+  Sigmoid(result.data(), result.size());
+
+  for (size_t i = 0; i < values.size(); i++) {
+    const float max_error = 0.00007;
+    float value = values[i];
+    float approx = result[i];
+    float expected = (1 / (1 + std::exp(-values[i])));
+    EXPECT_NEAR(approx, expected, max_error) << "Input: " << value;
+  }
+}
+
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace gcpp
@@ -371,9 +520,15 @@ HWY_EXPORT_AND_TEST_P(OpsTest, TestAllMulByConst);
 HWY_EXPORT_AND_TEST_P(OpsTest, TestAllMulByConstAndAdd);
 HWY_EXPORT_AND_TEST_P(OpsTest, TestAllSoftmax);
 HWY_EXPORT_AND_TEST_P(OpsTest, TestAllCreateDistribution);
+HWY_EXPORT_AND_TEST_P(OpsTest, TestMatVecAdd);
+HWY_EXPORT_AND_TEST_P(OpsTest, TestMatVecAddLoop);
+HWY_EXPORT_AND_TEST_P(OpsTest, TestTwoMatVecAdd);
+HWY_EXPORT_AND_TEST_P(OpsTest, TestTwoOfsMatVecAddLoop);
+HWY_EXPORT_AND_TEST_P(OpsTest, TestSigmoid);
 #ifdef HWY_AFTER_TEST
 HWY_AFTER_TEST();
 #endif
+
 }  // namespace gcpp
 
 #endif