Remove no longer used MatVec

PiperOrigin-RevId: 809059409
2025-09-19 09:02:44 -07:00 · 2025-09-19 09:02:44 -07:00 · 501fdf000e
parent b603425bf3
commit 501fdf000e
5 changed files with 0 additions and 520 deletions
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -366,7 +366,6 @@ cc_library(
        "ops/dot-inl.h",
        "ops/sum-inl.h",
        "ops/fp_arith-inl.h",
-        "ops/matvec-inl.h",
        "ops/ops-inl.h",
    ],
    deps = [
@ -381,7 +380,6 @@ cc_library(
        "@highway//:bit_set",
        "@highway//:hwy",
        "@highway//:math",
-        "@highway//:matvec",
        "@highway//:profiler",
        "@highway//:thread_pool",
        "@highway//hwy/contrib/sort:vqsort",
@ -442,27 +440,6 @@ cc_test(
    ],
 )

-cc_test(
-    name = "gemma_matvec_test",
-    size = "small",
-    timeout = "long",
-    srcs = ["ops/gemma_matvec_test.cc"],
-    linkstatic = True,
-    local_defines = ["HWY_IS_TEST"],
-    # for test_suite.
-    tags = ["ops_tests"],
-    deps = [
-        ":mat",
-        ":ops",
-        ":threading_context",
-        "@googletest//:gtest_main",  # buildcleaner: keep
-        "//compression:compress",
-        "@highway//:hwy",
-        "@highway//:hwy_test_util",
-        "@highway//:thread_pool",
-    ],
-)
-
 cc_test(
    name = "matmul_test",
    size = "small",
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -112,7 +112,6 @@ set(SOURCES
  ops/matmul-inl.h
  ops/matmul.cc
  ops/matmul.h
-  ops/matvec-inl.h
  ops/ops-inl.h
  ops/ops.h
  ops/sum-inl.h
@ -224,7 +223,6 @@ set(GEMMA_TEST_FILES
  io/fields_test.cc
  ops/bench_matmul.cc
  ops/dot_test.cc
-  ops/gemma_matvec_test.cc
  ops/matmul_test.cc
  ops/ops_test.cc
  paligemma/image_test.cc
--- a/gemma/flash_attention_test.cc
+++ b/gemma/flash_attention_test.cc
@ -51,7 +51,6 @@
 #include "gemma/attention.h"
 #include "gemma/configs.h"
 #include "gemma/flash_attention.h"
-#include "ops/matvec-inl.h"
 #include "hwy/tests/test_util-inl.h"

 HWY_BEFORE_NAMESPACE();
--- a/ops/gemma_matvec_test.cc
+++ b/ops/gemma_matvec_test.cc
@ -1,192 +0,0 @@
-// Copyright 2023 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "compression/types.h"
-#ifndef HWY_DISABLED_TARGETS
-#define HWY_DISABLED_TARGETS GEMMA_DISABLED_TARGETS
-#endif  // HWY_DISABLED_TARGETS
-
-#include <stddef.h>
-#include <stdio.h>
-
-#include <algorithm>  // std::max
-#include <cmath>      // std::abs
-#include <memory>
-
-#include "util/mat.h"
-#include "util/threading_context.h"
-#include "hwy/aligned_allocator.h"
-#include "hwy/base.h"
-#include "hwy/contrib/thread_pool/thread_pool.h"
-
-// clang-format off
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "ops/gemma_matvec_test.cc"  // NOLINT
-// clang-format on
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#include "hwy/highway.h"
-// After highway.h
-#include "compression/compress-inl.h"
-#include "ops/matvec-inl.h"
-#include "hwy/tests/test_util-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace gcpp {
-namespace HWY_NAMESPACE {
-
-using FloatPtr = hwy::AlignedFreeUniquePtr<float[]>;
-
-FloatPtr SimpleMatVecAdd(const MatStorageT<float>& mat, const FloatPtr& vec,
-                         const FloatPtr& add) {
-  const size_t num = mat.Rows() * mat.Cols();
-  FloatPtr raw_mat = hwy::AllocateAligned<float>(num);
-  FloatPtr out = hwy::AllocateAligned<float>(mat.Rows());
-  HWY_ASSERT(raw_mat && out);
-  const hn::ScalableTag<float> df;
-  DecompressAndZeroPad(df, mat.Span(), 0, raw_mat.get(), num);
-  for (size_t idx_row = 0; idx_row < mat.Rows(); idx_row++) {
-    out[idx_row] = 0.0f;
-    for (size_t idx_col = 0; idx_col < mat.Cols(); idx_col++) {
-      out[idx_row] += raw_mat[mat.Cols() * idx_row + idx_col] * vec[idx_col];
-    }
-    out[idx_row] *= mat.Scale();
-    out[idx_row] += add[idx_row];
-  }
-  return out;
-}
-
-template <typename MatT, size_t kOuter, size_t kInner>
-std::unique_ptr<MatStorageT<float>> GenerateMat(size_t offset,
-                                                const Allocator& allocator,
-                                                hwy::ThreadPool& pool) {
-  gcpp::CompressWorkingSet ws;
-  const Extents2D extents(kOuter, kInner);
-  auto mat = std::make_unique<MatStorageT<float>>("TestMat", extents, allocator,
-                                                  MatPadding::kPacked);
-  FloatPtr raw_mat = hwy::AllocateAligned<float>(extents.Area());
-  HWY_ASSERT(raw_mat);
-  const float scale = 1.0f / kInner;
-  pool.Run(0, kOuter, [&](const size_t i, size_t /*thread*/) {
-    for (size_t j = 0; j < kInner; j++) {
-      raw_mat[i * kInner + j] =
-          static_cast<float>((i * kInner + j + offset) * scale);
-    }
-  });
-
-  Compress(raw_mat.get(), extents.Area(), ws, mat->Span(), 0, pool);
-  mat->SetScale(1.9f);  // Arbitrary value, different from 1.
-  return mat;
-}
-
-template <size_t length>
-FloatPtr GenerateVec(size_t offset) {
-  FloatPtr vec = hwy::AllocateAligned<float>(length);
-  HWY_ASSERT(vec);
-  for (size_t idx = 0; idx < length; idx++) {
-    vec[idx] = static_cast<float>(idx + offset);
-  }
-  return vec;
-}
-
-template <size_t length>
-void AssertClose(const FloatPtr& a, const FloatPtr& b) {
-  for (size_t idx = 0; idx < length; idx++) {
-    const float rel_abs_delta = std::abs(a[idx] - b[idx]) /
-                                std::max(std::abs(a[idx]), std::abs(b[idx]));
-    EXPECT_LT(rel_abs_delta, 2e-6)
-        << "a[" << idx << "]=" << a[idx] << ", b[" << idx << "]=" << b[idx];
-  }
-}
-
-void TestMatVecAdd() {
-  ThreadingArgs threading_args;
-  ThreadingContext ctx(threading_args);
-  hwy::ThreadPool& pool = ctx.pools.Pool();
-  constexpr size_t kOuter = 128 * 3;
-  constexpr size_t kInner = 128 * 5;
-  auto mat = GenerateMat<float, kOuter, kInner>(0, ctx.allocator, pool);
-  FloatPtr vec = GenerateVec<kInner>(0);
-  FloatPtr add = GenerateVec<kOuter>(0);
-  FloatPtr expected_out = SimpleMatVecAdd(*mat, vec, add);
-  FloatPtr actual_out = hwy::AllocateAligned<float>(kOuter);
-  HWY_ASSERT(vec && add && expected_out && actual_out);
-  MatVecAdd(*mat, 0, kOuter, kInner, vec.get(), add.get(), actual_out.get(),
-            pool);
-  AssertClose<kOuter>(actual_out, expected_out);
-}
-
-void TestTwoMatVecAdd() {
-  ThreadingArgs threading_args;
-  ThreadingContext ctx(threading_args);
-  hwy::ThreadPool& pool = ctx.pools.Pool();
-  constexpr size_t kOuter = 128 * 3;
-  constexpr size_t kInner = 128 * 5;
-  auto mat0 = GenerateMat<float, kOuter, kInner>(0, ctx.allocator, pool);
-  auto mat1 = GenerateMat<float, kOuter, kInner>(1, ctx.allocator, pool);
-  FloatPtr vec = GenerateVec<kInner>(0);
-  FloatPtr add0 = GenerateVec<kOuter>(0);
-  FloatPtr add1 = GenerateVec<kOuter>(1);
-  FloatPtr expected_out0 = SimpleMatVecAdd(*mat0, vec, add0);
-  FloatPtr expected_out1 = SimpleMatVecAdd(*mat1, vec, add1);
-  FloatPtr actual_out0 = hwy::AllocateAligned<float>(kOuter);
-  FloatPtr actual_out1 = hwy::AllocateAligned<float>(kOuter);
-  HWY_ASSERT(vec && add0 && add1 && expected_out0 && actual_out0 &&
-             expected_out1 && actual_out1);
-  TwoMatVecAdd(*mat0, *mat1, 0, kOuter, kInner, vec.get(), add0.get(),
-               add1.get(), actual_out0.get(), actual_out1.get(), pool);
-  AssertClose<kOuter>(actual_out0, expected_out0);
-  AssertClose<kOuter>(actual_out1, expected_out1);
-}
-
-void TestTwoOfsMatVecAddLoop() {
-  ThreadingArgs threading_args;
-  ThreadingContext ctx(threading_args);
-  hwy::ThreadPool& pool = ctx.pools.Pool();
-
-  constexpr size_t kOuter = 128 * 3;
-  constexpr size_t kInner = 128 * 5;
-  auto mat = GenerateMat<float, kOuter, kInner>(0, ctx.allocator, pool);
-  FloatPtr vec = GenerateVec<kInner>(0);
-  FloatPtr add0 = GenerateVec<kOuter>(0);
-  FloatPtr add1 = GenerateVec<kOuter>(1);
-  FloatPtr expected_out0 = SimpleMatVecAdd(*mat, vec, add0);
-  FloatPtr expected_out1 = SimpleMatVecAdd(*mat, vec, add1);
-  FloatPtr actual_out0 = hwy::AllocateAligned<float>(kOuter);
-  FloatPtr actual_out1 = hwy::AllocateAligned<float>(kOuter);
-  HWY_ASSERT(vec && add0 && add1 && expected_out0 && actual_out0 &&
-             expected_out1 && actual_out1);
-  TwoOfsMatVecAddLoop(*mat, 0, 0, kOuter, kInner, vec.get(), add0.get(),
-                      add1.get(), actual_out0.get(), actual_out1.get());
-  AssertClose<kOuter>(actual_out0, expected_out0);
-  AssertClose<kOuter>(actual_out1, expected_out1);
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace gcpp
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace gcpp {
-HWY_BEFORE_TEST(MatVecTest);
-HWY_EXPORT_AND_TEST_P(MatVecTest, TestMatVecAdd);
-HWY_EXPORT_AND_TEST_P(MatVecTest, TestTwoMatVecAdd);
-HWY_EXPORT_AND_TEST_P(MatVecTest, TestTwoOfsMatVecAddLoop);
-HWY_AFTER_TEST();
-
-}  // namespace gcpp
-
-#endif
--- a/ops/matvec-inl.h
+++ b/ops/matvec-inl.h
@ -1,302 +0,0 @@
-// Copyright 2024 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Include guard for non-SIMD code.
-#ifndef THIRD_PARTY_GEMMA_CPP_OPS_MATVEC_INL_H_
-#define THIRD_PARTY_GEMMA_CPP_OPS_MATVEC_INL_H_
-
-#include <stddef.h>
-#include <stdint.h>
-#include <stdio.h>
-
-#include "hwy/base.h"
-#include "hwy/contrib/thread_pool/thread_pool.h"
-#include "hwy/profiler.h"
-
-#endif  // THIRD_PARTY_GEMMA_CPP_OPS_MATVEC_INL_H_
-
-// Include guard for (potentially) SIMD code.
-#if defined(THIRD_PARTY_GEMMA_CPP_MATVEC_TOGGLE) == defined(HWY_TARGET_TOGGLE)
-#ifdef THIRD_PARTY_GEMMA_CPP_MATVEC_TOGGLE
-#undef THIRD_PARTY_GEMMA_CPP_MATVEC_TOGGLE
-#else
-#define THIRD_PARTY_GEMMA_CPP_MATVEC_TOGGLE
-#endif
-
-#include "compression/compress-inl.h"
-#include "ops/dot-inl.h"
-#include "util/mat.h"  // MatPtrT
-#include "hwy/contrib/math/math-inl.h"
-#include "hwy/contrib/matvec/matvec-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace gcpp {
-namespace HWY_NAMESPACE {
-namespace hn = hwy::HWY_NAMESPACE;
-
-// For callers that pass `MatPtrT`, which is not necessarily packed - callers
-// should use Stride() to compute `w_ofs`.
-template <typename WT, typename VT>
-HWY_INLINE float Dot(const MatPtrT<WT>& w, size_t w_ofs, const VT* vec_aligned,
-                     size_t num) {
-  const hn::ScalableTag<VT> d;
-  return w.Scale() * Dot(d, w.PaddedSpan(), w_ofs, vec_aligned, num);
-}
-
-// ArrayT is MatPtrT.
-
-// Simple version without tiling nor threading, but two offsets/outputs and
-// always with addition.
-template <typename ArrayT, typename VecT, typename AddT>
-HWY_INLINE void TwoOfsMatVecAddLoop(const ArrayT& mat, const size_t mat_ofs0,
-                                    const size_t mat_ofs1, const size_t outer,
-                                    const size_t inner,
-                                    const VecT* HWY_RESTRICT vec_aligned,
-                                    const AddT* HWY_RESTRICT add0,
-                                    const AddT* HWY_RESTRICT add1,
-                                    float* HWY_RESTRICT out0,
-                                    float* HWY_RESTRICT out1) {
-  PROFILER_ZONE("TwoOfsMatVecAddLoop");
-
-  for (size_t idx_row = 0; idx_row < outer; ++idx_row) {
-    const size_t row_ofs0 = mat_ofs0 + idx_row * mat.Stride();
-    const size_t row_ofs1 = mat_ofs1 + idx_row * mat.Stride();
-    out0[idx_row] = hwy::ConvertScalarTo<float>(add0[idx_row]) +
-                    Dot(mat, row_ofs0, vec_aligned, inner);
-    out1[idx_row] = hwy::ConvertScalarTo<float>(add1[idx_row]) +
-                    Dot(mat, row_ofs1, vec_aligned, inner);
-  }
-}
-
-HWY_INLINE constexpr size_t MaxCols() {
-  // Vec + mat rows should fit into 32 KiB L1.
-  return 2048;
-}
-
-template <size_t kOuter>
-HWY_INLINE constexpr size_t RowsPerStrip() {
-  // Aim for 128 work items to reduce pool overhead. Must be at least one
-  // vector; prefer a power of two for faster division.
-  constexpr size_t kLanes = hn::ScalableTag<float>().MaxLanes();
-  constexpr size_t kRowsPerStrip =
-      kOuter < 128 ? kLanes
-                   : HWY_MAX(kLanes, 1ULL << hwy::FloorLog2(kOuter / 128));
-  return kRowsPerStrip;
-}
-
-HWY_INLINE size_t RowsPerStrip(const size_t outer) {
-  // Aim for 128 work items to reduce pool overhead. Must be at least one
-  // vector; prefer a power of two for faster division.
-  constexpr size_t kLanes = hn::ScalableTag<float>().MaxLanes();
-  return outer < 128 ? kLanes
-                     : HWY_MAX(kLanes, 1ULL << hwy::FloorLog2(outer / 128));
-}
-
-namespace detail {
-
-// For each i = [0, num_rows), compute partial (length `num_cols`) dot product
-// of row i with `vec_aligned` and add into `out[i]`. The upper-left
-// coordinate of the tile is r0, c0.
-template <class DF, typename ArrayT, typename VecT>
-HWY_INLINE void AccumulatePartialDotProducts(
-    DF df, const ArrayT& mat, size_t mat_ofs, size_t r0, size_t c0,
-    size_t num_rows, size_t num_cols, const VecT* HWY_RESTRICT vec_aligned,
-    float* HWY_RESTRICT out) {
-  for (size_t idx_row = 0; idx_row < num_rows; ++idx_row) {
-    const size_t row_ofs = mat_ofs + (r0 + idx_row) * mat.Stride();
-    out[idx_row] += Dot(mat, row_ofs + c0, vec_aligned + c0, num_cols);
-  }
-}
-
-// Same as AccumulatePartialDotProducts, but sets out[i] to the first partial
-// dot product + init (if kInit), which avoids having to zero-initialize and
-// accumulate.
-template <bool kInit, class DF, typename ArrayT, typename VecT, typename InitT>
-HWY_INLINE void SetFirstPartialDotProducts(DF df, const ArrayT& mat,
-                                           size_t mat_ofs, size_t r0, size_t c0,
-                                           size_t num_rows, size_t num_cols,
-                                           const VecT* HWY_RESTRICT vec_aligned,
-                                           const InitT* HWY_RESTRICT init,
-                                           float* HWY_RESTRICT out) {
-  for (size_t idx_row = 0; idx_row < num_rows; ++idx_row) {
-    const size_t row_ofs = mat_ofs + (r0 + idx_row) * mat.Stride();
-    if constexpr (kInit) {
-      out[idx_row] = hwy::ConvertScalarTo<float>(init[idx_row + r0]) +
-                     Dot(mat, row_ofs + c0, vec_aligned + c0, num_cols);
-    } else {
-      out[idx_row] = Dot(mat, row_ofs + c0, vec_aligned + c0, num_cols);
-    }
-  }
-}
-
-// Adds together partial dot products for all tiles with the same r0 (a
-// horizontal strip of the entire matrix); the result is the full dot product
-// for rows r in [r0, r0 + num_rows) + optionally the add vector, which we
-// store into in out[r - r0].
-template <bool kAdd, class DF, typename ArrayT, typename VecT, typename AddT>
-HWY_INLINE void FullDotProductsForStrip(DF df, const ArrayT& mat,
-                                        size_t mat_ofs, size_t r0,
-                                        size_t num_rows, size_t num_cols,
-                                        const VecT* HWY_RESTRICT vec_aligned,
-                                        const AddT* HWY_RESTRICT add,
-                                        float* HWY_RESTRICT out) {
-  HWY_DASSERT(num_cols <= mat.Cols());
-  // Tall and skinny: set `out` to the single dot product.
-  if (num_cols < MaxCols()) {
-    SetFirstPartialDotProducts<kAdd>(df, mat, mat_ofs, r0, 0, num_rows,
-                                     num_cols, vec_aligned, add, out);
-    return;
-  }
-
-  // We have at least MaxCols, so start by setting `out` to that:
-  SetFirstPartialDotProducts<kAdd>(df, mat, mat_ofs, r0, 0, num_rows, MaxCols(),
-                                   vec_aligned, add, out);
-  // For further multiples of MaxCols, accumulate. Remainders handled below.
-  size_t c0 = MaxCols();
-  for (; c0 <= num_cols - MaxCols(); c0 += MaxCols()) {
-    AccumulatePartialDotProducts(df, mat, mat_ofs, r0, c0, num_rows, MaxCols(),
-                                 vec_aligned, out);
-  }
-
-  if (c0 < num_cols) {  // Final cols
-    AccumulatePartialDotProducts(df, mat, mat_ofs, r0, c0, num_rows,
-                                 num_cols - c0, vec_aligned, out);
-  }
-}
-
-}  // namespace detail
-
-// Stores dot products of rows with `vec_aligned` + add the values from `add`
-// (if kAdd), then stores them to `out`.
-template <bool kAdd, typename ArrayT, typename VecT, typename AddT>
-HWY_INLINE void MatVecT(const ArrayT& mat, const size_t mat_ofs,
-                        const size_t outer, const size_t inner,
-                        const VecT* HWY_RESTRICT const vec_aligned,
-                        const AddT* HWY_RESTRICT const add,
-                        float* HWY_RESTRICT out, hwy::ThreadPool& pool) {
-  PROFILER_ZONE("MatVecAdd");
-
-  const hn::ScalableTag<float> df;
-  const size_t rows_per_strip = RowsPerStrip(outer);
-  const size_t num_strips = outer / rows_per_strip;
-
-  // For each entire strip.
-  pool.Run(0, num_strips, [&](const uint64_t strip, size_t thread) HWY_ATTR {
-    PROFILER_ZONE("MatVec.lambda");
-    const size_t r0 = strip * rows_per_strip;
-    detail::FullDotProductsForStrip<kAdd>(df, mat, mat_ofs, r0, rows_per_strip,
-                                          inner, vec_aligned, add, out + r0);
-  });
-
-  // Remaining rows
-  const size_t r0 = num_strips * rows_per_strip;
-  if (r0 < outer) {
-    PROFILER_ZONE("MatVec remainder");
-    const size_t num_rows = outer - r0;
-    detail::FullDotProductsForStrip<kAdd>(df, mat, mat_ofs, r0, num_rows, inner,
-                                          vec_aligned, add, out + r0);
-  }
-}
-
-// With addition
-template <typename ArrayT, typename VecT, typename AddT>
-HWY_INLINE void MatVecAdd(const ArrayT& mat, const size_t mat_ofs,
-                          const size_t outer, const size_t inner,
-                          const VecT* HWY_RESTRICT const vec_aligned,
-                          const AddT* HWY_RESTRICT const add,
-                          float* HWY_RESTRICT out, hwy::ThreadPool& pool) {
-  return MatVecT</*kAdd=*/true>(mat, mat_ofs, outer, inner, vec_aligned, add,
-                                out, pool);
-}
-
-// Without addition
-template <typename ArrayT, typename VecT>
-HWY_INLINE void MatVec(const ArrayT& mat, const size_t mat_ofs,
-                       const size_t outer, const size_t inner,
-                       const VecT* HWY_RESTRICT const vec_aligned,
-                       float* HWY_RESTRICT out, hwy::ThreadPool& pool) {
-  MatVecT</*kAdd=*/false>(mat, mat_ofs, outer, inner, vec_aligned,
-                          /*add=*/static_cast<VecT*>(nullptr), out, pool);
-}
-
-// Two matrices, same vector
-template <bool kAdd, typename ArrayT1, typename ArrayT2, typename VecT,
-          typename AddT>
-HWY_NOINLINE void TwoMatVecT(const ArrayT1& mat0, const ArrayT2& mat1,
-                             const size_t mat_ofs, size_t outer, size_t inner,
-                             const VecT* HWY_RESTRICT vec_aligned,
-                             const AddT* HWY_RESTRICT add0,
-                             const AddT* HWY_RESTRICT add1,
-                             float* HWY_RESTRICT out0, float* HWY_RESTRICT out1,
-                             hwy::ThreadPool& pool) {
-  PROFILER_ZONE("TwoMatVecAdd");
-
-  const hn::ScalableTag<float> df;
-  const size_t rows_per_strip = RowsPerStrip(outer);
-  const size_t num_strips = outer / rows_per_strip;
-
-  // For each entire strip.
-  pool.Run(0, num_strips, [&](const uint64_t strip, size_t thread) HWY_ATTR {
-    PROFILER_ZONE("TwoMatVec.lambda");
-    const size_t r0 = strip * rows_per_strip;
-    detail::FullDotProductsForStrip<kAdd>(df, mat0, mat_ofs, r0, rows_per_strip,
-                                          inner, vec_aligned, add0, out0 + r0);
-    detail::FullDotProductsForStrip<kAdd>(df, mat1, mat_ofs, r0, rows_per_strip,
-                                          inner, vec_aligned, add1, out1 + r0);
-  });
-
-  // Remaining rows
-  const size_t r0 = num_strips * rows_per_strip;
-  if (r0 < outer) {
-    PROFILER_ZONE("TwoMatVec remainder");
-    const size_t num_rows = outer - r0;
-    detail::FullDotProductsForStrip<kAdd>(df, mat0, mat_ofs, r0, num_rows,
-                                          inner, vec_aligned, add0, out0 + r0);
-    detail::FullDotProductsForStrip<kAdd>(df, mat1, mat_ofs, r0, num_rows,
-                                          inner, vec_aligned, add1, out1 + r0);
-  }
-}
-
-// With addition
-template <typename ArrayT1, typename ArrayT2, typename VecT, typename AddT>
-HWY_NOINLINE void TwoMatVecAdd(
-    const ArrayT1& mat0, const ArrayT2& mat1, const size_t mat_ofs,
-    const size_t outer, const size_t inner,
-    const VecT* HWY_RESTRICT vec_aligned, const AddT* HWY_RESTRICT add0,
-    const AddT* HWY_RESTRICT add1, float* HWY_RESTRICT out0,
-    float* HWY_RESTRICT out1, hwy::ThreadPool& pool) {
-  return TwoMatVecT</*kAdd=*/true>(mat0, mat1, mat_ofs, outer, inner,
-                                   vec_aligned, add0, add1, out0, out1, pool);
-}
-
-// Without addition
-template <typename ArrayT1, typename ArrayT2, typename VecT>
-HWY_NOINLINE void TwoMatVec(const ArrayT1& mat0, const ArrayT2& mat1,
-                            const size_t mat_ofs, const size_t outer,
-                            const size_t inner,
-                            const VecT* HWY_RESTRICT vec_aligned,
-                            float* HWY_RESTRICT out0, float* HWY_RESTRICT out1,
-                            hwy::ThreadPool& pool) {
-  TwoMatVecT</*kAdd=*/false, ArrayT1, ArrayT2, VecT, VecT>(
-      mat0, mat1, mat_ofs, outer, inner, vec_aligned, /*add0=*/nullptr,
-      /*add1=*/nullptr, out0, out1, pool);
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace gcpp
-HWY_AFTER_NAMESPACE();
-
-#endif  // NOLINT