From 501fdf000ea77544a0ee9b4db27767fcc10ebc02 Mon Sep 17 00:00:00 2001 From: Jan Wassenberg Date: Fri, 19 Sep 2025 09:02:44 -0700 Subject: [PATCH] Remove no longer used MatVec PiperOrigin-RevId: 809059409 --- BUILD.bazel | 23 --- CMakeLists.txt | 2 - gemma/flash_attention_test.cc | 1 - ops/gemma_matvec_test.cc | 192 --------------------- ops/matvec-inl.h | 302 ---------------------------------- 5 files changed, 520 deletions(-) delete mode 100644 ops/gemma_matvec_test.cc delete mode 100644 ops/matvec-inl.h diff --git a/BUILD.bazel b/BUILD.bazel index 02c54bd..d5bac73 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -366,7 +366,6 @@ cc_library( "ops/dot-inl.h", "ops/sum-inl.h", "ops/fp_arith-inl.h", - "ops/matvec-inl.h", "ops/ops-inl.h", ], deps = [ @@ -381,7 +380,6 @@ cc_library( "@highway//:bit_set", "@highway//:hwy", "@highway//:math", - "@highway//:matvec", "@highway//:profiler", "@highway//:thread_pool", "@highway//hwy/contrib/sort:vqsort", @@ -442,27 +440,6 @@ cc_test( ], ) -cc_test( - name = "gemma_matvec_test", - size = "small", - timeout = "long", - srcs = ["ops/gemma_matvec_test.cc"], - linkstatic = True, - local_defines = ["HWY_IS_TEST"], - # for test_suite. - tags = ["ops_tests"], - deps = [ - ":mat", - ":ops", - ":threading_context", - "@googletest//:gtest_main", # buildcleaner: keep - "//compression:compress", - "@highway//:hwy", - "@highway//:hwy_test_util", - "@highway//:thread_pool", - ], -) - cc_test( name = "matmul_test", size = "small", diff --git a/CMakeLists.txt b/CMakeLists.txt index cb2911f..46242f6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -112,7 +112,6 @@ set(SOURCES ops/matmul-inl.h ops/matmul.cc ops/matmul.h - ops/matvec-inl.h ops/ops-inl.h ops/ops.h ops/sum-inl.h @@ -224,7 +223,6 @@ set(GEMMA_TEST_FILES io/fields_test.cc ops/bench_matmul.cc ops/dot_test.cc - ops/gemma_matvec_test.cc ops/matmul_test.cc ops/ops_test.cc paligemma/image_test.cc diff --git a/gemma/flash_attention_test.cc b/gemma/flash_attention_test.cc index efb210e..d4d6380 100644 --- a/gemma/flash_attention_test.cc +++ b/gemma/flash_attention_test.cc @@ -51,7 +51,6 @@ #include "gemma/attention.h" #include "gemma/configs.h" #include "gemma/flash_attention.h" -#include "ops/matvec-inl.h" #include "hwy/tests/test_util-inl.h" HWY_BEFORE_NAMESPACE(); diff --git a/ops/gemma_matvec_test.cc b/ops/gemma_matvec_test.cc deleted file mode 100644 index e55539d..0000000 --- a/ops/gemma_matvec_test.cc +++ /dev/null @@ -1,192 +0,0 @@ -// Copyright 2023 Google LLC -// SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "compression/types.h" -#ifndef HWY_DISABLED_TARGETS -#define HWY_DISABLED_TARGETS GEMMA_DISABLED_TARGETS -#endif // HWY_DISABLED_TARGETS - -#include -#include - -#include // std::max -#include // std::abs -#include - -#include "util/mat.h" -#include "util/threading_context.h" -#include "hwy/aligned_allocator.h" -#include "hwy/base.h" -#include "hwy/contrib/thread_pool/thread_pool.h" - -// clang-format off -#undef HWY_TARGET_INCLUDE -#define HWY_TARGET_INCLUDE "ops/gemma_matvec_test.cc" // NOLINT -// clang-format on -#include "hwy/foreach_target.h" // IWYU pragma: keep -#include "hwy/highway.h" -// After highway.h -#include "compression/compress-inl.h" -#include "ops/matvec-inl.h" -#include "hwy/tests/test_util-inl.h" - -HWY_BEFORE_NAMESPACE(); -namespace gcpp { -namespace HWY_NAMESPACE { - -using FloatPtr = hwy::AlignedFreeUniquePtr; - -FloatPtr SimpleMatVecAdd(const MatStorageT& mat, const FloatPtr& vec, - const FloatPtr& add) { - const size_t num = mat.Rows() * mat.Cols(); - FloatPtr raw_mat = hwy::AllocateAligned(num); - FloatPtr out = hwy::AllocateAligned(mat.Rows()); - HWY_ASSERT(raw_mat && out); - const hn::ScalableTag df; - DecompressAndZeroPad(df, mat.Span(), 0, raw_mat.get(), num); - for (size_t idx_row = 0; idx_row < mat.Rows(); idx_row++) { - out[idx_row] = 0.0f; - for (size_t idx_col = 0; idx_col < mat.Cols(); idx_col++) { - out[idx_row] += raw_mat[mat.Cols() * idx_row + idx_col] * vec[idx_col]; - } - out[idx_row] *= mat.Scale(); - out[idx_row] += add[idx_row]; - } - return out; -} - -template -std::unique_ptr> GenerateMat(size_t offset, - const Allocator& allocator, - hwy::ThreadPool& pool) { - gcpp::CompressWorkingSet ws; - const Extents2D extents(kOuter, kInner); - auto mat = std::make_unique>("TestMat", extents, allocator, - MatPadding::kPacked); - FloatPtr raw_mat = hwy::AllocateAligned(extents.Area()); - HWY_ASSERT(raw_mat); - const float scale = 1.0f / kInner; - pool.Run(0, kOuter, [&](const size_t i, size_t /*thread*/) { - for (size_t j = 0; j < kInner; j++) { - raw_mat[i * kInner + j] = - static_cast((i * kInner + j + offset) * scale); - } - }); - - Compress(raw_mat.get(), extents.Area(), ws, mat->Span(), 0, pool); - mat->SetScale(1.9f); // Arbitrary value, different from 1. - return mat; -} - -template -FloatPtr GenerateVec(size_t offset) { - FloatPtr vec = hwy::AllocateAligned(length); - HWY_ASSERT(vec); - for (size_t idx = 0; idx < length; idx++) { - vec[idx] = static_cast(idx + offset); - } - return vec; -} - -template -void AssertClose(const FloatPtr& a, const FloatPtr& b) { - for (size_t idx = 0; idx < length; idx++) { - const float rel_abs_delta = std::abs(a[idx] - b[idx]) / - std::max(std::abs(a[idx]), std::abs(b[idx])); - EXPECT_LT(rel_abs_delta, 2e-6) - << "a[" << idx << "]=" << a[idx] << ", b[" << idx << "]=" << b[idx]; - } -} - -void TestMatVecAdd() { - ThreadingArgs threading_args; - ThreadingContext ctx(threading_args); - hwy::ThreadPool& pool = ctx.pools.Pool(); - constexpr size_t kOuter = 128 * 3; - constexpr size_t kInner = 128 * 5; - auto mat = GenerateMat(0, ctx.allocator, pool); - FloatPtr vec = GenerateVec(0); - FloatPtr add = GenerateVec(0); - FloatPtr expected_out = SimpleMatVecAdd(*mat, vec, add); - FloatPtr actual_out = hwy::AllocateAligned(kOuter); - HWY_ASSERT(vec && add && expected_out && actual_out); - MatVecAdd(*mat, 0, kOuter, kInner, vec.get(), add.get(), actual_out.get(), - pool); - AssertClose(actual_out, expected_out); -} - -void TestTwoMatVecAdd() { - ThreadingArgs threading_args; - ThreadingContext ctx(threading_args); - hwy::ThreadPool& pool = ctx.pools.Pool(); - constexpr size_t kOuter = 128 * 3; - constexpr size_t kInner = 128 * 5; - auto mat0 = GenerateMat(0, ctx.allocator, pool); - auto mat1 = GenerateMat(1, ctx.allocator, pool); - FloatPtr vec = GenerateVec(0); - FloatPtr add0 = GenerateVec(0); - FloatPtr add1 = GenerateVec(1); - FloatPtr expected_out0 = SimpleMatVecAdd(*mat0, vec, add0); - FloatPtr expected_out1 = SimpleMatVecAdd(*mat1, vec, add1); - FloatPtr actual_out0 = hwy::AllocateAligned(kOuter); - FloatPtr actual_out1 = hwy::AllocateAligned(kOuter); - HWY_ASSERT(vec && add0 && add1 && expected_out0 && actual_out0 && - expected_out1 && actual_out1); - TwoMatVecAdd(*mat0, *mat1, 0, kOuter, kInner, vec.get(), add0.get(), - add1.get(), actual_out0.get(), actual_out1.get(), pool); - AssertClose(actual_out0, expected_out0); - AssertClose(actual_out1, expected_out1); -} - -void TestTwoOfsMatVecAddLoop() { - ThreadingArgs threading_args; - ThreadingContext ctx(threading_args); - hwy::ThreadPool& pool = ctx.pools.Pool(); - - constexpr size_t kOuter = 128 * 3; - constexpr size_t kInner = 128 * 5; - auto mat = GenerateMat(0, ctx.allocator, pool); - FloatPtr vec = GenerateVec(0); - FloatPtr add0 = GenerateVec(0); - FloatPtr add1 = GenerateVec(1); - FloatPtr expected_out0 = SimpleMatVecAdd(*mat, vec, add0); - FloatPtr expected_out1 = SimpleMatVecAdd(*mat, vec, add1); - FloatPtr actual_out0 = hwy::AllocateAligned(kOuter); - FloatPtr actual_out1 = hwy::AllocateAligned(kOuter); - HWY_ASSERT(vec && add0 && add1 && expected_out0 && actual_out0 && - expected_out1 && actual_out1); - TwoOfsMatVecAddLoop(*mat, 0, 0, kOuter, kInner, vec.get(), add0.get(), - add1.get(), actual_out0.get(), actual_out1.get()); - AssertClose(actual_out0, expected_out0); - AssertClose(actual_out1, expected_out1); -} - -// NOLINTNEXTLINE(google-readability-namespace-comments) -} // namespace HWY_NAMESPACE -} // namespace gcpp -HWY_AFTER_NAMESPACE(); - -#if HWY_ONCE - -namespace gcpp { -HWY_BEFORE_TEST(MatVecTest); -HWY_EXPORT_AND_TEST_P(MatVecTest, TestMatVecAdd); -HWY_EXPORT_AND_TEST_P(MatVecTest, TestTwoMatVecAdd); -HWY_EXPORT_AND_TEST_P(MatVecTest, TestTwoOfsMatVecAddLoop); -HWY_AFTER_TEST(); - -} // namespace gcpp - -#endif diff --git a/ops/matvec-inl.h b/ops/matvec-inl.h deleted file mode 100644 index c8feda9..0000000 --- a/ops/matvec-inl.h +++ /dev/null @@ -1,302 +0,0 @@ -// Copyright 2024 Google LLC -// SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Include guard for non-SIMD code. -#ifndef THIRD_PARTY_GEMMA_CPP_OPS_MATVEC_INL_H_ -#define THIRD_PARTY_GEMMA_CPP_OPS_MATVEC_INL_H_ - -#include -#include -#include - -#include "hwy/base.h" -#include "hwy/contrib/thread_pool/thread_pool.h" -#include "hwy/profiler.h" - -#endif // THIRD_PARTY_GEMMA_CPP_OPS_MATVEC_INL_H_ - -// Include guard for (potentially) SIMD code. -#if defined(THIRD_PARTY_GEMMA_CPP_MATVEC_TOGGLE) == defined(HWY_TARGET_TOGGLE) -#ifdef THIRD_PARTY_GEMMA_CPP_MATVEC_TOGGLE -#undef THIRD_PARTY_GEMMA_CPP_MATVEC_TOGGLE -#else -#define THIRD_PARTY_GEMMA_CPP_MATVEC_TOGGLE -#endif - -#include "compression/compress-inl.h" -#include "ops/dot-inl.h" -#include "util/mat.h" // MatPtrT -#include "hwy/contrib/math/math-inl.h" -#include "hwy/contrib/matvec/matvec-inl.h" - -HWY_BEFORE_NAMESPACE(); -namespace gcpp { -namespace HWY_NAMESPACE { -namespace hn = hwy::HWY_NAMESPACE; - -// For callers that pass `MatPtrT`, which is not necessarily packed - callers -// should use Stride() to compute `w_ofs`. -template -HWY_INLINE float Dot(const MatPtrT& w, size_t w_ofs, const VT* vec_aligned, - size_t num) { - const hn::ScalableTag d; - return w.Scale() * Dot(d, w.PaddedSpan(), w_ofs, vec_aligned, num); -} - -// ArrayT is MatPtrT. - -// Simple version without tiling nor threading, but two offsets/outputs and -// always with addition. -template -HWY_INLINE void TwoOfsMatVecAddLoop(const ArrayT& mat, const size_t mat_ofs0, - const size_t mat_ofs1, const size_t outer, - const size_t inner, - const VecT* HWY_RESTRICT vec_aligned, - const AddT* HWY_RESTRICT add0, - const AddT* HWY_RESTRICT add1, - float* HWY_RESTRICT out0, - float* HWY_RESTRICT out1) { - PROFILER_ZONE("TwoOfsMatVecAddLoop"); - - for (size_t idx_row = 0; idx_row < outer; ++idx_row) { - const size_t row_ofs0 = mat_ofs0 + idx_row * mat.Stride(); - const size_t row_ofs1 = mat_ofs1 + idx_row * mat.Stride(); - out0[idx_row] = hwy::ConvertScalarTo(add0[idx_row]) + - Dot(mat, row_ofs0, vec_aligned, inner); - out1[idx_row] = hwy::ConvertScalarTo(add1[idx_row]) + - Dot(mat, row_ofs1, vec_aligned, inner); - } -} - -HWY_INLINE constexpr size_t MaxCols() { - // Vec + mat rows should fit into 32 KiB L1. - return 2048; -} - -template -HWY_INLINE constexpr size_t RowsPerStrip() { - // Aim for 128 work items to reduce pool overhead. Must be at least one - // vector; prefer a power of two for faster division. - constexpr size_t kLanes = hn::ScalableTag().MaxLanes(); - constexpr size_t kRowsPerStrip = - kOuter < 128 ? kLanes - : HWY_MAX(kLanes, 1ULL << hwy::FloorLog2(kOuter / 128)); - return kRowsPerStrip; -} - -HWY_INLINE size_t RowsPerStrip(const size_t outer) { - // Aim for 128 work items to reduce pool overhead. Must be at least one - // vector; prefer a power of two for faster division. - constexpr size_t kLanes = hn::ScalableTag().MaxLanes(); - return outer < 128 ? kLanes - : HWY_MAX(kLanes, 1ULL << hwy::FloorLog2(outer / 128)); -} - -namespace detail { - -// For each i = [0, num_rows), compute partial (length `num_cols`) dot product -// of row i with `vec_aligned` and add into `out[i]`. The upper-left -// coordinate of the tile is r0, c0. -template -HWY_INLINE void AccumulatePartialDotProducts( - DF df, const ArrayT& mat, size_t mat_ofs, size_t r0, size_t c0, - size_t num_rows, size_t num_cols, const VecT* HWY_RESTRICT vec_aligned, - float* HWY_RESTRICT out) { - for (size_t idx_row = 0; idx_row < num_rows; ++idx_row) { - const size_t row_ofs = mat_ofs + (r0 + idx_row) * mat.Stride(); - out[idx_row] += Dot(mat, row_ofs + c0, vec_aligned + c0, num_cols); - } -} - -// Same as AccumulatePartialDotProducts, but sets out[i] to the first partial -// dot product + init (if kInit), which avoids having to zero-initialize and -// accumulate. -template -HWY_INLINE void SetFirstPartialDotProducts(DF df, const ArrayT& mat, - size_t mat_ofs, size_t r0, size_t c0, - size_t num_rows, size_t num_cols, - const VecT* HWY_RESTRICT vec_aligned, - const InitT* HWY_RESTRICT init, - float* HWY_RESTRICT out) { - for (size_t idx_row = 0; idx_row < num_rows; ++idx_row) { - const size_t row_ofs = mat_ofs + (r0 + idx_row) * mat.Stride(); - if constexpr (kInit) { - out[idx_row] = hwy::ConvertScalarTo(init[idx_row + r0]) + - Dot(mat, row_ofs + c0, vec_aligned + c0, num_cols); - } else { - out[idx_row] = Dot(mat, row_ofs + c0, vec_aligned + c0, num_cols); - } - } -} - -// Adds together partial dot products for all tiles with the same r0 (a -// horizontal strip of the entire matrix); the result is the full dot product -// for rows r in [r0, r0 + num_rows) + optionally the add vector, which we -// store into in out[r - r0]. -template -HWY_INLINE void FullDotProductsForStrip(DF df, const ArrayT& mat, - size_t mat_ofs, size_t r0, - size_t num_rows, size_t num_cols, - const VecT* HWY_RESTRICT vec_aligned, - const AddT* HWY_RESTRICT add, - float* HWY_RESTRICT out) { - HWY_DASSERT(num_cols <= mat.Cols()); - // Tall and skinny: set `out` to the single dot product. - if (num_cols < MaxCols()) { - SetFirstPartialDotProducts(df, mat, mat_ofs, r0, 0, num_rows, - num_cols, vec_aligned, add, out); - return; - } - - // We have at least MaxCols, so start by setting `out` to that: - SetFirstPartialDotProducts(df, mat, mat_ofs, r0, 0, num_rows, MaxCols(), - vec_aligned, add, out); - // For further multiples of MaxCols, accumulate. Remainders handled below. - size_t c0 = MaxCols(); - for (; c0 <= num_cols - MaxCols(); c0 += MaxCols()) { - AccumulatePartialDotProducts(df, mat, mat_ofs, r0, c0, num_rows, MaxCols(), - vec_aligned, out); - } - - if (c0 < num_cols) { // Final cols - AccumulatePartialDotProducts(df, mat, mat_ofs, r0, c0, num_rows, - num_cols - c0, vec_aligned, out); - } -} - -} // namespace detail - -// Stores dot products of rows with `vec_aligned` + add the values from `add` -// (if kAdd), then stores them to `out`. -template -HWY_INLINE void MatVecT(const ArrayT& mat, const size_t mat_ofs, - const size_t outer, const size_t inner, - const VecT* HWY_RESTRICT const vec_aligned, - const AddT* HWY_RESTRICT const add, - float* HWY_RESTRICT out, hwy::ThreadPool& pool) { - PROFILER_ZONE("MatVecAdd"); - - const hn::ScalableTag df; - const size_t rows_per_strip = RowsPerStrip(outer); - const size_t num_strips = outer / rows_per_strip; - - // For each entire strip. - pool.Run(0, num_strips, [&](const uint64_t strip, size_t thread) HWY_ATTR { - PROFILER_ZONE("MatVec.lambda"); - const size_t r0 = strip * rows_per_strip; - detail::FullDotProductsForStrip(df, mat, mat_ofs, r0, rows_per_strip, - inner, vec_aligned, add, out + r0); - }); - - // Remaining rows - const size_t r0 = num_strips * rows_per_strip; - if (r0 < outer) { - PROFILER_ZONE("MatVec remainder"); - const size_t num_rows = outer - r0; - detail::FullDotProductsForStrip(df, mat, mat_ofs, r0, num_rows, inner, - vec_aligned, add, out + r0); - } -} - -// With addition -template -HWY_INLINE void MatVecAdd(const ArrayT& mat, const size_t mat_ofs, - const size_t outer, const size_t inner, - const VecT* HWY_RESTRICT const vec_aligned, - const AddT* HWY_RESTRICT const add, - float* HWY_RESTRICT out, hwy::ThreadPool& pool) { - return MatVecT(mat, mat_ofs, outer, inner, vec_aligned, add, - out, pool); -} - -// Without addition -template -HWY_INLINE void MatVec(const ArrayT& mat, const size_t mat_ofs, - const size_t outer, const size_t inner, - const VecT* HWY_RESTRICT const vec_aligned, - float* HWY_RESTRICT out, hwy::ThreadPool& pool) { - MatVecT(mat, mat_ofs, outer, inner, vec_aligned, - /*add=*/static_cast(nullptr), out, pool); -} - -// Two matrices, same vector -template -HWY_NOINLINE void TwoMatVecT(const ArrayT1& mat0, const ArrayT2& mat1, - const size_t mat_ofs, size_t outer, size_t inner, - const VecT* HWY_RESTRICT vec_aligned, - const AddT* HWY_RESTRICT add0, - const AddT* HWY_RESTRICT add1, - float* HWY_RESTRICT out0, float* HWY_RESTRICT out1, - hwy::ThreadPool& pool) { - PROFILER_ZONE("TwoMatVecAdd"); - - const hn::ScalableTag df; - const size_t rows_per_strip = RowsPerStrip(outer); - const size_t num_strips = outer / rows_per_strip; - - // For each entire strip. - pool.Run(0, num_strips, [&](const uint64_t strip, size_t thread) HWY_ATTR { - PROFILER_ZONE("TwoMatVec.lambda"); - const size_t r0 = strip * rows_per_strip; - detail::FullDotProductsForStrip(df, mat0, mat_ofs, r0, rows_per_strip, - inner, vec_aligned, add0, out0 + r0); - detail::FullDotProductsForStrip(df, mat1, mat_ofs, r0, rows_per_strip, - inner, vec_aligned, add1, out1 + r0); - }); - - // Remaining rows - const size_t r0 = num_strips * rows_per_strip; - if (r0 < outer) { - PROFILER_ZONE("TwoMatVec remainder"); - const size_t num_rows = outer - r0; - detail::FullDotProductsForStrip(df, mat0, mat_ofs, r0, num_rows, - inner, vec_aligned, add0, out0 + r0); - detail::FullDotProductsForStrip(df, mat1, mat_ofs, r0, num_rows, - inner, vec_aligned, add1, out1 + r0); - } -} - -// With addition -template -HWY_NOINLINE void TwoMatVecAdd( - const ArrayT1& mat0, const ArrayT2& mat1, const size_t mat_ofs, - const size_t outer, const size_t inner, - const VecT* HWY_RESTRICT vec_aligned, const AddT* HWY_RESTRICT add0, - const AddT* HWY_RESTRICT add1, float* HWY_RESTRICT out0, - float* HWY_RESTRICT out1, hwy::ThreadPool& pool) { - return TwoMatVecT(mat0, mat1, mat_ofs, outer, inner, - vec_aligned, add0, add1, out0, out1, pool); -} - -// Without addition -template -HWY_NOINLINE void TwoMatVec(const ArrayT1& mat0, const ArrayT2& mat1, - const size_t mat_ofs, const size_t outer, - const size_t inner, - const VecT* HWY_RESTRICT vec_aligned, - float* HWY_RESTRICT out0, float* HWY_RESTRICT out1, - hwy::ThreadPool& pool) { - TwoMatVecT( - mat0, mat1, mat_ofs, outer, inner, vec_aligned, /*add0=*/nullptr, - /*add1=*/nullptr, out0, out1, pool); -} - -// NOLINTNEXTLINE(google-readability-namespace-comments) -} // namespace HWY_NAMESPACE -} // namespace gcpp -HWY_AFTER_NAMESPACE(); - -#endif // NOLINT