Remove no longer used MatVec

PiperOrigin-RevId: 809059409
This commit is contained in:
Jan Wassenberg 2025-09-19 09:02:44 -07:00 committed by Copybara-Service
parent b603425bf3
commit 501fdf000e
5 changed files with 0 additions and 520 deletions

View File

@ -366,7 +366,6 @@ cc_library(
"ops/dot-inl.h",
"ops/sum-inl.h",
"ops/fp_arith-inl.h",
"ops/matvec-inl.h",
"ops/ops-inl.h",
],
deps = [
@ -381,7 +380,6 @@ cc_library(
"@highway//:bit_set",
"@highway//:hwy",
"@highway//:math",
"@highway//:matvec",
"@highway//:profiler",
"@highway//:thread_pool",
"@highway//hwy/contrib/sort:vqsort",
@ -442,27 +440,6 @@ cc_test(
],
)
cc_test(
name = "gemma_matvec_test",
size = "small",
timeout = "long",
srcs = ["ops/gemma_matvec_test.cc"],
linkstatic = True,
local_defines = ["HWY_IS_TEST"],
# for test_suite.
tags = ["ops_tests"],
deps = [
":mat",
":ops",
":threading_context",
"@googletest//:gtest_main", # buildcleaner: keep
"//compression:compress",
"@highway//:hwy",
"@highway//:hwy_test_util",
"@highway//:thread_pool",
],
)
cc_test(
name = "matmul_test",
size = "small",

View File

@ -112,7 +112,6 @@ set(SOURCES
ops/matmul-inl.h
ops/matmul.cc
ops/matmul.h
ops/matvec-inl.h
ops/ops-inl.h
ops/ops.h
ops/sum-inl.h
@ -224,7 +223,6 @@ set(GEMMA_TEST_FILES
io/fields_test.cc
ops/bench_matmul.cc
ops/dot_test.cc
ops/gemma_matvec_test.cc
ops/matmul_test.cc
ops/ops_test.cc
paligemma/image_test.cc

View File

@ -51,7 +51,6 @@
#include "gemma/attention.h"
#include "gemma/configs.h"
#include "gemma/flash_attention.h"
#include "ops/matvec-inl.h"
#include "hwy/tests/test_util-inl.h"
HWY_BEFORE_NAMESPACE();

View File

@ -1,192 +0,0 @@
// Copyright 2023 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "compression/types.h"
#ifndef HWY_DISABLED_TARGETS
#define HWY_DISABLED_TARGETS GEMMA_DISABLED_TARGETS
#endif // HWY_DISABLED_TARGETS
#include <stddef.h>
#include <stdio.h>
#include <algorithm> // std::max
#include <cmath> // std::abs
#include <memory>
#include "util/mat.h"
#include "util/threading_context.h"
#include "hwy/aligned_allocator.h"
#include "hwy/base.h"
#include "hwy/contrib/thread_pool/thread_pool.h"
// clang-format off
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "ops/gemma_matvec_test.cc" // NOLINT
// clang-format on
#include "hwy/foreach_target.h" // IWYU pragma: keep
#include "hwy/highway.h"
// After highway.h
#include "compression/compress-inl.h"
#include "ops/matvec-inl.h"
#include "hwy/tests/test_util-inl.h"
HWY_BEFORE_NAMESPACE();
namespace gcpp {
namespace HWY_NAMESPACE {
using FloatPtr = hwy::AlignedFreeUniquePtr<float[]>;
FloatPtr SimpleMatVecAdd(const MatStorageT<float>& mat, const FloatPtr& vec,
const FloatPtr& add) {
const size_t num = mat.Rows() * mat.Cols();
FloatPtr raw_mat = hwy::AllocateAligned<float>(num);
FloatPtr out = hwy::AllocateAligned<float>(mat.Rows());
HWY_ASSERT(raw_mat && out);
const hn::ScalableTag<float> df;
DecompressAndZeroPad(df, mat.Span(), 0, raw_mat.get(), num);
for (size_t idx_row = 0; idx_row < mat.Rows(); idx_row++) {
out[idx_row] = 0.0f;
for (size_t idx_col = 0; idx_col < mat.Cols(); idx_col++) {
out[idx_row] += raw_mat[mat.Cols() * idx_row + idx_col] * vec[idx_col];
}
out[idx_row] *= mat.Scale();
out[idx_row] += add[idx_row];
}
return out;
}
template <typename MatT, size_t kOuter, size_t kInner>
std::unique_ptr<MatStorageT<float>> GenerateMat(size_t offset,
const Allocator& allocator,
hwy::ThreadPool& pool) {
gcpp::CompressWorkingSet ws;
const Extents2D extents(kOuter, kInner);
auto mat = std::make_unique<MatStorageT<float>>("TestMat", extents, allocator,
MatPadding::kPacked);
FloatPtr raw_mat = hwy::AllocateAligned<float>(extents.Area());
HWY_ASSERT(raw_mat);
const float scale = 1.0f / kInner;
pool.Run(0, kOuter, [&](const size_t i, size_t /*thread*/) {
for (size_t j = 0; j < kInner; j++) {
raw_mat[i * kInner + j] =
static_cast<float>((i * kInner + j + offset) * scale);
}
});
Compress(raw_mat.get(), extents.Area(), ws, mat->Span(), 0, pool);
mat->SetScale(1.9f); // Arbitrary value, different from 1.
return mat;
}
template <size_t length>
FloatPtr GenerateVec(size_t offset) {
FloatPtr vec = hwy::AllocateAligned<float>(length);
HWY_ASSERT(vec);
for (size_t idx = 0; idx < length; idx++) {
vec[idx] = static_cast<float>(idx + offset);
}
return vec;
}
template <size_t length>
void AssertClose(const FloatPtr& a, const FloatPtr& b) {
for (size_t idx = 0; idx < length; idx++) {
const float rel_abs_delta = std::abs(a[idx] - b[idx]) /
std::max(std::abs(a[idx]), std::abs(b[idx]));
EXPECT_LT(rel_abs_delta, 2e-6)
<< "a[" << idx << "]=" << a[idx] << ", b[" << idx << "]=" << b[idx];
}
}
void TestMatVecAdd() {
ThreadingArgs threading_args;
ThreadingContext ctx(threading_args);
hwy::ThreadPool& pool = ctx.pools.Pool();
constexpr size_t kOuter = 128 * 3;
constexpr size_t kInner = 128 * 5;
auto mat = GenerateMat<float, kOuter, kInner>(0, ctx.allocator, pool);
FloatPtr vec = GenerateVec<kInner>(0);
FloatPtr add = GenerateVec<kOuter>(0);
FloatPtr expected_out = SimpleMatVecAdd(*mat, vec, add);
FloatPtr actual_out = hwy::AllocateAligned<float>(kOuter);
HWY_ASSERT(vec && add && expected_out && actual_out);
MatVecAdd(*mat, 0, kOuter, kInner, vec.get(), add.get(), actual_out.get(),
pool);
AssertClose<kOuter>(actual_out, expected_out);
}
void TestTwoMatVecAdd() {
ThreadingArgs threading_args;
ThreadingContext ctx(threading_args);
hwy::ThreadPool& pool = ctx.pools.Pool();
constexpr size_t kOuter = 128 * 3;
constexpr size_t kInner = 128 * 5;
auto mat0 = GenerateMat<float, kOuter, kInner>(0, ctx.allocator, pool);
auto mat1 = GenerateMat<float, kOuter, kInner>(1, ctx.allocator, pool);
FloatPtr vec = GenerateVec<kInner>(0);
FloatPtr add0 = GenerateVec<kOuter>(0);
FloatPtr add1 = GenerateVec<kOuter>(1);
FloatPtr expected_out0 = SimpleMatVecAdd(*mat0, vec, add0);
FloatPtr expected_out1 = SimpleMatVecAdd(*mat1, vec, add1);
FloatPtr actual_out0 = hwy::AllocateAligned<float>(kOuter);
FloatPtr actual_out1 = hwy::AllocateAligned<float>(kOuter);
HWY_ASSERT(vec && add0 && add1 && expected_out0 && actual_out0 &&
expected_out1 && actual_out1);
TwoMatVecAdd(*mat0, *mat1, 0, kOuter, kInner, vec.get(), add0.get(),
add1.get(), actual_out0.get(), actual_out1.get(), pool);
AssertClose<kOuter>(actual_out0, expected_out0);
AssertClose<kOuter>(actual_out1, expected_out1);
}
void TestTwoOfsMatVecAddLoop() {
ThreadingArgs threading_args;
ThreadingContext ctx(threading_args);
hwy::ThreadPool& pool = ctx.pools.Pool();
constexpr size_t kOuter = 128 * 3;
constexpr size_t kInner = 128 * 5;
auto mat = GenerateMat<float, kOuter, kInner>(0, ctx.allocator, pool);
FloatPtr vec = GenerateVec<kInner>(0);
FloatPtr add0 = GenerateVec<kOuter>(0);
FloatPtr add1 = GenerateVec<kOuter>(1);
FloatPtr expected_out0 = SimpleMatVecAdd(*mat, vec, add0);
FloatPtr expected_out1 = SimpleMatVecAdd(*mat, vec, add1);
FloatPtr actual_out0 = hwy::AllocateAligned<float>(kOuter);
FloatPtr actual_out1 = hwy::AllocateAligned<float>(kOuter);
HWY_ASSERT(vec && add0 && add1 && expected_out0 && actual_out0 &&
expected_out1 && actual_out1);
TwoOfsMatVecAddLoop(*mat, 0, 0, kOuter, kInner, vec.get(), add0.get(),
add1.get(), actual_out0.get(), actual_out1.get());
AssertClose<kOuter>(actual_out0, expected_out0);
AssertClose<kOuter>(actual_out1, expected_out1);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace gcpp
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace gcpp {
HWY_BEFORE_TEST(MatVecTest);
HWY_EXPORT_AND_TEST_P(MatVecTest, TestMatVecAdd);
HWY_EXPORT_AND_TEST_P(MatVecTest, TestTwoMatVecAdd);
HWY_EXPORT_AND_TEST_P(MatVecTest, TestTwoOfsMatVecAddLoop);
HWY_AFTER_TEST();
} // namespace gcpp
#endif

View File

@ -1,302 +0,0 @@
// Copyright 2024 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Include guard for non-SIMD code.
#ifndef THIRD_PARTY_GEMMA_CPP_OPS_MATVEC_INL_H_
#define THIRD_PARTY_GEMMA_CPP_OPS_MATVEC_INL_H_
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include "hwy/base.h"
#include "hwy/contrib/thread_pool/thread_pool.h"
#include "hwy/profiler.h"
#endif // THIRD_PARTY_GEMMA_CPP_OPS_MATVEC_INL_H_
// Include guard for (potentially) SIMD code.
#if defined(THIRD_PARTY_GEMMA_CPP_MATVEC_TOGGLE) == defined(HWY_TARGET_TOGGLE)
#ifdef THIRD_PARTY_GEMMA_CPP_MATVEC_TOGGLE
#undef THIRD_PARTY_GEMMA_CPP_MATVEC_TOGGLE
#else
#define THIRD_PARTY_GEMMA_CPP_MATVEC_TOGGLE
#endif
#include "compression/compress-inl.h"
#include "ops/dot-inl.h"
#include "util/mat.h" // MatPtrT
#include "hwy/contrib/math/math-inl.h"
#include "hwy/contrib/matvec/matvec-inl.h"
HWY_BEFORE_NAMESPACE();
namespace gcpp {
namespace HWY_NAMESPACE {
namespace hn = hwy::HWY_NAMESPACE;
// For callers that pass `MatPtrT`, which is not necessarily packed - callers
// should use Stride() to compute `w_ofs`.
template <typename WT, typename VT>
HWY_INLINE float Dot(const MatPtrT<WT>& w, size_t w_ofs, const VT* vec_aligned,
size_t num) {
const hn::ScalableTag<VT> d;
return w.Scale() * Dot(d, w.PaddedSpan(), w_ofs, vec_aligned, num);
}
// ArrayT is MatPtrT.
// Simple version without tiling nor threading, but two offsets/outputs and
// always with addition.
template <typename ArrayT, typename VecT, typename AddT>
HWY_INLINE void TwoOfsMatVecAddLoop(const ArrayT& mat, const size_t mat_ofs0,
const size_t mat_ofs1, const size_t outer,
const size_t inner,
const VecT* HWY_RESTRICT vec_aligned,
const AddT* HWY_RESTRICT add0,
const AddT* HWY_RESTRICT add1,
float* HWY_RESTRICT out0,
float* HWY_RESTRICT out1) {
PROFILER_ZONE("TwoOfsMatVecAddLoop");
for (size_t idx_row = 0; idx_row < outer; ++idx_row) {
const size_t row_ofs0 = mat_ofs0 + idx_row * mat.Stride();
const size_t row_ofs1 = mat_ofs1 + idx_row * mat.Stride();
out0[idx_row] = hwy::ConvertScalarTo<float>(add0[idx_row]) +
Dot(mat, row_ofs0, vec_aligned, inner);
out1[idx_row] = hwy::ConvertScalarTo<float>(add1[idx_row]) +
Dot(mat, row_ofs1, vec_aligned, inner);
}
}
HWY_INLINE constexpr size_t MaxCols() {
// Vec + mat rows should fit into 32 KiB L1.
return 2048;
}
template <size_t kOuter>
HWY_INLINE constexpr size_t RowsPerStrip() {
// Aim for 128 work items to reduce pool overhead. Must be at least one
// vector; prefer a power of two for faster division.
constexpr size_t kLanes = hn::ScalableTag<float>().MaxLanes();
constexpr size_t kRowsPerStrip =
kOuter < 128 ? kLanes
: HWY_MAX(kLanes, 1ULL << hwy::FloorLog2(kOuter / 128));
return kRowsPerStrip;
}
HWY_INLINE size_t RowsPerStrip(const size_t outer) {
// Aim for 128 work items to reduce pool overhead. Must be at least one
// vector; prefer a power of two for faster division.
constexpr size_t kLanes = hn::ScalableTag<float>().MaxLanes();
return outer < 128 ? kLanes
: HWY_MAX(kLanes, 1ULL << hwy::FloorLog2(outer / 128));
}
namespace detail {
// For each i = [0, num_rows), compute partial (length `num_cols`) dot product
// of row i with `vec_aligned` and add into `out[i]`. The upper-left
// coordinate of the tile is r0, c0.
template <class DF, typename ArrayT, typename VecT>
HWY_INLINE void AccumulatePartialDotProducts(
DF df, const ArrayT& mat, size_t mat_ofs, size_t r0, size_t c0,
size_t num_rows, size_t num_cols, const VecT* HWY_RESTRICT vec_aligned,
float* HWY_RESTRICT out) {
for (size_t idx_row = 0; idx_row < num_rows; ++idx_row) {
const size_t row_ofs = mat_ofs + (r0 + idx_row) * mat.Stride();
out[idx_row] += Dot(mat, row_ofs + c0, vec_aligned + c0, num_cols);
}
}
// Same as AccumulatePartialDotProducts, but sets out[i] to the first partial
// dot product + init (if kInit), which avoids having to zero-initialize and
// accumulate.
template <bool kInit, class DF, typename ArrayT, typename VecT, typename InitT>
HWY_INLINE void SetFirstPartialDotProducts(DF df, const ArrayT& mat,
size_t mat_ofs, size_t r0, size_t c0,
size_t num_rows, size_t num_cols,
const VecT* HWY_RESTRICT vec_aligned,
const InitT* HWY_RESTRICT init,
float* HWY_RESTRICT out) {
for (size_t idx_row = 0; idx_row < num_rows; ++idx_row) {
const size_t row_ofs = mat_ofs + (r0 + idx_row) * mat.Stride();
if constexpr (kInit) {
out[idx_row] = hwy::ConvertScalarTo<float>(init[idx_row + r0]) +
Dot(mat, row_ofs + c0, vec_aligned + c0, num_cols);
} else {
out[idx_row] = Dot(mat, row_ofs + c0, vec_aligned + c0, num_cols);
}
}
}
// Adds together partial dot products for all tiles with the same r0 (a
// horizontal strip of the entire matrix); the result is the full dot product
// for rows r in [r0, r0 + num_rows) + optionally the add vector, which we
// store into in out[r - r0].
template <bool kAdd, class DF, typename ArrayT, typename VecT, typename AddT>
HWY_INLINE void FullDotProductsForStrip(DF df, const ArrayT& mat,
size_t mat_ofs, size_t r0,
size_t num_rows, size_t num_cols,
const VecT* HWY_RESTRICT vec_aligned,
const AddT* HWY_RESTRICT add,
float* HWY_RESTRICT out) {
HWY_DASSERT(num_cols <= mat.Cols());
// Tall and skinny: set `out` to the single dot product.
if (num_cols < MaxCols()) {
SetFirstPartialDotProducts<kAdd>(df, mat, mat_ofs, r0, 0, num_rows,
num_cols, vec_aligned, add, out);
return;
}
// We have at least MaxCols, so start by setting `out` to that:
SetFirstPartialDotProducts<kAdd>(df, mat, mat_ofs, r0, 0, num_rows, MaxCols(),
vec_aligned, add, out);
// For further multiples of MaxCols, accumulate. Remainders handled below.
size_t c0 = MaxCols();
for (; c0 <= num_cols - MaxCols(); c0 += MaxCols()) {
AccumulatePartialDotProducts(df, mat, mat_ofs, r0, c0, num_rows, MaxCols(),
vec_aligned, out);
}
if (c0 < num_cols) { // Final cols
AccumulatePartialDotProducts(df, mat, mat_ofs, r0, c0, num_rows,
num_cols - c0, vec_aligned, out);
}
}
} // namespace detail
// Stores dot products of rows with `vec_aligned` + add the values from `add`
// (if kAdd), then stores them to `out`.
template <bool kAdd, typename ArrayT, typename VecT, typename AddT>
HWY_INLINE void MatVecT(const ArrayT& mat, const size_t mat_ofs,
const size_t outer, const size_t inner,
const VecT* HWY_RESTRICT const vec_aligned,
const AddT* HWY_RESTRICT const add,
float* HWY_RESTRICT out, hwy::ThreadPool& pool) {
PROFILER_ZONE("MatVecAdd");
const hn::ScalableTag<float> df;
const size_t rows_per_strip = RowsPerStrip(outer);
const size_t num_strips = outer / rows_per_strip;
// For each entire strip.
pool.Run(0, num_strips, [&](const uint64_t strip, size_t thread) HWY_ATTR {
PROFILER_ZONE("MatVec.lambda");
const size_t r0 = strip * rows_per_strip;
detail::FullDotProductsForStrip<kAdd>(df, mat, mat_ofs, r0, rows_per_strip,
inner, vec_aligned, add, out + r0);
});
// Remaining rows
const size_t r0 = num_strips * rows_per_strip;
if (r0 < outer) {
PROFILER_ZONE("MatVec remainder");
const size_t num_rows = outer - r0;
detail::FullDotProductsForStrip<kAdd>(df, mat, mat_ofs, r0, num_rows, inner,
vec_aligned, add, out + r0);
}
}
// With addition
template <typename ArrayT, typename VecT, typename AddT>
HWY_INLINE void MatVecAdd(const ArrayT& mat, const size_t mat_ofs,
const size_t outer, const size_t inner,
const VecT* HWY_RESTRICT const vec_aligned,
const AddT* HWY_RESTRICT const add,
float* HWY_RESTRICT out, hwy::ThreadPool& pool) {
return MatVecT</*kAdd=*/true>(mat, mat_ofs, outer, inner, vec_aligned, add,
out, pool);
}
// Without addition
template <typename ArrayT, typename VecT>
HWY_INLINE void MatVec(const ArrayT& mat, const size_t mat_ofs,
const size_t outer, const size_t inner,
const VecT* HWY_RESTRICT const vec_aligned,
float* HWY_RESTRICT out, hwy::ThreadPool& pool) {
MatVecT</*kAdd=*/false>(mat, mat_ofs, outer, inner, vec_aligned,
/*add=*/static_cast<VecT*>(nullptr), out, pool);
}
// Two matrices, same vector
template <bool kAdd, typename ArrayT1, typename ArrayT2, typename VecT,
typename AddT>
HWY_NOINLINE void TwoMatVecT(const ArrayT1& mat0, const ArrayT2& mat1,
const size_t mat_ofs, size_t outer, size_t inner,
const VecT* HWY_RESTRICT vec_aligned,
const AddT* HWY_RESTRICT add0,
const AddT* HWY_RESTRICT add1,
float* HWY_RESTRICT out0, float* HWY_RESTRICT out1,
hwy::ThreadPool& pool) {
PROFILER_ZONE("TwoMatVecAdd");
const hn::ScalableTag<float> df;
const size_t rows_per_strip = RowsPerStrip(outer);
const size_t num_strips = outer / rows_per_strip;
// For each entire strip.
pool.Run(0, num_strips, [&](const uint64_t strip, size_t thread) HWY_ATTR {
PROFILER_ZONE("TwoMatVec.lambda");
const size_t r0 = strip * rows_per_strip;
detail::FullDotProductsForStrip<kAdd>(df, mat0, mat_ofs, r0, rows_per_strip,
inner, vec_aligned, add0, out0 + r0);
detail::FullDotProductsForStrip<kAdd>(df, mat1, mat_ofs, r0, rows_per_strip,
inner, vec_aligned, add1, out1 + r0);
});
// Remaining rows
const size_t r0 = num_strips * rows_per_strip;
if (r0 < outer) {
PROFILER_ZONE("TwoMatVec remainder");
const size_t num_rows = outer - r0;
detail::FullDotProductsForStrip<kAdd>(df, mat0, mat_ofs, r0, num_rows,
inner, vec_aligned, add0, out0 + r0);
detail::FullDotProductsForStrip<kAdd>(df, mat1, mat_ofs, r0, num_rows,
inner, vec_aligned, add1, out1 + r0);
}
}
// With addition
template <typename ArrayT1, typename ArrayT2, typename VecT, typename AddT>
HWY_NOINLINE void TwoMatVecAdd(
const ArrayT1& mat0, const ArrayT2& mat1, const size_t mat_ofs,
const size_t outer, const size_t inner,
const VecT* HWY_RESTRICT vec_aligned, const AddT* HWY_RESTRICT add0,
const AddT* HWY_RESTRICT add1, float* HWY_RESTRICT out0,
float* HWY_RESTRICT out1, hwy::ThreadPool& pool) {
return TwoMatVecT</*kAdd=*/true>(mat0, mat1, mat_ofs, outer, inner,
vec_aligned, add0, add1, out0, out1, pool);
}
// Without addition
template <typename ArrayT1, typename ArrayT2, typename VecT>
HWY_NOINLINE void TwoMatVec(const ArrayT1& mat0, const ArrayT2& mat1,
const size_t mat_ofs, const size_t outer,
const size_t inner,
const VecT* HWY_RESTRICT vec_aligned,
float* HWY_RESTRICT out0, float* HWY_RESTRICT out1,
hwy::ThreadPool& pool) {
TwoMatVecT</*kAdd=*/false, ArrayT1, ArrayT2, VecT, VecT>(
mat0, mat1, mat_ofs, outer, inner, vec_aligned, /*add0=*/nullptr,
/*add1=*/nullptr, out0, out1, pool);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace gcpp
HWY_AFTER_NAMESPACE();
#endif // NOLINT