mirror of https://github.com/google/gemma.cpp.git
Split matmul into matvec; add large matrix benchmark
Rename var names to row/col for more clarity. Better estimate error tolerance via max abs col sum. PiperOrigin-RevId: 657601791
This commit is contained in:
parent
d37c088e44
commit
a24eda8d02
20
BUILD.bazel
20
BUILD.bazel
|
|
@ -25,6 +25,7 @@ cc_library(
|
||||||
textual_hdrs = [
|
textual_hdrs = [
|
||||||
"ops/ops-inl.h",
|
"ops/ops-inl.h",
|
||||||
"ops/matmul-inl.h",
|
"ops/matmul-inl.h",
|
||||||
|
"ops/matvec-inl.h",
|
||||||
],
|
],
|
||||||
deps = [
|
deps = [
|
||||||
"//compression:compress",
|
"//compression:compress",
|
||||||
|
|
@ -56,6 +57,25 @@ cc_test(
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
cc_test(
|
||||||
|
name = "matvec_test",
|
||||||
|
size = "small",
|
||||||
|
timeout = "long",
|
||||||
|
srcs = ["ops/matvec_test.cc"],
|
||||||
|
local_defines = ["HWY_IS_TEST"],
|
||||||
|
# for test_suite.
|
||||||
|
tags = ["hwy_ops_test"],
|
||||||
|
deps = [
|
||||||
|
":ops",
|
||||||
|
"@googletest//:gtest_main", # buildcleaner: keep
|
||||||
|
"//compression:compress",
|
||||||
|
"@hwy//:hwy",
|
||||||
|
"@hwy//:hwy_test_util",
|
||||||
|
"@hwy//:nanobenchmark",
|
||||||
|
"@hwy//:thread_pool",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
cc_test(
|
cc_test(
|
||||||
name = "matmul_test",
|
name = "matmul_test",
|
||||||
size = "small",
|
size = "small",
|
||||||
|
|
|
||||||
|
|
@ -98,6 +98,7 @@ set(SOURCES
|
||||||
gemma/weights.cc
|
gemma/weights.cc
|
||||||
gemma/weights.h
|
gemma/weights.h
|
||||||
ops/matmul-inl.h
|
ops/matmul-inl.h
|
||||||
|
ops/matvec-inl.h
|
||||||
ops/ops-inl.h
|
ops/ops-inl.h
|
||||||
util/app.h
|
util/app.h
|
||||||
util/args.h
|
util/args.h
|
||||||
|
|
@ -148,7 +149,9 @@ set(GEMMA_TEST_FILES
|
||||||
backprop/backward_test.cc
|
backprop/backward_test.cc
|
||||||
backprop/backward_scalar_test.cc
|
backprop/backward_scalar_test.cc
|
||||||
backprop/optimize_test.cc
|
backprop/optimize_test.cc
|
||||||
gemma/ops_test.cc
|
ops/ops_test.cc
|
||||||
|
ops/matmul_test.cc
|
||||||
|
ops/matvec_test.cc
|
||||||
evals/gemma_test.cc
|
evals/gemma_test.cc
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -39,7 +39,7 @@
|
||||||
#define THIRD_PARTY_GEMMA_CPP_FORWARD_TOGGLE
|
#define THIRD_PARTY_GEMMA_CPP_FORWARD_TOGGLE
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "ops/matmul-inl.h"
|
#include "ops/matvec-inl.h"
|
||||||
#include "ops/ops-inl.h"
|
#include "ops/ops-inl.h"
|
||||||
#include "hwy/highway.h"
|
#include "hwy/highway.h"
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -40,11 +40,11 @@
|
||||||
#include "gemma/weights.h"
|
#include "gemma/weights.h"
|
||||||
// Placeholder for internal test4, do not remove
|
// Placeholder for internal test4, do not remove
|
||||||
#include "ops/matmul-inl.h"
|
#include "ops/matmul-inl.h"
|
||||||
|
#include "ops/matvec-inl.h"
|
||||||
#include "ops/ops-inl.h"
|
#include "ops/ops-inl.h"
|
||||||
#include "hwy/aligned_allocator.h"
|
#include "hwy/aligned_allocator.h"
|
||||||
#include "hwy/base.h"
|
#include "hwy/base.h"
|
||||||
#include "hwy/bit_set.h"
|
#include "hwy/bit_set.h"
|
||||||
#include "hwy/contrib/matvec/matvec-inl.h"
|
|
||||||
#include "hwy/contrib/thread_pool/thread_pool.h"
|
#include "hwy/contrib/thread_pool/thread_pool.h"
|
||||||
#include "hwy/contrib/thread_pool/topology.h"
|
#include "hwy/contrib/thread_pool/topology.h"
|
||||||
#include "hwy/highway.h"
|
#include "hwy/highway.h"
|
||||||
|
|
|
||||||
307
ops/matmul-inl.h
307
ops/matmul-inl.h
|
|
@ -21,8 +21,6 @@
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
#include "compression/compress.h"
|
|
||||||
#include "compression/sfp.h"
|
|
||||||
#include "hwy/base.h"
|
#include "hwy/base.h"
|
||||||
#include "hwy/contrib/thread_pool/thread_pool.h"
|
#include "hwy/contrib/thread_pool/thread_pool.h"
|
||||||
#include "hwy/profiler.h"
|
#include "hwy/profiler.h"
|
||||||
|
|
@ -38,9 +36,7 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "compression/compress-inl.h"
|
#include "compression/compress-inl.h"
|
||||||
#include "hwy/contrib/dot/dot-inl.h"
|
|
||||||
#include "hwy/contrib/math/math-inl.h"
|
#include "hwy/contrib/math/math-inl.h"
|
||||||
#include "hwy/contrib/matvec/matvec-inl.h"
|
|
||||||
|
|
||||||
HWY_BEFORE_NAMESPACE();
|
HWY_BEFORE_NAMESPACE();
|
||||||
namespace gcpp {
|
namespace gcpp {
|
||||||
|
|
@ -443,309 +439,6 @@ HWY_NOINLINE void MatMul_4x4(const size_t batch_size, const Mat<MatTA>& A,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
//------------------------------------------------------------------------------
|
|
||||||
HWY_INLINE void ToEvenOddF32(const hwy::bfloat16_t* HWY_RESTRICT vec_aligned,
|
|
||||||
const size_t size, float* HWY_RESTRICT out) {
|
|
||||||
const hn::ScalableTag<float> df;
|
|
||||||
const hn::Repartition<hwy::bfloat16_t, decltype(df)> dbf16;
|
|
||||||
|
|
||||||
HWY_DASSERT(size % hn::Lanes(dbf16) == 0);
|
|
||||||
HWY_DASSERT(hn::IsAligned(df, vec_aligned));
|
|
||||||
|
|
||||||
for (size_t i = 0; i < size; i += hn::Lanes(dbf16)) {
|
|
||||||
const auto interleaved = hn::LoadU(dbf16, vec_aligned + i);
|
|
||||||
hn::Store(hn::PromoteEvenTo(df, interleaved), df, out + i);
|
|
||||||
hn::Store(hn::PromoteOddTo(df, interleaved), df, out + i + hn::Lanes(df));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
HWY_INLINE void ToEvenOddF32(const float* HWY_RESTRICT vec_aligned,
|
|
||||||
const size_t size, float* HWY_RESTRICT out) {
|
|
||||||
const hn::ScalableTag<float> df;
|
|
||||||
using VF = hn::Vec<decltype(df)>;
|
|
||||||
|
|
||||||
HWY_DASSERT(size % (hn::Lanes(df) * 2) == 0);
|
|
||||||
HWY_DASSERT(hn::IsAligned(df, vec_aligned));
|
|
||||||
|
|
||||||
VF vec0, vec1;
|
|
||||||
for (size_t i = 0; i < size; i += hn::Lanes(df) * 2) {
|
|
||||||
hn::LoadInterleaved2(df, vec_aligned + i, vec0, vec1);
|
|
||||||
hn::Store(vec0, df, out + i);
|
|
||||||
hn::Store(vec1, df, out + i + hn::Lanes(df));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Simple version without tiling nor threading, but two offsets/outputs and
|
|
||||||
// always with addition.
|
|
||||||
template <size_t kOuter, size_t kInner, typename ArrayT, typename VecT,
|
|
||||||
typename AddT>
|
|
||||||
HWY_INLINE void TwoOfsMatVecAddLoop(const ArrayT& mat, const size_t mat_ofs0,
|
|
||||||
const size_t mat_ofs1,
|
|
||||||
const VecT* HWY_RESTRICT vec_aligned,
|
|
||||||
const AddT* HWY_RESTRICT add0,
|
|
||||||
const AddT* HWY_RESTRICT add1,
|
|
||||||
float* HWY_RESTRICT out0,
|
|
||||||
float* HWY_RESTRICT out1) {
|
|
||||||
PROFILER_ZONE("TwoOfsMatVecAddLoop");
|
|
||||||
constexpr bool kVecEO = false;
|
|
||||||
const hn::ScalableTag<float> df;
|
|
||||||
|
|
||||||
for (size_t idx_row = 0; idx_row < kOuter; ++idx_row) {
|
|
||||||
const size_t row_ofs0 = mat_ofs0 + (idx_row)*kInner;
|
|
||||||
const size_t row_ofs1 = mat_ofs1 + (idx_row)*kInner;
|
|
||||||
out0[idx_row] = hwy::ConvertScalarTo<float>(add0[idx_row]) +
|
|
||||||
Dot<kVecEO>(df, mat, row_ofs0, vec_aligned, kInner);
|
|
||||||
out1[idx_row] = hwy::ConvertScalarTo<float>(add1[idx_row]) +
|
|
||||||
Dot<kVecEO>(df, mat, row_ofs1, vec_aligned, kInner);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
HWY_INLINE constexpr size_t MaxCols() {
|
|
||||||
// Vec + mat rows should fit into 32 KiB L1.
|
|
||||||
return 2048;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <size_t kOuter>
|
|
||||||
HWY_INLINE constexpr size_t RowsPerStrip() {
|
|
||||||
// Aim for 128 work items to reduce pool overhead. Must be at least one
|
|
||||||
// vector; prefer a power of two for faster division.
|
|
||||||
constexpr size_t kLanes = hn::ScalableTag<float>().MaxLanes();
|
|
||||||
constexpr size_t kRowsPerStrip =
|
|
||||||
kOuter < 128 ? kLanes
|
|
||||||
: HWY_MAX(kLanes, 1ULL << hwy::FloorLog2(kOuter / 128));
|
|
||||||
return kRowsPerStrip;
|
|
||||||
}
|
|
||||||
|
|
||||||
namespace detail {
|
|
||||||
|
|
||||||
// For each i = [0, num_rows), compute partial (length `num_cols`) dot product
|
|
||||||
// of row i with `vec_aligned` and add into `out[i]`. The upper-left
|
|
||||||
// coordinate of the tile is r0, c0.
|
|
||||||
template <bool kVecEO, class DF, typename ArrayT, typename VecT>
|
|
||||||
HWY_INLINE void AccumulatePartialDotProducts(
|
|
||||||
DF df, const ArrayT& mat, size_t mat_ofs, size_t mat_stride, size_t r0,
|
|
||||||
size_t c0, size_t num_rows, size_t num_cols,
|
|
||||||
const VecT* HWY_RESTRICT vec_aligned, float* HWY_RESTRICT out) {
|
|
||||||
for (size_t idx_row = 0; idx_row < num_rows; ++idx_row) {
|
|
||||||
const size_t row_ofs = mat_ofs + (r0 + idx_row) * mat_stride;
|
|
||||||
out[idx_row] +=
|
|
||||||
Dot<kVecEO>(df, mat, row_ofs + c0, vec_aligned + c0, num_cols);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Same as AccumulatePartialDotProducts, but sets out[i] to the first partial
|
|
||||||
// dot product + init (if kInit), which avoids having to zero-initialize and
|
|
||||||
// accumulate.
|
|
||||||
template <bool kVecEO, bool kInit, class DF, typename ArrayT, typename VecT,
|
|
||||||
typename InitT>
|
|
||||||
HWY_INLINE void SetFirstPartialDotProducts(DF df, const ArrayT& mat,
|
|
||||||
size_t mat_ofs, size_t mat_stride,
|
|
||||||
size_t r0, size_t c0,
|
|
||||||
size_t num_rows, size_t num_cols,
|
|
||||||
const VecT* HWY_RESTRICT vec_aligned,
|
|
||||||
const InitT* HWY_RESTRICT init,
|
|
||||||
float* HWY_RESTRICT out) {
|
|
||||||
for (size_t idx_row = 0; idx_row < num_rows; ++idx_row) {
|
|
||||||
const size_t row_ofs = mat_ofs + (r0 + idx_row) * mat_stride;
|
|
||||||
if constexpr (kInit) {
|
|
||||||
out[idx_row] =
|
|
||||||
hwy::ConvertScalarTo<float>(init[idx_row + r0]) +
|
|
||||||
Dot<kVecEO>(df, mat, row_ofs + c0, vec_aligned + c0, num_cols);
|
|
||||||
} else {
|
|
||||||
out[idx_row] =
|
|
||||||
Dot<kVecEO>(df, mat, row_ofs + c0, vec_aligned + c0, num_cols);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Adds together partial dot products for all tiles with the same r0 (a
|
|
||||||
// horizontal strip of the entire matrix); the result is the full dot product
|
|
||||||
// for rows r in [r0, r0 + num_rows) + optionally the add vector, which we
|
|
||||||
// store into in out[r - r0].
|
|
||||||
template <bool kVecEO, bool kAdd, class DF, typename ArrayT, typename VecT,
|
|
||||||
typename AddT>
|
|
||||||
HWY_INLINE void FullDotProductsForStrip(DF df, const ArrayT& mat,
|
|
||||||
size_t mat_ofs, size_t mat_stride,
|
|
||||||
size_t r0, size_t num_rows,
|
|
||||||
const VecT* HWY_RESTRICT vec_aligned,
|
|
||||||
const AddT* HWY_RESTRICT add,
|
|
||||||
float* HWY_RESTRICT out) {
|
|
||||||
// Tall and skinny: set `out` to the single dot product.
|
|
||||||
if (mat_stride < MaxCols()) {
|
|
||||||
SetFirstPartialDotProducts<kVecEO, kAdd>(df, mat, mat_ofs, mat_stride, r0,
|
|
||||||
0, num_rows, mat_stride,
|
|
||||||
vec_aligned, add, out);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// We have at least MaxCols, so start by setting `out` to that:
|
|
||||||
SetFirstPartialDotProducts<kVecEO, kAdd>(df, mat, mat_ofs, mat_stride, r0, 0,
|
|
||||||
num_rows, MaxCols(), vec_aligned,
|
|
||||||
add, out);
|
|
||||||
// For further multiples of MaxCols, accumulate. Remainders handled below.
|
|
||||||
size_t c0 = MaxCols();
|
|
||||||
for (; c0 <= mat_stride - MaxCols(); c0 += MaxCols()) {
|
|
||||||
AccumulatePartialDotProducts<kVecEO>(df, mat, mat_ofs, mat_stride, r0, c0,
|
|
||||||
num_rows, MaxCols(), vec_aligned, out);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (c0 < mat_stride) { // Final cols
|
|
||||||
AccumulatePartialDotProducts<kVecEO>(df, mat, mat_ofs, mat_stride, r0, c0,
|
|
||||||
num_rows, mat_stride - c0, vec_aligned,
|
|
||||||
out);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <bool kVecIsEvenOdd, bool kAdd, size_t kOuter, size_t kInner,
|
|
||||||
typename ArrayT, typename VecT, typename AddT>
|
|
||||||
HWY_INLINE void MatVecAddInner(const ArrayT& mat, const size_t mat_ofs,
|
|
||||||
const VecT* HWY_RESTRICT const vec_aligned,
|
|
||||||
const AddT* HWY_RESTRICT const add,
|
|
||||||
float* HWY_RESTRICT out, hwy::ThreadPool& pool) {
|
|
||||||
const hn::ScalableTag<float> df;
|
|
||||||
constexpr size_t kRowsPerStrip = RowsPerStrip<kOuter>();
|
|
||||||
constexpr size_t kNumStrips = kOuter / kRowsPerStrip;
|
|
||||||
|
|
||||||
// For each entire strip.
|
|
||||||
pool.Run(0, kNumStrips, [&](const uint64_t strip, size_t thread) HWY_ATTR {
|
|
||||||
PROFILER_ZONE("MatVec.lambda");
|
|
||||||
const size_t r0 = strip * kRowsPerStrip;
|
|
||||||
detail::FullDotProductsForStrip<kVecIsEvenOdd, kAdd>(
|
|
||||||
df, mat, mat_ofs, kInner, r0, kRowsPerStrip, vec_aligned, add,
|
|
||||||
out + r0);
|
|
||||||
});
|
|
||||||
|
|
||||||
// Remaining rows
|
|
||||||
const size_t r0 = kNumStrips * kRowsPerStrip;
|
|
||||||
if (r0 < kOuter) {
|
|
||||||
PROFILER_ZONE("MatVec remainder");
|
|
||||||
const size_t num_rows = kOuter - r0;
|
|
||||||
detail::FullDotProductsForStrip<kVecIsEvenOdd, kAdd>(
|
|
||||||
df, mat, mat_ofs, kInner, r0, num_rows, vec_aligned, add, out + r0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace detail
|
|
||||||
|
|
||||||
// Stores dot products of rows with `vec_aligned` + add the values from `add`
|
|
||||||
// (if kAdd), then stores them to `out`.
|
|
||||||
template <bool kAdd, size_t kOuter, size_t kInner, typename ArrayT,
|
|
||||||
typename VecT, typename AddT>
|
|
||||||
HWY_INLINE void MatVecT(const ArrayT& mat, const size_t mat_ofs,
|
|
||||||
const VecT* HWY_RESTRICT const vec_aligned,
|
|
||||||
const AddT* HWY_RESTRICT const add,
|
|
||||||
float* HWY_RESTRICT even_odd, float* HWY_RESTRICT out,
|
|
||||||
hwy::ThreadPool& pool) {
|
|
||||||
PROFILER_ZONE("MatVecAdd");
|
|
||||||
|
|
||||||
#if !defined(HWY_NATIVE_DOT_BF16) || !HWY_NATIVE_DOT_BF16
|
|
||||||
using MatT = typename ArrayT::value_type;
|
|
||||||
// Sfp -> float does not benefit enough to recoup the cost of ToEvenOddF32.
|
|
||||||
if constexpr (CompressTraits<MatT>::kSupportsEvenOdd &&
|
|
||||||
hwy::IsSameEither<VecT, float, hwy::bfloat16_t>() &&
|
|
||||||
!(hwy::IsSame<MatT, SfpStream>() &&
|
|
||||||
hwy::IsSame<VecT, float>())) {
|
|
||||||
ToEvenOddF32(vec_aligned, kInner, even_odd);
|
|
||||||
detail::MatVecAddInner</*kVecIsEvenOdd=*/true, kAdd, kOuter, kInner>(
|
|
||||||
mat, mat_ofs, even_odd, add, out, pool);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
(void)even_odd;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
detail::MatVecAddInner</*kVecIsEvenOdd=*/false, kAdd, kOuter, kInner>(
|
|
||||||
mat, mat_ofs, vec_aligned, add, out, pool);
|
|
||||||
}
|
|
||||||
|
|
||||||
// With addition
|
|
||||||
template <size_t kOuter, size_t kInner, typename ArrayT, typename VecT,
|
|
||||||
typename AddT>
|
|
||||||
HWY_INLINE void MatVecAdd(const ArrayT& mat, const size_t mat_ofs,
|
|
||||||
const VecT* HWY_RESTRICT const vec_aligned,
|
|
||||||
const AddT* HWY_RESTRICT const add,
|
|
||||||
float* HWY_RESTRICT even_odd, float* HWY_RESTRICT out,
|
|
||||||
hwy::ThreadPool& pool) {
|
|
||||||
return MatVecT</*kAdd=*/true, kOuter, kInner>(mat, mat_ofs, vec_aligned, add,
|
|
||||||
even_odd, out, pool);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Without addition
|
|
||||||
template <size_t kOuter, size_t kInner, typename ArrayT, typename VecT>
|
|
||||||
HWY_INLINE void MatVec(const ArrayT& mat, const size_t mat_ofs,
|
|
||||||
const VecT* HWY_RESTRICT const vec_aligned,
|
|
||||||
float* HWY_RESTRICT even_odd, float* HWY_RESTRICT out,
|
|
||||||
hwy::ThreadPool& pool) {
|
|
||||||
MatVecT</*kAdd=*/false, kOuter, kInner>(mat, mat_ofs, vec_aligned,
|
|
||||||
/*add=*/static_cast<VecT*>(nullptr),
|
|
||||||
even_odd, out, pool);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Two matrices, same vector
|
|
||||||
template <bool kAdd, size_t kOuter, size_t kInner, typename ArrayT,
|
|
||||||
typename VecT, typename AddT>
|
|
||||||
HWY_NOINLINE void TwoMatVecT(const ArrayT& mat0, const ArrayT& mat1,
|
|
||||||
const size_t mat_ofs,
|
|
||||||
const VecT* HWY_RESTRICT vec_aligned,
|
|
||||||
const AddT* HWY_RESTRICT add0,
|
|
||||||
const AddT* HWY_RESTRICT add1,
|
|
||||||
float* HWY_RESTRICT out0, float* HWY_RESTRICT out1,
|
|
||||||
hwy::ThreadPool& pool) {
|
|
||||||
PROFILER_ZONE("TwoMatVecAdd");
|
|
||||||
|
|
||||||
const hn::ScalableTag<float> df;
|
|
||||||
constexpr size_t kRowsPerStrip = RowsPerStrip<kOuter>();
|
|
||||||
constexpr size_t kNumStrips = kOuter / kRowsPerStrip;
|
|
||||||
constexpr bool kVecIsEvenOdd = false;
|
|
||||||
|
|
||||||
// For each entire strip.
|
|
||||||
pool.Run(0, kNumStrips, [&](const uint64_t strip, size_t thread) HWY_ATTR {
|
|
||||||
PROFILER_ZONE("TwoMatVec.lambda");
|
|
||||||
const size_t r0 = strip * kRowsPerStrip;
|
|
||||||
detail::FullDotProductsForStrip<kVecIsEvenOdd, kAdd>(
|
|
||||||
df, mat0, mat_ofs, kInner, r0, kRowsPerStrip, vec_aligned, add0,
|
|
||||||
out0 + r0);
|
|
||||||
detail::FullDotProductsForStrip<kVecIsEvenOdd, kAdd>(
|
|
||||||
df, mat1, mat_ofs, kInner, r0, kRowsPerStrip, vec_aligned, add1,
|
|
||||||
out1 + r0);
|
|
||||||
});
|
|
||||||
|
|
||||||
// Remaining rows
|
|
||||||
const size_t r0 = kNumStrips * kRowsPerStrip;
|
|
||||||
if (r0 < kOuter) {
|
|
||||||
PROFILER_ZONE("TwoMatVec remainder");
|
|
||||||
const size_t num_rows = kOuter - r0;
|
|
||||||
detail::FullDotProductsForStrip<kVecIsEvenOdd, kAdd>(
|
|
||||||
df, mat0, mat_ofs, kInner, r0, num_rows, vec_aligned, add0, out0 + r0);
|
|
||||||
detail::FullDotProductsForStrip<kVecIsEvenOdd, kAdd>(
|
|
||||||
df, mat1, mat_ofs, kInner, r0, num_rows, vec_aligned, add1, out1 + r0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// With addition
|
|
||||||
template <size_t kOuter, size_t kInner, typename ArrayT, typename VecT,
|
|
||||||
typename AddT>
|
|
||||||
HWY_NOINLINE void TwoMatVecAdd(
|
|
||||||
const ArrayT& mat0, const ArrayT& mat1, const size_t mat_ofs,
|
|
||||||
const VecT* HWY_RESTRICT vec_aligned, const AddT* HWY_RESTRICT add0,
|
|
||||||
const AddT* HWY_RESTRICT add1, float* HWY_RESTRICT out0,
|
|
||||||
float* HWY_RESTRICT out1, hwy::ThreadPool& pool) {
|
|
||||||
return TwoMatVecT</*kAdd=*/true, kOuter, kInner>(
|
|
||||||
mat0, mat1, mat_ofs, vec_aligned, add0, add1, out0, out1, pool);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Without addition
|
|
||||||
template <size_t kOuter, size_t kInner, typename ArrayT, typename VecT>
|
|
||||||
HWY_NOINLINE void TwoMatVec(const ArrayT& mat0, const ArrayT& mat1,
|
|
||||||
const size_t mat_ofs,
|
|
||||||
const VecT* HWY_RESTRICT vec_aligned,
|
|
||||||
float* HWY_RESTRICT out0, float* HWY_RESTRICT out1,
|
|
||||||
hwy::ThreadPool& pool) {
|
|
||||||
TwoMatVecT</*kAdd=*/false, kOuter, kInner, ArrayT, VecT, VecT>(
|
|
||||||
mat0, mat1, mat_ofs, vec_aligned, /*add0=*/nullptr, /*add1=*/nullptr,
|
|
||||||
out0, out1, pool);
|
|
||||||
}
|
|
||||||
|
|
||||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||||
} // namespace HWY_NAMESPACE
|
} // namespace HWY_NAMESPACE
|
||||||
} // namespace gcpp
|
} // namespace gcpp
|
||||||
|
|
|
||||||
|
|
@ -21,9 +21,6 @@
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
#include <algorithm>
|
|
||||||
#include <array>
|
|
||||||
#include <cmath>
|
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
|
||||||
#include "compression/compress.h"
|
#include "compression/compress.h"
|
||||||
|
|
@ -41,7 +38,6 @@
|
||||||
#include "hwy/tests/test_util-inl.h"
|
#include "hwy/tests/test_util-inl.h"
|
||||||
// After highway.h
|
// After highway.h
|
||||||
#include "ops/matmul-inl.h"
|
#include "ops/matmul-inl.h"
|
||||||
#include "ops/ops-inl.h" // MulByConst
|
|
||||||
|
|
||||||
HWY_BEFORE_NAMESPACE();
|
HWY_BEFORE_NAMESPACE();
|
||||||
namespace gcpp {
|
namespace gcpp {
|
||||||
|
|
@ -49,102 +45,115 @@ namespace HWY_NAMESPACE {
|
||||||
|
|
||||||
namespace hn = hwy::HWY_NAMESPACE;
|
namespace hn = hwy::HWY_NAMESPACE;
|
||||||
|
|
||||||
template <typename MatT, size_t kOuter, size_t kInner>
|
// Generates inputs: deterministic, within max SfpStream range.
|
||||||
std::unique_ptr<CompressedArray<MatT, kOuter * kInner>> GenerateMatHeap(
|
template <typename MatT, size_t kRows, size_t kCols>
|
||||||
|
std::unique_ptr<CompressedArray<MatT, kRows * kCols>> GenerateMatHeap(
|
||||||
size_t offset, hwy::ThreadPool& pool) {
|
size_t offset, hwy::ThreadPool& pool) {
|
||||||
gcpp::CompressWorkingSet ws;
|
gcpp::CompressWorkingSet ws;
|
||||||
hwy::AlignedFreeUniquePtr<float[]> content =
|
hwy::AlignedFreeUniquePtr<float[]> content =
|
||||||
hwy::AllocateAligned<float>(kOuter * kInner);
|
hwy::AllocateAligned<float>(kRows * kCols);
|
||||||
const float scale = 1.875f / (kInner * kOuter + offset);
|
const float scale = 1.875f / (kCols * kRows + offset);
|
||||||
pool.Run(0, kOuter, [&](const size_t i, size_t /*thread*/) {
|
pool.Run(0, kRows, [&](const size_t i, size_t /*thread*/) {
|
||||||
for (size_t j = 0; j < kInner; j++) {
|
for (size_t j = 0; j < kCols; j++) {
|
||||||
content[i * kInner + j] =
|
content[i * kCols + j] =
|
||||||
static_cast<float>((i * kInner + j + offset) * scale);
|
static_cast<float>((i * kCols + j + offset) * scale);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
std::unique_ptr<CompressedArray<MatT, kOuter * kInner>> mat =
|
std::unique_ptr<CompressedArray<MatT, kRows * kCols>> mat =
|
||||||
std::make_unique<CompressedArray<MatT, kOuter * kInner>>();
|
std::make_unique<CompressedArray<MatT, kRows * kCols>>();
|
||||||
Compress(content.get(), kOuter * kInner, ws, kOuter * kInner, mat->data(), 0,
|
Compress(content.get(), kRows * kCols, ws, kRows * kCols, mat->data(), 0,
|
||||||
pool);
|
pool);
|
||||||
mat->set_scale(0.6f); // Arbitrary value, different from 1.
|
mat->set_scale(0.6f); // Arbitrary value, different from 1.
|
||||||
return mat;
|
return mat;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename MatT, size_t kOuter, size_t kInner>
|
template <typename MatT, size_t kRows, size_t kCols>
|
||||||
std::unique_ptr<CompressedArray<MatT, kOuter * kInner>>
|
std::unique_ptr<CompressedArray<MatT, kRows * kCols>> GenerateTransposeMatHeap(
|
||||||
GenerateTransposeMatHeap(size_t offset, hwy::ThreadPool& pool) {
|
size_t offset, hwy::ThreadPool& pool) {
|
||||||
gcpp::CompressWorkingSet ws;
|
gcpp::CompressWorkingSet ws;
|
||||||
hwy::AlignedFreeUniquePtr<float[]> content =
|
hwy::AlignedFreeUniquePtr<float[]> content =
|
||||||
hwy::AllocateAligned<float>(kOuter * kInner);
|
hwy::AllocateAligned<float>(kRows * kCols);
|
||||||
const float scale = 1.875f / (kInner * kOuter + offset);
|
const float scale = 1.875f / (kCols * kRows + offset);
|
||||||
pool.Run(0, kOuter, [&](const size_t i, size_t /*thread*/) {
|
pool.Run(0, kRows, [&](const size_t i, size_t /*thread*/) {
|
||||||
for (size_t j = 0; j < kInner; j++) {
|
for (size_t j = 0; j < kCols; j++) {
|
||||||
content[j * kOuter + i] =
|
content[j * kRows + i] =
|
||||||
static_cast<float>((i * kInner + j + offset) * scale);
|
static_cast<float>((i * kCols + j + offset) * scale);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
std::unique_ptr<CompressedArray<MatT, kOuter * kInner>> mat =
|
std::unique_ptr<CompressedArray<MatT, kRows * kCols>> mat =
|
||||||
std::make_unique<CompressedArray<MatT, kOuter * kInner>>();
|
std::make_unique<CompressedArray<MatT, kRows * kCols>>();
|
||||||
Compress(content.get(), kOuter * kInner, ws, kOuter * kInner, mat->data(), 0,
|
Compress(content.get(), kRows * kCols, ws, kRows * kCols, mat->data(), 0,
|
||||||
pool);
|
pool);
|
||||||
// Arbitrary value, different from 1, must match GenerateMatHeap.
|
// Arbitrary value, different from 1, must match GenerateMatHeap.
|
||||||
mat->set_scale(0.6f);
|
mat->set_scale(0.6f);
|
||||||
return mat;
|
return mat;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename MatT, size_t kOuter, size_t kInner>
|
template <typename MatT, size_t kRows, size_t kCols>
|
||||||
std::unique_ptr<CompressedArray<MatT, kOuter * kInner>> GenerateZeroMatHeap(
|
std::unique_ptr<CompressedArray<MatT, kRows * kCols>> GenerateZeroMatHeap(
|
||||||
hwy::ThreadPool& pool) {
|
hwy::ThreadPool& pool) {
|
||||||
gcpp::CompressWorkingSet ws;
|
gcpp::CompressWorkingSet ws;
|
||||||
hwy::AlignedFreeUniquePtr<float[]> content =
|
hwy::AlignedFreeUniquePtr<float[]> content =
|
||||||
hwy::AllocateAligned<float>(kOuter * kInner);
|
hwy::AllocateAligned<float>(kRows * kCols);
|
||||||
|
|
||||||
pool.Run(0, kOuter, [&](const size_t i, size_t thread) {
|
pool.Run(0, kRows, [&](const size_t i, size_t thread) {
|
||||||
hwy::ZeroBytes(&content[i * kInner], kInner * sizeof(content[0]));
|
hwy::ZeroBytes(&content[i * kCols], kCols * sizeof(content[0]));
|
||||||
});
|
});
|
||||||
|
|
||||||
std::unique_ptr<CompressedArray<MatT, kOuter * kInner>> mat =
|
std::unique_ptr<CompressedArray<MatT, kRows * kCols>> mat =
|
||||||
std::make_unique<CompressedArray<MatT, kOuter * kInner>>();
|
std::make_unique<CompressedArray<MatT, kRows * kCols>>();
|
||||||
Compress(content.get(), kOuter * kInner, ws, kOuter * kInner, mat->data(), 0,
|
Compress(content.get(), kRows * kCols, ws, kRows * kCols, mat->data(), 0,
|
||||||
pool);
|
pool);
|
||||||
mat->set_scale(1.2f); // Arbitrary value, different from 1.
|
mat->set_scale(1.2f); // Arbitrary value, different from 1.
|
||||||
return mat;
|
return mat;
|
||||||
}
|
}
|
||||||
|
|
||||||
// A simple matrix multiplication. No optimization / tiling.
|
template <typename MatT>
|
||||||
template <size_t kM, size_t kN, size_t kK>
|
void Decompress(const MatT* compressed, size_t num, float* out) {
|
||||||
hwy::AlignedFreeUniquePtr<float[]> SimpleMatMul(
|
const hn::ScalableTag<float> d;
|
||||||
const hwy::AlignedFreeUniquePtr<float[]>& a,
|
hwy::AlignedFreeUniquePtr<float[]> b = hwy::AllocateAligned<float>(num);
|
||||||
const hwy::AlignedFreeUniquePtr<float[]>& b) {
|
CompressTraits<MatT>::Decompress(d, /*in_capacity=*/0, compressed, 0, out,
|
||||||
hwy::AlignedFreeUniquePtr<float[]> out = hwy::AllocateAligned<float>(kM * kK);
|
num);
|
||||||
hwy::ZeroBytes(out.get(), kM * kK * sizeof(float));
|
|
||||||
|
|
||||||
int i, j, k;
|
|
||||||
for (i = 0; i < kM; ++i) {
|
|
||||||
for (j = 0; j < kK; ++j) {
|
|
||||||
for (k = 0; k < kN; ++k) {
|
|
||||||
out[i * kK + j] += a[i * kN + k] * b[k * kK + j];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return out;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename MatT>
|
// Returns 1-norm, used for estimating tolerable numerical differences.
|
||||||
void AssertClose(const MatT* HWY_RESTRICT expected,
|
double MaxColAbsSum(const float* HWY_RESTRICT a, size_t rows, size_t cols) {
|
||||||
const MatT* HWY_RESTRICT actual, size_t num) {
|
double max_col_abs_sum = 0.0;
|
||||||
for (size_t idx = 0; idx < num; idx++) {
|
for (size_t c = 0; c < cols; c++) {
|
||||||
const double expected_value = hwy::ConvertScalarTo<double>(expected[idx]);
|
double col_abs_sum = 0.0;
|
||||||
const double actual_value = hwy::ConvertScalarTo<double>(actual[idx]);
|
for (size_t r = 0; r < rows; r++) {
|
||||||
|
col_abs_sum += hwy::ScalarAbs(a[r * cols + c]);
|
||||||
|
}
|
||||||
|
max_col_abs_sum = HWY_MAX(max_col_abs_sum, col_abs_sum);
|
||||||
|
}
|
||||||
|
return max_col_abs_sum;
|
||||||
|
}
|
||||||
|
|
||||||
const double magnitude = std::abs(expected_value);
|
template <typename MatTA, typename MatTB>
|
||||||
|
void AssertClose(size_t rows_ac, size_t cols_ab, size_t cols_c_rows_b,
|
||||||
|
const MatTA* HWY_RESTRICT a_compr,
|
||||||
|
const MatTB* HWY_RESTRICT b_trans_compr,
|
||||||
|
const float* HWY_RESTRICT expected_c,
|
||||||
|
const float* HWY_RESTRICT actual_c) {
|
||||||
|
const size_t num_a = rows_ac * cols_ab;
|
||||||
|
const size_t num_b = cols_c_rows_b * cols_ab;
|
||||||
|
const size_t num_c = rows_ac * cols_c_rows_b;
|
||||||
|
hwy::AlignedFreeUniquePtr<float[]> a = hwy::AllocateAligned<float>(num_a);
|
||||||
|
hwy::AlignedFreeUniquePtr<float[]> b_trans =
|
||||||
|
hwy::AllocateAligned<float>(num_b);
|
||||||
|
Decompress(a_compr, num_a, a.get());
|
||||||
|
Decompress(b_trans_compr, num_b, b_trans.get());
|
||||||
|
|
||||||
const double tolerance =
|
const double norm = MaxColAbsSum(a.get(), rows_ac, cols_ab) *
|
||||||
256.0 * hwy::ConvertScalarTo<double>(hwy::Epsilon<MatT>()) *
|
MaxColAbsSum(b_trans.get(), cols_c_rows_b, cols_ab);
|
||||||
HWY_MAX(magnitude, 1.0);
|
const double epsilon = hwy::ConvertScalarTo<double>(hwy::Epsilon<float>());
|
||||||
|
const double tolerance = 50.0 * norm * epsilon;
|
||||||
|
|
||||||
|
for (size_t idx = 0; idx < num_c; idx++) {
|
||||||
|
const double expected_value = expected_c[idx];
|
||||||
|
const double actual_value = actual_c[idx];
|
||||||
|
|
||||||
if (!(expected_value - tolerance <= actual_value &&
|
if (!(expected_value - tolerance <= actual_value &&
|
||||||
actual_value <= expected_value + tolerance)) {
|
actual_value <= expected_value + tolerance)) {
|
||||||
|
|
@ -154,35 +163,23 @@ void AssertClose(const MatT* HWY_RESTRICT expected,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <size_t length>
|
|
||||||
void AssertClose(const hwy::AlignedFreeUniquePtr<float[]>& a,
|
|
||||||
const hwy::AlignedFreeUniquePtr<float[]>& b) {
|
|
||||||
for (size_t idx = 0; idx < length; idx++) {
|
|
||||||
const float rel_abs_delta = std::abs(a[idx] - b[idx]) /
|
|
||||||
std::max(std::abs(a[idx]), std::abs(b[idx]));
|
|
||||||
EXPECT_LT(rel_abs_delta, 2e-6)
|
|
||||||
<< "a[" << idx << "]=" << a[idx] << ", b[" << idx << "]=" << b[idx];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Largely unoptimized; reordered innermost loops nets ~5-10X speedup.
|
// Largely unoptimized; reordered innermost loops nets ~5-10X speedup.
|
||||||
template <size_t kN, size_t kK, typename MatTA, typename MatTB,
|
template <typename MatTA, typename MatTB, HWY_IF_T_SIZE_GT(MatTB, 1)>
|
||||||
HWY_IF_T_SIZE_GT(MatTB, 1)>
|
HWY_INLINE void MatMulSlow(size_t rows_ac, size_t cols_a_rows_b, size_t cols_bc,
|
||||||
HWY_INLINE void MatMulSlowBatch(size_t batch_size, const MatTA* HWY_RESTRICT a,
|
const MatTA* HWY_RESTRICT a,
|
||||||
const MatTB* HWY_RESTRICT b, const float scale,
|
const MatTB* HWY_RESTRICT b, const float scale,
|
||||||
const float* add, float* HWY_RESTRICT out) {
|
const float* add, float* HWY_RESTRICT out) {
|
||||||
for (size_t i = 0; i < batch_size; ++i) {
|
for (size_t i = 0; i < rows_ac; ++i) {
|
||||||
for (size_t k = 0; k < kN; ++k) {
|
for (size_t k = 0; k < cols_a_rows_b; ++k) {
|
||||||
for (size_t j = 0; j < kK; ++j) {
|
for (size_t j = 0; j < cols_bc; ++j) {
|
||||||
const float a1 = hwy::ConvertScalarTo<float>(a[i * kN + k]);
|
const float a1 = hwy::ConvertScalarTo<float>(a[i * cols_a_rows_b + k]);
|
||||||
const float b1 = hwy::ConvertScalarTo<float>(b[k * kK + j]);
|
const float b1 = hwy::ConvertScalarTo<float>(b[k * cols_bc + j]);
|
||||||
out[i * kK + j] += scale * a1 * b1;
|
out[i * cols_bc + j] += scale * a1 * b1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (add != nullptr) {
|
if (add != nullptr) {
|
||||||
for (size_t j = 0; j < kK; ++j) {
|
for (size_t j = 0; j < cols_bc; ++j) {
|
||||||
out[i * kK + j] += add[j];
|
out[i * cols_bc + j] += add[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -190,74 +187,78 @@ HWY_INLINE void MatMulSlowBatch(size_t batch_size, const MatTA* HWY_RESTRICT a,
|
||||||
|
|
||||||
// The above overload can handle combinations of f32 and bf16, but this one
|
// The above overload can handle combinations of f32 and bf16, but this one
|
||||||
// is required for MatTB = {SFP, NUQ}.
|
// is required for MatTB = {SFP, NUQ}.
|
||||||
template <size_t kN, size_t kK, typename MatTA, typename MatTB,
|
template <typename MatTA, typename MatTB, HWY_IF_T_SIZE(MatTB, 1)>
|
||||||
HWY_IF_T_SIZE(MatTB, 1)>
|
HWY_INLINE void MatMulSlow(size_t rows_ac, size_t cols_a_rows_b, size_t cols_bc,
|
||||||
HWY_INLINE void MatMulSlowBatch(size_t batch_size, const MatTA* HWY_RESTRICT a,
|
const MatTA* HWY_RESTRICT a,
|
||||||
const MatTB* HWY_RESTRICT b_compr,
|
const MatTB* HWY_RESTRICT b_compr, const float scale,
|
||||||
const float scale, const float* add,
|
const float* add, float* HWY_RESTRICT out) {
|
||||||
float* HWY_RESTRICT out) {
|
|
||||||
const hn::ScalableTag<float> d;
|
const hn::ScalableTag<float> d;
|
||||||
hwy::AlignedFreeUniquePtr<float[]> b = hwy::AllocateAligned<float>(kK * kN);
|
hwy::AlignedFreeUniquePtr<float[]> b =
|
||||||
|
hwy::AllocateAligned<float>(cols_a_rows_b * cols_bc);
|
||||||
CompressTraits<MatTB>::Decompress(d, /*in_capacity=*/0, b_compr, 0, b.get(),
|
CompressTraits<MatTB>::Decompress(d, /*in_capacity=*/0, b_compr, 0, b.get(),
|
||||||
kK * kN);
|
cols_a_rows_b * cols_bc);
|
||||||
MatMulSlowBatch<kN, kK>(batch_size, a, b.get(), scale, add, out);
|
MatMulSlow(rows_ac, cols_a_rows_b, cols_bc, a, b.get(), scale, add, out);
|
||||||
}
|
}
|
||||||
|
|
||||||
void PrintSpeed(const char* algo, size_t M, size_t N, size_t K,
|
void PrintSpeed(const char* algo, size_t rows_ac, size_t cols_a_rows_b,
|
||||||
double elapsed) {
|
size_t cols_bc, double elapsed) {
|
||||||
// * 2 because of FMA.
|
// 2 because of FMA.
|
||||||
fprintf(stderr, "%s: %f seconds, %f GFLOPS.\n", algo, elapsed,
|
fprintf(stderr, "%s: %f seconds, %f GFLOPS.\n", algo, elapsed,
|
||||||
2E-9 * M * N * K / elapsed);
|
2E-9 * rows_ac * cols_a_rows_b * cols_bc / elapsed);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <size_t kM, size_t kN, size_t kK, bool kAdd, typename MatTA,
|
template <size_t kRowsAC, size_t kColsARowsB, size_t kColsBC, bool kAdd,
|
||||||
typename MatTB = MatTA>
|
typename MatTA, typename MatTB = MatTA>
|
||||||
void TestMatMul(hwy::ThreadPool& pool) {
|
void TestMatMul(hwy::ThreadPool& pool) {
|
||||||
using TraitsA = CompressTraits<MatTA>;
|
using TraitsA = CompressTraits<MatTA>;
|
||||||
using TraitsB = CompressTraits<MatTB>;
|
using TraitsB = CompressTraits<MatTB>;
|
||||||
fprintf(stderr, "TestMatMul %lu, %lu, %lu, add=%d, MatTA=%s, MatTB=%s\n", kM,
|
const bool want_bench = kColsBC > 2000; // avoid spam for small matrices
|
||||||
kN, kK, kAdd, TraitsA::Name(), TraitsB::Name());
|
fprintf(stderr, "TestMatMul %lu, %lu, %lu, add=%d, MatTA=%s, MatTB=%s\n",
|
||||||
|
kRowsAC, kColsARowsB, kColsBC, kAdd, TraitsA::Name(),
|
||||||
|
TraitsB::Name());
|
||||||
|
|
||||||
std::unique_ptr<CompressedArray<MatTA, kM * kN>> a =
|
std::unique_ptr<CompressedArray<MatTA, kRowsAC * kColsARowsB>> a =
|
||||||
GenerateMatHeap<MatTA, kM, kN>(0, pool);
|
GenerateMatHeap<MatTA, kRowsAC, kColsARowsB>(0, pool);
|
||||||
std::unique_ptr<CompressedArray<MatTB, kN * kK>> b_trans =
|
std::unique_ptr<CompressedArray<MatTB, kColsARowsB * kColsBC>> b_trans =
|
||||||
GenerateTransposeMatHeap<MatTB, kN, kK>(0, pool);
|
GenerateTransposeMatHeap<MatTB, kColsARowsB, kColsBC>(0, pool);
|
||||||
hwy::AlignedFreeUniquePtr<float[]> c = hwy::AllocateAligned<float>(kM * kK);
|
hwy::AlignedFreeUniquePtr<float[]> c =
|
||||||
|
hwy::AllocateAligned<float>(kRowsAC * kColsBC);
|
||||||
|
|
||||||
const float scale = a->scale() * b_trans->scale();
|
const float scale = a->scale() * b_trans->scale();
|
||||||
std::unique_ptr<CompressedArray<float, kK>> add;
|
std::unique_ptr<CompressedArray<float, kColsBC>> add;
|
||||||
if (kAdd) {
|
if (kAdd) {
|
||||||
add = GenerateMatHeap<float, 1, kK>(0, pool);
|
add = GenerateMatHeap<float, 1, kColsBC>(0, pool);
|
||||||
add->set_scale(1.0f);
|
add->set_scale(1.0f);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::unique_ptr<CompressedArray<float, kM * kK>> c_slow;
|
std::unique_ptr<CompressedArray<MatTB, kColsARowsB * kColsBC>> b =
|
||||||
const bool compare_slow = kN < 2048;
|
GenerateMatHeap<MatTB, kColsARowsB, kColsBC>(0, pool);
|
||||||
if (compare_slow) {
|
|
||||||
std::unique_ptr<CompressedArray<MatTB, kN * kK>> b =
|
|
||||||
GenerateMatHeap<MatTB, kN, kK>(0, pool);
|
|
||||||
HWY_ASSERT_EQ(scale, a->scale() * b->scale());
|
HWY_ASSERT_EQ(scale, a->scale() * b->scale());
|
||||||
c_slow = GenerateZeroMatHeap<float, kM, kK>(pool);
|
std::unique_ptr<CompressedArray<float, kRowsAC * kColsBC>> c_slow =
|
||||||
|
GenerateZeroMatHeap<float, kRowsAC, kColsBC>(pool);
|
||||||
const double start_slow = hwy::platform::Now();
|
const double start_slow = hwy::platform::Now();
|
||||||
MatMulSlowBatch<kN, kK>(kM, a->data(), b->data(), scale,
|
MatMulSlow(kRowsAC, kColsARowsB, kColsBC, a->data(), b->data(), scale,
|
||||||
kAdd ? add->data() : nullptr, c_slow->data());
|
kAdd ? add->data() : nullptr, c_slow->data());
|
||||||
PrintSpeed("MatMulSlowBatch", kM, kN, kK,
|
if (want_bench) {
|
||||||
|
PrintSpeed("MatMulSlow", kRowsAC, kColsARowsB, kColsBC,
|
||||||
hwy::platform::Now() - start_slow);
|
hwy::platform::Now() - start_slow);
|
||||||
}
|
}
|
||||||
|
|
||||||
double min_elapsed = hwy::HighestValue<double>();
|
double min_elapsed = hwy::HighestValue<double>();
|
||||||
for (int rep = 0; rep < (compare_slow ? 1 : 3); ++rep) {
|
for (int rep = 0; rep < (want_bench ? 3 : 1); ++rep) {
|
||||||
const double start_tiled = hwy::platform::Now();
|
const double start_tiled = hwy::platform::Now();
|
||||||
MatMul_4x4<kAdd>(kM, MakeMat(a->data(), kN), MakeMat(b_trans->data(), kN),
|
MatMul_4x4<kAdd>(kRowsAC, MakeMat(a->data(), kColsARowsB),
|
||||||
scale, kAdd ? add->data_scale1() : nullptr,
|
MakeMat(b_trans->data(), kColsARowsB), scale,
|
||||||
MakeMat(c.get(), kK), pool);
|
kAdd ? add->data_scale1() : nullptr,
|
||||||
|
MakeMat(c.get(), kColsBC), pool);
|
||||||
min_elapsed = HWY_MIN(min_elapsed, hwy::platform::Now() - start_tiled);
|
min_elapsed = HWY_MIN(min_elapsed, hwy::platform::Now() - start_tiled);
|
||||||
}
|
}
|
||||||
PrintSpeed("MatMul_4x4", kM, kN, kK, min_elapsed);
|
if (want_bench) {
|
||||||
|
PrintSpeed("MatMul_4x4", kRowsAC, kColsARowsB, kColsBC, min_elapsed);
|
||||||
if (compare_slow) {
|
|
||||||
AssertClose(c_slow->data(), c.get(), kM * kK);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
AssertClose(kRowsAC, kColsARowsB, kColsBC, a->data(), b_trans->data(),
|
||||||
|
c_slow->data(), c.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
void TestAllMatMul() {
|
void TestAllMatMul() {
|
||||||
|
|
@ -274,6 +275,7 @@ void TestAllMatMul() {
|
||||||
|
|
||||||
// large-scale test
|
// large-scale test
|
||||||
TestMatMul<64, 24576, 3072, /*kAdd=*/false, F32, SFP>(pool);
|
TestMatMul<64, 24576, 3072, /*kAdd=*/false, F32, SFP>(pool);
|
||||||
|
TestMatMul<64, 3072, 24576, /*kAdd=*/false, F32, SFP>(pool);
|
||||||
|
|
||||||
// medium-sized square test
|
// medium-sized square test
|
||||||
TestMatMul<512, 512, 512, /*kAdd=*/false, F32>(pool);
|
TestMatMul<512, 512, 512, /*kAdd=*/false, F32>(pool);
|
||||||
|
|
@ -283,7 +285,7 @@ void TestAllMatMul() {
|
||||||
TestMatMul<512, 512, 512, /*kAdd=*/false, F32, SFP>(pool);
|
TestMatMul<512, 512, 512, /*kAdd=*/false, F32, SFP>(pool);
|
||||||
TestMatMul<512, 512, 512, /*kAdd=*/true, BF16, SFP>(pool);
|
TestMatMul<512, 512, 512, /*kAdd=*/true, BF16, SFP>(pool);
|
||||||
|
|
||||||
// minimal non-square test. kK must be at least 2 vectors.
|
// minimal non-square test. kColsARowsB must be at least 2 vectors.
|
||||||
TestMatMul<35, 128, 32, /*kAdd=*/false, F32>(pool);
|
TestMatMul<35, 128, 32, /*kAdd=*/false, F32>(pool);
|
||||||
TestMatMul<34, 128, 32, /*kAdd=*/true, BF16>(pool);
|
TestMatMul<34, 128, 32, /*kAdd=*/true, BF16>(pool);
|
||||||
TestMatMul<33, 128, 32, /*kAdd=*/false, F32, BF16>(pool);
|
TestMatMul<33, 128, 32, /*kAdd=*/false, F32, BF16>(pool);
|
||||||
|
|
@ -316,129 +318,6 @@ void TestAllMatMul() {
|
||||||
TestMatMul<1, 128, 32, /*kAdd=*/true, BF16, SFP>(pool);
|
TestMatMul<1, 128, 32, /*kAdd=*/true, BF16, SFP>(pool);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <size_t kOuter, size_t kInner>
|
|
||||||
hwy::AlignedFreeUniquePtr<float[]> SimpleMatVecAdd(
|
|
||||||
const CompressedArray<float, kOuter * kInner>& mat,
|
|
||||||
const hwy::AlignedFreeUniquePtr<float[]>& vec,
|
|
||||||
const hwy::AlignedFreeUniquePtr<float[]>& add) {
|
|
||||||
hwy::AlignedFreeUniquePtr<float[]> uncompressed_mat =
|
|
||||||
hwy::AllocateAligned<float>(kOuter * kInner);
|
|
||||||
hwy::AlignedFreeUniquePtr<float[]> out = hwy::AllocateAligned<float>(kOuter);
|
|
||||||
HWY_ASSERT(uncompressed_mat && out);
|
|
||||||
Decompress(mat, 0, uncompressed_mat.get(), kOuter * kInner);
|
|
||||||
MulByConst(mat.scale(), uncompressed_mat.get(), kOuter * kInner);
|
|
||||||
for (size_t idx_row = 0; idx_row < kOuter; idx_row++) {
|
|
||||||
out[idx_row] = add[idx_row];
|
|
||||||
for (size_t idx_col = 0; idx_col < kInner; idx_col++) {
|
|
||||||
out[idx_row] +=
|
|
||||||
uncompressed_mat[kInner * idx_row + idx_col] * vec[idx_col];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return out;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename MatT, size_t kOuter, size_t kInner>
|
|
||||||
CompressedArray<MatT, kOuter * kInner> GenerateMat(size_t offset,
|
|
||||||
hwy::ThreadPool& pool) {
|
|
||||||
gcpp::CompressWorkingSet ws;
|
|
||||||
CompressedArray<MatT, kOuter * kInner> mat;
|
|
||||||
std::array<float, kOuter * kInner> content;
|
|
||||||
const float scale = 1.0f / kInner;
|
|
||||||
pool.Run(0, kOuter, [&](const size_t i, size_t /*thread*/) {
|
|
||||||
for (size_t j = 0; j < kInner; j++) {
|
|
||||||
content[i * kInner + j] =
|
|
||||||
static_cast<float>((i * kInner + j + offset) * scale);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
Compress(content, ws, mat, pool);
|
|
||||||
mat.set_scale(1.9f); // Arbitrary value, different from 1.
|
|
||||||
return mat;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <size_t length>
|
|
||||||
hwy::AlignedFreeUniquePtr<float[]> GenerateVec(size_t offset) {
|
|
||||||
hwy::AlignedFreeUniquePtr<float[]> vec = hwy::AllocateAligned<float>(length);
|
|
||||||
HWY_ASSERT(vec);
|
|
||||||
for (size_t idx = 0; idx < length; idx++) {
|
|
||||||
vec[idx] = static_cast<float>(idx + offset);
|
|
||||||
}
|
|
||||||
return vec;
|
|
||||||
}
|
|
||||||
|
|
||||||
void TestMatVecAdd() {
|
|
||||||
hwy::ThreadPool pool(hwy::ThreadPool::MaxThreads());
|
|
||||||
constexpr size_t kOuter = 128 * 3;
|
|
||||||
constexpr size_t kInner = 128 * 5;
|
|
||||||
CompressedArray<float, kOuter * kInner> mat =
|
|
||||||
GenerateMat<float, kOuter, kInner>(0, pool);
|
|
||||||
hwy::AlignedFreeUniquePtr<float[]> vec = GenerateVec<kInner>(0);
|
|
||||||
hwy::AlignedFreeUniquePtr<float[]> add = GenerateVec<kOuter>(0);
|
|
||||||
hwy::AlignedFreeUniquePtr<float[]> even_odd =
|
|
||||||
hwy::AllocateAligned<float>(kInner * pool.NumWorkers());
|
|
||||||
hwy::AlignedFreeUniquePtr<float[]> expected_out =
|
|
||||||
SimpleMatVecAdd<kOuter, kInner>(mat, vec, add);
|
|
||||||
hwy::AlignedFreeUniquePtr<float[]> actual_out =
|
|
||||||
hwy::AllocateAligned<float>(kOuter);
|
|
||||||
HWY_ASSERT(vec && add && even_odd && expected_out && actual_out);
|
|
||||||
MatVecAdd<kOuter, kInner>(mat, 0, vec.get(), add.get(), even_odd.get(),
|
|
||||||
actual_out.get(), pool);
|
|
||||||
AssertClose<kOuter>(actual_out, expected_out);
|
|
||||||
}
|
|
||||||
|
|
||||||
void TestTwoMatVecAdd() {
|
|
||||||
hwy::ThreadPool pool(hwy::ThreadPool::MaxThreads());
|
|
||||||
constexpr size_t kOuter = 128 * 3;
|
|
||||||
constexpr size_t kInner = 128 * 5;
|
|
||||||
CompressedArray<float, kOuter * kInner> mat0 =
|
|
||||||
GenerateMat<float, kOuter, kInner>(0, pool);
|
|
||||||
CompressedArray<float, kOuter * kInner> mat1 =
|
|
||||||
GenerateMat<float, kOuter, kInner>(1, pool);
|
|
||||||
hwy::AlignedFreeUniquePtr<float[]> vec = GenerateVec<kInner>(0);
|
|
||||||
hwy::AlignedFreeUniquePtr<float[]> add0 = GenerateVec<kOuter>(0);
|
|
||||||
hwy::AlignedFreeUniquePtr<float[]> add1 = GenerateVec<kOuter>(1);
|
|
||||||
hwy::AlignedFreeUniquePtr<float[]> expected_out0 =
|
|
||||||
SimpleMatVecAdd<kOuter, kInner>(mat0, vec, add0);
|
|
||||||
hwy::AlignedFreeUniquePtr<float[]> expected_out1 =
|
|
||||||
SimpleMatVecAdd<kOuter, kInner>(mat1, vec, add1);
|
|
||||||
hwy::AlignedFreeUniquePtr<float[]> actual_out0 =
|
|
||||||
hwy::AllocateAligned<float>(kOuter);
|
|
||||||
hwy::AlignedFreeUniquePtr<float[]> actual_out1 =
|
|
||||||
hwy::AllocateAligned<float>(kOuter);
|
|
||||||
HWY_ASSERT(vec && add0 && add1 && expected_out0 && actual_out0 &&
|
|
||||||
expected_out1 && actual_out1);
|
|
||||||
TwoMatVecAdd<kOuter, kInner>(mat0, mat1, 0, vec.get(), add0.get(), add1.get(),
|
|
||||||
actual_out0.get(), actual_out1.get(), pool);
|
|
||||||
AssertClose<kOuter>(actual_out0, expected_out0);
|
|
||||||
AssertClose<kOuter>(actual_out1, expected_out1);
|
|
||||||
}
|
|
||||||
|
|
||||||
void TestTwoOfsMatVecAddLoop() {
|
|
||||||
hwy::ThreadPool pool(hwy::ThreadPool::MaxThreads());
|
|
||||||
constexpr size_t kOuter = 128 * 3;
|
|
||||||
constexpr size_t kInner = 128 * 5;
|
|
||||||
CompressedArray<float, kOuter * kInner> mat =
|
|
||||||
GenerateMat<float, kOuter, kInner>(0, pool);
|
|
||||||
hwy::AlignedFreeUniquePtr<float[]> vec = GenerateVec<kInner>(0);
|
|
||||||
hwy::AlignedFreeUniquePtr<float[]> add0 = GenerateVec<kOuter>(0);
|
|
||||||
hwy::AlignedFreeUniquePtr<float[]> add1 = GenerateVec<kOuter>(1);
|
|
||||||
hwy::AlignedFreeUniquePtr<float[]> expected_out0 =
|
|
||||||
SimpleMatVecAdd<kOuter, kInner>(mat, vec, add0);
|
|
||||||
hwy::AlignedFreeUniquePtr<float[]> expected_out1 =
|
|
||||||
SimpleMatVecAdd<kOuter, kInner>(mat, vec, add1);
|
|
||||||
hwy::AlignedFreeUniquePtr<float[]> actual_out0 =
|
|
||||||
hwy::AllocateAligned<float>(kOuter);
|
|
||||||
hwy::AlignedFreeUniquePtr<float[]> actual_out1 =
|
|
||||||
hwy::AllocateAligned<float>(kOuter);
|
|
||||||
HWY_ASSERT(vec && add0 && add1 && expected_out0 && actual_out0 &&
|
|
||||||
expected_out1 && actual_out1);
|
|
||||||
TwoOfsMatVecAddLoop<kOuter, kInner>(mat, 0, 0, vec.get(), add0.get(),
|
|
||||||
add1.get(), actual_out0.get(),
|
|
||||||
actual_out1.get());
|
|
||||||
AssertClose<kOuter>(actual_out0, expected_out0);
|
|
||||||
AssertClose<kOuter>(actual_out1, expected_out1);
|
|
||||||
}
|
|
||||||
|
|
||||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||||
} // namespace HWY_NAMESPACE
|
} // namespace HWY_NAMESPACE
|
||||||
} // namespace gcpp
|
} // namespace gcpp
|
||||||
|
|
@ -449,9 +328,6 @@ HWY_AFTER_NAMESPACE();
|
||||||
namespace gcpp {
|
namespace gcpp {
|
||||||
HWY_BEFORE_TEST(MatmulTest);
|
HWY_BEFORE_TEST(MatmulTest);
|
||||||
HWY_EXPORT_AND_TEST_P(MatmulTest, TestAllMatMul);
|
HWY_EXPORT_AND_TEST_P(MatmulTest, TestAllMatMul);
|
||||||
HWY_EXPORT_AND_TEST_P(MatmulTest, TestMatVecAdd);
|
|
||||||
HWY_EXPORT_AND_TEST_P(MatmulTest, TestTwoMatVecAdd);
|
|
||||||
HWY_EXPORT_AND_TEST_P(MatmulTest, TestTwoOfsMatVecAddLoop);
|
|
||||||
HWY_AFTER_TEST();
|
HWY_AFTER_TEST();
|
||||||
|
|
||||||
} // namespace gcpp
|
} // namespace gcpp
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,357 @@
|
||||||
|
// Copyright 2024 Google LLC
|
||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// https://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
// Include guard for non-SIMD code.
|
||||||
|
#ifndef THIRD_PARTY_GEMMA_CPP_OPS_MATVEC_INL_H_
|
||||||
|
#define THIRD_PARTY_GEMMA_CPP_OPS_MATVEC_INL_H_
|
||||||
|
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
#include "compression/compress.h"
|
||||||
|
#include "compression/sfp.h"
|
||||||
|
#include "hwy/base.h"
|
||||||
|
#include "hwy/contrib/thread_pool/thread_pool.h"
|
||||||
|
#include "hwy/profiler.h"
|
||||||
|
|
||||||
|
#endif // THIRD_PARTY_GEMMA_CPP_OPS_MATVEC_INL_H_
|
||||||
|
|
||||||
|
// Include guard for (potentially) SIMD code.
|
||||||
|
#if defined(THIRD_PARTY_GEMMA_CPP_MATVEC_TOGGLE) == defined(HWY_TARGET_TOGGLE)
|
||||||
|
#ifdef THIRD_PARTY_GEMMA_CPP_MATVEC_TOGGLE
|
||||||
|
#undef THIRD_PARTY_GEMMA_CPP_MATVEC_TOGGLE
|
||||||
|
#else
|
||||||
|
#define THIRD_PARTY_GEMMA_CPP_MATVEC_TOGGLE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "compression/compress-inl.h"
|
||||||
|
#include "hwy/contrib/dot/dot-inl.h"
|
||||||
|
#include "hwy/contrib/math/math-inl.h"
|
||||||
|
#include "hwy/contrib/matvec/matvec-inl.h"
|
||||||
|
|
||||||
|
HWY_BEFORE_NAMESPACE();
|
||||||
|
namespace gcpp {
|
||||||
|
namespace HWY_NAMESPACE {
|
||||||
|
namespace hn = hwy::HWY_NAMESPACE;
|
||||||
|
|
||||||
|
HWY_INLINE void ToEvenOddF32(const hwy::bfloat16_t* HWY_RESTRICT vec_aligned,
|
||||||
|
const size_t size, float* HWY_RESTRICT out) {
|
||||||
|
const hn::ScalableTag<float> df;
|
||||||
|
const hn::Repartition<hwy::bfloat16_t, decltype(df)> dbf16;
|
||||||
|
|
||||||
|
HWY_DASSERT(size % hn::Lanes(dbf16) == 0);
|
||||||
|
HWY_DASSERT(hn::IsAligned(df, vec_aligned));
|
||||||
|
|
||||||
|
for (size_t i = 0; i < size; i += hn::Lanes(dbf16)) {
|
||||||
|
const auto interleaved = hn::LoadU(dbf16, vec_aligned + i);
|
||||||
|
hn::Store(hn::PromoteEvenTo(df, interleaved), df, out + i);
|
||||||
|
hn::Store(hn::PromoteOddTo(df, interleaved), df, out + i + hn::Lanes(df));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
HWY_INLINE void ToEvenOddF32(const float* HWY_RESTRICT vec_aligned,
|
||||||
|
const size_t size, float* HWY_RESTRICT out) {
|
||||||
|
const hn::ScalableTag<float> df;
|
||||||
|
using VF = hn::Vec<decltype(df)>;
|
||||||
|
|
||||||
|
HWY_DASSERT(size % (hn::Lanes(df) * 2) == 0);
|
||||||
|
HWY_DASSERT(hn::IsAligned(df, vec_aligned));
|
||||||
|
|
||||||
|
VF vec0, vec1;
|
||||||
|
for (size_t i = 0; i < size; i += hn::Lanes(df) * 2) {
|
||||||
|
hn::LoadInterleaved2(df, vec_aligned + i, vec0, vec1);
|
||||||
|
hn::Store(vec0, df, out + i);
|
||||||
|
hn::Store(vec1, df, out + i + hn::Lanes(df));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Simple version without tiling nor threading, but two offsets/outputs and
|
||||||
|
// always with addition.
|
||||||
|
template <size_t kOuter, size_t kInner, typename ArrayT, typename VecT,
|
||||||
|
typename AddT>
|
||||||
|
HWY_INLINE void TwoOfsMatVecAddLoop(const ArrayT& mat, const size_t mat_ofs0,
|
||||||
|
const size_t mat_ofs1,
|
||||||
|
const VecT* HWY_RESTRICT vec_aligned,
|
||||||
|
const AddT* HWY_RESTRICT add0,
|
||||||
|
const AddT* HWY_RESTRICT add1,
|
||||||
|
float* HWY_RESTRICT out0,
|
||||||
|
float* HWY_RESTRICT out1) {
|
||||||
|
PROFILER_ZONE("TwoOfsMatVecAddLoop");
|
||||||
|
constexpr bool kVecEO = false;
|
||||||
|
const hn::ScalableTag<float> df;
|
||||||
|
|
||||||
|
for (size_t idx_row = 0; idx_row < kOuter; ++idx_row) {
|
||||||
|
const size_t row_ofs0 = mat_ofs0 + (idx_row)*kInner;
|
||||||
|
const size_t row_ofs1 = mat_ofs1 + (idx_row)*kInner;
|
||||||
|
out0[idx_row] = hwy::ConvertScalarTo<float>(add0[idx_row]) +
|
||||||
|
Dot<kVecEO>(df, mat, row_ofs0, vec_aligned, kInner);
|
||||||
|
out1[idx_row] = hwy::ConvertScalarTo<float>(add1[idx_row]) +
|
||||||
|
Dot<kVecEO>(df, mat, row_ofs1, vec_aligned, kInner);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
HWY_INLINE constexpr size_t MaxCols() {
|
||||||
|
// Vec + mat rows should fit into 32 KiB L1.
|
||||||
|
return 2048;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <size_t kOuter>
|
||||||
|
HWY_INLINE constexpr size_t RowsPerStrip() {
|
||||||
|
// Aim for 128 work items to reduce pool overhead. Must be at least one
|
||||||
|
// vector; prefer a power of two for faster division.
|
||||||
|
constexpr size_t kLanes = hn::ScalableTag<float>().MaxLanes();
|
||||||
|
constexpr size_t kRowsPerStrip =
|
||||||
|
kOuter < 128 ? kLanes
|
||||||
|
: HWY_MAX(kLanes, 1ULL << hwy::FloorLog2(kOuter / 128));
|
||||||
|
return kRowsPerStrip;
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace detail {
|
||||||
|
|
||||||
|
// For each i = [0, num_rows), compute partial (length `num_cols`) dot product
|
||||||
|
// of row i with `vec_aligned` and add into `out[i]`. The upper-left
|
||||||
|
// coordinate of the tile is r0, c0.
|
||||||
|
template <bool kVecEO, class DF, typename ArrayT, typename VecT>
|
||||||
|
HWY_INLINE void AccumulatePartialDotProducts(
|
||||||
|
DF df, const ArrayT& mat, size_t mat_ofs, size_t mat_stride, size_t r0,
|
||||||
|
size_t c0, size_t num_rows, size_t num_cols,
|
||||||
|
const VecT* HWY_RESTRICT vec_aligned, float* HWY_RESTRICT out) {
|
||||||
|
for (size_t idx_row = 0; idx_row < num_rows; ++idx_row) {
|
||||||
|
const size_t row_ofs = mat_ofs + (r0 + idx_row) * mat_stride;
|
||||||
|
out[idx_row] +=
|
||||||
|
Dot<kVecEO>(df, mat, row_ofs + c0, vec_aligned + c0, num_cols);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Same as AccumulatePartialDotProducts, but sets out[i] to the first partial
|
||||||
|
// dot product + init (if kInit), which avoids having to zero-initialize and
|
||||||
|
// accumulate.
|
||||||
|
template <bool kVecEO, bool kInit, class DF, typename ArrayT, typename VecT,
|
||||||
|
typename InitT>
|
||||||
|
HWY_INLINE void SetFirstPartialDotProducts(DF df, const ArrayT& mat,
|
||||||
|
size_t mat_ofs, size_t mat_stride,
|
||||||
|
size_t r0, size_t c0,
|
||||||
|
size_t num_rows, size_t num_cols,
|
||||||
|
const VecT* HWY_RESTRICT vec_aligned,
|
||||||
|
const InitT* HWY_RESTRICT init,
|
||||||
|
float* HWY_RESTRICT out) {
|
||||||
|
for (size_t idx_row = 0; idx_row < num_rows; ++idx_row) {
|
||||||
|
const size_t row_ofs = mat_ofs + (r0 + idx_row) * mat_stride;
|
||||||
|
if constexpr (kInit) {
|
||||||
|
out[idx_row] =
|
||||||
|
hwy::ConvertScalarTo<float>(init[idx_row + r0]) +
|
||||||
|
Dot<kVecEO>(df, mat, row_ofs + c0, vec_aligned + c0, num_cols);
|
||||||
|
} else {
|
||||||
|
out[idx_row] =
|
||||||
|
Dot<kVecEO>(df, mat, row_ofs + c0, vec_aligned + c0, num_cols);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Adds together partial dot products for all tiles with the same r0 (a
|
||||||
|
// horizontal strip of the entire matrix); the result is the full dot product
|
||||||
|
// for rows r in [r0, r0 + num_rows) + optionally the add vector, which we
|
||||||
|
// store into in out[r - r0].
|
||||||
|
template <bool kVecEO, bool kAdd, class DF, typename ArrayT, typename VecT,
|
||||||
|
typename AddT>
|
||||||
|
HWY_INLINE void FullDotProductsForStrip(DF df, const ArrayT& mat,
|
||||||
|
size_t mat_ofs, size_t mat_stride,
|
||||||
|
size_t r0, size_t num_rows,
|
||||||
|
const VecT* HWY_RESTRICT vec_aligned,
|
||||||
|
const AddT* HWY_RESTRICT add,
|
||||||
|
float* HWY_RESTRICT out) {
|
||||||
|
// Tall and skinny: set `out` to the single dot product.
|
||||||
|
if (mat_stride < MaxCols()) {
|
||||||
|
SetFirstPartialDotProducts<kVecEO, kAdd>(df, mat, mat_ofs, mat_stride, r0,
|
||||||
|
0, num_rows, mat_stride,
|
||||||
|
vec_aligned, add, out);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// We have at least MaxCols, so start by setting `out` to that:
|
||||||
|
SetFirstPartialDotProducts<kVecEO, kAdd>(df, mat, mat_ofs, mat_stride, r0, 0,
|
||||||
|
num_rows, MaxCols(), vec_aligned,
|
||||||
|
add, out);
|
||||||
|
// For further multiples of MaxCols, accumulate. Remainders handled below.
|
||||||
|
size_t c0 = MaxCols();
|
||||||
|
for (; c0 <= mat_stride - MaxCols(); c0 += MaxCols()) {
|
||||||
|
AccumulatePartialDotProducts<kVecEO>(df, mat, mat_ofs, mat_stride, r0, c0,
|
||||||
|
num_rows, MaxCols(), vec_aligned, out);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (c0 < mat_stride) { // Final cols
|
||||||
|
AccumulatePartialDotProducts<kVecEO>(df, mat, mat_ofs, mat_stride, r0, c0,
|
||||||
|
num_rows, mat_stride - c0, vec_aligned,
|
||||||
|
out);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <bool kVecIsEvenOdd, bool kAdd, size_t kOuter, size_t kInner,
|
||||||
|
typename ArrayT, typename VecT, typename AddT>
|
||||||
|
HWY_INLINE void MatVecAddInner(const ArrayT& mat, const size_t mat_ofs,
|
||||||
|
const VecT* HWY_RESTRICT const vec_aligned,
|
||||||
|
const AddT* HWY_RESTRICT const add,
|
||||||
|
float* HWY_RESTRICT out, hwy::ThreadPool& pool) {
|
||||||
|
const hn::ScalableTag<float> df;
|
||||||
|
constexpr size_t kRowsPerStrip = RowsPerStrip<kOuter>();
|
||||||
|
constexpr size_t kNumStrips = kOuter / kRowsPerStrip;
|
||||||
|
|
||||||
|
// For each entire strip.
|
||||||
|
pool.Run(0, kNumStrips, [&](const uint64_t strip, size_t thread) HWY_ATTR {
|
||||||
|
PROFILER_ZONE("MatVec.lambda");
|
||||||
|
const size_t r0 = strip * kRowsPerStrip;
|
||||||
|
detail::FullDotProductsForStrip<kVecIsEvenOdd, kAdd>(
|
||||||
|
df, mat, mat_ofs, kInner, r0, kRowsPerStrip, vec_aligned, add,
|
||||||
|
out + r0);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Remaining rows
|
||||||
|
const size_t r0 = kNumStrips * kRowsPerStrip;
|
||||||
|
if (r0 < kOuter) {
|
||||||
|
PROFILER_ZONE("MatVec remainder");
|
||||||
|
const size_t num_rows = kOuter - r0;
|
||||||
|
detail::FullDotProductsForStrip<kVecIsEvenOdd, kAdd>(
|
||||||
|
df, mat, mat_ofs, kInner, r0, num_rows, vec_aligned, add, out + r0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace detail
|
||||||
|
|
||||||
|
// Stores dot products of rows with `vec_aligned` + add the values from `add`
|
||||||
|
// (if kAdd), then stores them to `out`.
|
||||||
|
template <bool kAdd, size_t kOuter, size_t kInner, typename ArrayT,
|
||||||
|
typename VecT, typename AddT>
|
||||||
|
HWY_INLINE void MatVecT(const ArrayT& mat, const size_t mat_ofs,
|
||||||
|
const VecT* HWY_RESTRICT const vec_aligned,
|
||||||
|
const AddT* HWY_RESTRICT const add,
|
||||||
|
float* HWY_RESTRICT even_odd, float* HWY_RESTRICT out,
|
||||||
|
hwy::ThreadPool& pool) {
|
||||||
|
PROFILER_ZONE("MatVecAdd");
|
||||||
|
|
||||||
|
#if !defined(HWY_NATIVE_DOT_BF16) || !HWY_NATIVE_DOT_BF16
|
||||||
|
using MatT = typename ArrayT::value_type;
|
||||||
|
// Sfp -> float does not benefit enough to recoup the cost of ToEvenOddF32.
|
||||||
|
if constexpr (CompressTraits<MatT>::kSupportsEvenOdd &&
|
||||||
|
hwy::IsSameEither<VecT, float, hwy::bfloat16_t>() &&
|
||||||
|
!(hwy::IsSame<MatT, SfpStream>() &&
|
||||||
|
hwy::IsSame<VecT, float>())) {
|
||||||
|
ToEvenOddF32(vec_aligned, kInner, even_odd);
|
||||||
|
detail::MatVecAddInner</*kVecIsEvenOdd=*/true, kAdd, kOuter, kInner>(
|
||||||
|
mat, mat_ofs, even_odd, add, out, pool);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
(void)even_odd;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
detail::MatVecAddInner</*kVecIsEvenOdd=*/false, kAdd, kOuter, kInner>(
|
||||||
|
mat, mat_ofs, vec_aligned, add, out, pool);
|
||||||
|
}
|
||||||
|
|
||||||
|
// With addition
|
||||||
|
template <size_t kOuter, size_t kInner, typename ArrayT, typename VecT,
|
||||||
|
typename AddT>
|
||||||
|
HWY_INLINE void MatVecAdd(const ArrayT& mat, const size_t mat_ofs,
|
||||||
|
const VecT* HWY_RESTRICT const vec_aligned,
|
||||||
|
const AddT* HWY_RESTRICT const add,
|
||||||
|
float* HWY_RESTRICT even_odd, float* HWY_RESTRICT out,
|
||||||
|
hwy::ThreadPool& pool) {
|
||||||
|
return MatVecT</*kAdd=*/true, kOuter, kInner>(mat, mat_ofs, vec_aligned, add,
|
||||||
|
even_odd, out, pool);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Without addition
|
||||||
|
template <size_t kOuter, size_t kInner, typename ArrayT, typename VecT>
|
||||||
|
HWY_INLINE void MatVec(const ArrayT& mat, const size_t mat_ofs,
|
||||||
|
const VecT* HWY_RESTRICT const vec_aligned,
|
||||||
|
float* HWY_RESTRICT even_odd, float* HWY_RESTRICT out,
|
||||||
|
hwy::ThreadPool& pool) {
|
||||||
|
MatVecT</*kAdd=*/false, kOuter, kInner>(mat, mat_ofs, vec_aligned,
|
||||||
|
/*add=*/static_cast<VecT*>(nullptr),
|
||||||
|
even_odd, out, pool);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Two matrices, same vector
|
||||||
|
template <bool kAdd, size_t kOuter, size_t kInner, typename ArrayT,
|
||||||
|
typename VecT, typename AddT>
|
||||||
|
HWY_NOINLINE void TwoMatVecT(const ArrayT& mat0, const ArrayT& mat1,
|
||||||
|
const size_t mat_ofs,
|
||||||
|
const VecT* HWY_RESTRICT vec_aligned,
|
||||||
|
const AddT* HWY_RESTRICT add0,
|
||||||
|
const AddT* HWY_RESTRICT add1,
|
||||||
|
float* HWY_RESTRICT out0, float* HWY_RESTRICT out1,
|
||||||
|
hwy::ThreadPool& pool) {
|
||||||
|
PROFILER_ZONE("TwoMatVecAdd");
|
||||||
|
|
||||||
|
const hn::ScalableTag<float> df;
|
||||||
|
constexpr size_t kRowsPerStrip = RowsPerStrip<kOuter>();
|
||||||
|
constexpr size_t kNumStrips = kOuter / kRowsPerStrip;
|
||||||
|
constexpr bool kVecIsEvenOdd = false;
|
||||||
|
|
||||||
|
// For each entire strip.
|
||||||
|
pool.Run(0, kNumStrips, [&](const uint64_t strip, size_t thread) HWY_ATTR {
|
||||||
|
PROFILER_ZONE("TwoMatVec.lambda");
|
||||||
|
const size_t r0 = strip * kRowsPerStrip;
|
||||||
|
detail::FullDotProductsForStrip<kVecIsEvenOdd, kAdd>(
|
||||||
|
df, mat0, mat_ofs, kInner, r0, kRowsPerStrip, vec_aligned, add0,
|
||||||
|
out0 + r0);
|
||||||
|
detail::FullDotProductsForStrip<kVecIsEvenOdd, kAdd>(
|
||||||
|
df, mat1, mat_ofs, kInner, r0, kRowsPerStrip, vec_aligned, add1,
|
||||||
|
out1 + r0);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Remaining rows
|
||||||
|
const size_t r0 = kNumStrips * kRowsPerStrip;
|
||||||
|
if (r0 < kOuter) {
|
||||||
|
PROFILER_ZONE("TwoMatVec remainder");
|
||||||
|
const size_t num_rows = kOuter - r0;
|
||||||
|
detail::FullDotProductsForStrip<kVecIsEvenOdd, kAdd>(
|
||||||
|
df, mat0, mat_ofs, kInner, r0, num_rows, vec_aligned, add0, out0 + r0);
|
||||||
|
detail::FullDotProductsForStrip<kVecIsEvenOdd, kAdd>(
|
||||||
|
df, mat1, mat_ofs, kInner, r0, num_rows, vec_aligned, add1, out1 + r0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// With addition
|
||||||
|
template <size_t kOuter, size_t kInner, typename ArrayT, typename VecT,
|
||||||
|
typename AddT>
|
||||||
|
HWY_NOINLINE void TwoMatVecAdd(
|
||||||
|
const ArrayT& mat0, const ArrayT& mat1, const size_t mat_ofs,
|
||||||
|
const VecT* HWY_RESTRICT vec_aligned, const AddT* HWY_RESTRICT add0,
|
||||||
|
const AddT* HWY_RESTRICT add1, float* HWY_RESTRICT out0,
|
||||||
|
float* HWY_RESTRICT out1, hwy::ThreadPool& pool) {
|
||||||
|
return TwoMatVecT</*kAdd=*/true, kOuter, kInner>(
|
||||||
|
mat0, mat1, mat_ofs, vec_aligned, add0, add1, out0, out1, pool);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Without addition
|
||||||
|
template <size_t kOuter, size_t kInner, typename ArrayT, typename VecT>
|
||||||
|
HWY_NOINLINE void TwoMatVec(const ArrayT& mat0, const ArrayT& mat1,
|
||||||
|
const size_t mat_ofs,
|
||||||
|
const VecT* HWY_RESTRICT vec_aligned,
|
||||||
|
float* HWY_RESTRICT out0, float* HWY_RESTRICT out1,
|
||||||
|
hwy::ThreadPool& pool) {
|
||||||
|
TwoMatVecT</*kAdd=*/false, kOuter, kInner, ArrayT, VecT, VecT>(
|
||||||
|
mat0, mat1, mat_ofs, vec_aligned, /*add0=*/nullptr, /*add1=*/nullptr,
|
||||||
|
out0, out1, pool);
|
||||||
|
}
|
||||||
|
|
||||||
|
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||||
|
} // namespace HWY_NAMESPACE
|
||||||
|
} // namespace gcpp
|
||||||
|
HWY_AFTER_NAMESPACE();
|
||||||
|
|
||||||
|
#endif // NOLINT
|
||||||
|
|
@ -0,0 +1,199 @@
|
||||||
|
// Copyright 2023 Google LLC
|
||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
#ifndef HWY_DISABLED_TARGETS
|
||||||
|
// Exclude HWY_SCALAR due to 2x bf16 -> f32.
|
||||||
|
#define HWY_DISABLED_TARGETS HWY_SCALAR
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <array>
|
||||||
|
#include <cmath>
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
|
#include "compression/compress.h"
|
||||||
|
#include "hwy/aligned_allocator.h"
|
||||||
|
#include "hwy/base.h"
|
||||||
|
#include "hwy/contrib/thread_pool/thread_pool.h"
|
||||||
|
|
||||||
|
// clang-format off
|
||||||
|
#undef HWY_TARGET_INCLUDE
|
||||||
|
#define HWY_TARGET_INCLUDE "ops/matvec_test.cc" // NOLINT
|
||||||
|
// clang-format on
|
||||||
|
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||||
|
#include "hwy/highway.h"
|
||||||
|
#include "hwy/tests/test_util-inl.h"
|
||||||
|
// After highway.h
|
||||||
|
#include "ops/matvec-inl.h"
|
||||||
|
#include "ops/ops-inl.h" // MulByConst
|
||||||
|
|
||||||
|
HWY_BEFORE_NAMESPACE();
|
||||||
|
namespace gcpp {
|
||||||
|
namespace HWY_NAMESPACE {
|
||||||
|
|
||||||
|
template <size_t kOuter, size_t kInner>
|
||||||
|
hwy::AlignedFreeUniquePtr<float[]> SimpleMatVecAdd(
|
||||||
|
const CompressedArray<float, kOuter * kInner>& mat,
|
||||||
|
const hwy::AlignedFreeUniquePtr<float[]>& vec,
|
||||||
|
const hwy::AlignedFreeUniquePtr<float[]>& add) {
|
||||||
|
hwy::AlignedFreeUniquePtr<float[]> uncompressed_mat =
|
||||||
|
hwy::AllocateAligned<float>(kOuter * kInner);
|
||||||
|
hwy::AlignedFreeUniquePtr<float[]> out = hwy::AllocateAligned<float>(kOuter);
|
||||||
|
HWY_ASSERT(uncompressed_mat && out);
|
||||||
|
Decompress(mat, 0, uncompressed_mat.get(), kOuter * kInner);
|
||||||
|
MulByConst(mat.scale(), uncompressed_mat.get(), kOuter * kInner);
|
||||||
|
for (size_t idx_row = 0; idx_row < kOuter; idx_row++) {
|
||||||
|
out[idx_row] = add[idx_row];
|
||||||
|
for (size_t idx_col = 0; idx_col < kInner; idx_col++) {
|
||||||
|
out[idx_row] +=
|
||||||
|
uncompressed_mat[kInner * idx_row + idx_col] * vec[idx_col];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename MatT, size_t kOuter, size_t kInner>
|
||||||
|
CompressedArray<MatT, kOuter * kInner> GenerateMat(size_t offset,
|
||||||
|
hwy::ThreadPool& pool) {
|
||||||
|
gcpp::CompressWorkingSet ws;
|
||||||
|
CompressedArray<MatT, kOuter * kInner> mat;
|
||||||
|
std::array<float, kOuter * kInner> content;
|
||||||
|
const float scale = 1.0f / kInner;
|
||||||
|
pool.Run(0, kOuter, [&](const size_t i, size_t /*thread*/) {
|
||||||
|
for (size_t j = 0; j < kInner; j++) {
|
||||||
|
content[i * kInner + j] =
|
||||||
|
static_cast<float>((i * kInner + j + offset) * scale);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
Compress(content, ws, mat, pool);
|
||||||
|
mat.set_scale(1.9f); // Arbitrary value, different from 1.
|
||||||
|
return mat;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <size_t length>
|
||||||
|
hwy::AlignedFreeUniquePtr<float[]> GenerateVec(size_t offset) {
|
||||||
|
hwy::AlignedFreeUniquePtr<float[]> vec = hwy::AllocateAligned<float>(length);
|
||||||
|
HWY_ASSERT(vec);
|
||||||
|
for (size_t idx = 0; idx < length; idx++) {
|
||||||
|
vec[idx] = static_cast<float>(idx + offset);
|
||||||
|
}
|
||||||
|
return vec;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <size_t length>
|
||||||
|
void AssertClose(const hwy::AlignedFreeUniquePtr<float[]>& a,
|
||||||
|
const hwy::AlignedFreeUniquePtr<float[]>& b) {
|
||||||
|
for (size_t idx = 0; idx < length; idx++) {
|
||||||
|
const float rel_abs_delta = std::abs(a[idx] - b[idx]) /
|
||||||
|
std::max(std::abs(a[idx]), std::abs(b[idx]));
|
||||||
|
EXPECT_LT(rel_abs_delta, 2e-6)
|
||||||
|
<< "a[" << idx << "]=" << a[idx] << ", b[" << idx << "]=" << b[idx];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void TestMatVecAdd() {
|
||||||
|
hwy::ThreadPool pool(hwy::ThreadPool::MaxThreads());
|
||||||
|
constexpr size_t kOuter = 128 * 3;
|
||||||
|
constexpr size_t kInner = 128 * 5;
|
||||||
|
CompressedArray<float, kOuter * kInner> mat =
|
||||||
|
GenerateMat<float, kOuter, kInner>(0, pool);
|
||||||
|
hwy::AlignedFreeUniquePtr<float[]> vec = GenerateVec<kInner>(0);
|
||||||
|
hwy::AlignedFreeUniquePtr<float[]> add = GenerateVec<kOuter>(0);
|
||||||
|
hwy::AlignedFreeUniquePtr<float[]> even_odd =
|
||||||
|
hwy::AllocateAligned<float>(kInner * pool.NumWorkers());
|
||||||
|
hwy::AlignedFreeUniquePtr<float[]> expected_out =
|
||||||
|
SimpleMatVecAdd<kOuter, kInner>(mat, vec, add);
|
||||||
|
hwy::AlignedFreeUniquePtr<float[]> actual_out =
|
||||||
|
hwy::AllocateAligned<float>(kOuter);
|
||||||
|
HWY_ASSERT(vec && add && even_odd && expected_out && actual_out);
|
||||||
|
MatVecAdd<kOuter, kInner>(mat, 0, vec.get(), add.get(), even_odd.get(),
|
||||||
|
actual_out.get(), pool);
|
||||||
|
AssertClose<kOuter>(actual_out, expected_out);
|
||||||
|
}
|
||||||
|
|
||||||
|
void TestTwoMatVecAdd() {
|
||||||
|
hwy::ThreadPool pool(hwy::ThreadPool::MaxThreads());
|
||||||
|
constexpr size_t kOuter = 128 * 3;
|
||||||
|
constexpr size_t kInner = 128 * 5;
|
||||||
|
CompressedArray<float, kOuter * kInner> mat0 =
|
||||||
|
GenerateMat<float, kOuter, kInner>(0, pool);
|
||||||
|
CompressedArray<float, kOuter * kInner> mat1 =
|
||||||
|
GenerateMat<float, kOuter, kInner>(1, pool);
|
||||||
|
hwy::AlignedFreeUniquePtr<float[]> vec = GenerateVec<kInner>(0);
|
||||||
|
hwy::AlignedFreeUniquePtr<float[]> add0 = GenerateVec<kOuter>(0);
|
||||||
|
hwy::AlignedFreeUniquePtr<float[]> add1 = GenerateVec<kOuter>(1);
|
||||||
|
hwy::AlignedFreeUniquePtr<float[]> expected_out0 =
|
||||||
|
SimpleMatVecAdd<kOuter, kInner>(mat0, vec, add0);
|
||||||
|
hwy::AlignedFreeUniquePtr<float[]> expected_out1 =
|
||||||
|
SimpleMatVecAdd<kOuter, kInner>(mat1, vec, add1);
|
||||||
|
hwy::AlignedFreeUniquePtr<float[]> actual_out0 =
|
||||||
|
hwy::AllocateAligned<float>(kOuter);
|
||||||
|
hwy::AlignedFreeUniquePtr<float[]> actual_out1 =
|
||||||
|
hwy::AllocateAligned<float>(kOuter);
|
||||||
|
HWY_ASSERT(vec && add0 && add1 && expected_out0 && actual_out0 &&
|
||||||
|
expected_out1 && actual_out1);
|
||||||
|
TwoMatVecAdd<kOuter, kInner>(mat0, mat1, 0, vec.get(), add0.get(), add1.get(),
|
||||||
|
actual_out0.get(), actual_out1.get(), pool);
|
||||||
|
AssertClose<kOuter>(actual_out0, expected_out0);
|
||||||
|
AssertClose<kOuter>(actual_out1, expected_out1);
|
||||||
|
}
|
||||||
|
|
||||||
|
void TestTwoOfsMatVecAddLoop() {
|
||||||
|
hwy::ThreadPool pool(hwy::ThreadPool::MaxThreads());
|
||||||
|
constexpr size_t kOuter = 128 * 3;
|
||||||
|
constexpr size_t kInner = 128 * 5;
|
||||||
|
CompressedArray<float, kOuter * kInner> mat =
|
||||||
|
GenerateMat<float, kOuter, kInner>(0, pool);
|
||||||
|
hwy::AlignedFreeUniquePtr<float[]> vec = GenerateVec<kInner>(0);
|
||||||
|
hwy::AlignedFreeUniquePtr<float[]> add0 = GenerateVec<kOuter>(0);
|
||||||
|
hwy::AlignedFreeUniquePtr<float[]> add1 = GenerateVec<kOuter>(1);
|
||||||
|
hwy::AlignedFreeUniquePtr<float[]> expected_out0 =
|
||||||
|
SimpleMatVecAdd<kOuter, kInner>(mat, vec, add0);
|
||||||
|
hwy::AlignedFreeUniquePtr<float[]> expected_out1 =
|
||||||
|
SimpleMatVecAdd<kOuter, kInner>(mat, vec, add1);
|
||||||
|
hwy::AlignedFreeUniquePtr<float[]> actual_out0 =
|
||||||
|
hwy::AllocateAligned<float>(kOuter);
|
||||||
|
hwy::AlignedFreeUniquePtr<float[]> actual_out1 =
|
||||||
|
hwy::AllocateAligned<float>(kOuter);
|
||||||
|
HWY_ASSERT(vec && add0 && add1 && expected_out0 && actual_out0 &&
|
||||||
|
expected_out1 && actual_out1);
|
||||||
|
TwoOfsMatVecAddLoop<kOuter, kInner>(mat, 0, 0, vec.get(), add0.get(),
|
||||||
|
add1.get(), actual_out0.get(),
|
||||||
|
actual_out1.get());
|
||||||
|
AssertClose<kOuter>(actual_out0, expected_out0);
|
||||||
|
AssertClose<kOuter>(actual_out1, expected_out1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||||
|
} // namespace HWY_NAMESPACE
|
||||||
|
} // namespace gcpp
|
||||||
|
HWY_AFTER_NAMESPACE();
|
||||||
|
|
||||||
|
#if HWY_ONCE
|
||||||
|
|
||||||
|
namespace gcpp {
|
||||||
|
HWY_BEFORE_TEST(MatVecTest);
|
||||||
|
HWY_EXPORT_AND_TEST_P(MatVecTest, TestMatVecAdd);
|
||||||
|
HWY_EXPORT_AND_TEST_P(MatVecTest, TestTwoMatVecAdd);
|
||||||
|
HWY_EXPORT_AND_TEST_P(MatVecTest, TestTwoOfsMatVecAddLoop);
|
||||||
|
HWY_AFTER_TEST();
|
||||||
|
|
||||||
|
} // namespace gcpp
|
||||||
|
|
||||||
|
#endif
|
||||||
Loading…
Reference in New Issue