mirror of https://github.com/google/gemma.cpp.git
Fix RecurrentGemma (refs #166) - one Dot was ignoring scale.
Remove extra Dot() overload MatVecAdd always adds, use MatVecT<kAdd> if conditional. Remove ununsed MatVecAddLoop and MatVecLoop No longer tsan-verify even_odd PiperOrigin-RevId: 631377279
This commit is contained in:
parent
b5a9ade75f
commit
f6d02b2870
|
|
@ -464,17 +464,6 @@ HWY_INLINE void Decompress(const CompressedArray<MatT, kCapacity>& compressed,
|
||||||
fprintf(stderr, "Decompress %.1f MB/s\n", mbps);
|
fprintf(stderr, "Decompress %.1f MB/s\n", mbps);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns dot product with `vec_aligned` of length `num`.
|
|
||||||
template <class DF, typename ArrayT, typename VecT>
|
|
||||||
HWY_INLINE float Dot(DF df, const ArrayT& compressed, size_t compressed_ofs,
|
|
||||||
const VecT* vec_aligned, size_t num) {
|
|
||||||
HWY_DASSERT(compressed_ofs + num <= compressed.size());
|
|
||||||
HWY_DASSERT(hn::IsAligned(df, vec_aligned));
|
|
||||||
using Traits = CompressTraits<typename ArrayT::value_type>;
|
|
||||||
return Traits::Dot(df, compressed.size(), compressed.data(), compressed_ofs,
|
|
||||||
vec_aligned, num);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns dot product with `vec_aligned` of length `num`.
|
// Returns dot product with `vec_aligned` of length `num`.
|
||||||
template <bool kVecEO, class DF, typename MatT, size_t kCapacity, typename VecT>
|
template <bool kVecEO, class DF, typename MatT, size_t kCapacity, typename VecT>
|
||||||
HWY_INLINE float Dot(DF df, const CompressedArray<MatT, kCapacity>& compressed,
|
HWY_INLINE float Dot(DF df, const CompressedArray<MatT, kCapacity>& compressed,
|
||||||
|
|
|
||||||
|
|
@ -580,14 +580,13 @@ HWY_NOINLINE void GriffinRecurrent(
|
||||||
gcpp::Activations<TConfig, kBatchSize>::kModelDim;
|
gcpp::Activations<TConfig, kBatchSize>::kModelDim;
|
||||||
static constexpr size_t kConv1dWidth = TConfig::kConv1dWidth;
|
static constexpr size_t kConv1dWidth = TConfig::kConv1dWidth;
|
||||||
static constexpr size_t kHeads = TConfig::kHeads;
|
static constexpr size_t kHeads = TConfig::kHeads;
|
||||||
static constexpr bool kAdd = true;
|
|
||||||
|
|
||||||
// X / Y linear layers.
|
// X / Y linear layers.
|
||||||
for (size_t batch_idx = 0; batch_idx < num_tokens; ++batch_idx) {
|
for (size_t batch_idx = 0; batch_idx < num_tokens; ++batch_idx) {
|
||||||
const size_t batch_offset = batch_idx * kModelDim;
|
const size_t batch_offset = batch_idx * kModelDim;
|
||||||
float* HWY_RESTRICT y = activations.griffin_y.data() + batch_offset;
|
float* HWY_RESTRICT y = activations.griffin_y.data() + batch_offset;
|
||||||
float* HWY_RESTRICT x = activations.griffin_x.data() + batch_offset;
|
float* HWY_RESTRICT x = activations.griffin_x.data() + batch_offset;
|
||||||
TwoMatVecAdd<kAdd, kModelDim, kModelDim>(
|
TwoMatVecAdd<kModelDim, kModelDim>(
|
||||||
layer_weights->griffin.linear_x_w, layer_weights->griffin.linear_y_w, 0,
|
layer_weights->griffin.linear_x_w, layer_weights->griffin.linear_y_w, 0,
|
||||||
activations.pre_att_rms_out.data() + batch_offset,
|
activations.pre_att_rms_out.data() + batch_offset,
|
||||||
/*add0=*/layer_weights->griffin.linear_x_biases.data(),
|
/*add0=*/layer_weights->griffin.linear_x_biases.data(),
|
||||||
|
|
@ -649,7 +648,7 @@ HWY_NOINLINE void GriffinRecurrent(
|
||||||
constexpr size_t kHeadDim = kModelDim / kHeads;
|
constexpr size_t kHeadDim = kModelDim / kHeads;
|
||||||
constexpr size_t kMatrixSize = kHeadDim * kHeadDim;
|
constexpr size_t kMatrixSize = kHeadDim * kHeadDim;
|
||||||
size_t head_offset = head * kHeadDim;
|
size_t head_offset = head * kHeadDim;
|
||||||
TwoOfsMatVecAddLoop<kAdd, kHeadDim, kHeadDim>(
|
TwoOfsMatVecAddLoop<kHeadDim, kHeadDim>(
|
||||||
layer_weights->griffin.gate_w, kMatrixSize * head,
|
layer_weights->griffin.gate_w, kMatrixSize * head,
|
||||||
kMatrixSize * (kHeads + head), x + head_offset,
|
kMatrixSize * (kHeads + head), x + head_offset,
|
||||||
/*add0=*/layer_weights->griffin.gate_biases.data() + head_offset,
|
/*add0=*/layer_weights->griffin.gate_biases.data() + head_offset,
|
||||||
|
|
@ -692,7 +691,7 @@ HWY_NOINLINE void GriffinRecurrent(
|
||||||
const size_t batch_offset = batch_idx * kModelDim;
|
const size_t batch_offset = batch_idx * kModelDim;
|
||||||
float* HWY_RESTRICT x = activations.griffin_x.data() + batch_offset;
|
float* HWY_RESTRICT x = activations.griffin_x.data() + batch_offset;
|
||||||
float* out_ptr = activations.att_post2.data() + batch_idx * kModelDim;
|
float* out_ptr = activations.att_post2.data() + batch_idx * kModelDim;
|
||||||
MatVecAdd<kAdd, kModelDim, kModelDim>(
|
MatVecAdd<kModelDim, kModelDim>(
|
||||||
layer_weights->griffin.linear_out_w, 0, x,
|
layer_weights->griffin.linear_out_w, 0, x,
|
||||||
layer_weights->griffin.linear_out_biases.data(),
|
layer_weights->griffin.linear_out_biases.data(),
|
||||||
activations.even_odd.data(), out_ptr, pool);
|
activations.even_odd.data(), out_ptr, pool);
|
||||||
|
|
@ -825,7 +824,7 @@ HWY_NOINLINE void Attention(size_t batch_start, size_t num_tokens, size_t layer,
|
||||||
activations.att_out.data() + batch_idx * kHeads * kQKVDim;
|
activations.att_out.data() + batch_idx * kHeads * kQKVDim;
|
||||||
float* HWY_RESTRICT layer_out =
|
float* HWY_RESTRICT layer_out =
|
||||||
activations.att_post2.data() + batch_idx * kModelDim;
|
activations.att_post2.data() + batch_idx * kModelDim;
|
||||||
MatVecAdd<TConfig::kSoftmaxAttnOutputBiases, kModelDim, kQKVDim>(
|
MatVecT</*kAdd=*/TConfig::kSoftmaxAttnOutputBiases, kModelDim, kQKVDim>(
|
||||||
layer_weights->attn_vec_einsum_w, 0, att_out,
|
layer_weights->attn_vec_einsum_w, 0, att_out,
|
||||||
layer_weights->attention_output_biases.data(),
|
layer_weights->attention_output_biases.data(),
|
||||||
activations.even_odd.data(), layer_out, pool);
|
activations.even_odd.data(), layer_out, pool);
|
||||||
|
|
@ -859,12 +858,12 @@ HWY_NOINLINE void FFW(Activations<TConfig, kBatchSize>& activations,
|
||||||
float* HWY_RESTRICT out_mul = out + kFFHiddenDim;
|
float* HWY_RESTRICT out_mul = out + kFFHiddenDim;
|
||||||
|
|
||||||
// Same matrix, first and second half of rows. Could fuse into one MatVec.
|
// Same matrix, first and second half of rows. Could fuse into one MatVec.
|
||||||
MatVecAdd<TConfig::kFFBiases, kFFHiddenDim, kModelDim>(
|
MatVecT</*kAdd=*/TConfig::kFFBiases, kFFHiddenDim, kModelDim>(
|
||||||
layer_weights->gating_einsum_w, kFFHiddenDim * kModelDim, vec,
|
layer_weights->gating_einsum_w, kFFHiddenDim * kModelDim, vec,
|
||||||
layer_weights->ffw_gating_biases.data() + kFFHiddenDim, even_odd,
|
layer_weights->ffw_gating_biases.data() + kFFHiddenDim, even_odd,
|
||||||
out_mul, pool);
|
out_mul, pool);
|
||||||
// Gate, will go through the nonlinearity.
|
// Gate, will go through the nonlinearity.
|
||||||
MatVecAdd<TConfig::kFFBiases, kFFHiddenDim, kModelDim>(
|
MatVecT</*kAdd=*/TConfig::kFFBiases, kFFHiddenDim, kModelDim>(
|
||||||
layer_weights->gating_einsum_w, 0, vec,
|
layer_weights->gating_einsum_w, 0, vec,
|
||||||
layer_weights->ffw_gating_biases.data(), even_odd, out, pool);
|
layer_weights->ffw_gating_biases.data(), even_odd, out, pool);
|
||||||
|
|
||||||
|
|
@ -879,7 +878,7 @@ HWY_NOINLINE void FFW(Activations<TConfig, kBatchSize>& activations,
|
||||||
for (size_t batch_idx = 0; batch_idx < num_tokens; ++batch_idx) {
|
for (size_t batch_idx = 0; batch_idx < num_tokens; ++batch_idx) {
|
||||||
PROFILER_ZONE("Gen.FFW\\GatedGELU");
|
PROFILER_ZONE("Gen.FFW\\GatedGELU");
|
||||||
const size_t hidden_offset = batch_idx * kFFHiddenDim * 2;
|
const size_t hidden_offset = batch_idx * kFFHiddenDim * 2;
|
||||||
MatVecAdd<TConfig::kFFBiases, kModelDim, kFFHiddenDim>(
|
MatVecT</*kAdd=*/TConfig::kFFBiases, kModelDim, kFFHiddenDim>(
|
||||||
layer_weights->linear_w, 0,
|
layer_weights->linear_w, 0,
|
||||||
activations.ffw_hidden.data() + hidden_offset,
|
activations.ffw_hidden.data() + hidden_offset,
|
||||||
layer_weights->ffw_output_biases.data(), even_odd,
|
layer_weights->ffw_output_biases.data(), even_odd,
|
||||||
|
|
|
||||||
152
gemma/ops.h
152
gemma/ops.h
|
|
@ -25,7 +25,6 @@
|
||||||
#include <random>
|
#include <random>
|
||||||
#include <type_traits> // std::enable_if_t
|
#include <type_traits> // std::enable_if_t
|
||||||
|
|
||||||
#include "compression/compress.h" // CompressedArray
|
|
||||||
#include "hwy/base.h"
|
#include "hwy/base.h"
|
||||||
#include "hwy/contrib/thread_pool/thread_pool.h"
|
#include "hwy/contrib/thread_pool/thread_pool.h"
|
||||||
#include "hwy/profiler.h"
|
#include "hwy/profiler.h"
|
||||||
|
|
@ -139,68 +138,10 @@ HWY_INLINE void ToEvenOddF32(const float* HWY_RESTRICT vec_aligned,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Simple version without tiling nor threading.
|
// Simple version without tiling nor threading, but two offsets/outputs and
|
||||||
// even_odd is precomputed for the current thread.
|
// always with addition.
|
||||||
template <bool kAdd, size_t kOuter, size_t kInner, typename ArrayT,
|
template <size_t kOuter, size_t kInner, typename ArrayT, typename VecT,
|
||||||
typename VecT, typename AddT>
|
typename AddT>
|
||||||
HWY_INLINE void MatVecAddLoop(const ArrayT& mat, const size_t mat_ofs,
|
|
||||||
const VecT* HWY_RESTRICT vec_aligned,
|
|
||||||
const AddT* HWY_RESTRICT add,
|
|
||||||
float* HWY_RESTRICT even_odd,
|
|
||||||
float* HWY_RESTRICT out) {
|
|
||||||
PROFILER_ZONE("MatVecAddLoop");
|
|
||||||
const hn::ScalableTag<float> df;
|
|
||||||
|
|
||||||
for (size_t idx_row = 0; idx_row < kOuter; ++idx_row) {
|
|
||||||
const size_t row_ofs = mat_ofs + idx_row * kInner;
|
|
||||||
if constexpr (kAdd) {
|
|
||||||
out[idx_row] = hwy::ConvertScalarTo<float>(add[idx_row]) +
|
|
||||||
Dot(df, mat, row_ofs, vec_aligned, kInner);
|
|
||||||
} else {
|
|
||||||
out[idx_row] = Dot(df, mat, row_ofs, vec_aligned, kInner);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#if !defined(HWY_NATIVE_DOT_BF16) || !HWY_NATIVE_DOT_BF16
|
|
||||||
template <bool kAdd, size_t kOuter, size_t kInner, typename VecT, typename AddT,
|
|
||||||
size_t kCapacity>
|
|
||||||
HWY_INLINE void MatVecAddLoop(
|
|
||||||
const CompressedArray<hwy::bfloat16_t, kCapacity>& mat,
|
|
||||||
const size_t mat_ofs, const VecT* HWY_RESTRICT vec_aligned,
|
|
||||||
const AddT* HWY_RESTRICT add, float* HWY_RESTRICT even_odd,
|
|
||||||
float* HWY_RESTRICT out) {
|
|
||||||
PROFILER_ZONE("MatVecAddLoop");
|
|
||||||
constexpr bool kVecIsEvenOdd = true;
|
|
||||||
|
|
||||||
const hn::ScalableTag<float> df;
|
|
||||||
ToEvenOddF32(vec_aligned, kInner, even_odd);
|
|
||||||
for (size_t idx_row = 0; idx_row < kOuter; ++idx_row) {
|
|
||||||
const size_t row_ofs = mat_ofs + idx_row * kInner;
|
|
||||||
if constexpr (kAdd) {
|
|
||||||
out[idx_row] = hwy::ConvertScalarTo<float>(add[idx_row]) +
|
|
||||||
Dot<kVecIsEvenOdd>(df, mat, row_ofs, even_odd, kInner);
|
|
||||||
} else {
|
|
||||||
out[idx_row] = Dot<kVecIsEvenOdd>(df, mat, row_ofs, even_odd, kInner);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// even_odd is precomputed for the current thread.
|
|
||||||
template <size_t kOuter, size_t kInner, typename ArrayT, typename VecT>
|
|
||||||
HWY_INLINE void MatVecLoop(const ArrayT& mat, const size_t mat_ofs,
|
|
||||||
const VecT* HWY_RESTRICT vec_aligned,
|
|
||||||
float* HWY_RESTRICT even_odd,
|
|
||||||
float* HWY_RESTRICT out) {
|
|
||||||
MatVecAddLoop</*kAdd=*/false, kOuter, kInner>(
|
|
||||||
mat, mat_ofs, vec_aligned, /*add=*/static_cast<VecT*>(nullptr), even_odd,
|
|
||||||
out);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Simple version without tiling nor threading, but two offsets/outputs.
|
|
||||||
template <bool kAdd, size_t kOuter, size_t kInner, typename ArrayT,
|
|
||||||
typename VecT, typename AddT>
|
|
||||||
HWY_INLINE void TwoOfsMatVecAddLoop(const ArrayT& mat, const size_t mat_ofs0,
|
HWY_INLINE void TwoOfsMatVecAddLoop(const ArrayT& mat, const size_t mat_ofs0,
|
||||||
const size_t mat_ofs1,
|
const size_t mat_ofs1,
|
||||||
const VecT* HWY_RESTRICT vec_aligned,
|
const VecT* HWY_RESTRICT vec_aligned,
|
||||||
|
|
@ -208,35 +149,19 @@ HWY_INLINE void TwoOfsMatVecAddLoop(const ArrayT& mat, const size_t mat_ofs0,
|
||||||
const AddT* HWY_RESTRICT add1,
|
const AddT* HWY_RESTRICT add1,
|
||||||
float* HWY_RESTRICT out0,
|
float* HWY_RESTRICT out0,
|
||||||
float* HWY_RESTRICT out1) {
|
float* HWY_RESTRICT out1) {
|
||||||
PROFILER_ZONE("MatVecLoop");
|
PROFILER_ZONE("TwoOfsMatVecAddLoop");
|
||||||
|
constexpr bool kVecEO = false;
|
||||||
const hn::ScalableTag<float> df;
|
const hn::ScalableTag<float> df;
|
||||||
|
|
||||||
for (size_t idx_row = 0; idx_row < kOuter; ++idx_row) {
|
for (size_t idx_row = 0; idx_row < kOuter; ++idx_row) {
|
||||||
const size_t row_ofs0 = mat_ofs0 + (idx_row)*kInner;
|
const size_t row_ofs0 = mat_ofs0 + (idx_row)*kInner;
|
||||||
const size_t row_ofs1 = mat_ofs1 + (idx_row)*kInner;
|
const size_t row_ofs1 = mat_ofs1 + (idx_row)*kInner;
|
||||||
if constexpr (kAdd) {
|
|
||||||
out0[idx_row] = hwy::ConvertScalarTo<float>(add0[idx_row]) +
|
out0[idx_row] = hwy::ConvertScalarTo<float>(add0[idx_row]) +
|
||||||
Dot(df, mat, row_ofs0, vec_aligned, kInner);
|
Dot<kVecEO>(df, mat, row_ofs0, vec_aligned, kInner);
|
||||||
out1[idx_row] = hwy::ConvertScalarTo<float>(add1[idx_row]) +
|
out1[idx_row] = hwy::ConvertScalarTo<float>(add1[idx_row]) +
|
||||||
Dot(df, mat, row_ofs1, vec_aligned, kInner);
|
Dot<kVecEO>(df, mat, row_ofs1, vec_aligned, kInner);
|
||||||
} else {
|
|
||||||
out0[idx_row] = Dot(df, mat, row_ofs0, vec_aligned, kInner);
|
|
||||||
out1[idx_row] = Dot(df, mat, row_ofs1, vec_aligned, kInner);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// Simple version without tiling nor threading, but two offsets/outputs.
|
|
||||||
template <size_t kOuter, size_t kInner, typename ArrayT, typename VecT>
|
|
||||||
HWY_INLINE void TwoOfsMatVecLoop(const ArrayT& mat, const size_t mat_ofs0,
|
|
||||||
const size_t mat_ofs1,
|
|
||||||
const VecT* HWY_RESTRICT vec_aligned,
|
|
||||||
float* HWY_RESTRICT out0,
|
|
||||||
float* HWY_RESTRICT out1) {
|
|
||||||
TwoOfsMatVecAddLoop</*kAdd=*/false, kOuter, kInner, ArrayT, VecT, VecT>(
|
|
||||||
mat, mat_ofs0, mat_ofs1, vec_aligned, /*add0=*/nullptr, /*add1=*/nullptr,
|
|
||||||
out0, out1);
|
|
||||||
}
|
|
||||||
|
|
||||||
namespace detail {
|
namespace detail {
|
||||||
|
|
||||||
|
|
@ -323,21 +248,11 @@ template <bool kVecIsEvenOdd, bool kAdd, size_t kOuter, size_t kInner,
|
||||||
HWY_INLINE void MatVecAddInner(const ArrayT& mat, const size_t mat_ofs,
|
HWY_INLINE void MatVecAddInner(const ArrayT& mat, const size_t mat_ofs,
|
||||||
const VecT* HWY_RESTRICT const vec_aligned,
|
const VecT* HWY_RESTRICT const vec_aligned,
|
||||||
const AddT* HWY_RESTRICT const add,
|
const AddT* HWY_RESTRICT const add,
|
||||||
float* HWY_RESTRICT even_odd,
|
|
||||||
float* HWY_RESTRICT out, hwy::ThreadPool& pool) {
|
float* HWY_RESTRICT out, hwy::ThreadPool& pool) {
|
||||||
const hn::ScalableTag<float> df;
|
const hn::ScalableTag<float> df;
|
||||||
constexpr size_t kRowsPerStrip = RowsPerStrip<kOuter>();
|
constexpr size_t kRowsPerStrip = RowsPerStrip<kOuter>();
|
||||||
constexpr size_t kNumStrips = kOuter / kRowsPerStrip;
|
constexpr size_t kNumStrips = kOuter / kRowsPerStrip;
|
||||||
|
|
||||||
// Sanity check: each thread can write without race conditions.
|
|
||||||
if (HWY_IS_TSAN) {
|
|
||||||
pool.Run(
|
|
||||||
0, pool.NumWorkers(), [even_odd](uint64_t /*task*/, size_t thread) {
|
|
||||||
even_odd[thread * kInner] = -static_cast<float>(thread);
|
|
||||||
even_odd[thread * kInner + kInner - 1] = static_cast<float>(thread);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// For each entire strip.
|
// For each entire strip.
|
||||||
pool.Run(0, kNumStrips, [&](const uint64_t strip, size_t thread) HWY_ATTR {
|
pool.Run(0, kNumStrips, [&](const uint64_t strip, size_t thread) HWY_ATTR {
|
||||||
PROFILER_ZONE("MatVec.lambda");
|
PROFILER_ZONE("MatVec.lambda");
|
||||||
|
|
@ -361,10 +276,9 @@ HWY_INLINE void MatVecAddInner(const ArrayT& mat, const size_t mat_ofs,
|
||||||
|
|
||||||
// Stores dot products of rows with `vec_aligned` + add the values from `add`
|
// Stores dot products of rows with `vec_aligned` + add the values from `add`
|
||||||
// (if kAdd), then stores them to `out`.
|
// (if kAdd), then stores them to `out`.
|
||||||
//
|
|
||||||
template <bool kAdd, size_t kOuter, size_t kInner, typename ArrayT,
|
template <bool kAdd, size_t kOuter, size_t kInner, typename ArrayT,
|
||||||
typename VecT, typename AddT>
|
typename VecT, typename AddT>
|
||||||
HWY_INLINE void MatVecAdd(const ArrayT& mat, const size_t mat_ofs,
|
HWY_INLINE void MatVecT(const ArrayT& mat, const size_t mat_ofs,
|
||||||
const VecT* HWY_RESTRICT const vec_aligned,
|
const VecT* HWY_RESTRICT const vec_aligned,
|
||||||
const AddT* HWY_RESTRICT const add,
|
const AddT* HWY_RESTRICT const add,
|
||||||
float* HWY_RESTRICT even_odd, float* HWY_RESTRICT out,
|
float* HWY_RESTRICT even_odd, float* HWY_RESTRICT out,
|
||||||
|
|
@ -376,21 +290,36 @@ HWY_INLINE void MatVecAdd(const ArrayT& mat, const size_t mat_ofs,
|
||||||
hwy::IsSameEither<VecT, float, hwy::bfloat16_t>()) {
|
hwy::IsSameEither<VecT, float, hwy::bfloat16_t>()) {
|
||||||
ToEvenOddF32(vec_aligned, kInner, even_odd);
|
ToEvenOddF32(vec_aligned, kInner, even_odd);
|
||||||
detail::MatVecAddInner</*kVecIsEvenOdd=*/true, kAdd, kOuter, kInner>(
|
detail::MatVecAddInner</*kVecIsEvenOdd=*/true, kAdd, kOuter, kInner>(
|
||||||
mat, mat_ofs, even_odd, add, even_odd, out, pool);
|
mat, mat_ofs, even_odd, add, out, pool);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
(void)even_odd;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
detail::MatVecAddInner</*kVecIsEvenOdd=*/false, kAdd, kOuter, kInner>(
|
detail::MatVecAddInner</*kVecIsEvenOdd=*/false, kAdd, kOuter, kInner>(
|
||||||
mat, mat_ofs, vec_aligned, add, even_odd, out, pool);
|
mat, mat_ofs, vec_aligned, add, out, pool);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// With addition
|
||||||
|
template <size_t kOuter, size_t kInner, typename ArrayT, typename VecT,
|
||||||
|
typename AddT>
|
||||||
|
HWY_INLINE void MatVecAdd(const ArrayT& mat, const size_t mat_ofs,
|
||||||
|
const VecT* HWY_RESTRICT const vec_aligned,
|
||||||
|
const AddT* HWY_RESTRICT const add,
|
||||||
|
float* HWY_RESTRICT even_odd, float* HWY_RESTRICT out,
|
||||||
|
hwy::ThreadPool& pool) {
|
||||||
|
return MatVecT</*kAdd=*/true, kOuter, kInner>(mat, mat_ofs, vec_aligned, add,
|
||||||
|
even_odd, out, pool);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Without addition
|
||||||
template <size_t kOuter, size_t kInner, typename ArrayT, typename VecT>
|
template <size_t kOuter, size_t kInner, typename ArrayT, typename VecT>
|
||||||
HWY_INLINE void MatVec(const ArrayT& mat, const size_t mat_ofs,
|
HWY_INLINE void MatVec(const ArrayT& mat, const size_t mat_ofs,
|
||||||
const VecT* HWY_RESTRICT const vec_aligned,
|
const VecT* HWY_RESTRICT const vec_aligned,
|
||||||
float* HWY_RESTRICT even_odd, float* HWY_RESTRICT out,
|
float* HWY_RESTRICT even_odd, float* HWY_RESTRICT out,
|
||||||
hwy::ThreadPool& pool) {
|
hwy::ThreadPool& pool) {
|
||||||
MatVecAdd</*kAdd=*/false, kOuter, kInner>(mat, mat_ofs, vec_aligned,
|
MatVecT</*kAdd=*/false, kOuter, kInner>(mat, mat_ofs, vec_aligned,
|
||||||
/*add=*/static_cast<VecT*>(nullptr),
|
/*add=*/static_cast<VecT*>(nullptr),
|
||||||
even_odd, out, pool);
|
even_odd, out, pool);
|
||||||
}
|
}
|
||||||
|
|
@ -504,11 +433,13 @@ static HWY_NOINLINE HWY_MAYBE_UNUSED void Sigmoid(float* HWY_RESTRICT x,
|
||||||
// Two matrices, same vector
|
// Two matrices, same vector
|
||||||
template <bool kAdd, size_t kOuter, size_t kInner, typename ArrayT,
|
template <bool kAdd, size_t kOuter, size_t kInner, typename ArrayT,
|
||||||
typename VecT, typename AddT>
|
typename VecT, typename AddT>
|
||||||
HWY_NOINLINE void TwoMatVecAdd(
|
HWY_NOINLINE void TwoMatVecT(const ArrayT& mat0, const ArrayT& mat1,
|
||||||
const ArrayT& mat0, const ArrayT& mat1, const size_t mat_ofs,
|
const size_t mat_ofs,
|
||||||
const VecT* HWY_RESTRICT vec_aligned, const AddT* HWY_RESTRICT add0,
|
const VecT* HWY_RESTRICT vec_aligned,
|
||||||
const AddT* HWY_RESTRICT add1, float* HWY_RESTRICT out0,
|
const AddT* HWY_RESTRICT add0,
|
||||||
float* HWY_RESTRICT out1, hwy::ThreadPool& pool) {
|
const AddT* HWY_RESTRICT add1,
|
||||||
|
float* HWY_RESTRICT out0, float* HWY_RESTRICT out1,
|
||||||
|
hwy::ThreadPool& pool) {
|
||||||
PROFILER_ZONE("TwoMatVecAdd");
|
PROFILER_ZONE("TwoMatVecAdd");
|
||||||
|
|
||||||
const hn::ScalableTag<float> df;
|
const hn::ScalableTag<float> df;
|
||||||
|
|
@ -540,13 +471,26 @@ HWY_NOINLINE void TwoMatVecAdd(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// With addition
|
||||||
|
template <size_t kOuter, size_t kInner, typename ArrayT, typename VecT,
|
||||||
|
typename AddT>
|
||||||
|
HWY_NOINLINE void TwoMatVecAdd(
|
||||||
|
const ArrayT& mat0, const ArrayT& mat1, const size_t mat_ofs,
|
||||||
|
const VecT* HWY_RESTRICT vec_aligned, const AddT* HWY_RESTRICT add0,
|
||||||
|
const AddT* HWY_RESTRICT add1, float* HWY_RESTRICT out0,
|
||||||
|
float* HWY_RESTRICT out1, hwy::ThreadPool& pool) {
|
||||||
|
return TwoMatVecT</*kAdd=*/true, kOuter, kInner>(
|
||||||
|
mat0, mat1, mat_ofs, vec_aligned, add0, add1, out0, out1, pool);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Without addition
|
||||||
template <size_t kOuter, size_t kInner, typename ArrayT, typename VecT>
|
template <size_t kOuter, size_t kInner, typename ArrayT, typename VecT>
|
||||||
HWY_NOINLINE void TwoMatVec(const ArrayT& mat0, const ArrayT& mat1,
|
HWY_NOINLINE void TwoMatVec(const ArrayT& mat0, const ArrayT& mat1,
|
||||||
const size_t mat_ofs,
|
const size_t mat_ofs,
|
||||||
const VecT* HWY_RESTRICT vec_aligned,
|
const VecT* HWY_RESTRICT vec_aligned,
|
||||||
float* HWY_RESTRICT out0, float* HWY_RESTRICT out1,
|
float* HWY_RESTRICT out0, float* HWY_RESTRICT out1,
|
||||||
hwy::ThreadPool& pool) {
|
hwy::ThreadPool& pool) {
|
||||||
TwoMatVecAdd</*kAdd=*/false, kOuter, kInner, ArrayT, VecT, VecT>(
|
TwoMatVecT</*kAdd=*/false, kOuter, kInner, ArrayT, VecT, VecT>(
|
||||||
mat0, mat1, mat_ofs, vec_aligned, /*add0=*/nullptr, /*add1=*/nullptr,
|
mat0, mat1, mat_ofs, vec_aligned, /*add0=*/nullptr, /*add1=*/nullptr,
|
||||||
out0, out1, pool);
|
out0, out1, pool);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -517,8 +517,8 @@ void TestMatVecAdd() {
|
||||||
hwy::AlignedFreeUniquePtr<float[]> actual_out =
|
hwy::AlignedFreeUniquePtr<float[]> actual_out =
|
||||||
hwy::AllocateAligned<float>(kOuter);
|
hwy::AllocateAligned<float>(kOuter);
|
||||||
HWY_ASSERT(vec && add && even_odd && expected_out && actual_out);
|
HWY_ASSERT(vec && add && even_odd && expected_out && actual_out);
|
||||||
MatVecAdd</*kAdd=*/true, kOuter, kInner>(
|
MatVecAdd<kOuter, kInner>(mat, 0, vec.get(), add.get(), even_odd.get(),
|
||||||
mat, 0, vec.get(), add.get(), even_odd.get(), actual_out.get(), pool);
|
actual_out.get(), pool);
|
||||||
AssertClose<kOuter>(actual_out, expected_out);
|
AssertClose<kOuter>(actual_out, expected_out);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -541,9 +541,8 @@ void TestTwoMatVecAdd() {
|
||||||
hwy::AllocateAligned<float>(kOuter);
|
hwy::AllocateAligned<float>(kOuter);
|
||||||
HWY_ASSERT(vec && add0 && add1 && expected_out0 && actual_out0 &&
|
HWY_ASSERT(vec && add0 && add1 && expected_out0 && actual_out0 &&
|
||||||
expected_out1 && actual_out1);
|
expected_out1 && actual_out1);
|
||||||
TwoMatVecAdd<true, kOuter, kInner>(mat0, mat1, 0, vec.get(), add0.get(),
|
TwoMatVecAdd<kOuter, kInner>(mat0, mat1, 0, vec.get(), add0.get(), add1.get(),
|
||||||
add1.get(), actual_out0.get(),
|
actual_out0.get(), actual_out1.get(), pool);
|
||||||
actual_out1.get(), pool);
|
|
||||||
AssertClose<kOuter>(actual_out0, expected_out0);
|
AssertClose<kOuter>(actual_out0, expected_out0);
|
||||||
AssertClose<kOuter>(actual_out1, expected_out1);
|
AssertClose<kOuter>(actual_out1, expected_out1);
|
||||||
}
|
}
|
||||||
|
|
@ -565,7 +564,7 @@ void TestTwoOfsMatVecAddLoop() {
|
||||||
hwy::AllocateAligned<float>(kOuter);
|
hwy::AllocateAligned<float>(kOuter);
|
||||||
HWY_ASSERT(vec && add0 && add1 && expected_out0 && actual_out0 &&
|
HWY_ASSERT(vec && add0 && add1 && expected_out0 && actual_out0 &&
|
||||||
expected_out1 && actual_out1);
|
expected_out1 && actual_out1);
|
||||||
TwoOfsMatVecAddLoop<true, kOuter, kInner>(mat, 0, 0, vec.get(), add0.get(),
|
TwoOfsMatVecAddLoop<kOuter, kInner>(mat, 0, 0, vec.get(), add0.get(),
|
||||||
add1.get(), actual_out0.get(),
|
add1.get(), actual_out0.get(),
|
||||||
actual_out1.get());
|
actual_out1.get());
|
||||||
AssertClose<kOuter>(actual_out0, expected_out0);
|
AssertClose<kOuter>(actual_out0, expected_out0);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue