diff --git a/compression/compress-inl.h b/compression/compress-inl.h index fbd054a..6d2e39f 100644 --- a/compression/compress-inl.h +++ b/compression/compress-inl.h @@ -224,7 +224,7 @@ struct CompressTraits { // Computes the dot product of an even-odd deinterleaved, f32 `vec_aligned` // and a column- major matrix `in`. `vec_aligned` should be aligned and - // alternate even-indexed `hn::Lanes(df32)` elements followed by odd-indexed + // alternate even-indexed `hn::Lanes(df32)` elements followed by odd-indexed // `hn::Lanes(df32)` elements. template static HWY_INLINE float DotEO( @@ -464,17 +464,6 @@ HWY_INLINE void Decompress(const CompressedArray& compressed, fprintf(stderr, "Decompress %.1f MB/s\n", mbps); } -// Returns dot product with `vec_aligned` of length `num`. -template -HWY_INLINE float Dot(DF df, const ArrayT& compressed, size_t compressed_ofs, - const VecT* vec_aligned, size_t num) { - HWY_DASSERT(compressed_ofs + num <= compressed.size()); - HWY_DASSERT(hn::IsAligned(df, vec_aligned)); - using Traits = CompressTraits; - return Traits::Dot(df, compressed.size(), compressed.data(), compressed_ofs, - vec_aligned, num); -} - // Returns dot product with `vec_aligned` of length `num`. template HWY_INLINE float Dot(DF df, const CompressedArray& compressed, diff --git a/gemma/gemma.cc b/gemma/gemma.cc index 6659923..e602aac 100644 --- a/gemma/gemma.cc +++ b/gemma/gemma.cc @@ -580,14 +580,13 @@ HWY_NOINLINE void GriffinRecurrent( gcpp::Activations::kModelDim; static constexpr size_t kConv1dWidth = TConfig::kConv1dWidth; static constexpr size_t kHeads = TConfig::kHeads; - static constexpr bool kAdd = true; // X / Y linear layers. for (size_t batch_idx = 0; batch_idx < num_tokens; ++batch_idx) { const size_t batch_offset = batch_idx * kModelDim; float* HWY_RESTRICT y = activations.griffin_y.data() + batch_offset; float* HWY_RESTRICT x = activations.griffin_x.data() + batch_offset; - TwoMatVecAdd( + TwoMatVecAdd( layer_weights->griffin.linear_x_w, layer_weights->griffin.linear_y_w, 0, activations.pre_att_rms_out.data() + batch_offset, /*add0=*/layer_weights->griffin.linear_x_biases.data(), @@ -649,12 +648,12 @@ HWY_NOINLINE void GriffinRecurrent( constexpr size_t kHeadDim = kModelDim / kHeads; constexpr size_t kMatrixSize = kHeadDim * kHeadDim; size_t head_offset = head * kHeadDim; - TwoOfsMatVecAddLoop( + TwoOfsMatVecAddLoop( layer_weights->griffin.gate_w, kMatrixSize * head, kMatrixSize * (kHeads + head), x + head_offset, /*add0=*/layer_weights->griffin.gate_biases.data() + head_offset, /*add1=*/layer_weights->griffin.gate_biases.data() + kModelDim + - head_offset, + head_offset, /*out0=*/gate_x + head_offset, /*out1=*/a + head_offset); Sigmoid(gate_x + head_offset, kHeadDim); Sigmoid(a + head_offset, kHeadDim); @@ -692,7 +691,7 @@ HWY_NOINLINE void GriffinRecurrent( const size_t batch_offset = batch_idx * kModelDim; float* HWY_RESTRICT x = activations.griffin_x.data() + batch_offset; float* out_ptr = activations.att_post2.data() + batch_idx * kModelDim; - MatVecAdd( + MatVecAdd( layer_weights->griffin.linear_out_w, 0, x, layer_weights->griffin.linear_out_biases.data(), activations.even_odd.data(), out_ptr, pool); @@ -825,7 +824,7 @@ HWY_NOINLINE void Attention(size_t batch_start, size_t num_tokens, size_t layer, activations.att_out.data() + batch_idx * kHeads * kQKVDim; float* HWY_RESTRICT layer_out = activations.att_post2.data() + batch_idx * kModelDim; - MatVecAdd( + MatVecT( layer_weights->attn_vec_einsum_w, 0, att_out, layer_weights->attention_output_biases.data(), activations.even_odd.data(), layer_out, pool); @@ -859,12 +858,12 @@ HWY_NOINLINE void FFW(Activations& activations, float* HWY_RESTRICT out_mul = out + kFFHiddenDim; // Same matrix, first and second half of rows. Could fuse into one MatVec. - MatVecAdd( + MatVecT( layer_weights->gating_einsum_w, kFFHiddenDim * kModelDim, vec, layer_weights->ffw_gating_biases.data() + kFFHiddenDim, even_odd, out_mul, pool); // Gate, will go through the nonlinearity. - MatVecAdd( + MatVecT( layer_weights->gating_einsum_w, 0, vec, layer_weights->ffw_gating_biases.data(), even_odd, out, pool); @@ -879,7 +878,7 @@ HWY_NOINLINE void FFW(Activations& activations, for (size_t batch_idx = 0; batch_idx < num_tokens; ++batch_idx) { PROFILER_ZONE("Gen.FFW\\GatedGELU"); const size_t hidden_offset = batch_idx * kFFHiddenDim * 2; - MatVecAdd( + MatVecT( layer_weights->linear_w, 0, activations.ffw_hidden.data() + hidden_offset, layer_weights->ffw_output_biases.data(), even_odd, diff --git a/gemma/ops.h b/gemma/ops.h index a520056..1be0170 100644 --- a/gemma/ops.h +++ b/gemma/ops.h @@ -25,7 +25,6 @@ #include #include // std::enable_if_t -#include "compression/compress.h" // CompressedArray #include "hwy/base.h" #include "hwy/contrib/thread_pool/thread_pool.h" #include "hwy/profiler.h" @@ -139,68 +138,10 @@ HWY_INLINE void ToEvenOddF32(const float* HWY_RESTRICT vec_aligned, } } -// Simple version without tiling nor threading. -// even_odd is precomputed for the current thread. -template -HWY_INLINE void MatVecAddLoop(const ArrayT& mat, const size_t mat_ofs, - const VecT* HWY_RESTRICT vec_aligned, - const AddT* HWY_RESTRICT add, - float* HWY_RESTRICT even_odd, - float* HWY_RESTRICT out) { - PROFILER_ZONE("MatVecAddLoop"); - const hn::ScalableTag df; - - for (size_t idx_row = 0; idx_row < kOuter; ++idx_row) { - const size_t row_ofs = mat_ofs + idx_row * kInner; - if constexpr (kAdd) { - out[idx_row] = hwy::ConvertScalarTo(add[idx_row]) + - Dot(df, mat, row_ofs, vec_aligned, kInner); - } else { - out[idx_row] = Dot(df, mat, row_ofs, vec_aligned, kInner); - } - } -} - -#if !defined(HWY_NATIVE_DOT_BF16) || !HWY_NATIVE_DOT_BF16 -template -HWY_INLINE void MatVecAddLoop( - const CompressedArray& mat, - const size_t mat_ofs, const VecT* HWY_RESTRICT vec_aligned, - const AddT* HWY_RESTRICT add, float* HWY_RESTRICT even_odd, - float* HWY_RESTRICT out) { - PROFILER_ZONE("MatVecAddLoop"); - constexpr bool kVecIsEvenOdd = true; - - const hn::ScalableTag df; - ToEvenOddF32(vec_aligned, kInner, even_odd); - for (size_t idx_row = 0; idx_row < kOuter; ++idx_row) { - const size_t row_ofs = mat_ofs + idx_row * kInner; - if constexpr (kAdd) { - out[idx_row] = hwy::ConvertScalarTo(add[idx_row]) + - Dot(df, mat, row_ofs, even_odd, kInner); - } else { - out[idx_row] = Dot(df, mat, row_ofs, even_odd, kInner); - } - } -} -#endif - -// even_odd is precomputed for the current thread. -template -HWY_INLINE void MatVecLoop(const ArrayT& mat, const size_t mat_ofs, - const VecT* HWY_RESTRICT vec_aligned, - float* HWY_RESTRICT even_odd, - float* HWY_RESTRICT out) { - MatVecAddLoop( - mat, mat_ofs, vec_aligned, /*add=*/static_cast(nullptr), even_odd, - out); -} - -// Simple version without tiling nor threading, but two offsets/outputs. -template +// Simple version without tiling nor threading, but two offsets/outputs and +// always with addition. +template HWY_INLINE void TwoOfsMatVecAddLoop(const ArrayT& mat, const size_t mat_ofs0, const size_t mat_ofs1, const VecT* HWY_RESTRICT vec_aligned, @@ -208,36 +149,20 @@ HWY_INLINE void TwoOfsMatVecAddLoop(const ArrayT& mat, const size_t mat_ofs0, const AddT* HWY_RESTRICT add1, float* HWY_RESTRICT out0, float* HWY_RESTRICT out1) { - PROFILER_ZONE("MatVecLoop"); + PROFILER_ZONE("TwoOfsMatVecAddLoop"); + constexpr bool kVecEO = false; const hn::ScalableTag df; for (size_t idx_row = 0; idx_row < kOuter; ++idx_row) { const size_t row_ofs0 = mat_ofs0 + (idx_row)*kInner; const size_t row_ofs1 = mat_ofs1 + (idx_row)*kInner; - if constexpr (kAdd) { - out0[idx_row] = hwy::ConvertScalarTo(add0[idx_row]) + - Dot(df, mat, row_ofs0, vec_aligned, kInner); - out1[idx_row] = hwy::ConvertScalarTo(add1[idx_row]) + - Dot(df, mat, row_ofs1, vec_aligned, kInner); - } else { - out0[idx_row] = Dot(df, mat, row_ofs0, vec_aligned, kInner); - out1[idx_row] = Dot(df, mat, row_ofs1, vec_aligned, kInner); - } + out0[idx_row] = hwy::ConvertScalarTo(add0[idx_row]) + + Dot(df, mat, row_ofs0, vec_aligned, kInner); + out1[idx_row] = hwy::ConvertScalarTo(add1[idx_row]) + + Dot(df, mat, row_ofs1, vec_aligned, kInner); } } -// Simple version without tiling nor threading, but two offsets/outputs. -template -HWY_INLINE void TwoOfsMatVecLoop(const ArrayT& mat, const size_t mat_ofs0, - const size_t mat_ofs1, - const VecT* HWY_RESTRICT vec_aligned, - float* HWY_RESTRICT out0, - float* HWY_RESTRICT out1) { - TwoOfsMatVecAddLoop( - mat, mat_ofs0, mat_ofs1, vec_aligned, /*add0=*/nullptr, /*add1=*/nullptr, - out0, out1); -} - namespace detail { // For each i = [0, num_rows), compute partial (length `num_cols`) dot product @@ -323,21 +248,11 @@ template df; constexpr size_t kRowsPerStrip = RowsPerStrip(); constexpr size_t kNumStrips = kOuter / kRowsPerStrip; - // Sanity check: each thread can write without race conditions. - if (HWY_IS_TSAN) { - pool.Run( - 0, pool.NumWorkers(), [even_odd](uint64_t /*task*/, size_t thread) { - even_odd[thread * kInner] = -static_cast(thread); - even_odd[thread * kInner + kInner - 1] = static_cast(thread); - }); - } - // For each entire strip. pool.Run(0, kNumStrips, [&](const uint64_t strip, size_t thread) HWY_ATTR { PROFILER_ZONE("MatVec.lambda"); @@ -361,14 +276,13 @@ HWY_INLINE void MatVecAddInner(const ArrayT& mat, const size_t mat_ofs, // Stores dot products of rows with `vec_aligned` + add the values from `add` // (if kAdd), then stores them to `out`. -// template -HWY_INLINE void MatVecAdd(const ArrayT& mat, const size_t mat_ofs, - const VecT* HWY_RESTRICT const vec_aligned, - const AddT* HWY_RESTRICT const add, - float* HWY_RESTRICT even_odd, float* HWY_RESTRICT out, - hwy::ThreadPool& pool) { +HWY_INLINE void MatVecT(const ArrayT& mat, const size_t mat_ofs, + const VecT* HWY_RESTRICT const vec_aligned, + const AddT* HWY_RESTRICT const add, + float* HWY_RESTRICT even_odd, float* HWY_RESTRICT out, + hwy::ThreadPool& pool) { PROFILER_ZONE("MatVecAdd"); #if !defined(HWY_NATIVE_DOT_BF16) || !HWY_NATIVE_DOT_BF16 @@ -376,23 +290,38 @@ HWY_INLINE void MatVecAdd(const ArrayT& mat, const size_t mat_ofs, hwy::IsSameEither()) { ToEvenOddF32(vec_aligned, kInner, even_odd); detail::MatVecAddInner( - mat, mat_ofs, even_odd, add, even_odd, out, pool); + mat, mat_ofs, even_odd, add, out, pool); return; } +#else + (void)even_odd; #endif detail::MatVecAddInner( - mat, mat_ofs, vec_aligned, add, even_odd, out, pool); + mat, mat_ofs, vec_aligned, add, out, pool); } +// With addition +template +HWY_INLINE void MatVecAdd(const ArrayT& mat, const size_t mat_ofs, + const VecT* HWY_RESTRICT const vec_aligned, + const AddT* HWY_RESTRICT const add, + float* HWY_RESTRICT even_odd, float* HWY_RESTRICT out, + hwy::ThreadPool& pool) { + return MatVecT(mat, mat_ofs, vec_aligned, add, + even_odd, out, pool); +} + +// Without addition template HWY_INLINE void MatVec(const ArrayT& mat, const size_t mat_ofs, const VecT* HWY_RESTRICT const vec_aligned, float* HWY_RESTRICT even_odd, float* HWY_RESTRICT out, hwy::ThreadPool& pool) { - MatVecAdd(mat, mat_ofs, vec_aligned, - /*add=*/static_cast(nullptr), - even_odd, out, pool); + MatVecT(mat, mat_ofs, vec_aligned, + /*add=*/static_cast(nullptr), + even_odd, out, pool); } template @@ -504,11 +433,13 @@ static HWY_NOINLINE HWY_MAYBE_UNUSED void Sigmoid(float* HWY_RESTRICT x, // Two matrices, same vector template -HWY_NOINLINE void TwoMatVecAdd( - const ArrayT& mat0, const ArrayT& mat1, const size_t mat_ofs, - const VecT* HWY_RESTRICT vec_aligned, const AddT* HWY_RESTRICT add0, - const AddT* HWY_RESTRICT add1, float* HWY_RESTRICT out0, - float* HWY_RESTRICT out1, hwy::ThreadPool& pool) { +HWY_NOINLINE void TwoMatVecT(const ArrayT& mat0, const ArrayT& mat1, + const size_t mat_ofs, + const VecT* HWY_RESTRICT vec_aligned, + const AddT* HWY_RESTRICT add0, + const AddT* HWY_RESTRICT add1, + float* HWY_RESTRICT out0, float* HWY_RESTRICT out1, + hwy::ThreadPool& pool) { PROFILER_ZONE("TwoMatVecAdd"); const hn::ScalableTag df; @@ -540,13 +471,26 @@ HWY_NOINLINE void TwoMatVecAdd( } } +// With addition +template +HWY_NOINLINE void TwoMatVecAdd( + const ArrayT& mat0, const ArrayT& mat1, const size_t mat_ofs, + const VecT* HWY_RESTRICT vec_aligned, const AddT* HWY_RESTRICT add0, + const AddT* HWY_RESTRICT add1, float* HWY_RESTRICT out0, + float* HWY_RESTRICT out1, hwy::ThreadPool& pool) { + return TwoMatVecT( + mat0, mat1, mat_ofs, vec_aligned, add0, add1, out0, out1, pool); +} + +// Without addition template HWY_NOINLINE void TwoMatVec(const ArrayT& mat0, const ArrayT& mat1, const size_t mat_ofs, const VecT* HWY_RESTRICT vec_aligned, float* HWY_RESTRICT out0, float* HWY_RESTRICT out1, hwy::ThreadPool& pool) { - TwoMatVecAdd( + TwoMatVecT( mat0, mat1, mat_ofs, vec_aligned, /*add0=*/nullptr, /*add1=*/nullptr, out0, out1, pool); } diff --git a/gemma/ops_test.cc b/gemma/ops_test.cc index 75a09d0..9ad185a 100644 --- a/gemma/ops_test.cc +++ b/gemma/ops_test.cc @@ -517,8 +517,8 @@ void TestMatVecAdd() { hwy::AlignedFreeUniquePtr actual_out = hwy::AllocateAligned(kOuter); HWY_ASSERT(vec && add && even_odd && expected_out && actual_out); - MatVecAdd( - mat, 0, vec.get(), add.get(), even_odd.get(), actual_out.get(), pool); + MatVecAdd(mat, 0, vec.get(), add.get(), even_odd.get(), + actual_out.get(), pool); AssertClose(actual_out, expected_out); } @@ -541,9 +541,8 @@ void TestTwoMatVecAdd() { hwy::AllocateAligned(kOuter); HWY_ASSERT(vec && add0 && add1 && expected_out0 && actual_out0 && expected_out1 && actual_out1); - TwoMatVecAdd(mat0, mat1, 0, vec.get(), add0.get(), - add1.get(), actual_out0.get(), - actual_out1.get(), pool); + TwoMatVecAdd(mat0, mat1, 0, vec.get(), add0.get(), add1.get(), + actual_out0.get(), actual_out1.get(), pool); AssertClose(actual_out0, expected_out0); AssertClose(actual_out1, expected_out1); } @@ -565,9 +564,9 @@ void TestTwoOfsMatVecAddLoop() { hwy::AllocateAligned(kOuter); HWY_ASSERT(vec && add0 && add1 && expected_out0 && actual_out0 && expected_out1 && actual_out1); - TwoOfsMatVecAddLoop(mat, 0, 0, vec.get(), add0.get(), - add1.get(), actual_out0.get(), - actual_out1.get()); + TwoOfsMatVecAddLoop(mat, 0, 0, vec.get(), add0.get(), + add1.get(), actual_out0.get(), + actual_out1.get()); AssertClose(actual_out0, expected_out0); AssertClose(actual_out1, expected_out1); }