Abstracted some MatVecAdd spec. dupes.

Remove Bf16ToF32EO and use PromoteEvenTo and PromoteOddTo.
(VecT*) to static_cast<VecT*>.
2024-04-29 16:23:38 -07:00 · 2024-04-29 14:13:07 -07:00 · 2024-04-29 12:53:47 -07:00 · 2024-04-29 12:51:35 -07:00
2 changed files with 24 additions and 70 deletions
--- a/compression/compress-inl.h
+++ b/compression/compress-inl.h
@ -51,20 +51,6 @@ namespace gcpp {
 namespace HWY_NAMESPACE {
 namespace hn = hwy::HWY_NAMESPACE;

-template <class DF, HWY_IF_F32_D(DF)>
-HWY_INLINE void Bf16ToF32EO(const DF df32,
-                            const hwy::bfloat16_t* HWY_RESTRICT in,
-                            hn::Vec<DF>& v_even,
-                            hn::Vec<DF>& v_odd) {
-  const hn::Repartition<hwy::bfloat16_t, DF> dbf16;
-  const hn::RebindToUnsigned<decltype(df32)> du32;
-
-  const auto odd = Set(du32, 0xFFFF0000u);
-  const auto interleaved = BitCast(du32, LoadU(dbf16, in));
-  v_even = BitCast(df32, hn::ShiftLeft<16>(interleaved));
-  v_odd = BitCast(df32, And(interleaved, odd));
-}
-
 // Enables generic code independent of compression type.
 template <typename T>  // primary, must specialize
 struct CompressTraits {};
@ -72,7 +58,7 @@ struct CompressTraits {};
 template <>
 struct CompressTraits<float> {
  using MatT = float;
-  static constexpr bool supports_eo = false;
+  static constexpr bool kSupportsEvenOdd = false;

  template <class DF, HWY_IF_F32_D(DF)>
  static HWY_INLINE void Compress(DF df, const float* HWY_RESTRICT in,
@ -126,7 +112,7 @@ struct CompressTraits<float> {
 template <>
 struct CompressTraits<hwy::bfloat16_t> {
  using MatT = hwy::bfloat16_t;
-  static constexpr bool supports_eo = true;
+  static constexpr bool kSupportsEvenOdd = true;

  template <class DF, HWY_IF_F32_D(DF)>
  static HWY_INLINE void Compress(DF df, const float* HWY_RESTRICT in,
@ -263,19 +249,19 @@ struct CompressTraits<hwy::bfloat16_t> {

    VF32 be0, bo0, be1, bo1;
    for (size_t i = 0; i < num; /* i += 2 * N */) {
+      const auto interleaved0 = hn::LoadU(dbf16, in + in_ofs + i);
      const VF32 ae0 = Load(df32, vec_aligned + i);
      const VF32 ao0 = Load(df32, vec_aligned + i + (N / 2));
-      Bf16ToF32EO(df32, in + in_ofs + i, be0, bo0);
+      sum0 = hn::MulAdd(ae0, hn::PromoteEvenTo(df32, interleaved0), sum0);
+      sum1 = hn::MulAdd(ao0, hn::PromoteOddTo(df32, interleaved0), sum1);
      i += N;
-      sum0 = hn::MulAdd(ae0, be0, sum0);
-      sum1 = hn::MulAdd(ao0, bo0, sum1);

+      const auto interleaved1 = hn::LoadU(dbf16, in + in_ofs + i);
      const VF32 ae1 = Load(df32, vec_aligned + i);
      const VF32 ao1 = Load(df32, vec_aligned + i + (N / 2));
-      Bf16ToF32EO(df32, in + in_ofs + i, be1, bo1);
+      sum2 = hn::MulAdd(ae1, hn::PromoteEvenTo(df32, interleaved1), sum2);
+      sum3 = hn::MulAdd(ao1, hn::PromoteOddTo(df32, interleaved1), sum3);
      i += N;
-      sum2 = hn::MulAdd(ae1, be1, sum2);
-      sum3 = hn::MulAdd(ao1, bo1, sum3);
    }

    sum0 = Add(sum0, sum1);
@ -288,7 +274,7 @@ struct CompressTraits<hwy::bfloat16_t> {
 template <>
 struct CompressTraits<SfpStream> {
  using MatT = SfpStream;
-  static constexpr bool supports_eo = false;
+  static constexpr bool kSupportsEvenOdd = false;

  template <class DF, HWY_IF_F32_D(DF)>
  static HWY_INLINE void Compress(DF df, const float* in, size_t num,
@ -338,7 +324,7 @@ struct CompressTraits<SfpStream> {
 template <>
 struct CompressTraits<NuqStream> {
  using MatT = NuqStream;
-  static constexpr bool supports_eo = false;
+  static constexpr bool kSupportsEvenOdd = false;

  template <class DF, HWY_IF_F32_D(DF)>
  static HWY_INLINE void Compress(DF df, const float* in, size_t num,
--- a/gemma/ops.h
+++ b/gemma/ops.h
@ -104,11 +104,10 @@ HWY_INLINE void ToEvenOddF32(
  HWY_DASSERT(size % hn::Lanes(dbf16) == 0);
  HWY_DASSERT(hn::IsAligned(df, vec_aligned));

-  VF32 veven, vodd;
  for (size_t i = 0; i < size; i += hn::Lanes(dbf16)) {
-    Bf16ToF32EO(df, vec_aligned + i, veven, vodd);
-    hn::Store(veven, df, out + i);
-    hn::Store(vodd, df, out + i + hn::Lanes(df));
+    const auto interleaved = hn::LoadU(dbf16, vec_aligned + i);
+    hn::Store(hn::PromoteEvenTo(df, interleaved), df, out + i);
+    hn::Store(hn::PromoteOddTo(df, interleaved), df, out + i + hn::Lanes(df));
  }
 }

@ -181,7 +180,7 @@ HWY_INLINE void MatVecLoop(const ArrayT& mat, const size_t mat_ofs,
                           const VecT* HWY_RESTRICT vec_aligned,
                           float* HWY_RESTRICT out) {
  MatVecAddLoop<false, kOuter, kInner>(
-      mat, mat_ofs, vec_aligned, /*add=*/(VecT*)nullptr, out);
+      mat, mat_ofs, vec_aligned, /*add=*/static_cast<VecT*>(nullptr), out);
 }

 // Simple version without tiling nor threading, but two offsets/outputs.
@ -340,47 +339,15 @@ HWY_INLINE void MatVecAdd(const ArrayT& mat, const size_t mat_ofs,
 // A specialization of MatVecAdd to float32 vectors which first rearranges the
 // vector to even-odd layout.
 template <bool kAdd, size_t kOuter, size_t kInner, typename ArrayT,
-          typename AddT,
-          std::enable_if_t<CompressTraits<typename ArrayT::value_type>::supports_eo, bool> = true>
+          typename VecT, typename AddT,
+          std::enable_if_t<
+            std::is_same_v<VecT, float> || std::is_same_v<VecT, hwy::bfloat16_t>>
+            = true,
+          std::enable_if_t<
+            CompressTraits<typename ArrayT::value_type>::kSupportsEvenOdd, bool>
+            = true>
 HWY_INLINE void MatVecAdd(const ArrayT& mat, const size_t mat_ofs,
-                          const float* HWY_RESTRICT const vec_aligned,
-                          const AddT* HWY_RESTRICT const add,
-                          float* HWY_RESTRICT out, hwy::ThreadPool& pool) {
-  PROFILER_ZONE("MatVecAdd");
-
-  const hn::ScalableTag<float> df;
-  constexpr size_t kRowsPerStrip = RowsPerStrip<kOuter>();
-  constexpr size_t kNumStrips = kOuter / kRowsPerStrip;
-
-  const auto vec_dequant = hwy::AllocateAligned<float>(kInner);
-  ToEvenOddF32(vec_aligned, kInner, vec_dequant.get());
-
-  // For each entire strip.
-  pool.Run(0, kNumStrips, [&](const uint64_t strip, size_t thread) HWY_ATTR {
-    PROFILER_ZONE("MatVec.lambda");
-    const size_t r0 = strip * kRowsPerStrip;
-    detail::FullDotProductsForStrip<true, kAdd>(
-      df, mat, mat_ofs, kInner, r0, kRowsPerStrip, vec_dequant.get(), add,
-      out + r0);
-  });
-
-  // Remaining rows
-  const size_t r0 = kNumStrips * kRowsPerStrip;
-  if (r0 < kOuter) {
-    PROFILER_ZONE("MatVec remainder");
-    const size_t num_rows = kOuter - r0;
-    detail::FullDotProductsForStrip<true, kAdd>(
-      df, mat, mat_ofs, kInner, r0, num_rows, vec_dequant.get(), add, out + r0);
-  }
-}
-
-// A specialization of MatVecAdd to bf16 vectors which first rearranges the
-// vector to even-odd layout.
-template <bool kAdd, size_t kOuter, size_t kInner, typename ArrayT,
-          typename AddT,
-          std::enable_if_t<CompressTraits<typename ArrayT::value_type>::supports_eo, bool> = true>
-HWY_INLINE void MatVecAdd(const ArrayT& mat, const size_t mat_ofs,
-                          const hwy::bfloat16_t* HWY_RESTRICT const vec_aligned,
+                          const VecT* HWY_RESTRICT const vec_aligned,
                          const AddT* HWY_RESTRICT const add,
                          float* HWY_RESTRICT out, hwy::ThreadPool& pool) {
  PROFILER_ZONE("MatVecAdd");
@ -416,7 +383,8 @@ HWY_INLINE void MatVec(const ArrayT& mat, const size_t mat_ofs,
                       const VecT* HWY_RESTRICT const vec_aligned,
                       float* HWY_RESTRICT out, hwy::ThreadPool& pool) {
  MatVecAdd<false, kOuter, kInner>(
-      mat, mat_ofs, vec_aligned, /*add=*/(VecT *)nullptr, out, pool);
+      mat, mat_ofs, vec_aligned, /*add=*/static_cast<VecT*>(nullptr), out,
+      pool);
 }

 template <class D, HWY_IF_F32_D(D)>
Author	SHA1	Message	Date
Sam Kaufman	6a78a23f4c	Abstracted some MatVecAdd spec. dupes.	2024-04-29 16:23:38 -07:00
Sam Kaufman	f608337fef	Remove Bf16ToF32EO and use PromoteEvenTo and PromoteOddTo.	2024-04-29 14:13:07 -07:00
Sam Kaufman	aa0b113214	(VecT) to static_cast<VecT>.	2024-04-29 12:53:47 -07:00
Sam Kaufman	5cb63346aa	supports_eo -> kSupportsEvenOdd	2024-04-29 12:51:35 -07:00