Compare commits

...

4 Commits

Author SHA1 Message Date
Sam Kaufman 6a78a23f4c Abstracted some MatVecAdd spec. dupes. 2024-04-29 16:23:38 -07:00
Sam Kaufman f608337fef Remove Bf16ToF32EO and use PromoteEvenTo and PromoteOddTo. 2024-04-29 14:13:07 -07:00
Sam Kaufman aa0b113214 (VecT*) to static_cast<VecT*>. 2024-04-29 12:53:47 -07:00
Sam Kaufman 5cb63346aa supports_eo -> kSupportsEvenOdd 2024-04-29 12:51:35 -07:00
2 changed files with 24 additions and 70 deletions

View File

@ -51,20 +51,6 @@ namespace gcpp {
namespace HWY_NAMESPACE {
namespace hn = hwy::HWY_NAMESPACE;
template <class DF, HWY_IF_F32_D(DF)>
HWY_INLINE void Bf16ToF32EO(const DF df32,
const hwy::bfloat16_t* HWY_RESTRICT in,
hn::Vec<DF>& v_even,
hn::Vec<DF>& v_odd) {
const hn::Repartition<hwy::bfloat16_t, DF> dbf16;
const hn::RebindToUnsigned<decltype(df32)> du32;
const auto odd = Set(du32, 0xFFFF0000u);
const auto interleaved = BitCast(du32, LoadU(dbf16, in));
v_even = BitCast(df32, hn::ShiftLeft<16>(interleaved));
v_odd = BitCast(df32, And(interleaved, odd));
}
// Enables generic code independent of compression type.
template <typename T> // primary, must specialize
struct CompressTraits {};
@ -72,7 +58,7 @@ struct CompressTraits {};
template <>
struct CompressTraits<float> {
using MatT = float;
static constexpr bool supports_eo = false;
static constexpr bool kSupportsEvenOdd = false;
template <class DF, HWY_IF_F32_D(DF)>
static HWY_INLINE void Compress(DF df, const float* HWY_RESTRICT in,
@ -126,7 +112,7 @@ struct CompressTraits<float> {
template <>
struct CompressTraits<hwy::bfloat16_t> {
using MatT = hwy::bfloat16_t;
static constexpr bool supports_eo = true;
static constexpr bool kSupportsEvenOdd = true;
template <class DF, HWY_IF_F32_D(DF)>
static HWY_INLINE void Compress(DF df, const float* HWY_RESTRICT in,
@ -263,19 +249,19 @@ struct CompressTraits<hwy::bfloat16_t> {
VF32 be0, bo0, be1, bo1;
for (size_t i = 0; i < num; /* i += 2 * N */) {
const auto interleaved0 = hn::LoadU(dbf16, in + in_ofs + i);
const VF32 ae0 = Load(df32, vec_aligned + i);
const VF32 ao0 = Load(df32, vec_aligned + i + (N / 2));
Bf16ToF32EO(df32, in + in_ofs + i, be0, bo0);
sum0 = hn::MulAdd(ae0, hn::PromoteEvenTo(df32, interleaved0), sum0);
sum1 = hn::MulAdd(ao0, hn::PromoteOddTo(df32, interleaved0), sum1);
i += N;
sum0 = hn::MulAdd(ae0, be0, sum0);
sum1 = hn::MulAdd(ao0, bo0, sum1);
const auto interleaved1 = hn::LoadU(dbf16, in + in_ofs + i);
const VF32 ae1 = Load(df32, vec_aligned + i);
const VF32 ao1 = Load(df32, vec_aligned + i + (N / 2));
Bf16ToF32EO(df32, in + in_ofs + i, be1, bo1);
sum2 = hn::MulAdd(ae1, hn::PromoteEvenTo(df32, interleaved1), sum2);
sum3 = hn::MulAdd(ao1, hn::PromoteOddTo(df32, interleaved1), sum3);
i += N;
sum2 = hn::MulAdd(ae1, be1, sum2);
sum3 = hn::MulAdd(ao1, bo1, sum3);
}
sum0 = Add(sum0, sum1);
@ -288,7 +274,7 @@ struct CompressTraits<hwy::bfloat16_t> {
template <>
struct CompressTraits<SfpStream> {
using MatT = SfpStream;
static constexpr bool supports_eo = false;
static constexpr bool kSupportsEvenOdd = false;
template <class DF, HWY_IF_F32_D(DF)>
static HWY_INLINE void Compress(DF df, const float* in, size_t num,
@ -338,7 +324,7 @@ struct CompressTraits<SfpStream> {
template <>
struct CompressTraits<NuqStream> {
using MatT = NuqStream;
static constexpr bool supports_eo = false;
static constexpr bool kSupportsEvenOdd = false;
template <class DF, HWY_IF_F32_D(DF)>
static HWY_INLINE void Compress(DF df, const float* in, size_t num,

View File

@ -104,11 +104,10 @@ HWY_INLINE void ToEvenOddF32(
HWY_DASSERT(size % hn::Lanes(dbf16) == 0);
HWY_DASSERT(hn::IsAligned(df, vec_aligned));
VF32 veven, vodd;
for (size_t i = 0; i < size; i += hn::Lanes(dbf16)) {
Bf16ToF32EO(df, vec_aligned + i, veven, vodd);
hn::Store(veven, df, out + i);
hn::Store(vodd, df, out + i + hn::Lanes(df));
const auto interleaved = hn::LoadU(dbf16, vec_aligned + i);
hn::Store(hn::PromoteEvenTo(df, interleaved), df, out + i);
hn::Store(hn::PromoteOddTo(df, interleaved), df, out + i + hn::Lanes(df));
}
}
@ -181,7 +180,7 @@ HWY_INLINE void MatVecLoop(const ArrayT& mat, const size_t mat_ofs,
const VecT* HWY_RESTRICT vec_aligned,
float* HWY_RESTRICT out) {
MatVecAddLoop<false, kOuter, kInner>(
mat, mat_ofs, vec_aligned, /*add=*/(VecT*)nullptr, out);
mat, mat_ofs, vec_aligned, /*add=*/static_cast<VecT*>(nullptr), out);
}
// Simple version without tiling nor threading, but two offsets/outputs.
@ -340,47 +339,15 @@ HWY_INLINE void MatVecAdd(const ArrayT& mat, const size_t mat_ofs,
// A specialization of MatVecAdd to float32 vectors which first rearranges the
// vector to even-odd layout.
template <bool kAdd, size_t kOuter, size_t kInner, typename ArrayT,
typename AddT,
std::enable_if_t<CompressTraits<typename ArrayT::value_type>::supports_eo, bool> = true>
typename VecT, typename AddT,
std::enable_if_t<
std::is_same_v<VecT, float> || std::is_same_v<VecT, hwy::bfloat16_t>>
= true,
std::enable_if_t<
CompressTraits<typename ArrayT::value_type>::kSupportsEvenOdd, bool>
= true>
HWY_INLINE void MatVecAdd(const ArrayT& mat, const size_t mat_ofs,
const float* HWY_RESTRICT const vec_aligned,
const AddT* HWY_RESTRICT const add,
float* HWY_RESTRICT out, hwy::ThreadPool& pool) {
PROFILER_ZONE("MatVecAdd");
const hn::ScalableTag<float> df;
constexpr size_t kRowsPerStrip = RowsPerStrip<kOuter>();
constexpr size_t kNumStrips = kOuter / kRowsPerStrip;
const auto vec_dequant = hwy::AllocateAligned<float>(kInner);
ToEvenOddF32(vec_aligned, kInner, vec_dequant.get());
// For each entire strip.
pool.Run(0, kNumStrips, [&](const uint64_t strip, size_t thread) HWY_ATTR {
PROFILER_ZONE("MatVec.lambda");
const size_t r0 = strip * kRowsPerStrip;
detail::FullDotProductsForStrip<true, kAdd>(
df, mat, mat_ofs, kInner, r0, kRowsPerStrip, vec_dequant.get(), add,
out + r0);
});
// Remaining rows
const size_t r0 = kNumStrips * kRowsPerStrip;
if (r0 < kOuter) {
PROFILER_ZONE("MatVec remainder");
const size_t num_rows = kOuter - r0;
detail::FullDotProductsForStrip<true, kAdd>(
df, mat, mat_ofs, kInner, r0, num_rows, vec_dequant.get(), add, out + r0);
}
}
// A specialization of MatVecAdd to bf16 vectors which first rearranges the
// vector to even-odd layout.
template <bool kAdd, size_t kOuter, size_t kInner, typename ArrayT,
typename AddT,
std::enable_if_t<CompressTraits<typename ArrayT::value_type>::supports_eo, bool> = true>
HWY_INLINE void MatVecAdd(const ArrayT& mat, const size_t mat_ofs,
const hwy::bfloat16_t* HWY_RESTRICT const vec_aligned,
const VecT* HWY_RESTRICT const vec_aligned,
const AddT* HWY_RESTRICT const add,
float* HWY_RESTRICT out, hwy::ThreadPool& pool) {
PROFILER_ZONE("MatVecAdd");
@ -416,7 +383,8 @@ HWY_INLINE void MatVec(const ArrayT& mat, const size_t mat_ofs,
const VecT* HWY_RESTRICT const vec_aligned,
float* HWY_RESTRICT out, hwy::ThreadPool& pool) {
MatVecAdd<false, kOuter, kInner>(
mat, mat_ofs, vec_aligned, /*add=*/(VecT *)nullptr, out, pool);
mat, mat_ofs, vec_aligned, /*add=*/static_cast<VecT*>(nullptr), out,
pool);
}
template <class D, HWY_IF_F32_D(D)>