Add double-precision dot variant

PiperOrigin-RevId: 679243590
This commit is contained in:
Jan Wassenberg 2024-09-26 12:08:22 -07:00 committed by Copybara-Service
parent 71116daf64
commit 47eb80a90e
3 changed files with 150 additions and 39 deletions

View File

@ -101,6 +101,19 @@ struct CompressTraits<float> {
raw1 = hn::LoadU(df, packed.ptr + packed_ofs + N); raw1 = hn::LoadU(df, packed.ptr + packed_ofs + N);
} }
template <class DD, HWY_IF_F64_D(DD), class VD = hn::Vec<DD>>
static HWY_INLINE void Load2(DD dd, const PackedSpan<const Packed>& packed,
const size_t packed_ofs, VD& raw0, VD& raw1) {
const hn::Rebind<float, DD> df;
using VF = hn::Vec<decltype(df)>;
const size_t NF = hn::Lanes(df);
// Two half loads are likely cheaper than one full + UpperHalf.
const VF f0 = hn::LoadU(df, packed.ptr + packed_ofs + 0 * NF);
const VF f1 = hn::LoadU(df, packed.ptr + packed_ofs + 1 * NF);
raw0 = hn::PromoteTo(dd, f0);
raw1 = hn::PromoteTo(dd, f1);
}
template <class DBF, HWY_IF_BF16_D(DBF)> template <class DBF, HWY_IF_BF16_D(DBF)>
static HWY_INLINE void DecompressAndZeroPad( static HWY_INLINE void DecompressAndZeroPad(
DBF dbf, const PackedSpan<const Packed>& packed, const size_t packed_ofs, DBF dbf, const PackedSpan<const Packed>& packed, const size_t packed_ofs,
@ -149,6 +162,30 @@ struct CompressTraits<float> {
hn::StoreU(vf, df, raw + i); // adds zero padding hn::StoreU(vf, df, raw + i); // adds zero padding
} }
} }
template <class DD, HWY_IF_F64_D(DD)>
static HWY_INLINE void DecompressAndZeroPad(
DD dd, const PackedSpan<const Packed>& packed, const size_t packed_ofs,
double* HWY_RESTRICT raw, size_t num) {
const hn::Rebind<float, DD> df;
using VF = hn::Vec<decltype(df)>;
using VD = hn::Vec<decltype(dd)>;
const size_t ND = hn::Lanes(dd);
size_t i = 0;
if (num >= ND) {
for (; i <= num - ND; i += ND) {
const VF vf = hn::LoadU(df, packed.ptr + packed_ofs + i);
hn::StoreU(hn::PromoteTo(dd, vf), dd, raw + i);
}
}
const size_t remaining = num - i;
HWY_DASSERT(remaining < ND);
if (HWY_UNLIKELY(remaining != 0)) {
const VF vf = hn::LoadN(df, packed.ptr + packed_ofs + i, remaining);
hn::StoreU(hn::PromoteTo(dd, vf), dd, raw + i); // adds zero padding
}
}
}; };
template <> template <>
@ -460,12 +497,23 @@ void Compress2(DF df, VF raw0, VF raw1, const PackedSpan<Packed>& packed,
Traits::Store2(df, raw0, raw1, packed, packed_ofs); Traits::Store2(df, raw0, raw1, packed, packed_ofs);
} }
// Decompresses from any type of `packed`, to two float or BF16 vectors. // Compile-time-only check that `DRaw` and `Packed` are compatible. This makes
// for better error messages than "no matching function found".
template <class DRaw, typename Packed>
HWY_INLINE void VerifyRawAndPacked() {
using TRaw = hn::TFromD<DRaw>;
constexpr bool kPackedF32 = hwy::IsSame<hwy::RemoveCvRef<Packed>, float>();
// We can decompress any Packed to f32 or BF16, or f32 to f64.
static_assert(hwy::IsSameEither<TRaw, float, BF16>() ||
(kPackedF32 && hwy::IsSame<TRaw, double>()));
}
// Decompresses from any type of `packed`, to two vectors of `float/BF16`, or
// `double`, if `Packed` is `float`.
template <class DRaw, typename Packed, class VRaw = hn::Vec<DRaw>> template <class DRaw, typename Packed, class VRaw = hn::Vec<DRaw>>
HWY_INLINE void Decompress2(DRaw d, const PackedSpan<Packed>& packed, HWY_INLINE void Decompress2(DRaw d, const PackedSpan<Packed>& packed,
const size_t packed_ofs, VRaw& raw0, VRaw& raw1) { const size_t packed_ofs, VRaw& raw0, VRaw& raw1) {
using TRaw = hn::TFromD<DRaw>; VerifyRawAndPacked<DRaw, Packed>();
static_assert(hwy::IsSameEither<TRaw, float, BF16>());
packed.BoundsCheck(packed_ofs, 2 * hn::Lanes(d)); packed.BoundsCheck(packed_ofs, 2 * hn::Lanes(d));
using Traits = CompressTraits<hwy::RemoveCvRef<Packed>>; using Traits = CompressTraits<hwy::RemoveCvRef<Packed>>;
Traits::Load2(d, MakeConst(packed), packed_ofs, raw0, raw1); Traits::Load2(d, MakeConst(packed), packed_ofs, raw0, raw1);
@ -476,13 +524,14 @@ HWY_INLINE void Decompress2(DRaw d, const PackedSpan<Packed>& packed,
// required to round `num` up to one vector, if it is not already. The caller is // required to round `num` up to one vector, if it is not already. The caller is
// responsible for scaling `raw` to the original range because `EmbedToken` // responsible for scaling `raw` to the original range because `EmbedToken`
// also wants to scale the decompressed elements. // also wants to scale the decompressed elements.
// `TRaw` can be `float/BF16`, or `double` if `Packed` is `float`.
template <class DRaw, typename Packed, typename TRaw = hn::TFromD<DRaw>> template <class DRaw, typename Packed, typename TRaw = hn::TFromD<DRaw>>
HWY_NOINLINE void DecompressAndZeroPad(DRaw d, const PackedSpan<Packed>& packed, HWY_NOINLINE void DecompressAndZeroPad(DRaw d, const PackedSpan<Packed>& packed,
const size_t packed_ofs, TRaw* raw, const size_t packed_ofs, TRaw* raw,
size_t num) { size_t num) {
static_assert(hwy::IsSameEither<TRaw, float, BF16>()); VerifyRawAndPacked<DRaw, Packed>();
using Traits = CompressTraits<hwy::RemoveCvRef<Packed>>;
packed.BoundsCheck(packed_ofs, num); packed.BoundsCheck(packed_ofs, num);
using Traits = CompressTraits<hwy::RemoveCvRef<Packed>>;
Traits::DecompressAndZeroPad(d, MakeConst(packed), packed_ofs, raw, num); Traits::DecompressAndZeroPad(d, MakeConst(packed), packed_ofs, raw, num);
} }
@ -495,34 +544,38 @@ HWY_NOINLINE void DecompressAndZeroPad(DRaw d, const PackedSpan<Packed>& packed,
// `hwy/contrib/unroller`, but also supports compressed types with simpler // `hwy/contrib/unroller`, but also supports compressed types with simpler
// remainder handling thanks to `DecompressAndZeroPad`. // remainder handling thanks to `DecompressAndZeroPad`.
// //
// `D` can be BF16/float, or also double if `WeightT` and `VecT` are both float.
// `w` can be any packed type, including NUQ, which requires a separate `w_ofs` // `w` can be any packed type, including NUQ, which requires a separate `w_ofs`
// rather than pointer arithmetic. `vec_aligned` can also be any type, but // rather than pointer arithmetic. `vec` can also be any type, but typically
// typically float or BF16. We omit a `v_ofs` because it is 0 in our use cases. // float or BF16. We omit a `v_ofs` because it is 0 in our use cases.
// `num`, the number of elements to process, need not be a vector multiple. // `num`, the number of elements to process, need not be a vector multiple.
// //
// `kernel` is const& so we can pass an rvalue argument, but can contain // `kernel` is const& so we can pass an rvalue argument, but can contain
// mutable state, though not vectors (see highway.h). We pass in the four // mutable state, though not vectors (see highway.h). We pass in the four
// loaded vectors plus eight *f32* state vectors, independent of `D`. // loaded vectors plus eight state vectors. The state vectors' lane type is
// either `double` (required for DotKernelDouble) or `float`.
template <class D, typename WeightT, typename VecT, class Kernel> template <class D, typename WeightT, typename VecT, class Kernel>
HWY_INLINE float DecompressAndCall(D d, const PackedSpan<const WeightT>& w, HWY_INLINE float DecompressAndCall(D d, const PackedSpan<const WeightT>& w,
const size_t w_ofs, const size_t w_ofs,
const PackedSpan<const VecT> vec, const PackedSpan<const VecT> vec,
const Kernel& kernel) { const Kernel& kernel) {
// Decompressed inputs // Decompressed inputs
using T = hn::TFromD<D>;
using V = hn::Vec<decltype(d)>; using V = hn::Vec<decltype(d)>;
V w0, w1, w2, w3, v0, v1, v2, v3; V w0, w1, w2, w3, v0, v1, v2, v3;
// State for Kernel // State for Kernel
const hn::Repartition<float, D> df; using StateT = hwy::If<hwy::IsSame<T, double>(), double, float>;
using VF = hn::Vec<decltype(df)>; const hn::Repartition<StateT, D> ds;
VF sum0 = hn::Zero(df); using VS = hn::Vec<decltype(ds)>;
VF sum1 = hn::Zero(df); VS sum0 = hn::Zero(ds);
VF sum2 = hn::Zero(df); VS sum1 = hn::Zero(ds);
VF sum3 = hn::Zero(df); VS sum2 = hn::Zero(ds);
VF comp0 = hn::Zero(df); VS sum3 = hn::Zero(ds);
VF comp1 = hn::Zero(df); VS comp0 = hn::Zero(ds);
VF comp2 = hn::Zero(df); VS comp1 = hn::Zero(ds);
VF comp3 = hn::Zero(df); VS comp2 = hn::Zero(ds);
VS comp3 = hn::Zero(ds);
const size_t N = hn::Lanes(d); const size_t N = hn::Lanes(d);
size_t i = 0; size_t i = 0;
@ -541,7 +594,6 @@ HWY_INLINE float DecompressAndCall(D d, const PackedSpan<const WeightT>& w,
size_t remaining = vec.num - i; size_t remaining = vec.num - i;
HWY_DASSERT(remaining < 4 * N); HWY_DASSERT(remaining < 4 * N);
if (HWY_UNLIKELY(remaining != 0)) { if (HWY_UNLIKELY(remaining != 0)) {
using T = hn::TFromD<D>;
HWY_ALIGN T padded_w[4 * hn::MaxLanes(d)]; HWY_ALIGN T padded_w[4 * hn::MaxLanes(d)];
HWY_ALIGN T padded_v[4 * hn::MaxLanes(d)]; HWY_ALIGN T padded_v[4 * hn::MaxLanes(d)];
DecompressAndZeroPad(d, w, w_ofs + i, padded_w, remaining); DecompressAndZeroPad(d, w, w_ofs + i, padded_w, remaining);
@ -555,7 +607,7 @@ HWY_INLINE float DecompressAndCall(D d, const PackedSpan<const WeightT>& w,
} }
} }
return kernel.Reduce(df, sum0, sum1, sum2, sum3, comp0, comp1, comp2, comp3); return kernel.Reduce(ds, sum0, sum1, sum2, sum3, comp0, comp1, comp2, comp3);
} }
// Same as above, but single input array. Used by RMSNorm. // Same as above, but single input array. Used by RMSNorm.
@ -563,20 +615,22 @@ template <class D, typename VecT, class Kernel>
HWY_INLINE float DecompressAndCall(D d, const PackedSpan<const VecT> vec, HWY_INLINE float DecompressAndCall(D d, const PackedSpan<const VecT> vec,
const Kernel& kernel) { const Kernel& kernel) {
// Decompressed inputs // Decompressed inputs
using T = hn::TFromD<D>;
using V = hn::Vec<decltype(d)>; using V = hn::Vec<decltype(d)>;
V v0, v1, v2, v3; V v0, v1, v2, v3;
// State for Kernel // State for Kernel
const hn::Repartition<float, D> df; using StateT = hwy::If<hwy::IsSame<T, double>(), double, float>;
using VF = hn::Vec<decltype(df)>; const hn::Repartition<StateT, D> ds;
VF sum0 = hn::Zero(d); using VS = hn::Vec<decltype(ds)>;
VF sum1 = hn::Zero(d); VS sum0 = hn::Zero(ds);
VF sum2 = hn::Zero(d); VS sum1 = hn::Zero(ds);
VF sum3 = hn::Zero(d); VS sum2 = hn::Zero(ds);
VF comp0 = hn::Zero(d); VS sum3 = hn::Zero(ds);
VF comp1 = hn::Zero(d); VS comp0 = hn::Zero(ds);
VF comp2 = hn::Zero(d); VS comp1 = hn::Zero(ds);
VF comp3 = hn::Zero(d); VS comp2 = hn::Zero(ds);
VS comp3 = hn::Zero(ds);
const size_t N = hn::Lanes(d); const size_t N = hn::Lanes(d);
size_t i = 0; size_t i = 0;
@ -593,17 +647,17 @@ HWY_INLINE float DecompressAndCall(D d, const PackedSpan<const VecT> vec,
size_t remaining = vec.num - i; size_t remaining = vec.num - i;
HWY_DASSERT(remaining < 4 * N); HWY_DASSERT(remaining < 4 * N);
if (HWY_UNLIKELY(remaining != 0)) { if (HWY_UNLIKELY(remaining != 0)) {
HWY_ALIGN float padded_v[4 * hn::MaxLanes(d)]; HWY_ALIGN T padded_v[4 * hn::MaxLanes(d)];
DecompressAndZeroPad(d, vec, i, padded_v, remaining); DecompressAndZeroPad(d, vec, i, padded_v, remaining);
// 1..4 whole vectors, possibly zero-padded. // 1..4 whole vectors, possibly zero-padded.
for (size_t padded_pos = 0; padded_pos < remaining; padded_pos += N) { for (size_t padded_pos = 0; padded_pos < remaining; padded_pos += N) {
const VF v0 = hn::Load(d, padded_v + padded_pos); const V v0 = hn::Load(d, padded_v + padded_pos);
kernel.Update1(d, v0, v0, sum0, comp0); kernel.Update1(d, v0, v0, sum0, comp0);
} }
} }
return kernel.Reduce(d, sum0, sum1, sum2, sum3, comp0, comp1, comp2, comp3); return kernel.Reduce(ds, sum0, sum1, sum2, sum3, comp0, comp1, comp2, comp3);
} }
// Functor called for each tensor, which compresses and stores them along with // Functor called for each tensor, which compresses and stores them along with

View File

@ -50,6 +50,8 @@ void ForeachRawType() {
// The argument selects the type to decode to: BF16 or float. // The argument selects the type to decode to: BF16 or float.
test(BF16()); test(BF16());
test(float()); test(float());
// Do not include double because it is not supported as an input type - we
// would also have to implement double -> Packed Compress().
} }
template <template <class> class TestT> template <template <class> class TestT>

View File

@ -57,6 +57,7 @@ enum { // alphabetical order for consistency and to avoid implying a preference
kAddTwoSum, kAddTwoSum,
kComp2, kComp2,
kCompensated, kCompensated,
kDouble,
kKahan, kKahan,
kNaive, kNaive,
kOnlyTwoProd, kOnlyTwoProd,
@ -75,6 +76,8 @@ const char* VariantName(size_t variant) {
return "comp2"; return "comp2";
case kCompensated: case kCompensated:
return "comp"; return "comp";
case kDouble:
return "double";
case kKahan: case kKahan:
return "kahan"; return "kahan";
case kNaive: case kNaive:
@ -153,6 +156,43 @@ HWY_INLINE float DotNaive(D d, const PackedSpan<const WeightT>& w, size_t w_ofs,
return DecompressAndCall(d, w, w_ofs, MakeSpan(vec, num), DotKernelNaive()); return DecompressAndCall(d, w, w_ofs, MakeSpan(vec, num), DotKernelNaive());
} }
struct DotKernelDouble {
template <class DD, class VD = hn::Vec<DD>, HWY_IF_F64_D(DD)>
HWY_INLINE void Update4(DD dd, const VD w0, const VD w1, const VD w2,
const VD w3, const VD v0, const VD v1, const VD v2,
const VD v3, VD& sum0, VD& sum1, VD& sum2, VD& sum3,
VD&, VD&, VD&, VD&) const {
sum0 = hn::MulAdd(w0, v0, sum0);
sum1 = hn::MulAdd(w1, v1, sum1);
sum2 = hn::MulAdd(w2, v2, sum2);
sum3 = hn::MulAdd(w3, v3, sum3);
}
template <class DD, class VD = hn::Vec<DD>, HWY_IF_F64_D(DD)>
HWY_INLINE void Update1(DD dd, const VD w0, const VD v0, VD& sum0,
VD&) const {
sum0 = hn::MulAdd(w0, v0, sum0);
}
template <class DD, class VD = hn::Vec<DD>, HWY_IF_F64_D(DD)>
HWY_INLINE float Reduce(DD dd, VD& sum0, VD& sum1, VD& sum2, VD& sum3, VD&,
VD&, VD&, VD&) const {
// Reduction tree: sum of all accumulators by pairs, then across lanes.
sum0 = hn::Add(sum0, sum1);
sum2 = hn::Add(sum2, sum3);
sum0 = hn::Add(sum0, sum2);
return static_cast<float>(hn::ReduceSum(dd, sum0));
}
};
template <class D, typename WeightT, typename VecT>
HWY_INLINE float DotDouble(D d, const PackedSpan<const WeightT>& w,
size_t w_ofs, const VecT* HWY_RESTRICT vec,
size_t num) {
const hn::Repartition<double, D> dd;
return DecompressAndCall(dd, w, w_ofs, MakeSpan(vec, num), DotKernelDouble());
}
// https://en.wikipedia.org/wiki/Kahan_summation_algorithm: FastTwoSum. // https://en.wikipedia.org/wiki/Kahan_summation_algorithm: FastTwoSum.
struct DotKernelKahan { struct DotKernelKahan {
template <class DF, class VF = hn::Vec<DF>> template <class DF, class VF = hn::Vec<DF>>
@ -533,9 +573,14 @@ HWY_INLINE float DotComp2(D d, const PackedSpan<const WeightT>& w, size_t w_ofs,
return DecompressAndCall(d, w, w_ofs, MakeSpan(vec, num), DotKernelComp2()); return DecompressAndCall(d, w, w_ofs, MakeSpan(vec, num), DotKernelComp2());
} }
template <class D, typename WeightT, typename VecT> template <class D, typename WeightT, typename VecT, HWY_IF_F32_D(D)>
float CallDot(D d, size_t variant, const PackedSpan<const WeightT>& w, float CallDot(D d, size_t variant, const PackedSpan<const WeightT>& w,
size_t w_ofs, const VecT* HWY_RESTRICT v, size_t num) { size_t w_ofs, const VecT* HWY_RESTRICT v, size_t num) {
// float inputs also support kDouble.
if constexpr (hwy::IsSame<WeightT, float>() && hwy::IsSame<VecT, float>()) {
if (variant == kDouble) return DotDouble(d, w, 0, v, num);
}
switch (variant) { switch (variant) {
case kAddTwoProd: case kAddTwoProd:
return DotTwoProdFast(d, w, 0, v, num); return DotTwoProdFast(d, w, 0, v, num);
@ -720,9 +765,11 @@ class DotStats {
ASSERT_INSIDE(kComp2, 1.001f, s_muls[kComp2].Max(), 2.4f); ASSERT_INSIDE(kComp2, 1.001f, s_muls[kComp2].Max(), 2.4f);
ASSERT_INSIDE(kComp2, 1.0, s_muls[kComp2].GeometricMean(), 1.2); ASSERT_INSIDE(kComp2, 1.0, s_muls[kComp2].GeometricMean(), 1.2);
// Compensated is very accurate. // Compensated and Double are very accurate.
ASSERT_LESS(kCompensated, s_muls[kCompensated].Min(), 1.0f + 2E-6f); ASSERT_LESS(kCompensated, s_muls[kCompensated].Min(), 1.0f + 2E-6f);
ASSERT_LESS(kCompensated, s_muls[kCompensated].Max(), 1.0f + 2E-5f); ASSERT_LESS(kCompensated, s_muls[kCompensated].Max(), 1.0f + 2E-5f);
ASSERT_LESS(kDouble, s_muls[kDouble].Min(), 1.0f + 2E-6f);
ASSERT_LESS(kDouble, s_muls[kDouble].Max(), 1.0f + 2E-5f);
// Naive and OnlyTwoProd are considerably worse. >10x is for narrower // Naive and OnlyTwoProd are considerably worse. >10x is for narrower
// vectors, compared to AVX-512. GeometricMean overflows, must use Mean. // vectors, compared to AVX-512. GeometricMean overflows, must use Mean.
@ -751,9 +798,11 @@ class DotStats {
ASSERT_INSIDE(kComp2, 1E-5, s_l1s[kComp2].Mean(), 9E-4); ASSERT_INSIDE(kComp2, 1E-5, s_l1s[kComp2].Mean(), 9E-4);
ASSERT_INSIDE(kComp2, 1E-5f, s_l1s[kComp2].Max(), 2.6E-3f); ASSERT_INSIDE(kComp2, 1E-5f, s_l1s[kComp2].Max(), 2.6E-3f);
// Compensated is very accurate. // Compensated and Double are very accurate.
HWY_ASSERT(s_l1s[kCompensated].Min() == 0.0f); HWY_ASSERT(s_l1s[kCompensated].Min() == 0.0f);
ASSERT_LESS(kCompensated, s_l1s[kCompensated].Max(), 3E-7f); ASSERT_LESS(kCompensated, s_l1s[kCompensated].Max(), 3E-7f);
HWY_ASSERT(s_l1s[kDouble].Min() == 0.0f);
ASSERT_LESS(kDouble, s_l1s[kDouble].Max(), 3E-7f);
// Naive and OnlyTwoProd are considerably higher, but not huge. // Naive and OnlyTwoProd are considerably higher, but not huge.
ASSERT_INSIDE(kNaive, 1E-3, s_l1s[kNaive].Mean(), 2E-2); ASSERT_INSIDE(kNaive, 1E-3, s_l1s[kNaive].Mean(), 2E-2);
@ -778,9 +827,11 @@ class DotStats {
ASSERT_INSIDE(kComp2, 2E-4, s_rels[kComp2].GeometricMean(), 3.7E-3); ASSERT_INSIDE(kComp2, 2E-4, s_rels[kComp2].GeometricMean(), 3.7E-3);
ASSERT_INSIDE(kComp2, 1E-5f, s_rels[kComp2].Max(), 0.4f); ASSERT_INSIDE(kComp2, 1E-5f, s_rels[kComp2].Max(), 0.4f);
// Compensated is very accurate. // Compensated and Double are very accurate.
ASSERT_LESS(kCompensated, s_rels[kCompensated].Min(), 1E-8f); ASSERT_LESS(kCompensated, s_rels[kCompensated].Min(), 1E-8f);
ASSERT_LESS(kCompensated, s_rels[kCompensated].Max(), 8E-6f); ASSERT_LESS(kCompensated, s_rels[kCompensated].Max(), 8E-6f);
ASSERT_LESS(kDouble, s_rels[kDouble].Min(), 1E-8f);
ASSERT_LESS(kDouble, s_rels[kDouble].Max(), 8E-6f);
// Naive and OnlyTwoProd are considerably higher, but not huge. // Naive and OnlyTwoProd are considerably higher, but not huge.
ASSERT_INSIDE(kNaive, 1E-3, s_rels[kNaive].GeometricMean(), 8E-2); ASSERT_INSIDE(kNaive, 1E-3, s_rels[kNaive].GeometricMean(), 8E-2);
@ -807,8 +858,9 @@ class DotStats {
void CheckBwd() const { void CheckBwd() const {
ASSERT_INSIDE(kComp2, 7E-10f, s_rels[kComp2].Max(), 0.4f); ASSERT_INSIDE(kComp2, 7E-10f, s_rels[kComp2].Max(), 0.4f);
// Compensated is very accurate. // Compensated and Double are very accurate.
ASSERT_LESS(kCompensated, s_rels[kCompensated].Max(), 8E-6f); ASSERT_LESS(kCompensated, s_rels[kCompensated].Max(), 8E-6f);
ASSERT_LESS(kDouble, s_rels[kDouble].Max(), 8E-6f);
// Naive and OnlyTwoProd are considerably higher than others // Naive and OnlyTwoProd are considerably higher than others
ASSERT_INSIDE(kNaive, 1.5E-8f, s_rels[kNaive].Max(), 3080.f); ASSERT_INSIDE(kNaive, 1.5E-8f, s_rels[kNaive].Max(), 3080.f);
@ -828,6 +880,7 @@ class DotStats {
void CheckUlps() const { void CheckUlps() const {
ASSERT_LESS(kComp2, s_ulps[kCompensated].Max(), 3.6E6f); ASSERT_LESS(kComp2, s_ulps[kCompensated].Max(), 3.6E6f);
ASSERT_LESS(kCompensated, s_ulps[kCompensated].Max(), 250.0f); ASSERT_LESS(kCompensated, s_ulps[kCompensated].Max(), 250.0f);
ASSERT_LESS(kDouble, s_ulps[kDouble].Max(), 250.0f);
ASSERT_LESS(kNaive, s_ulps[kNaive].Max(), 4E9f); ASSERT_LESS(kNaive, s_ulps[kNaive].Max(), 4E9f);
ASSERT_LESS(kOnlyTwoProd, s_ulps[kOnlyTwoProd].Max(), 3E9f); ASSERT_LESS(kOnlyTwoProd, s_ulps[kOnlyTwoProd].Max(), 3E9f);
ASSERT_LESS(kKahan, s_ulps[kKahan].Max(), 4E7f); ASSERT_LESS(kKahan, s_ulps[kKahan].Max(), 4E7f);
@ -987,7 +1040,9 @@ struct TestShortDotsT {
const float dot_exact = ExactDot(raw_w.All(), raw_v.All(), num, buf); const float dot_exact = ExactDot(raw_w.All(), raw_v.All(), num, buf);
float dots[kVariants]; float dots[kVariants];
for (size_t variant = 0; variant < kVariants; ++variant) { for (size_t variant = 0; variant < kVariants; ++variant) {
dots[variant] = CallDot(df, variant, MakeConst(w), 0, v.ptr, num); // Here Packed is not always float, so we must not call kDouble.
const size_t actual = (variant == kDouble) ? kCompensated : variant;
dots[variant] = CallDot(df, actual, MakeConst(w), 0, v.ptr, num);
const float l1 = hwy::ScalarAbs(dots[variant] - dot_exact); const float l1 = hwy::ScalarAbs(dots[variant] - dot_exact);
s_l1[variant].Notify(l1); s_l1[variant].Notify(l1);