mirror of https://github.com/google/gemma.cpp.git
Add pairwise sum dot products for testing
Also add wrapper function for threshold comparison. PiperOrigin-RevId: 676749760
This commit is contained in:
parent
03f0ee2323
commit
bb6b398df3
346
ops/dot_test.cc
346
ops/dot_test.cc
|
|
@ -52,6 +52,65 @@ namespace gcpp {
|
||||||
namespace HWY_NAMESPACE {
|
namespace HWY_NAMESPACE {
|
||||||
namespace hn = hwy::HWY_NAMESPACE;
|
namespace hn = hwy::HWY_NAMESPACE;
|
||||||
|
|
||||||
|
enum { // alphabetical order for consistency and to avoid implying a preference
|
||||||
|
kAddTwoProd,
|
||||||
|
kAddTwoSum,
|
||||||
|
kComp2,
|
||||||
|
kCompensated,
|
||||||
|
kKahan,
|
||||||
|
kNaive,
|
||||||
|
kOnlyTwoProd,
|
||||||
|
kPairwise,
|
||||||
|
|
||||||
|
kVariants
|
||||||
|
};
|
||||||
|
|
||||||
|
const char* VariantName(size_t variant) {
|
||||||
|
switch (variant) {
|
||||||
|
case kAddTwoProd:
|
||||||
|
return "add2prod";
|
||||||
|
case kAddTwoSum:
|
||||||
|
return "add2sum";
|
||||||
|
case kComp2:
|
||||||
|
return "comp2";
|
||||||
|
case kCompensated:
|
||||||
|
return "comp";
|
||||||
|
case kKahan:
|
||||||
|
return "kahan";
|
||||||
|
case kNaive:
|
||||||
|
return "naive";
|
||||||
|
case kOnlyTwoProd:
|
||||||
|
return "only2prod";
|
||||||
|
case kPairwise:
|
||||||
|
return "pairwise";
|
||||||
|
default:
|
||||||
|
HWY_ABORT("Unknown variant %zu", variant);
|
||||||
|
return "?";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wrapper functions allow disabling HWY_ASSERT so that we see all failures in
|
||||||
|
// one run and can update all thresholds at once.
|
||||||
|
template <typename T>
|
||||||
|
void AssertInside(size_t variant, T min, T actual, T max, int line) {
|
||||||
|
if (!gcpp::IsInside(min, max, actual)) {
|
||||||
|
fprintf(stderr, "!!line %03d, %s actual %E not in [%E, %E]\n", line,
|
||||||
|
VariantName(variant), actual, min, max);
|
||||||
|
HWY_ASSERT(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void AssertLess(size_t variant, T actual, T max, int line) {
|
||||||
|
AssertInside(variant, hwy::LowestValue<T>(), actual, max, line);
|
||||||
|
}
|
||||||
|
|
||||||
|
#define ASSERT_LESS(variant, actual, max) \
|
||||||
|
AssertLess(variant, actual, max, __LINE__)
|
||||||
|
|
||||||
|
#define ASSERT_INSIDE(variant, min, actual, max) \
|
||||||
|
AssertInside(variant, min, actual, max, __LINE__)
|
||||||
|
|
||||||
//------------------------------------------------------------------------------
|
//------------------------------------------------------------------------------
|
||||||
// Dot product variants
|
// Dot product variants
|
||||||
|
|
||||||
|
|
@ -87,6 +146,7 @@ struct DotKernelNaive {
|
||||||
return hn::ReduceSum(df, sum0);
|
return hn::ReduceSum(df, sum0);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class D, typename WeightT, typename VecT>
|
template <class D, typename WeightT, typename VecT>
|
||||||
HWY_INLINE float DotNaive(D d, const PackedSpan<const WeightT>& w, size_t w_ofs,
|
HWY_INLINE float DotNaive(D d, const PackedSpan<const WeightT>& w, size_t w_ofs,
|
||||||
const VecT* HWY_RESTRICT vec_aligned, size_t num) {
|
const VecT* HWY_RESTRICT vec_aligned, size_t num) {
|
||||||
|
|
@ -133,6 +193,7 @@ struct DotKernelKahan {
|
||||||
return ReduceCascadedSums(df, sum0, sum_err);
|
return ReduceCascadedSums(df, sum0, sum_err);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class D, typename WeightT, typename VecT>
|
template <class D, typename WeightT, typename VecT>
|
||||||
HWY_INLINE float DotKahan(D d, const PackedSpan<const WeightT>& w, size_t w_ofs,
|
HWY_INLINE float DotKahan(D d, const PackedSpan<const WeightT>& w, size_t w_ofs,
|
||||||
const VecT* HWY_RESTRICT vec_aligned, size_t num) {
|
const VecT* HWY_RESTRICT vec_aligned, size_t num) {
|
||||||
|
|
@ -195,6 +256,7 @@ struct DotKernelTwoProdFast {
|
||||||
return ReduceCascadedSums(df, sum0, comp0);
|
return ReduceCascadedSums(df, sum0, comp0);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class D, typename WeightT, typename VecT>
|
template <class D, typename WeightT, typename VecT>
|
||||||
HWY_INLINE float DotTwoProdFast(D d, const PackedSpan<const WeightT>& w,
|
HWY_INLINE float DotTwoProdFast(D d, const PackedSpan<const WeightT>& w,
|
||||||
size_t w_ofs,
|
size_t w_ofs,
|
||||||
|
|
@ -250,6 +312,7 @@ struct DotKernelMulTwoSum {
|
||||||
return ReduceCascadedSums(df, sum0, comp0);
|
return ReduceCascadedSums(df, sum0, comp0);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class D, typename WeightT, typename VecT>
|
template <class D, typename WeightT, typename VecT>
|
||||||
HWY_INLINE float DotMulTwoSum(D d, const PackedSpan<const WeightT>& w,
|
HWY_INLINE float DotMulTwoSum(D d, const PackedSpan<const WeightT>& w,
|
||||||
size_t w_ofs,
|
size_t w_ofs,
|
||||||
|
|
@ -304,6 +367,7 @@ struct DotKernelTwoProdAdd {
|
||||||
return ReduceCascadedSums(df, sum0, comp0);
|
return ReduceCascadedSums(df, sum0, comp0);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class D, typename WeightT, typename VecT>
|
template <class D, typename WeightT, typename VecT>
|
||||||
HWY_INLINE float DotTwoProdAdd(D d, const PackedSpan<const WeightT>& w,
|
HWY_INLINE float DotTwoProdAdd(D d, const PackedSpan<const WeightT>& w,
|
||||||
size_t w_ofs,
|
size_t w_ofs,
|
||||||
|
|
@ -313,35 +377,162 @@ HWY_INLINE float DotTwoProdAdd(D d, const PackedSpan<const WeightT>& w,
|
||||||
DotKernelTwoProdAdd());
|
DotKernelTwoProdAdd());
|
||||||
}
|
}
|
||||||
|
|
||||||
enum { // alphabetical order
|
// From "SIMDizing Pairwise Sums". Slower and generally higher error than
|
||||||
kAddTwoProd,
|
// Kahan, but uses fewer regs.
|
||||||
kAddTwoSum,
|
struct DotKernelPairwise {
|
||||||
kCompensated,
|
template <class DF, class VF = hn::Vec<DF>>
|
||||||
kKahan,
|
HWY_INLINE void Update4(DF df, const VF w0, const VF w1, const VF w2,
|
||||||
kNaive,
|
const VF w3, const VF v0, const VF v1, const VF v2,
|
||||||
kOnlyTwoProd,
|
const VF v3, VF& sum0, VF& sum1, VF& sum2, VF& sum3,
|
||||||
|
VF& comp0, VF& comp1, VF& comp2, VF& comp3) const {
|
||||||
|
const size_t N = hn::Lanes(df);
|
||||||
|
const VF prod0 = hn::Mul(w0, v0);
|
||||||
|
const VF prod2 = hn::Mul(w2, v2);
|
||||||
|
const VF prod1 = hn::MulAdd(w1, v1, prod0);
|
||||||
|
const VF prod3 = hn::MulAdd(w3, v3, prod2);
|
||||||
|
VF sum = hn::Add(prod1, prod3);
|
||||||
|
for (size_t bit = 4 * N; bit & num_; bit += bit, top_ -= N) {
|
||||||
|
HWY_DASSERT(top_ >= N);
|
||||||
|
HWY_DASSERT(top_ <= 32 * N);
|
||||||
|
sum = hn::Add(sum, hn::LoadU(df, stack_ + top_ - N));
|
||||||
|
}
|
||||||
|
hn::StoreU(sum, df, stack_ + top_);
|
||||||
|
top_ += N;
|
||||||
|
HWY_DASSERT(top_ <= 32 * N);
|
||||||
|
num_ += 4 * N;
|
||||||
|
}
|
||||||
|
|
||||||
kVariants
|
template <class DF, class VF = hn::Vec<DF>>
|
||||||
|
HWY_INLINE void Update1(DF df, const VF w0, const VF v0, VF& sum0,
|
||||||
|
VF& comp0) const {
|
||||||
|
const size_t N = hn::Lanes(df);
|
||||||
|
VF sum = hn::Mul(w0, v0);
|
||||||
|
for (size_t bit = N; bit & num_; bit += bit, top_ -= N) {
|
||||||
|
HWY_DASSERT(top_ >= N);
|
||||||
|
HWY_DASSERT(top_ <= 32 * N);
|
||||||
|
sum = hn::Add(sum, hn::LoadU(df, stack_ + top_ - N));
|
||||||
|
}
|
||||||
|
hn::StoreU(sum, df, stack_ + top_);
|
||||||
|
top_ += N;
|
||||||
|
HWY_DASSERT(top_ <= 32 * N);
|
||||||
|
num_ += N;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class DF, class VF = hn::Vec<DF>>
|
||||||
|
HWY_INLINE float Reduce(DF df, VF& sum0, VF& sum1, VF& sum2, VF& sum3,
|
||||||
|
VF& comp0, VF& comp1, VF& comp2, VF& comp3) const {
|
||||||
|
const size_t N = hn::Lanes(df);
|
||||||
|
sum0 = hn::Zero(df);
|
||||||
|
for (; top_ != 0; top_ -= N) {
|
||||||
|
sum0 = hn::Add(sum0, hn::LoadU(df, stack_ + top_ - N));
|
||||||
|
}
|
||||||
|
return hn::ReduceSum(df, sum0);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
HWY_ALIGN mutable float stack_[32 * hn::MaxLanes(hn::ScalableTag<float>())];
|
||||||
|
mutable size_t top_ = 0;
|
||||||
|
mutable size_t num_ = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
const char* VariantName(size_t variant) {
|
template <class D, typename WeightT, typename VecT>
|
||||||
switch (variant) {
|
HWY_INLINE float DotPairwise(D d, const PackedSpan<const WeightT>& w,
|
||||||
case kAddTwoProd:
|
size_t w_ofs, const VecT* HWY_RESTRICT vec_aligned,
|
||||||
return "add2prod";
|
size_t num) {
|
||||||
case kAddTwoSum:
|
return DecompressAndCall(d, w, w_ofs, vec_aligned, num, DotKernelPairwise());
|
||||||
return "add2sum";
|
}
|
||||||
case kCompensated:
|
|
||||||
return "comp";
|
// Hybrid of Pairwise and Compensated. 1.14x time vs. Kahan, but geomean mul
|
||||||
case kKahan:
|
// is 1.02 vs 1.06, mean L1 is 1.21x better, and uses two fewer regs.
|
||||||
return "kahan";
|
struct DotKernelComp2 {
|
||||||
case kNaive:
|
template <class DF, class VF = hn::Vec<DF>, HWY_IF_F32_D(DF)>
|
||||||
return "naive";
|
HWY_INLINE void Update4(DF df, const VF w0, const VF w1, const VF w2,
|
||||||
case kOnlyTwoProd:
|
const VF w3, const VF v0, const VF v1, const VF v2,
|
||||||
return "only2prod";
|
const VF v3, VF& sum0, VF& /*sum1*/, VF& sum2,
|
||||||
default:
|
VF& /*sum3*/, VF& comp0, VF& comp1, VF& comp2,
|
||||||
HWY_ABORT("Unknown variant %zu", variant);
|
VF& comp3) const {
|
||||||
return "?";
|
VF perr0, perr1, perr2, perr3;
|
||||||
|
VF prod0 = TwoProducts(df, w0, v0, perr0);
|
||||||
|
VF prod1 = TwoProducts(df, w1, v1, perr1);
|
||||||
|
VF prod2 = TwoProducts(df, w2, v2, perr2);
|
||||||
|
VF prod3 = TwoProducts(df, w3, v3, perr3);
|
||||||
|
|
||||||
|
// Pairwise sums of prod* and perr*.
|
||||||
|
prod0 = hn::Add(prod0, prod1);
|
||||||
|
prod2 = hn::Add(prod2, prod3);
|
||||||
|
perr0 = hn::Add(perr0, perr1);
|
||||||
|
perr2 = hn::Add(perr2, perr3);
|
||||||
|
|
||||||
|
VF serr0, serr2;
|
||||||
|
sum0 = TwoSums(df, prod0, sum0, serr0);
|
||||||
|
sum2 = TwoSums(df, prod2, sum2, serr2);
|
||||||
|
|
||||||
|
comp0 = hn::Add(comp0, perr0);
|
||||||
|
comp1 = hn::Add(comp1, perr2);
|
||||||
|
comp2 = hn::Add(comp2, serr0);
|
||||||
|
comp3 = hn::Add(comp3, serr2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <class DBF, class VBF = hn::Vec<DBF>, HWY_IF_BF16_D(DBF),
|
||||||
|
class DF = hn::Repartition<float, DBF>, class VF = hn::Vec<DF>>
|
||||||
|
HWY_INLINE void Update4(DBF /*dbf*/, const VBF w0, const VBF w1, const VBF w2,
|
||||||
|
const VBF w3, const VBF v0, const VBF v1,
|
||||||
|
const VBF v2, const VBF v3, VF& sum0, VF& sum1,
|
||||||
|
VF& sum2, VF& sum3, VF& comp0, VF& comp1, VF& comp2,
|
||||||
|
VF& comp3) const {
|
||||||
|
const DF df;
|
||||||
|
VF prod0 = WidenMulPairwiseAdd(df, w0, v0);
|
||||||
|
VF prod1 = WidenMulPairwiseAdd(df, w1, v1);
|
||||||
|
VF prod2 = WidenMulPairwiseAdd(df, w2, v2);
|
||||||
|
VF prod3 = WidenMulPairwiseAdd(df, w3, v3);
|
||||||
|
|
||||||
|
// Pairwise sums
|
||||||
|
prod0 = hn::Add(prod0, prod1);
|
||||||
|
prod2 = hn::Add(prod2, prod3);
|
||||||
|
prod0 = hn::Add(prod0, prod2);
|
||||||
|
|
||||||
|
VF serr0;
|
||||||
|
sum0 = TwoSums(df, prod0, sum0, serr0);
|
||||||
|
comp0 = hn::Add(comp0, serr0);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class DF, class VF = hn::Vec<DF>, HWY_IF_F32_D(DF)>
|
||||||
|
HWY_INLINE void Update1(DF df, const VF w0, const VF v0, VF& sum0,
|
||||||
|
VF& comp0) const {
|
||||||
|
VF perr0;
|
||||||
|
const VF prod0 = TwoProducts(df, w0, v0, perr0);
|
||||||
|
|
||||||
|
VF serr0;
|
||||||
|
sum0 = TwoSums(df, prod0, sum0, serr0);
|
||||||
|
|
||||||
|
comp0 = hn::Add(comp0, hn::Add(perr0, serr0));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class DBF, class VBF = hn::Vec<DBF>, HWY_IF_BF16_D(DBF),
|
||||||
|
class DF = hn::Repartition<float, DBF>, class VF = hn::Vec<DF>>
|
||||||
|
HWY_INLINE void Update1(DBF /*dbf*/, const VBF w0, const VBF v0, VF& sum0,
|
||||||
|
VF& comp0) const {
|
||||||
|
const DF df;
|
||||||
|
const VF prod0 = WidenMulPairwiseAdd(df, w0, v0);
|
||||||
|
|
||||||
|
VF serr0;
|
||||||
|
sum0 = TwoSums(df, prod0, sum0, serr0);
|
||||||
|
comp0 = hn::Add(comp0, serr0);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class DF, class VF = hn::Vec<DF>>
|
||||||
|
HWY_INLINE float Reduce(DF df, VF& sum0, VF& sum1, VF& sum2, VF& sum3,
|
||||||
|
VF& comp0, VF& comp1, VF& comp2, VF& comp3) const {
|
||||||
|
AssimilateCascadedSums(df, sum2, comp2, sum0, comp0);
|
||||||
|
comp1 = hn::Add(comp1, comp3);
|
||||||
|
return ReduceCascadedSums(df, sum0, hn::Add(comp0, comp1));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <class D, typename WeightT, typename VecT>
|
||||||
|
HWY_INLINE float DotComp2(D d, const PackedSpan<const WeightT>& w, size_t w_ofs,
|
||||||
|
const VecT* HWY_RESTRICT vec_aligned, size_t num) {
|
||||||
|
return DecompressAndCall(d, w, w_ofs, vec_aligned, num, DotKernelComp2());
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class D, typename WeightT, typename VecT>
|
template <class D, typename WeightT, typename VecT>
|
||||||
|
|
@ -352,6 +543,8 @@ float CallDot(D d, size_t variant, const PackedSpan<const WeightT>& w,
|
||||||
return DotTwoProdFast(d, w, 0, v, num);
|
return DotTwoProdFast(d, w, 0, v, num);
|
||||||
case kAddTwoSum:
|
case kAddTwoSum:
|
||||||
return DotMulTwoSum(d, w, 0, v, num);
|
return DotMulTwoSum(d, w, 0, v, num);
|
||||||
|
case kComp2:
|
||||||
|
return DotComp2(d, w, 0, v, num);
|
||||||
case kCompensated:
|
case kCompensated:
|
||||||
return DotCompensated(d, w, 0, v, num);
|
return DotCompensated(d, w, 0, v, num);
|
||||||
case kKahan:
|
case kKahan:
|
||||||
|
|
@ -360,6 +553,8 @@ float CallDot(D d, size_t variant, const PackedSpan<const WeightT>& w,
|
||||||
return DotNaive(d, w, 0, v, num);
|
return DotNaive(d, w, 0, v, num);
|
||||||
case kOnlyTwoProd:
|
case kOnlyTwoProd:
|
||||||
return DotTwoProdAdd(d, w, 0, v, num);
|
return DotTwoProdAdd(d, w, 0, v, num);
|
||||||
|
case kPairwise:
|
||||||
|
return DotPairwise(d, w, 0, v, num);
|
||||||
default:
|
default:
|
||||||
HWY_ABORT("Unknown variant %zu", variant);
|
HWY_ABORT("Unknown variant %zu", variant);
|
||||||
return 0.0f;
|
return 0.0f;
|
||||||
|
|
@ -496,60 +691,74 @@ class DotStats {
|
||||||
private:
|
private:
|
||||||
// Factor by which the approximate result is off; larger is worse.
|
// Factor by which the approximate result is off; larger is worse.
|
||||||
void CheckMuls() const {
|
void CheckMuls() const {
|
||||||
|
// Comp2 is between Compensated and Kahan.
|
||||||
|
ASSERT_INSIDE(kComp2, 1.001, s_muls[kComp2].Mean(), 1.3);
|
||||||
|
ASSERT_INSIDE(kComp2, 1.001f, s_muls[kComp2].Max(), 2.4f);
|
||||||
|
ASSERT_INSIDE(kComp2, 1.0, s_muls[kComp2].GeometricMean(), 1.2);
|
||||||
|
|
||||||
// Compensated is very accurate.
|
// Compensated is very accurate.
|
||||||
HWY_ASSERT(s_muls[kCompensated].Min() <= 1.0f + 2E-6f);
|
ASSERT_LESS(kCompensated, s_muls[kCompensated].Min(), 1.0f + 2E-6f);
|
||||||
HWY_ASSERT(s_muls[kCompensated].Max() <= 1.0f + 2E-5f);
|
ASSERT_LESS(kCompensated, s_muls[kCompensated].Max(), 1.0f + 2E-5f);
|
||||||
|
|
||||||
// Naive and OnlyTwoProd are considerably worse. >10x is for narrower
|
// Naive and OnlyTwoProd are considerably worse. >10x is for narrower
|
||||||
// vectors, compared to AVX-512. GeometricMean overflows, must use Mean.
|
// vectors, compared to AVX-512. GeometricMean overflows, must use Mean.
|
||||||
HWY_ASSERT(gcpp::IsInside(1.01, 16.0, s_muls[kNaive].Mean()));
|
ASSERT_INSIDE(kNaive, 1.01, s_muls[kNaive].Mean(), 16.0);
|
||||||
HWY_ASSERT(gcpp::IsInside(1.01, 13.0, s_muls[kOnlyTwoProd].Mean()));
|
ASSERT_INSIDE(kOnlyTwoProd, 1.01, s_muls[kOnlyTwoProd].Mean(), 13.0);
|
||||||
|
|
||||||
// Kahan (FastTwoSum) is decent:
|
// Kahan (FastTwoSum) is decent:
|
||||||
HWY_ASSERT(gcpp::IsInside(1.001, 4.1, s_muls[kKahan].Mean()));
|
ASSERT_INSIDE(kKahan, 1.001, s_muls[kKahan].Mean(), 4.1);
|
||||||
HWY_ASSERT(gcpp::IsInside(1.001f, 14.1f, s_muls[kKahan].Max()));
|
ASSERT_INSIDE(kKahan, 1.001f, s_muls[kKahan].Max(), 14.1f);
|
||||||
HWY_ASSERT(gcpp::IsInside(1.0, 1.6, s_muls[kKahan].GeometricMean()));
|
ASSERT_INSIDE(kKahan, 1.0, s_muls[kKahan].GeometricMean(), 1.6);
|
||||||
|
|
||||||
// But can be considerably improved via TwoProducts:
|
// But can be considerably improved via TwoProducts:
|
||||||
HWY_ASSERT(gcpp::IsInside(1.0005, 1.5, s_muls[kAddTwoProd].Mean()));
|
ASSERT_INSIDE(kAddTwoProd, 1.0005, s_muls[kAddTwoProd].Mean(), 1.5);
|
||||||
HWY_ASSERT(gcpp::IsInside(1.001f, 2.3f, s_muls[kAddTwoProd].Max()));
|
ASSERT_INSIDE(kAddTwoProd, 1.001f, s_muls[kAddTwoProd].Max(), 2.3f);
|
||||||
HWY_ASSERT(gcpp::IsInside(1.0, 1.2, s_muls[kAddTwoProd].GeometricMean()));
|
ASSERT_INSIDE(kAddTwoProd, 1.0, s_muls[kAddTwoProd].GeometricMean(), 1.2);
|
||||||
// Updating Kahan's FastTwoSums to TwoSums is not quite as helpful.
|
// Updating Kahan's FastTwoSums to TwoSums is not quite as helpful.
|
||||||
HWY_ASSERT(gcpp::IsInside(1.0005, 2.2, s_muls[kAddTwoSum].Mean()));
|
ASSERT_INSIDE(kAddTwoSum, 1.0005, s_muls[kAddTwoSum].Mean(), 2.2);
|
||||||
HWY_ASSERT(gcpp::IsInside(1.0, 1.3, s_muls[kAddTwoProd].GeometricMean()));
|
ASSERT_INSIDE(kAddTwoSum, 1.0, s_muls[kAddTwoSum].GeometricMean(), 1.3);
|
||||||
|
|
||||||
|
ASSERT_INSIDE(kPairwise, 1.0, s_muls[kPairwise].GeometricMean(), 1.5);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Absolute error; larger is worse.
|
// Absolute error; larger is worse.
|
||||||
void CheckL1() const {
|
void CheckL1() const {
|
||||||
|
// Comp2 is between Compensated and Kahan.
|
||||||
|
ASSERT_INSIDE(kComp2, 1E-5, s_l1s[kComp2].Mean(), 9E-4);
|
||||||
|
ASSERT_INSIDE(kComp2, 1E-5f, s_l1s[kComp2].Max(), 2.6E-3f);
|
||||||
|
|
||||||
// Compensated is very accurate.
|
// Compensated is very accurate.
|
||||||
HWY_ASSERT(s_l1s[kCompensated].Min() == 0.0f);
|
HWY_ASSERT(s_l1s[kCompensated].Min() == 0.0f);
|
||||||
HWY_ASSERT(s_l1s[kCompensated].Max() <= 3E-7f);
|
ASSERT_LESS(kCompensated, s_l1s[kCompensated].Max(), 3E-7f);
|
||||||
|
|
||||||
// Naive and OnlyTwoProd are considerably higher, but not huge.
|
// Naive and OnlyTwoProd are considerably higher, but not huge.
|
||||||
HWY_ASSERT(gcpp::IsInside(1E-3, 2E-2, s_l1s[kNaive].Mean()));
|
ASSERT_INSIDE(kNaive, 1E-3, s_l1s[kNaive].Mean(), 2E-2);
|
||||||
HWY_ASSERT(gcpp::IsInside(1E-3, 2E-2, s_l1s[kOnlyTwoProd].Mean()));
|
ASSERT_INSIDE(kOnlyTwoProd, 1E-3, s_l1s[kOnlyTwoProd].Mean(), 2E-2);
|
||||||
|
|
||||||
// Kahan (FastTwoSum) is decent:
|
// Kahan (FastTwoSum) is decent:
|
||||||
HWY_ASSERT(gcpp::IsInside(4.5E-4, 1E-3, s_l1s[kKahan].Mean()));
|
ASSERT_INSIDE(kKahan, 3.9E-4, s_l1s[kKahan].Mean(), 1E-3);
|
||||||
HWY_ASSERT(gcpp::IsInside(1.1E-3f, 3.2E-3f, s_l1s[kKahan].Max()));
|
ASSERT_INSIDE(kKahan, 1.1E-3f, s_l1s[kKahan].Max(), 3.2E-3f);
|
||||||
|
|
||||||
// But can be nearly halved via TwoProducts:
|
// But can be nearly halved via TwoProducts:
|
||||||
HWY_ASSERT(gcpp::IsInside(2.5E-4, 8E-4, s_l1s[kAddTwoProd].Mean()));
|
ASSERT_INSIDE(kAddTwoProd, 2.2E-4, s_l1s[kAddTwoProd].Mean(), 8E-4);
|
||||||
HWY_ASSERT(gcpp::IsInside(4E-4f, 2.0E-3f, s_l1s[kAddTwoProd].Max()));
|
ASSERT_INSIDE(kAddTwoProd, 4E-4f, s_l1s[kAddTwoProd].Max(), 2.0E-3f);
|
||||||
// Updating Kahan's FastTwoSums to TwoSums does help a bit.
|
// Updating Kahan's FastTwoSums to TwoSums does help a bit.
|
||||||
HWY_ASSERT(gcpp::IsInside(1.5E-4, 5.2E-4, s_l1s[kAddTwoSum].Mean()));
|
ASSERT_INSIDE(kAddTwoSum, 1.5E-4, s_l1s[kAddTwoSum].Mean(), 5.2E-4);
|
||||||
|
|
||||||
|
ASSERT_INSIDE(kPairwise, 4.5E-4, s_l1s[kPairwise].Mean(), 4E-3);
|
||||||
|
ASSERT_INSIDE(kPairwise, 1.1E-3f, s_l1s[kPairwise].Max(), 1E-2f);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Units in the last place; larger is worse.
|
// Units in the last place; larger is worse.
|
||||||
void CheckUlps() const {
|
void CheckUlps() const {
|
||||||
HWY_ASSERT(s_ulps[kCompensated].Max() <= 250.0f);
|
ASSERT_LESS(kComp2, s_ulps[kCompensated].Max(), 3.6E6f);
|
||||||
|
ASSERT_LESS(kCompensated, s_ulps[kCompensated].Max(), 250.0f);
|
||||||
HWY_ASSERT(s_ulps[kNaive].Max() <= 4E9f);
|
ASSERT_LESS(kNaive, s_ulps[kNaive].Max(), 4E9f);
|
||||||
HWY_ASSERT(s_ulps[kOnlyTwoProd].Max() <= 3E9f);
|
ASSERT_LESS(kOnlyTwoProd, s_ulps[kOnlyTwoProd].Max(), 3E9f);
|
||||||
|
ASSERT_LESS(kKahan, s_ulps[kKahan].Max(), 4E7f);
|
||||||
HWY_ASSERT(s_ulps[kKahan].Max() <= 4E7f);
|
ASSERT_LESS(kAddTwoProd, s_ulps[kAddTwoProd].Max(), 1E7f);
|
||||||
HWY_ASSERT(s_ulps[kAddTwoProd].Max() <= 1E7f);
|
ASSERT_LESS(kAddTwoSum, s_ulps[kAddTwoSum].Max(), 2.5E7f);
|
||||||
HWY_ASSERT(s_ulps[kAddTwoSum].Max() <= 2.5E7f);
|
ASSERT_LESS(kPairwise, s_ulps[kPairwise].Max(), 3.3E9f);
|
||||||
}
|
}
|
||||||
|
|
||||||
hwy::Stats s_cond;
|
hwy::Stats s_cond;
|
||||||
|
|
@ -715,32 +924,35 @@ struct TestShortDotsT {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
constexpr bool kCompressed = IsCompressed<Packed>();
|
||||||
// Verify the dot products are plausible. This is only to verify
|
// Verify the dot products are plausible. This is only to verify
|
||||||
// correctness, not to differentiate between the variants.
|
// correctness, not to differentiate between the variants.
|
||||||
double expected_l1[kVariants];
|
double expected_l1[kVariants];
|
||||||
// Tolerances are much lower for compressed inputs: the more limited set of
|
// Tolerances are much lower for compressed inputs: the more limited set of
|
||||||
// values seems to reduce roundoff.
|
// values seems to reduce roundoff.
|
||||||
constexpr bool kCompressed = IsCompressed<Packed>();
|
for (size_t variant = 0; variant < kVariants; ++variant) {
|
||||||
expected_l1[kAddTwoProd] = kCompressed ? 1.5E-6 : 5E-5;
|
expected_l1[variant] = kCompressed ? 1.5E-6 : 7E-5;
|
||||||
expected_l1[kAddTwoSum] = kCompressed ? 1.5E-6 : 6E-5;
|
}
|
||||||
expected_l1[kCompensated] = kCompressed ? 1.5E-6 : 4E-5;
|
expected_l1[kNaive] = kCompressed ? 4E-6 : 2E-4;
|
||||||
expected_l1[kKahan] = kCompressed ? 1.5E-6 : 7E-5;
|
expected_l1[kPairwise] = kCompressed ? 4E-6 : 2E-4;
|
||||||
expected_l1[kNaive] = kCompressed ? 4E-6 : 1.5E-4;
|
|
||||||
expected_l1[kOnlyTwoProd] = kCompressed ? 1.5E-6 : 6E-5;
|
|
||||||
|
|
||||||
for (size_t variant = 0; variant < kVariants; ++variant) {
|
for (size_t variant = 0; variant < kVariants; ++variant) {
|
||||||
HWY_ASSERT(s_l1[variant].Min() >= 0.0f);
|
HWY_ASSERT(s_l1[variant].Min() >= 0.0f);
|
||||||
HWY_ASSERT(s_l1[variant].Max() <= 1.5E-3f);
|
ASSERT_LESS(variant, s_l1[variant].Max(), 1.5E-3f);
|
||||||
if (s_l1[variant].Mean() > expected_l1[variant]) {
|
ASSERT_LESS(variant, s_l1[variant].Mean(), expected_l1[variant]);
|
||||||
HWY_ABORT("%s -> %s: %s mean l1 %.5E > %.5E\n", TypeName<Packed>(),
|
|
||||||
TypeName<T>(), VariantName(variant), s_l1[variant].Mean(),
|
|
||||||
expected_l1[variant]);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
void TestAllShortDots() { ForeachPackedAndRawType<TestShortDotsT>(); }
|
void TestAllShortDots() {
|
||||||
|
// Skip EMU128 and old x86, include SSE4 because it tests the non-FMA path.
|
||||||
|
if (HWY_TARGET == HWY_EMU128 || HWY_TARGET == HWY_SSSE3 ||
|
||||||
|
HWY_TARGET == HWY_SSE2) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
ForeachPackedAndRawType<TestShortDotsT>();
|
||||||
|
}
|
||||||
|
|
||||||
// Excludes outliers; we might not have enough samples for a reliable mode.
|
// Excludes outliers; we might not have enough samples for a reliable mode.
|
||||||
double TrimmedMean(double* seconds, size_t num) {
|
double TrimmedMean(double* seconds, size_t num) {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue