From b959ea1a22d4a0bb855ed2232eb04b8acad40da8 Mon Sep 17 00:00:00 2001 From: Jan Wassenberg Date: Tue, 25 Nov 2025 20:24:05 -0800 Subject: [PATCH] Add ToFloatSlow, move RandomFloat to test_util PiperOrigin-RevId: 836915166 --- BUILD.bazel | 2 ++ compression/BUILD.bazel | 3 +-- compression/compress-inl.h | 37 ++++++++++++++++++++++++++++++++++ compression/python/BUILD.bazel | 1 - compression/sfp_test.cc | 30 ++++++++------------------- ops/dot_test.cc | 12 ----------- util/test_util.h | 28 +++++++++++++++++++++++++ 7 files changed, 76 insertions(+), 37 deletions(-) diff --git a/BUILD.bazel b/BUILD.bazel index fd85acb..02a7996 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -171,6 +171,7 @@ cc_library( name = "test_util", hdrs = ["util/test_util.h"], deps = [ + ":basics", "@highway//:hwy", "@highway//:hwy_test_util", "@highway//:stats", @@ -575,6 +576,7 @@ cc_library( ":mat", ":matmul", ":matmul_env", + ":test_util", ":model_store", ":ops", ":threading", diff --git a/compression/BUILD.bazel b/compression/BUILD.bazel index c04bd08..0fb43d7 100644 --- a/compression/BUILD.bazel +++ b/compression/BUILD.bazel @@ -135,8 +135,8 @@ cc_test( # for test_suite. tags = ["hwy_ops_test"], deps = [ + ":compress", ":distortion", - ":sfp", "@googletest//:gtest_main", # buildcleaner: keep "//:test_util", "@highway//:hwy", @@ -182,7 +182,6 @@ cc_library( "//:mat", "//:threading_context", "@highway//:hwy", - "@highway//:nanobenchmark", "@highway//:profiler", "@highway//:stats", "@highway//:thread_pool", diff --git a/compression/compress-inl.h b/compression/compress-inl.h index 42812ef..e7bb9d6 100644 --- a/compression/compress-inl.h +++ b/compression/compress-inl.h @@ -82,6 +82,8 @@ struct CompressTraits { hn::StoreU(raw1, df, packed.ptr + packed_ofs + NF); } + static float ToFloatSlow(const Packed x) { return x; } + template > static HWY_INLINE void Load2(DBF16 dbf16, const PackedSpan& packed, @@ -254,6 +256,10 @@ struct CompressTraits { packed.ptr + packed_ofs); } + static float ToFloatSlow(const Packed x) { + return hwy::ConvertScalarTo(x); + } + template static HWY_INLINE void Load2(DBF16 dbf16, const PackedSpan& packed, @@ -397,6 +403,27 @@ struct CompressTraits { } } + // NOTE: this does not take into account the per-tensor scale. + static float ToFloatSlow(const Packed x) { + uint32_t sfp = x.byte; + HWY_ASSERT(sfp != 0x80); // -0 is reserved + + const uint32_t sign32 = (sfp & 0x80) << 24; + sfp &= 0x7F; + const bool large_e = sfp >= 64; + const size_t m_bits = large_e ? 3 : 2; + uint32_t m = sfp & ((1u << m_bits) - 1u); + size_t e = sfp >> m_bits; + if (sfp == 0) return 0.0f; + const uint32_t e_bias = large_e ? 15 : 23; + const uint32_t exp32 = static_cast(127 + e - e_bias) << 23; + const uint32_t mnt32 = m << (23 - m_bits); + const uint32_t binary32 = sign32 | exp32 | mnt32; + float result; + hwy::CopySameSize(&binary32, &result); + return result; + } + template // Caller checks this is f32 or bf16 static HWY_INLINE void Load2(D d, const PackedSpan& packed, const size_t packed_ofs, hn::Vec& raw0, @@ -437,6 +464,12 @@ struct CompressTraits { IntCodec::Dec2(d, packed, packed_ofs, raw0, raw1); } + static float ToFloatSlow(const Packed x) { + HWY_DASSERT(!"Not supported - requires a stream"); + return 0.0f; + } + // Store2 is not yet implemented. + template static HWY_INLINE void DecompressAndZeroPad( D d, const PackedSpan& packed, const size_t packed_ofs, @@ -483,6 +516,10 @@ struct CompressTraits { NuqCodec::Dec2(d, packed, packed_ofs, raw0, raw1); } + static float ToFloatSlow(const Packed x) { + HWY_DASSERT(!"Not supported - requires a stream"); + return 0.0f; + } // Store2 is not yet implemented. template diff --git a/compression/python/BUILD.bazel b/compression/python/BUILD.bazel index 4d9b2ac..e3b7e36 100644 --- a/compression/python/BUILD.bazel +++ b/compression/python/BUILD.bazel @@ -26,7 +26,6 @@ cc_library( "//io", "//io:blob_store", "@highway//:hwy", - "@highway//:thread_pool", ], ) diff --git a/compression/sfp_test.cc b/compression/sfp_test.cc index 8e49ceb..df3e846 100644 --- a/compression/sfp_test.cc +++ b/compression/sfp_test.cc @@ -37,37 +37,23 @@ #include "hwy/foreach_target.h" // IWYU pragma: keep #include "hwy/highway.h" // After highway.h -#include "compression/sfp-inl.h" +#include "compression/compress-inl.h" #include "hwy/tests/test_util-inl.h" HWY_BEFORE_NAMESPACE(); namespace gcpp { namespace HWY_NAMESPACE { -// Decode -float F32FromSFP8(uint32_t sfp) { - HWY_ASSERT(sfp < 256); - HWY_ASSERT(sfp != 0x80); // -0 is reserved +HWY_INLINE_VAR constexpr bool kPrint = false; - const uint32_t sign32 = (sfp & 0x80) << 24; - sfp &= 0x7F; - const bool large_e = sfp >= 64; - const size_t m_bits = large_e ? 3 : 2; - uint32_t m = sfp & ((1u << m_bits) - 1u); - size_t e = sfp >> m_bits; - if (sfp == 0) return 0.0f; - const uint32_t e_bias = large_e ? 15 : 23; - const uint32_t exp32 = static_cast(127 + e - e_bias) << 23; - const uint32_t mnt32 = m << (23 - m_bits); - const uint32_t binary32 = sign32 | exp32 | mnt32; - float result; - hwy::CopySameSize(&binary32, &result); - return result; +static float F32FromSFP8(uint32_t sfp) { + return CompressTraits::ToFloatSlow( + SfpStream{static_cast(sfp)}); } // Used for HWY_AVX3_DL and newer. void PrintTables() { - if (HWY_ONCE && false) { + if (HWY_ONCE && kPrint) { uint8_t hi[128]; fprintf(stderr, "lo\n"); for (uint32_t sfp = 0; sfp < 128; ++sfp) { @@ -92,7 +78,7 @@ void TestAllUnique() { unique.insert(F32FromSFP8(sfp)); } HWY_ASSERT_EQ(size_t{255}, unique.size()); - if (false) { + if (kPrint) { for (float f : unique) { fprintf(stderr, "%e\n", f); } @@ -163,7 +149,7 @@ HWY_INLINE uint32_t SFP8FromF32(float f) { if (m == 0) m = 1; } - if (false) { + if (kPrint) { fprintf(stderr, "in %x round %x rounded %x e %d m %x large_e %d\n", org_binary32, round, rounded, e, m, large_e); } diff --git a/ops/dot_test.cc b/ops/dot_test.cc index 5547e86..bce8904 100644 --- a/ops/dot_test.cc +++ b/ops/dot_test.cc @@ -891,18 +891,6 @@ class DotStats { hwy::Stats s_times[kVariants]; }; -// Returns normalized value in [-1, 1). -float RandomFloat(RngStream& rng) { - const uint32_t exp = hwy::BitCastScalar(1.0f); - const uint32_t mantissa_mask = hwy::MantissaMask(); - const uint32_t representation = exp | (rng() & mantissa_mask); - const float f12 = hwy::BitCastScalar(representation); - HWY_DASSERT(1.0f <= f12 && f12 < 2.0f); // exponent is 2^0, only mantissa - const float f = (2.0f * (f12 - 1.0f)) - 1.0f; - HWY_DASSERT(-1.0f <= f && f < 1.0f); - return f; -} - // `raw` holds the decompressed values, so that the test measures only the // error from the Dot algorithms, not the compression. template diff --git a/util/test_util.h b/util/test_util.h index 355b096..32e1e04 100644 --- a/util/test_util.h +++ b/util/test_util.h @@ -19,8 +19,10 @@ #include #include +#include // std::sort #include +#include "util/basics.h" // RngStream #include "hwy/base.h" // IWYU pragma: begin_exports @@ -30,9 +32,35 @@ namespace gcpp { +// Excludes outliers; we might not have enough samples for a reliable mode. +HWY_INLINE double TrimmedMean(double* seconds, size_t num) { + std::sort(seconds, seconds + num); + double sum = 0; + int count = 0; + for (size_t i = num / 4; i < num / 2; ++i) { + sum += seconds[i]; + count += 1; + } + HWY_DASSERT(num != 0); + return sum / count; +} + +// Returns normalized value in [-1, 1). +HWY_INLINE float RandomFloat(RngStream& rng) { + const uint32_t exp = hwy::BitCastScalar(1.0f); + const uint32_t mantissa_mask = hwy::MantissaMask(); + const uint32_t representation = exp | (rng() & mantissa_mask); + const float f12 = hwy::BitCastScalar(representation); + HWY_DASSERT(1.0f <= f12 && f12 < 2.0f); // exponent is 2^0, only mantissa + const float f = (2.0f * (f12 - 1.0f)) - 1.0f; + HWY_DASSERT(-1.0f <= f && f < 1.0f); + return f; +} + // Returns random Gaussian (mean=0, stddev=1/3 similar to expected weights) // using the central limit theorem. Avoid std::normal_distribution for // consistent cross-platform output. +// TODO: use RngStream instead of RandomState. HWY_INLINE double RandomGaussian(hwy::RandomState& rng) { uint64_t sum = 0; constexpr int kReps = 40;