From b959ea1a22d4a0bb855ed2232eb04b8acad40da8 Mon Sep 17 00:00:00 2001
From: Jan Wassenberg <janwas@google.com>
Date: Tue, 25 Nov 2025 20:24:05 -0800
Subject: [PATCH] Add ToFloatSlow, move RandomFloat to test_util

PiperOrigin-RevId: 836915166
---
 BUILD.bazel                    |  2 ++
 compression/BUILD.bazel        |  3 +--
 compression/compress-inl.h     | 37 ++++++++++++++++++++++++++++++++++
 compression/python/BUILD.bazel |  1 -
 compression/sfp_test.cc        | 30 ++++++++-------------------
 ops/dot_test.cc                | 12 -----------
 util/test_util.h               | 28 +++++++++++++++++++++++++
 7 files changed, 76 insertions(+), 37 deletions(-)
diff --git a/BUILD.bazel b/BUILD.bazel
index fd85acb..02a7996 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -171,6 +171,7 @@ cc_library(
     name = "test_util",
     hdrs = ["util/test_util.h"],
     deps = [
+        ":basics",
         "@highway//:hwy",
         "@highway//:hwy_test_util",
         "@highway//:stats",
@@ -575,6 +576,7 @@ cc_library(
         ":mat",
         ":matmul",
         ":matmul_env",
+        ":test_util",
         ":model_store",
         ":ops",
         ":threading",
diff --git a/compression/BUILD.bazel b/compression/BUILD.bazel
index c04bd08..0fb43d7 100644
--- a/compression/BUILD.bazel
+++ b/compression/BUILD.bazel
@@ -135,8 +135,8 @@ cc_test(
     # for test_suite.
     tags = ["hwy_ops_test"],
     deps = [
+        ":compress",
         ":distortion",
-        ":sfp",
         "@googletest//:gtest_main",  # buildcleaner: keep
         "//:test_util",
         "@highway//:hwy",
@@ -182,7 +182,6 @@ cc_library(
         "//:mat",
         "//:threading_context",
         "@highway//:hwy",
-        "@highway//:nanobenchmark",
         "@highway//:profiler",
         "@highway//:stats",
         "@highway//:thread_pool",
diff --git a/compression/compress-inl.h b/compression/compress-inl.h
index 42812ef..e7bb9d6 100644
--- a/compression/compress-inl.h
+++ b/compression/compress-inl.h
@@ -82,6 +82,8 @@ struct CompressTraits<float> {
     hn::StoreU(raw1, df, packed.ptr + packed_ofs + NF);
   }
 
+  static float ToFloatSlow(const Packed x) { return x; }
+
   template <class DBF16, HWY_IF_BF16_D(DBF16), class VBF16 = hn::Vec<DBF16>>
   static HWY_INLINE void Load2(DBF16 dbf16,
                                const PackedSpan<const Packed>& packed,
@@ -254,6 +256,10 @@ struct CompressTraits<BF16> {
                packed.ptr + packed_ofs);
   }
 
+  static float ToFloatSlow(const Packed x) {
+    return hwy::ConvertScalarTo<float>(x);
+  }
+
   template <class DBF16, HWY_IF_BF16_D(DBF16)>
   static HWY_INLINE void Load2(DBF16 dbf16,
                                const PackedSpan<const Packed>& packed,
@@ -397,6 +403,27 @@ struct CompressTraits<SfpStream> {
     }
   }
 
+  // NOTE: this does not take into account the per-tensor scale.
+  static float ToFloatSlow(const Packed x) {
+    uint32_t sfp = x.byte;
+    HWY_ASSERT(sfp != 0x80);  // -0 is reserved
+
+    const uint32_t sign32 = (sfp & 0x80) << 24;
+    sfp &= 0x7F;
+    const bool large_e = sfp >= 64;
+    const size_t m_bits = large_e ? 3 : 2;
+    uint32_t m = sfp & ((1u << m_bits) - 1u);
+    size_t e = sfp >> m_bits;
+    if (sfp == 0) return 0.0f;
+    const uint32_t e_bias = large_e ? 15 : 23;
+    const uint32_t exp32 = static_cast<uint32_t>(127 + e - e_bias) << 23;
+    const uint32_t mnt32 = m << (23 - m_bits);
+    const uint32_t binary32 = sign32 | exp32 | mnt32;
+    float result;
+    hwy::CopySameSize(&binary32, &result);
+    return result;
+  }
+
   template <class D>  // Caller checks this is f32 or bf16
   static HWY_INLINE void Load2(D d, const PackedSpan<const Packed>& packed,
                                const size_t packed_ofs, hn::Vec<D>& raw0,
@@ -437,6 +464,12 @@ struct CompressTraits<I8Stream> {
     IntCodec::Dec2(d, packed, packed_ofs, raw0, raw1);
   }
 
+  static float ToFloatSlow(const Packed x) {
+    HWY_DASSERT(!"Not supported - requires a stream");
+    return 0.0f;
+  }
+  // Store2 is not yet implemented.
+
   template <class D, typename Raw>
   static HWY_INLINE void DecompressAndZeroPad(
       D d, const PackedSpan<const Packed>& packed, const size_t packed_ofs,
@@ -483,6 +516,10 @@ struct CompressTraits<NuqStream> {
     NuqCodec::Dec2(d, packed, packed_ofs, raw0, raw1);
   }
 
+  static float ToFloatSlow(const Packed x) {
+    HWY_DASSERT(!"Not supported - requires a stream");
+    return 0.0f;
+  }
   // Store2 is not yet implemented.
 
   template <class D, typename Raw>
diff --git a/compression/python/BUILD.bazel b/compression/python/BUILD.bazel
index 4d9b2ac..e3b7e36 100644
--- a/compression/python/BUILD.bazel
+++ b/compression/python/BUILD.bazel
@@ -26,7 +26,6 @@ cc_library(
         "//io",
         "//io:blob_store",
         "@highway//:hwy",
-        "@highway//:thread_pool",
     ],
 )
 
diff --git a/compression/sfp_test.cc b/compression/sfp_test.cc
index 8e49ceb..df3e846 100644
--- a/compression/sfp_test.cc
+++ b/compression/sfp_test.cc
@@ -37,37 +37,23 @@
 #include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 // After highway.h
-#include "compression/sfp-inl.h"
+#include "compression/compress-inl.h"
 #include "hwy/tests/test_util-inl.h"
 
 HWY_BEFORE_NAMESPACE();
 namespace gcpp {
 namespace HWY_NAMESPACE {
 
-// Decode
-float F32FromSFP8(uint32_t sfp) {
-  HWY_ASSERT(sfp < 256);
-  HWY_ASSERT(sfp != 0x80);  // -0 is reserved
+HWY_INLINE_VAR constexpr bool kPrint = false;
 
-  const uint32_t sign32 = (sfp & 0x80) << 24;
-  sfp &= 0x7F;
-  const bool large_e = sfp >= 64;
-  const size_t m_bits = large_e ? 3 : 2;
-  uint32_t m = sfp & ((1u << m_bits) - 1u);
-  size_t e = sfp >> m_bits;
-  if (sfp == 0) return 0.0f;
-  const uint32_t e_bias = large_e ? 15 : 23;
-  const uint32_t exp32 = static_cast<uint32_t>(127 + e - e_bias) << 23;
-  const uint32_t mnt32 = m << (23 - m_bits);
-  const uint32_t binary32 = sign32 | exp32 | mnt32;
-  float result;
-  hwy::CopySameSize(&binary32, &result);
-  return result;
+static float F32FromSFP8(uint32_t sfp) {
+  return CompressTraits<SfpStream>::ToFloatSlow(
+      SfpStream{static_cast<uint8_t>(sfp)});
 }
 
 // Used for HWY_AVX3_DL and newer.
 void PrintTables() {
-  if (HWY_ONCE && false) {
+  if (HWY_ONCE && kPrint) {
     uint8_t hi[128];
     fprintf(stderr, "lo\n");
     for (uint32_t sfp = 0; sfp < 128; ++sfp) {
@@ -92,7 +78,7 @@ void TestAllUnique() {
     unique.insert(F32FromSFP8(sfp));
   }
   HWY_ASSERT_EQ(size_t{255}, unique.size());
-  if (false) {
+  if (kPrint) {
     for (float f : unique) {
       fprintf(stderr, "%e\n", f);
     }
@@ -163,7 +149,7 @@ HWY_INLINE uint32_t SFP8FromF32(float f) {
     if (m == 0) m = 1;
   }
 
-  if (false) {
+  if (kPrint) {
     fprintf(stderr, "in %x round %x rounded %x e %d m %x large_e %d\n",
             org_binary32, round, rounded, e, m, large_e);
   }
diff --git a/ops/dot_test.cc b/ops/dot_test.cc
index 5547e86..bce8904 100644
--- a/ops/dot_test.cc
+++ b/ops/dot_test.cc
@@ -891,18 +891,6 @@ class DotStats {
   hwy::Stats s_times[kVariants];
 };
 
-// Returns normalized value in [-1, 1).
-float RandomFloat(RngStream& rng) {
-  const uint32_t exp = hwy::BitCastScalar<uint32_t>(1.0f);
-  const uint32_t mantissa_mask = hwy::MantissaMask<float>();
-  const uint32_t representation = exp | (rng() & mantissa_mask);
-  const float f12 = hwy::BitCastScalar<float>(representation);
-  HWY_DASSERT(1.0f <= f12 && f12 < 2.0f);  // exponent is 2^0, only mantissa
-  const float f = (2.0f * (f12 - 1.0f)) - 1.0f;
-  HWY_DASSERT(-1.0f <= f && f < 1.0f);
-  return f;
-}
-
 // `raw` holds the decompressed values, so that the test measures only the
 // error from the Dot algorithms, not the compression.
 template <typename Packed>
diff --git a/util/test_util.h b/util/test_util.h
index 355b096..32e1e04 100644
--- a/util/test_util.h
+++ b/util/test_util.h
@@ -19,8 +19,10 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#include <algorithm>  // std::sort
 #include <cmath>
 
+#include "util/basics.h"  // RngStream
 #include "hwy/base.h"
 
 // IWYU pragma: begin_exports
@@ -30,9 +32,35 @@
 
 namespace gcpp {
 
+// Excludes outliers; we might not have enough samples for a reliable mode.
+HWY_INLINE double TrimmedMean(double* seconds, size_t num) {
+  std::sort(seconds, seconds + num);
+  double sum = 0;
+  int count = 0;
+  for (size_t i = num / 4; i < num / 2; ++i) {
+    sum += seconds[i];
+    count += 1;
+  }
+  HWY_DASSERT(num != 0);
+  return sum / count;
+}
+
+// Returns normalized value in [-1, 1).
+HWY_INLINE float RandomFloat(RngStream& rng) {
+  const uint32_t exp = hwy::BitCastScalar<uint32_t>(1.0f);
+  const uint32_t mantissa_mask = hwy::MantissaMask<float>();
+  const uint32_t representation = exp | (rng() & mantissa_mask);
+  const float f12 = hwy::BitCastScalar<float>(representation);
+  HWY_DASSERT(1.0f <= f12 && f12 < 2.0f);  // exponent is 2^0, only mantissa
+  const float f = (2.0f * (f12 - 1.0f)) - 1.0f;
+  HWY_DASSERT(-1.0f <= f && f < 1.0f);
+  return f;
+}
+
 // Returns random Gaussian (mean=0, stddev=1/3 similar to expected weights)
 // using the central limit theorem. Avoid std::normal_distribution for
 // consistent cross-platform output.
+// TODO: use RngStream instead of RandomState.
 HWY_INLINE double RandomGaussian(hwy::RandomState& rng) {
   uint64_t sum = 0;
   constexpr int kReps = 40;