Merge b959ea1a22 into c153d5255b

2025-11-27 08:04:31 +00:00 · 2025-11-27 08:04:31 +00:00 · 3bc8da8d7b
parent c153d5255b b959ea1a22
commit 3bc8da8d7b
7 changed files with 76 additions and 37 deletions
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -171,6 +171,7 @@ cc_library(
    name = "test_util",
    hdrs = ["util/test_util.h"],
    deps = [
+        ":basics",
        "@highway//:hwy",
        "@highway//:hwy_test_util",
        "@highway//:stats",
@ -575,6 +576,7 @@ cc_library(
        ":mat",
        ":matmul",
        ":matmul_env",
+        ":test_util",
        ":model_store",
        ":ops",
        ":threading",
--- a/compression/BUILD.bazel
+++ b/compression/BUILD.bazel
@ -135,8 +135,8 @@ cc_test(
    # for test_suite.
    tags = ["hwy_ops_test"],
    deps = [
+        ":compress",
        ":distortion",
-        ":sfp",
        "@googletest//:gtest_main",  # buildcleaner: keep
        "//:test_util",
        "@highway//:hwy",
@ -182,7 +182,6 @@ cc_library(
        "//:mat",
        "//:threading_context",
        "@highway//:hwy",
-        "@highway//:nanobenchmark",
        "@highway//:profiler",
        "@highway//:stats",
        "@highway//:thread_pool",
--- a/compression/compress-inl.h
+++ b/compression/compress-inl.h
@ -82,6 +82,8 @@ struct CompressTraits<float> {
    hn::StoreU(raw1, df, packed.ptr + packed_ofs + NF);
  }

+  static float ToFloatSlow(const Packed x) { return x; }
+
  template <class DBF16, HWY_IF_BF16_D(DBF16), class VBF16 = hn::Vec<DBF16>>
  static HWY_INLINE void Load2(DBF16 dbf16,
                               const PackedSpan<const Packed>& packed,
@ -254,6 +256,10 @@ struct CompressTraits<BF16> {
               packed.ptr + packed_ofs);
  }

+  static float ToFloatSlow(const Packed x) {
+    return hwy::ConvertScalarTo<float>(x);
+  }
+
  template <class DBF16, HWY_IF_BF16_D(DBF16)>
  static HWY_INLINE void Load2(DBF16 dbf16,
                               const PackedSpan<const Packed>& packed,
@ -397,6 +403,27 @@ struct CompressTraits<SfpStream> {
    }
  }

+  // NOTE: this does not take into account the per-tensor scale.
+  static float ToFloatSlow(const Packed x) {
+    uint32_t sfp = x.byte;
+    HWY_ASSERT(sfp != 0x80);  // -0 is reserved
+
+    const uint32_t sign32 = (sfp & 0x80) << 24;
+    sfp &= 0x7F;
+    const bool large_e = sfp >= 64;
+    const size_t m_bits = large_e ? 3 : 2;
+    uint32_t m = sfp & ((1u << m_bits) - 1u);
+    size_t e = sfp >> m_bits;
+    if (sfp == 0) return 0.0f;
+    const uint32_t e_bias = large_e ? 15 : 23;
+    const uint32_t exp32 = static_cast<uint32_t>(127 + e - e_bias) << 23;
+    const uint32_t mnt32 = m << (23 - m_bits);
+    const uint32_t binary32 = sign32 | exp32 | mnt32;
+    float result;
+    hwy::CopySameSize(&binary32, &result);
+    return result;
+  }
+
  template <class D>  // Caller checks this is f32 or bf16
  static HWY_INLINE void Load2(D d, const PackedSpan<const Packed>& packed,
                               const size_t packed_ofs, hn::Vec<D>& raw0,
@ -437,6 +464,12 @@ struct CompressTraits<I8Stream> {
    IntCodec::Dec2(d, packed, packed_ofs, raw0, raw1);
  }

+  static float ToFloatSlow(const Packed x) {
+    HWY_DASSERT(!"Not supported - requires a stream");
+    return 0.0f;
+  }
+  // Store2 is not yet implemented.
+
  template <class D, typename Raw>
  static HWY_INLINE void DecompressAndZeroPad(
      D d, const PackedSpan<const Packed>& packed, const size_t packed_ofs,
@ -483,6 +516,10 @@ struct CompressTraits<NuqStream> {
    NuqCodec::Dec2(d, packed, packed_ofs, raw0, raw1);
  }

+  static float ToFloatSlow(const Packed x) {
+    HWY_DASSERT(!"Not supported - requires a stream");
+    return 0.0f;
+  }
  // Store2 is not yet implemented.

  template <class D, typename Raw>
--- a/compression/python/BUILD.bazel
+++ b/compression/python/BUILD.bazel
@ -26,7 +26,6 @@ cc_library(
        "//io",
        "//io:blob_store",
        "@highway//:hwy",
-        "@highway//:thread_pool",
    ],
 )

--- a/compression/sfp_test.cc
+++ b/compression/sfp_test.cc
@ -37,37 +37,23 @@
 #include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 // After highway.h
-#include "compression/sfp-inl.h"
+#include "compression/compress-inl.h"
 #include "hwy/tests/test_util-inl.h"

 HWY_BEFORE_NAMESPACE();
 namespace gcpp {
 namespace HWY_NAMESPACE {

-// Decode
-float F32FromSFP8(uint32_t sfp) {
-  HWY_ASSERT(sfp < 256);
-  HWY_ASSERT(sfp != 0x80);  // -0 is reserved
+HWY_INLINE_VAR constexpr bool kPrint = false;

-  const uint32_t sign32 = (sfp & 0x80) << 24;
-  sfp &= 0x7F;
-  const bool large_e = sfp >= 64;
-  const size_t m_bits = large_e ? 3 : 2;
-  uint32_t m = sfp & ((1u << m_bits) - 1u);
-  size_t e = sfp >> m_bits;
-  if (sfp == 0) return 0.0f;
-  const uint32_t e_bias = large_e ? 15 : 23;
-  const uint32_t exp32 = static_cast<uint32_t>(127 + e - e_bias) << 23;
-  const uint32_t mnt32 = m << (23 - m_bits);
-  const uint32_t binary32 = sign32 | exp32 | mnt32;
-  float result;
-  hwy::CopySameSize(&binary32, &result);
-  return result;
+static float F32FromSFP8(uint32_t sfp) {
+  return CompressTraits<SfpStream>::ToFloatSlow(
+      SfpStream{static_cast<uint8_t>(sfp)});
 }

 // Used for HWY_AVX3_DL and newer.
 void PrintTables() {
-  if (HWY_ONCE && false) {
+  if (HWY_ONCE && kPrint) {
    uint8_t hi[128];
    fprintf(stderr, "lo\n");
    for (uint32_t sfp = 0; sfp < 128; ++sfp) {
@ -92,7 +78,7 @@ void TestAllUnique() {
    unique.insert(F32FromSFP8(sfp));
  }
  HWY_ASSERT_EQ(size_t{255}, unique.size());
-  if (false) {
+  if (kPrint) {
    for (float f : unique) {
      fprintf(stderr, "%e\n", f);
    }
@ -163,7 +149,7 @@ HWY_INLINE uint32_t SFP8FromF32(float f) {
    if (m == 0) m = 1;
  }

-  if (false) {
+  if (kPrint) {
    fprintf(stderr, "in %x round %x rounded %x e %d m %x large_e %d\n",
            org_binary32, round, rounded, e, m, large_e);
  }
--- a/ops/dot_test.cc
+++ b/ops/dot_test.cc
@ -891,18 +891,6 @@ class DotStats {
  hwy::Stats s_times[kVariants];
 };

-// Returns normalized value in [-1, 1).
-float RandomFloat(RngStream& rng) {
-  const uint32_t exp = hwy::BitCastScalar<uint32_t>(1.0f);
-  const uint32_t mantissa_mask = hwy::MantissaMask<float>();
-  const uint32_t representation = exp | (rng() & mantissa_mask);
-  const float f12 = hwy::BitCastScalar<float>(representation);
-  HWY_DASSERT(1.0f <= f12 && f12 < 2.0f);  // exponent is 2^0, only mantissa
-  const float f = (2.0f * (f12 - 1.0f)) - 1.0f;
-  HWY_DASSERT(-1.0f <= f && f < 1.0f);
-  return f;
-}
-
 // `raw` holds the decompressed values, so that the test measures only the
 // error from the Dot algorithms, not the compression.
 template <typename Packed>
--- a/util/test_util.h
+++ b/util/test_util.h
@ -19,8 +19,10 @@
 #include <stddef.h>
 #include <stdint.h>

+#include <algorithm>  // std::sort
 #include <cmath>

+#include "util/basics.h"  // RngStream
 #include "hwy/base.h"

 // IWYU pragma: begin_exports
@ -30,9 +32,35 @@

 namespace gcpp {

+// Excludes outliers; we might not have enough samples for a reliable mode.
+HWY_INLINE double TrimmedMean(double* seconds, size_t num) {
+  std::sort(seconds, seconds + num);
+  double sum = 0;
+  int count = 0;
+  for (size_t i = num / 4; i < num / 2; ++i) {
+    sum += seconds[i];
+    count += 1;
+  }
+  HWY_DASSERT(num != 0);
+  return sum / count;
+}
+
+// Returns normalized value in [-1, 1).
+HWY_INLINE float RandomFloat(RngStream& rng) {
+  const uint32_t exp = hwy::BitCastScalar<uint32_t>(1.0f);
+  const uint32_t mantissa_mask = hwy::MantissaMask<float>();
+  const uint32_t representation = exp | (rng() & mantissa_mask);
+  const float f12 = hwy::BitCastScalar<float>(representation);
+  HWY_DASSERT(1.0f <= f12 && f12 < 2.0f);  // exponent is 2^0, only mantissa
+  const float f = (2.0f * (f12 - 1.0f)) - 1.0f;
+  HWY_DASSERT(-1.0f <= f && f < 1.0f);
+  return f;
+}
+
 // Returns random Gaussian (mean=0, stddev=1/3 similar to expected weights)
 // using the central limit theorem. Avoid std::normal_distribution for
 // consistent cross-platform output.
+// TODO: use RngStream instead of RandomState.
 HWY_INLINE double RandomGaussian(hwy::RandomState& rng) {
  uint64_t sum = 0;
  constexpr int kReps = 40;