This commit is contained in:
copybara-service[bot] 2025-11-27 08:04:31 +00:00 committed by GitHub
commit 3bc8da8d7b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 76 additions and 37 deletions

View File

@ -171,6 +171,7 @@ cc_library(
name = "test_util",
hdrs = ["util/test_util.h"],
deps = [
":basics",
"@highway//:hwy",
"@highway//:hwy_test_util",
"@highway//:stats",
@ -575,6 +576,7 @@ cc_library(
":mat",
":matmul",
":matmul_env",
":test_util",
":model_store",
":ops",
":threading",

View File

@ -135,8 +135,8 @@ cc_test(
# for test_suite.
tags = ["hwy_ops_test"],
deps = [
":compress",
":distortion",
":sfp",
"@googletest//:gtest_main", # buildcleaner: keep
"//:test_util",
"@highway//:hwy",
@ -182,7 +182,6 @@ cc_library(
"//:mat",
"//:threading_context",
"@highway//:hwy",
"@highway//:nanobenchmark",
"@highway//:profiler",
"@highway//:stats",
"@highway//:thread_pool",

View File

@ -82,6 +82,8 @@ struct CompressTraits<float> {
hn::StoreU(raw1, df, packed.ptr + packed_ofs + NF);
}
static float ToFloatSlow(const Packed x) { return x; }
template <class DBF16, HWY_IF_BF16_D(DBF16), class VBF16 = hn::Vec<DBF16>>
static HWY_INLINE void Load2(DBF16 dbf16,
const PackedSpan<const Packed>& packed,
@ -254,6 +256,10 @@ struct CompressTraits<BF16> {
packed.ptr + packed_ofs);
}
static float ToFloatSlow(const Packed x) {
return hwy::ConvertScalarTo<float>(x);
}
template <class DBF16, HWY_IF_BF16_D(DBF16)>
static HWY_INLINE void Load2(DBF16 dbf16,
const PackedSpan<const Packed>& packed,
@ -397,6 +403,27 @@ struct CompressTraits<SfpStream> {
}
}
// NOTE: this does not take into account the per-tensor scale.
static float ToFloatSlow(const Packed x) {
uint32_t sfp = x.byte;
HWY_ASSERT(sfp != 0x80); // -0 is reserved
const uint32_t sign32 = (sfp & 0x80) << 24;
sfp &= 0x7F;
const bool large_e = sfp >= 64;
const size_t m_bits = large_e ? 3 : 2;
uint32_t m = sfp & ((1u << m_bits) - 1u);
size_t e = sfp >> m_bits;
if (sfp == 0) return 0.0f;
const uint32_t e_bias = large_e ? 15 : 23;
const uint32_t exp32 = static_cast<uint32_t>(127 + e - e_bias) << 23;
const uint32_t mnt32 = m << (23 - m_bits);
const uint32_t binary32 = sign32 | exp32 | mnt32;
float result;
hwy::CopySameSize(&binary32, &result);
return result;
}
template <class D> // Caller checks this is f32 or bf16
static HWY_INLINE void Load2(D d, const PackedSpan<const Packed>& packed,
const size_t packed_ofs, hn::Vec<D>& raw0,
@ -437,6 +464,12 @@ struct CompressTraits<I8Stream> {
IntCodec::Dec2(d, packed, packed_ofs, raw0, raw1);
}
static float ToFloatSlow(const Packed x) {
HWY_DASSERT(!"Not supported - requires a stream");
return 0.0f;
}
// Store2 is not yet implemented.
template <class D, typename Raw>
static HWY_INLINE void DecompressAndZeroPad(
D d, const PackedSpan<const Packed>& packed, const size_t packed_ofs,
@ -483,6 +516,10 @@ struct CompressTraits<NuqStream> {
NuqCodec::Dec2(d, packed, packed_ofs, raw0, raw1);
}
static float ToFloatSlow(const Packed x) {
HWY_DASSERT(!"Not supported - requires a stream");
return 0.0f;
}
// Store2 is not yet implemented.
template <class D, typename Raw>

View File

@ -26,7 +26,6 @@ cc_library(
"//io",
"//io:blob_store",
"@highway//:hwy",
"@highway//:thread_pool",
],
)

View File

@ -37,37 +37,23 @@
#include "hwy/foreach_target.h" // IWYU pragma: keep
#include "hwy/highway.h"
// After highway.h
#include "compression/sfp-inl.h"
#include "compression/compress-inl.h"
#include "hwy/tests/test_util-inl.h"
HWY_BEFORE_NAMESPACE();
namespace gcpp {
namespace HWY_NAMESPACE {
// Decode
float F32FromSFP8(uint32_t sfp) {
HWY_ASSERT(sfp < 256);
HWY_ASSERT(sfp != 0x80); // -0 is reserved
HWY_INLINE_VAR constexpr bool kPrint = false;
const uint32_t sign32 = (sfp & 0x80) << 24;
sfp &= 0x7F;
const bool large_e = sfp >= 64;
const size_t m_bits = large_e ? 3 : 2;
uint32_t m = sfp & ((1u << m_bits) - 1u);
size_t e = sfp >> m_bits;
if (sfp == 0) return 0.0f;
const uint32_t e_bias = large_e ? 15 : 23;
const uint32_t exp32 = static_cast<uint32_t>(127 + e - e_bias) << 23;
const uint32_t mnt32 = m << (23 - m_bits);
const uint32_t binary32 = sign32 | exp32 | mnt32;
float result;
hwy::CopySameSize(&binary32, &result);
return result;
static float F32FromSFP8(uint32_t sfp) {
return CompressTraits<SfpStream>::ToFloatSlow(
SfpStream{static_cast<uint8_t>(sfp)});
}
// Used for HWY_AVX3_DL and newer.
void PrintTables() {
if (HWY_ONCE && false) {
if (HWY_ONCE && kPrint) {
uint8_t hi[128];
fprintf(stderr, "lo\n");
for (uint32_t sfp = 0; sfp < 128; ++sfp) {
@ -92,7 +78,7 @@ void TestAllUnique() {
unique.insert(F32FromSFP8(sfp));
}
HWY_ASSERT_EQ(size_t{255}, unique.size());
if (false) {
if (kPrint) {
for (float f : unique) {
fprintf(stderr, "%e\n", f);
}
@ -163,7 +149,7 @@ HWY_INLINE uint32_t SFP8FromF32(float f) {
if (m == 0) m = 1;
}
if (false) {
if (kPrint) {
fprintf(stderr, "in %x round %x rounded %x e %d m %x large_e %d\n",
org_binary32, round, rounded, e, m, large_e);
}

View File

@ -891,18 +891,6 @@ class DotStats {
hwy::Stats s_times[kVariants];
};
// Returns normalized value in [-1, 1).
float RandomFloat(RngStream& rng) {
const uint32_t exp = hwy::BitCastScalar<uint32_t>(1.0f);
const uint32_t mantissa_mask = hwy::MantissaMask<float>();
const uint32_t representation = exp | (rng() & mantissa_mask);
const float f12 = hwy::BitCastScalar<float>(representation);
HWY_DASSERT(1.0f <= f12 && f12 < 2.0f); // exponent is 2^0, only mantissa
const float f = (2.0f * (f12 - 1.0f)) - 1.0f;
HWY_DASSERT(-1.0f <= f && f < 1.0f);
return f;
}
// `raw` holds the decompressed values, so that the test measures only the
// error from the Dot algorithms, not the compression.
template <typename Packed>

View File

@ -19,8 +19,10 @@
#include <stddef.h>
#include <stdint.h>
#include <algorithm> // std::sort
#include <cmath>
#include "util/basics.h" // RngStream
#include "hwy/base.h"
// IWYU pragma: begin_exports
@ -30,9 +32,35 @@
namespace gcpp {
// Excludes outliers; we might not have enough samples for a reliable mode.
HWY_INLINE double TrimmedMean(double* seconds, size_t num) {
std::sort(seconds, seconds + num);
double sum = 0;
int count = 0;
for (size_t i = num / 4; i < num / 2; ++i) {
sum += seconds[i];
count += 1;
}
HWY_DASSERT(num != 0);
return sum / count;
}
// Returns normalized value in [-1, 1).
HWY_INLINE float RandomFloat(RngStream& rng) {
const uint32_t exp = hwy::BitCastScalar<uint32_t>(1.0f);
const uint32_t mantissa_mask = hwy::MantissaMask<float>();
const uint32_t representation = exp | (rng() & mantissa_mask);
const float f12 = hwy::BitCastScalar<float>(representation);
HWY_DASSERT(1.0f <= f12 && f12 < 2.0f); // exponent is 2^0, only mantissa
const float f = (2.0f * (f12 - 1.0f)) - 1.0f;
HWY_DASSERT(-1.0f <= f && f < 1.0f);
return f;
}
// Returns random Gaussian (mean=0, stddev=1/3 similar to expected weights)
// using the central limit theorem. Avoid std::normal_distribution for
// consistent cross-platform output.
// TODO: use RngStream instead of RandomState.
HWY_INLINE double RandomGaussian(hwy::RandomState& rng) {
uint64_t sum = 0;
constexpr int kReps = 40;