diff --git a/CMakeLists.txt b/CMakeLists.txt index 1d2a7a0..9efce80 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -40,6 +40,7 @@ set(SOURCES compression/nuq-inl.h compression/sfp.h compression/sfp-inl.h + compression/test_util.h util/app.h util/args.h ) diff --git a/compression/BUILD b/compression/BUILD index cfbeb99..5b525a5 100644 --- a/compression/BUILD +++ b/compression/BUILD @@ -24,20 +24,36 @@ cc_library( ], ) +# Deprecated because it is also implemented in Highway; will be removed once +# that Highway version is sufficiently widespread. cc_library( name = "stats", - srcs = [ - "stats.cc", - ], - hdrs = [ - "distortion.h", - "stats.h", - ], + srcs = ["stats.cc"], + hdrs = ["stats.h"], deps = [ "@hwy//:hwy", ], ) +cc_library( + name = "distortion", + hdrs = ["distortion.h"], + deps = [ + "@hwy//:hwy", + ], +) + +cc_library( + name = "test_util", + hdrs = ["test_util.h"], + deps = [ + ":distortion", + ":stats", + "@hwy//:hwy", + "@hwy//:hwy_test_util", + ], +) + cc_library( name = "sfp", hdrs = [ @@ -62,12 +78,11 @@ cc_test( tags = ["hwy_ops_test"], deps = [ ":sfp", - ":stats", + ":test_util", "@googletest//:gtest_main", "@hwy//:hwy", "@hwy//:hwy_test_util", "@hwy//:nanobenchmark", - "@hwy//:thread_pool", ], ) @@ -98,7 +113,7 @@ cc_test( deps = [ ":nuq", ":sfp", - ":stats", + ":test_util", "@googletest//:gtest_main", "@hwy//:hwy", "@hwy//:hwy_test_util", @@ -118,6 +133,7 @@ cc_library( ], deps = [ ":blob_store", + ":distortion", ":nuq", ":sfp", ":stats", @@ -134,6 +150,7 @@ cc_library( "analyze.h", ], deps = [ + ":distortion", ":nuq", ":sfp", ":stats", diff --git a/compression/nuq-inl.h b/compression/nuq-inl.h index e7d85a7..1c8bdf1 100644 --- a/compression/nuq-inl.h +++ b/compression/nuq-inl.h @@ -42,6 +42,10 @@ #include "hwy/contrib/sort/vqsort-inl.h" #include "hwy/highway.h" +#ifndef HWY_IF_CONSTEXPR +#define HWY_IF_CONSTEXPR if +#endif + HWY_BEFORE_NAMESPACE(); namespace gcpp { namespace HWY_NAMESPACE { @@ -124,7 +128,7 @@ class NuqClustering { } private: - // Float has enough precision for our relatively small kGroupSize (128). + // Float has enough precision for our relatively small kGroupSize (256). float cumsum_[kGroupSize + 1]; float cumsum2_[kGroupSize + 1]; float inv_len_[kGroupSize + 1]; @@ -168,8 +172,8 @@ class NuqClustering { // `centers`; prior centers are zero-initialized. // // O(kClusters * kGroupSize * kGroupSize), but the constant factors are so low - // that this is about 10 times as fast as the O(kClusters * kGroupSize) SMAWK - // as implemented in FAISS, for our kGroupSize <= 128. + // that this is about 5 times as fast as the O(kClusters * kGroupSize) SMAWK + // as implemented in FAISS, for our kGroupSize of 256. template static HWY_NOINLINE size_t ClusterExactL2(DF df, const float* x, ClusterBuf& buf, @@ -228,7 +232,7 @@ class NuqClustering { // Center = mean, O(1) thanks to cumulative sums. const float sum = cc.SumOfSorted(start, last); const int size = static_cast(last) - static_cast(start) + 1; - HWY_DASSERT(0 < size && size <= kGroupSize); + HWY_DASSERT(0 < size && size <= static_cast(kGroupSize)); centers[k] = sum / static_cast(size); // We know the range inside sorted_and_i[]; translate to original indices, @@ -427,7 +431,7 @@ class NuqCodec { // instead of TableLookupBytes, which requires extra interleaving of lo/hi. HWY_DASSERT(hn::Lanes(du) >= 8); - if (NumTables(du) == 2) { + HWY_IF_CONSTEXPR(NumTables(du) == 2) { // Reduce cap for second half to avoid loading past the end of the table. const hn::CappedTag d_table2; *tbl1 = hn::ResizeBitCast(du, hn::LoadU(d_table2, table + kClusters / 2)); @@ -449,11 +453,12 @@ class NuqCodec { const auto indices0 = hn::IndicesFromVec(du, idx0); const auto indices1 = hn::IndicesFromVec(du, idx1); - if (NumTables(du) == 1) { + HWY_IF_CONSTEXPR(NumTables(du) == 1) { (void)tbl1; c0 = hn::TableLookupLanes(tbl0, indices0); c1 = hn::TableLookupLanes(tbl0, indices1); - } else { + } + HWY_IF_CONSTEXPR(NumTables(du) == 2) { // `else` is poorly formatted. c0 = hn::TwoTablesLookupLanes(du, tbl0, tbl1, indices0); c1 = hn::TwoTablesLookupLanes(du, tbl0, tbl1, indices1); } @@ -521,8 +526,8 @@ class NuqCodec { // Decodes `num` values from the stream `in`, starting at the offset `in_ofs` // (in units of values), to bf16 in `out`. `in_capacity`, `in_ofs` and `num` // must all be multiples of `kGroupSize`. - template - static HWY_INLINE void Dec(DF dbf, const size_t in_capacity, + template + static HWY_INLINE void Dec(DBF dbf, const size_t in_capacity, const NuqStream* const in, const size_t in_ofs, hwy::bfloat16_t* const out, const size_t num) { const hn::RebindToUnsigned d16; diff --git a/compression/nuq_test.cc b/compression/nuq_test.cc index d679376..7045ba4 100644 --- a/compression/nuq_test.cc +++ b/compression/nuq_test.cc @@ -27,6 +27,7 @@ #include "hwy/aligned_allocator.h" #include "hwy/base.h" +#include "hwy/timer.h" // clang-format off #undef HWY_TARGET_INCLUDE @@ -35,15 +36,14 @@ #include "hwy/foreach_target.h" // IWYU pragma: keep // Other headers that include Highway must come after foreach_target.h // copybara:import_next_line:gemma_cpp -#include "compression/distortion.h" -// copybara:import_next_line:gemma_cpp #include "compression/nuq-inl.h" // copybara:import_next_line:gemma_cpp #include "compression/nuq.h" +// copybara:import_next_line:gemma_cpp +#include "compression/test_util.h" #include "hwy/highway.h" #include "hwy/tests/hwy_gtest.h" #include "hwy/tests/test_util-inl.h" -#include "hwy/timer.h" HWY_BEFORE_NAMESPACE(); namespace gcpp { @@ -181,12 +181,14 @@ struct TestNormal { auto in = hwy::AllocateAligned(kGroupSize); HWY_ASSERT(in); - std::mt19937 rng(123); - std::normal_distribution dist{0.001f, 0.3f}; + hwy::RandomState rng; + Stats in_stats; for (size_t i = 0; i < kGroupSize; ++i) { - in[i] = dist(rng); + const double r = RandomGaussian(rng); + in_stats.Notify(r); + in[i] = hwy::ConvertScalarTo(r); } - std::shuffle(in.get(), in.get() + kGroupSize, rng); + VerifyGaussian(in_stats); ClusterBuf buf; float centers[kClusters]; @@ -212,9 +214,9 @@ struct TestNormal { const float snr = stats.GeomeanValueDivL1(); fprintf(stderr, "p-norm %.3E snr %.2f @%zu = %.4E\n", pnorm, snr, stats.MaxIndex(), stats.MaxL1()); - static_assert(kGroupSize == 128 || kGroupSize == 256, "Update expected"); - const float expected_pnorm = kGroupSize == 128 ? 3E-2f : 3.4E-2f; - const float expected_snr = kGroupSize == 128 ? 17.4f : 13.1f; + static_assert(kGroupSize == 256, "Update expected"); + const float expected_pnorm = 3.68E-2f; + const float expected_snr = 12.7f; HWY_ASSERT(expected_pnorm <= pnorm && pnorm < 1.02f * expected_pnorm); HWY_ASSERT(expected_snr <= snr && snr < 1.01f * expected_snr); } @@ -345,21 +347,27 @@ struct TestDot { auto nuq = hwy::AllocateAligned(NuqStream::PackedEnd(num)); HWY_ASSERT(in && dec && vec && nuq); - std::mt19937 rng(123); - std::normal_distribution dist{0.001f, 0.3f}; + // Generate inputs and verify their distribution. + hwy::RandomState rng; + Stats in_stats; for (size_t i = 0; i < num; ++i) { - in[i] = dist(rng); - vec[i] = hwy::ConvertScalarTo(dist(rng)); + const float r = static_cast(RandomGaussian(rng)); + in_stats.Notify(r); + in[i] = r; } - // This changes the correlation between in and vec, which considerably - // affects the error of the result. - std::shuffle(in.get(), in.get() + num, rng); + for (size_t i = 0; i < num; ++i) { + const float r = static_cast(RandomGaussian(rng)); + in_stats.Notify(r); + vec[i] = hwy::ConvertScalarTo(r); + } + VerifyGaussian(in_stats); ClusterBuf buf; const size_t unused_clusters = NuqCodec::Enc(df, in.get(), num, buf, num, nuq.get(), 0); HWY_ASSERT(unused_clusters == 0); + // Compute dot product without decompression. double actual = 0.0; double elapsed = hwy::HighestValue(); for (size_t rep = 0; rep < 20; ++rep) { @@ -380,24 +388,39 @@ struct TestDot { fprintf(stderr, "Vec %zu Dec %.2f MB/s\n", Lanes(d) * sizeof(T), num * sizeof(in[0]) * 1E-6 / elapsed); - double expected = 0.0; // using original input - double expected2 = 0.0; // using decoded NUQ + // Exact and decompressed dot products for comparison. + double exact = 0.0; // using original input + double expected = 0.0; // using decoded NUQ + DistortionStats dec_stats; + Stats ratios; for (size_t i = 0; i < num; ++i) { - expected += in[i] * hwy::ConvertScalarTo(vec[i]); - expected2 += dec[i] * hwy::ConvertScalarTo(vec[i]); + dec_stats.Notify(in[i], dec[i]); + const float v1 = hwy::ConvertScalarTo(vec[i]); + exact += in[i] * v1; + expected += dec[i] * v1; + if (expected != 0.0f) { + ratios.Notify(exact / expected); + } } - const double l1 = hwy::ScalarAbs(expected - actual); - const double snr = 1.0 + hwy::ScalarAbs(expected) / l1; - fprintf(stderr, "expected %.3f e2 %.4f actual %.4f l1 %E snr %.2f\n", - expected, expected2, actual, l1, snr); - HWY_ASSERT(hwy::ScalarAbs(expected2 - actual) < 1E-4); - static_assert(kGroupSize == 128 || kGroupSize == 256, "Update expected"); - const double expected_l1 = kGroupSize == 128 ? 7.3E-2 : 4.34E-2; - const double expected_snr = kGroupSize == 128 ? 9.7f - : sizeof(T) == 2 ? 14.5f - : 14.9f; - HWY_ASSERT(expected_l1 <= l1 && l1 < 1.02f * expected_l1); - HWY_ASSERT(expected_snr <= snr && snr < 1.01f * expected_snr); + const double dec_snr = dec_stats.GeomeanValueDivL1(); + const double dot_snr = 1.0 / hwy::ScalarAbs(1.0 - ratios.GeometricMean()); + // exact and actual fluctuate due to the combination of NUQ imprecision, + // and whether vec[i] is negative or positive, so this is quite loose. + const float final_ratio = HWY_MIN(exact / actual, actual / exact); + fprintf(stderr, "ratios %s\n", ratios.ToString().c_str()); + fprintf(stderr, + "exact %.3f e2 %.4f actual %.4f final_ratio %.3f dec_snr %.2f " + "dot_snr %.2f\n", + exact, expected, actual, final_ratio, dec_snr, dot_snr); + // Final values are not too far apart. + HWY_ASSERT(0.88f <= final_ratio && final_ratio <= 1.0f); + // Decompressed and uncompressed dot should match exactly. + HWY_ASSERT(hwy::ScalarAbs(expected - actual) < 1E-4f); + // dec[] is close to in[], but we already check that in TestStream. + HWY_ASSERT(dec_snr >= 13.0); + // Geomean of ratios for each i is an approximation of the actual SNR. + HWY_ASSERT(dot_snr >= (sizeof(T) == 2 ? 17.0 : 14.0)); + static_assert(kGroupSize == 256, "Update expected*"); } }; diff --git a/compression/sfp-inl.h b/compression/sfp-inl.h index 62b8955..77f7ede 100644 --- a/compression/sfp-inl.h +++ b/compression/sfp-inl.h @@ -449,6 +449,18 @@ class SfpCodec { return Enc2U(d16, w0, w1); } + // Truncates two f32 to bf16, in lane order, without rounding (see Enc4F). + template > + static HWY_INLINE hn::Vec Truncate2To(DBF dbf, hn::Vec f0, + hn::Vec f1) { + const hn::RebindToUnsigned d16; + using V16 = hn::Vec; + const V16 u0 = BitCast(d16, f0); + const V16 u1 = BitCast(d16, f1); + return BitCast(DBF(), HWY_IS_LITTLE_ENDIAN ? ConcatOdd(d16, u1, u0) + : ConcatEven(d16, u1, u0)); + } + template >> static HWY_INLINE V8 Enc4F(DF df, const float* HWY_RESTRICT in) { @@ -462,9 +474,10 @@ class SfpCodec { const VF f1 = hn::LoadU(df, in + NF * 1); const VF f2 = hn::LoadU(df, in + NF * 2); const VF f3 = hn::LoadU(df, in + NF * 3); - // Chop off the lower 16 bits; EncBytes still rounds properly. - const V16 w0 = hn::BitCast(d16, hn::OrderedDemote2To(dbf, f0, f1)); - const V16 w1 = hn::BitCast(d16, hn::OrderedDemote2To(dbf, f2, f3)); + // Chop off the lower 16 bits instead of OrderedDemote2To, which rounds to + // the nearest bf16, because EncBytes will round again. + const V16 w0 = hn::BitCast(d16, Truncate2To(dbf, f0, f1)); + const V16 w1 = hn::BitCast(d16, Truncate2To(dbf, f2, f3)); return Enc2U(d16, w0, w1); } diff --git a/compression/sfp_test.cc b/compression/sfp_test.cc index b51505f..b362728 100644 --- a/compression/sfp_test.cc +++ b/compression/sfp_test.cc @@ -25,12 +25,11 @@ #include #include -#include -#include #include #include "hwy/aligned_allocator.h" #include "hwy/base.h" +#include "hwy/timer.h" // clang-format off #undef HWY_TARGET_INCLUDE @@ -39,13 +38,12 @@ #include "hwy/foreach_target.h" // IWYU pragma: keep // Any highway.h must come after foreach_target.h // copybara:import_next_line:gemma_cpp -#include "compression/distortion.h" -// copybara:import_next_line:gemma_cpp #include "compression/sfp-inl.h" +// copybara:import_next_line:gemma_cpp +#include "compression/test_util.h" #include "hwy/highway.h" #include "hwy/tests/hwy_gtest.h" #include "hwy/tests/test_util-inl.h" -#include "hwy/timer.h" HWY_BEFORE_NAMESPACE(); namespace gcpp { @@ -358,25 +356,31 @@ struct TestDot { template HWY_INLINE void operator()(T /*unused*/, D d) { const hn::Repartition df; - const size_t num = 384; + const size_t num = 1024; // not too many for GeometricMean overflow. auto in = hwy::AllocateAligned(num); auto dec = hwy::AllocateAligned(num); auto vec = hwy::AllocateAligned(num); auto sfp = hwy::AllocateAligned(num); HWY_ASSERT(in && dec && vec && sfp); - std::mt19937 rng(123); - std::normal_distribution dist{0.001f, 0.3f}; + // Generate inputs and verify their distribution. + hwy::RandomState rng; + Stats in_stats; for (size_t i = 0; i < num; ++i) { - in[i] = hwy::ConvertScalarTo(dist(rng)); - vec[i] = hwy::ConvertScalarTo(dist(rng)); + const float r = static_cast(RandomGaussian(rng)); + in_stats.Notify(r); + in[i] = hwy::ConvertScalarTo(r); } - // This changes the correlation between in and vec, which considerably - // affects the error of the result. - std::shuffle(in.get(), in.get() + num, rng); + for (size_t i = 0; i < num; ++i) { + const float r = static_cast(RandomGaussian(rng)); + in_stats.Notify(r); + vec[i] = hwy::ConvertScalarTo(r); + } + VerifyGaussian(in_stats); SfpCodec::Enc(d, in.get(), num, sfp.get()); + // Compute dot product without decompression. double actual = 0.0; double elapsed = hwy::HighestValue(); for (size_t rep = 0; rep < 200; ++rep) { @@ -393,26 +397,44 @@ struct TestDot { } SfpCodec::Dec(d, sfp.get(), num, dec.get()); - fprintf(stderr, "Vec %zu Dot %.2f MB/s\n", Lanes(d) * sizeof(T), - num * sizeof(T) * 1E-6 / elapsed); + fprintf(stderr, "Vec %zu Dot %zu-bit %.2f MB/s\n", Lanes(d) * sizeof(T), + sizeof(T) * 8, num * sizeof(T) * 1E-6 / elapsed); - double expected = 0.0; // using original input - double expected2 = 0.0; // using decoded SFP + // Exact and decompressed dot products for comparison. + float exact = 0.0f; // using original input + float expected = 0.0f; // using decoded SFP + DistortionStats dec_stats; + Stats ratios; for (size_t i = 0; i < num; ++i) { - expected += hwy::ConvertScalarTo(in[i]) * - hwy::ConvertScalarTo(vec[i]); - expected2 += hwy::ConvertScalarTo(dec[i]) * - hwy::ConvertScalarTo(vec[i]); + const float in1 = hwy::ConvertScalarTo(in[i]); + const float dec1 = hwy::ConvertScalarTo(dec[i]); + const float vec1 = hwy::ConvertScalarTo(vec[i]); + dec_stats.Notify(in1, dec1); + + exact += in1 * vec1; + expected += dec1 * vec1; + if (expected != 0.0f) { + ratios.Notify(exact / expected); + } } - const double l1 = hwy::ScalarAbs(expected - actual); - const double snr = 1.0 + hwy::ScalarAbs(expected) / l1; - fprintf(stderr, "expected %.3f e2 %.4f actual %.4f l1 %E snr %.2f\n", - expected, expected2, actual, l1, snr); - HWY_ASSERT(hwy::ScalarAbs(expected2 - actual) < 1E-4); - const double expected_l1 = sizeof(T) == 2 ? 1.52E-2 : 1.15E-2; - const double expected_snr = sizeof(T) == 2 ? 80.1f : 104.9f; - HWY_ASSERT(expected_l1 <= l1 && l1 < 1.02f * expected_l1); - HWY_ASSERT(expected_snr <= snr && snr < 1.01f * expected_snr); + const double dec_snr = dec_stats.GeomeanValueDivL1(); + const double dot_snr = 1.0 / hwy::ScalarAbs(1.0 - ratios.GeometricMean()); + // exact and actual fluctuate due to the combination of SFP imprecision, + // and whether vec[i] is negative or positive, so this is quite loose. + const float final_ratio = HWY_MIN(exact / actual, actual / exact); + fprintf(stderr, "ratios %s\n", ratios.ToString().c_str()); + fprintf(stderr, + "exact %.3f e2 %.4f actual %.4f final_ratio %.3f dec_snr %.2f " + "dot_snr %.2f\n", + exact, expected, actual, final_ratio, dec_snr, dot_snr); + // Final values are not too far apart. + HWY_ASSERT(0.87f <= final_ratio && final_ratio <= 1.0f); + // Decompressed and uncompressed dot should match exactly. + HWY_ASSERT(hwy::ScalarAbs(expected - actual) < 1E-4f); + // dec[] is close to in[], but we already check that in TestEncDec. + HWY_ASSERT(dec_snr >= 50.0); + // Geomean of ratios for each i should be very close to one. + HWY_ASSERT(dot_snr >= (sizeof(T) == 2 ? 70.0 : 1000.0)); } }; diff --git a/compression/test_util.h b/compression/test_util.h new file mode 100644 index 0000000..b1e4026 --- /dev/null +++ b/compression/test_util.h @@ -0,0 +1,64 @@ +// Copyright 2023 Google LLC +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef THIRD_PARTY_GEMMA_CPP_COMPRESSION_TEST_UTIL_H_ +#define THIRD_PARTY_GEMMA_CPP_COMPRESSION_TEST_UTIL_H_ + +#include +#include + +#include + +#include "hwy/base.h" + +// IWYU pragma: begin_exports +// copybara:import_next_line:gemma_cpp +#include "compression/distortion.h" +// copybara:import_next_line:gemma_cpp +#include "compression/stats.h" +#include "hwy/tests/test_util.h" // RandomState +// IWYU pragma: end_exports + +namespace gcpp { + +// Returns random Gaussian (mean=0, stddev=1/3 similar to expected weights) +// using the central limit theorem. Avoid std::normal_distribution for +// consistent cross-platform output. +HWY_INLINE double RandomGaussian(hwy::RandomState& rng) { + uint64_t sum = 0; + constexpr int kReps = 40; + for (int rep = 0; rep < kReps; ++rep) { + sum += hwy::Random32(&rng) & 0xFFFFF; + } + const double sum_f = + static_cast(sum) / static_cast(0xFFFFF * kReps); + HWY_ASSERT(0.0 <= sum_f && sum_f <= 1.0); + const double plus_minus_1 = 2.0 * sum_f - 1.0; + HWY_ASSERT(-1.0 <= plus_minus_1 && plus_minus_1 <= 1.0); + // Normalize by stddev of sum of uniform random scaled to [-1, 1]. + return plus_minus_1 * std::sqrt(kReps / 3.0); +}; + +HWY_INLINE void VerifyGaussian(Stats& stats) { + const double stddev = stats.StandardDeviation(); + HWY_ASSERT(-0.01 <= stats.Mean() && stats.Mean() <= 0.01); + HWY_ASSERT(0.30 <= stddev && stddev <= 0.35); + HWY_ASSERT(-1.1 <= stats.Min() && stats.Min() <= -0.9); + HWY_ASSERT(0.9 <= stats.Max() && stats.Max() <= 1.1); +} + +} // namespace gcpp + +#endif // THIRD_PARTY_GEMMA_CPP_COMPRESSION_TEST_UTIL_H_