From aeade052c600ae55dcd038b2802439f5947ff964 Mon Sep 17 00:00:00 2001 From: Jan Wassenberg Date: Wed, 7 Jan 2026 10:32:44 -0800 Subject: [PATCH] Move AssertClose to test_util, add U16 PiperOrigin-RevId: 853321311 --- compression/test_util-inl.h | 124 ++++++++++++++++++++++++++++++++++++ compression/types.h | 20 +++++- ops/matmul_test.cc | 120 +--------------------------------- paligemma/BUILD.bazel | 1 - 4 files changed, 143 insertions(+), 122 deletions(-) diff --git a/compression/test_util-inl.h b/compression/test_util-inl.h index f2c8b8c..99b34b5 100644 --- a/compression/test_util-inl.h +++ b/compression/test_util-inl.h @@ -17,6 +17,10 @@ #ifndef THIRD_PARTY_GEMMA_CPP_COMPRESSION_TEST_UTIL_INL_H_ #define THIRD_PARTY_GEMMA_CPP_COMPRESSION_TEST_UTIL_INL_H_ +#include + +#include + // IWYU pragma: begin_exports #include "compression/distortion.h" #include "util/mat.h" @@ -153,6 +157,126 @@ MatStorageT GenerateTransposedMat(const Extents2D extents, return compressed; } +// Returns 1-norm, used for estimating tolerable numerical differences. +inline double MaxRowAbsSum(const MatStorageT& a) { + double max_row_abs_sum = 0.0; + for (size_t r = 0; r < a.Rows(); r++) { + const float* row = a.Row(r); + double row_abs_sum = 0.0; + for (size_t c = 0; c < a.Cols(); c++) { + row_abs_sum += hwy::ScalarAbs(row[c]); + } + max_row_abs_sum = HWY_MAX(max_row_abs_sum, row_abs_sum); + } + return max_row_abs_sum; +} + +// Returns the maximum absolute value of `a`. +inline float MaxAbs(const MatStorageT& a) { + float max_abs = 0.0f; + for (size_t c = 0; c < a.Cols(); c++) { + for (size_t r = 0; r < a.Rows(); r++) { + const float* row = a.Row(r); + max_abs = HWY_MAX(max_abs, hwy::ScalarAbs(row[c])); + } + } + return max_abs; +} + +// B is already transposed. +template +void AssertClose(const MatPtrT& A, const MatPtrT& B, + const MatPtrT& C_slow, const MatPtrT& C, + const Allocator& allocator, + std::vector>& row_ptrs, + int line) { + const hn::ScalableTag df; + const size_t cols = A.Cols(); + const size_t B_rows = B.Rows(); + // Round up for DecompressAndZeroPad. + MatStorageT a_batch("a_batch", A.Extents(), allocator, + MatPadding::kOdd); + MatStorageT b_trans_batch("b_trans_batch", B.Extents(), allocator, + MatPadding::kOdd); + MatStorageT c_batch("c_batch", Extents2D(A.Rows(), B_rows), allocator, + MatPadding::kOdd); + c_batch.AllocateAndAttachRowPtrs(row_ptrs); + MatStorageT c_slow_batch("c_slow_batch", Extents2D(A.Rows(), B_rows), + allocator, MatPadding::kOdd); + for (size_t m = 0; m < A.Rows(); ++m) { + DecompressAndZeroPad(df, MakeSpan(A.Row(m), cols), 0, a_batch.Row(m), cols); + DecompressAndZeroPad(df, MakeSpan(C.Row(m), B_rows), 0, c_batch.Row(m), + B_rows); + DecompressAndZeroPad(df, MakeSpan(C_slow.Row(m), B_rows), 0, + c_slow_batch.Row(m), B_rows); + } + for (size_t n = 0; n < B_rows; ++n) { + DecompressAndZeroPad(df, MakeSpan(B.Row(n), cols), 0, b_trans_batch.Row(n), + cols); + } + + // MatMul rounds inputs to BF16, so error is proportional to the max input + // magnitude, but also to f32 accumulation of rows in A and B. + const double norm = MaxRowAbsSum(a_batch) * MaxRowAbsSum(b_trans_batch); + const float max_abs = MaxAbs(a_batch) * MaxAbs(b_trans_batch); + const double eps_bf16 = hwy::ConvertScalarTo(hwy::Epsilon()); + const double eps_f32 = hwy::ConvertScalarTo(hwy::Epsilon()); + // Dot() uses double-precision summation. + double tolerance = 20 * norm * eps_f32; + // If either is F32, Dot() promotes F32 or even F64, but MatMul demotes the + // F32 to BF16, so add extra tolerance. + if (IsF32() || IsF32()) { + tolerance += 2 * max_abs * eps_bf16; + } + + if (tolerance > 500.0) { + HWY_WARN("high tolerance %f norm %f maxabs %f\n", tolerance, norm, max_abs); + } + const double rel_tolerance = + 1.0 + hwy::ConvertScalarTo(hwy::Epsilon()); + + double max_rel = 0.0; + size_t worst_r = 0; + size_t worst_c = 0; + double worst_actual = 0.0; + double worst_expected = 0.0; + size_t num_outside = 0; + for (size_t r = 0; r < A.Rows(); r++) { + const float* expected_row = c_slow_batch.Row(r); + const float* actual_row = c_batch.Row(r); + for (size_t c = 0; c < B.Rows(); c++) { + const double expected_value = static_cast(expected_row[c]); + const double actual_value = static_cast(actual_row[c]); + const bool in_range = expected_value - tolerance <= actual_value && + actual_value <= expected_value + tolerance; + + if (!in_range) { + const double max = HWY_MAX(expected_value, actual_value); + const double min = HWY_MIN(expected_value, actual_value); + const double rel = max / HWY_MAX(min, 1E-6); + if (rel > max_rel) { + worst_expected = expected_value; + worst_actual = actual_value; + worst_r = r; + worst_c = c; + max_rel = rel; + ++num_outside; + } + } + } + } + + if (max_rel > rel_tolerance) { + hwy::Abort(__FILE__, line, + "(%zu,%zu): expected %f, actual %f, norm %f maxabs %f " + "tolerance %f rel %E max_rel %E num_outside %zu\n", + worst_r, worst_c, worst_expected, worst_actual, norm, max_abs, + tolerance, max_rel, rel_tolerance, num_outside); + } + HWY_ASSERT(hn::AllFalse( + df, hn::IsEitherNaN(hn::Set(df, norm), hn::Set(df, max_abs)))); +} + // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE } // namespace gcpp diff --git a/compression/types.h b/compression/types.h index 8f11591..6e6129d 100644 --- a/compression/types.h +++ b/compression/types.h @@ -218,12 +218,23 @@ constexpr bool SupportsPointerArithmetic() { } // Tensor types for loading weights. Not all of these are supported weight -// types, some are only used for `Activations`. -enum class Type { kUnknown, kF32, kBF16, kSFP, kNUQ, kF64, kU32, kU64, kI8 }; +// types, some are only used for `Activations`. Append-only. +enum class Type { + kUnknown, + kF32, + kBF16, + kSFP, + kNUQ, + kF64, + kU32, + kU64, + kI8, + kU16 +}; // These are used in `ModelConfig.Specifier`, hence the strings will not // change, though new ones may be added. static constexpr const char* kTypeStrings[] = { - "unknown", "f32", "bf16", "sfp", "nuq", "f64", "u32", "u64", "i8"}; + "unknown", "f32", "bf16", "sfp", "nuq", "f64", "u32", "u64", "i8", "u16"}; static constexpr size_t kNumTypes = sizeof(kTypeStrings) / sizeof(kTypeStrings[0]); static constexpr size_t kTypeBits[] = { @@ -236,6 +247,7 @@ static constexpr size_t kTypeBits[] = { 8 * sizeof(uint32_t), 8 * sizeof(uint64_t), 8 * sizeof(I8Stream), + 8 * sizeof(uint16_t), }; static inline bool EnumValid(Type type) { @@ -262,6 +274,8 @@ Type TypeEnum() { return Type::kU64; } else if constexpr (hwy::IsSame()) { return Type::kI8; + } else if constexpr (hwy::IsSame()) { + return Type::kU16; } else { HWY_DASSERT(false); return Type::kUnknown; diff --git a/ops/matmul_test.cc b/ops/matmul_test.cc index 4787122..a7a9862 100644 --- a/ops/matmul_test.cc +++ b/ops/matmul_test.cc @@ -58,122 +58,6 @@ extern int64_t first_target; namespace HWY_NAMESPACE { namespace hn = hwy::HWY_NAMESPACE; -// Returns 1-norm, used for estimating tolerable numerical differences. -double MaxRowAbsSum(const MatStorageT& a) { - double max_row_abs_sum = 0.0; - for (size_t r = 0; r < a.Rows(); r++) { - const float* row = a.Row(r); - double row_abs_sum = 0.0; - for (size_t c = 0; c < a.Cols(); c++) { - row_abs_sum += hwy::ScalarAbs(row[c]); - } - max_row_abs_sum = HWY_MAX(max_row_abs_sum, row_abs_sum); - } - return max_row_abs_sum; -} - -// Returns the maximum absolute value of `a`. -float MaxAbs(const MatStorageT& a) { - float max_abs = 0.0f; - for (size_t c = 0; c < a.Cols(); c++) { - for (size_t r = 0; r < a.Rows(); r++) { - const float* row = a.Row(r); - max_abs = HWY_MAX(max_abs, hwy::ScalarAbs(row[c])); - } - } - return max_abs; -} - -// B is already transposed. -template -void AssertClose(const MatPtrT& A, const MatPtrT& B, - const MatPtrT& C_slow, const MatPtrT& C, - MatMulEnv& env, int line) { - const hn::ScalableTag df; - const size_t cols = A.Cols(); - const size_t B_rows = B.Rows(); - // Round up for DecompressAndZeroPad. - MatStorageT a_batch("a_batch", A.Extents(), env.ctx.allocator, - MatPadding::kOdd); - MatStorageT b_trans_batch("b_trans_batch", B.Extents(), - env.ctx.allocator, MatPadding::kOdd); - MatStorageT c_batch("c_batch", Extents2D(A.Rows(), B_rows), - env.ctx.allocator, MatPadding::kOdd); - c_batch.AllocateAndAttachRowPtrs(env.row_ptrs); - MatStorageT c_slow_batch("c_slow_batch", Extents2D(A.Rows(), B_rows), - env.ctx.allocator, MatPadding::kOdd); - for (size_t m = 0; m < A.Rows(); ++m) { - DecompressAndZeroPad(df, MakeSpan(A.Row(m), cols), 0, a_batch.Row(m), cols); - DecompressAndZeroPad(df, MakeSpan(C.Row(m), B_rows), 0, c_batch.Row(m), - B_rows); - DecompressAndZeroPad(df, MakeSpan(C_slow.Row(m), B_rows), 0, - c_slow_batch.Row(m), B_rows); - } - for (size_t n = 0; n < B_rows; ++n) { - DecompressAndZeroPad(df, MakeSpan(B.Row(n), cols), 0, b_trans_batch.Row(n), - cols); - } - - // MatMul rounds inputs to BF16, so error is proportional to the max input - // magnitude, but also to f32 accumulation of rows in A and B. - const double norm = MaxRowAbsSum(a_batch) * MaxRowAbsSum(b_trans_batch); - const float max_abs = MaxAbs(a_batch) * MaxAbs(b_trans_batch); - const double eps_bf16 = hwy::ConvertScalarTo(hwy::Epsilon()); - const double eps_f32 = hwy::ConvertScalarTo(hwy::Epsilon()); - // Dot() uses double-precision summation. - double tolerance = 20 * norm * eps_f32; - // If either is F32, Dot() promotes F32 or even F64, but MatMul demotes the - // F32 to BF16, so add extra tolerance. - if (IsF32() || IsF32()) { - tolerance += 2 * max_abs * eps_bf16; - } - - if (tolerance > 500.0) { - HWY_WARN("high tolerance %f norm %f maxabs %f\n", tolerance, norm, max_abs); - } - const double rel_tolerance = - 1.0 + hwy::ConvertScalarTo(hwy::Epsilon()); - - double max_rel = 0.0; - size_t worst_r = 0; - size_t worst_c = 0; - double worst_actual = 0.0; - double worst_expected = 0.0; - size_t num_outside = 0; - for (size_t r = 0; r < A.Rows(); r++) { - const float* expected_row = c_slow_batch.Row(r); - const float* actual_row = c_batch.Row(r); - for (size_t c = 0; c < B.Rows(); c++) { - const double expected_value = static_cast(expected_row[c]); - const double actual_value = static_cast(actual_row[c]); - const bool in_range = expected_value - tolerance <= actual_value && - actual_value <= expected_value + tolerance; - - if (!in_range) { - const double max = HWY_MAX(expected_value, actual_value); - const double min = HWY_MIN(expected_value, actual_value); - const double rel = max / HWY_MAX(min, 1E-6); - if (rel > max_rel) { - worst_expected = expected_value; - worst_actual = actual_value; - worst_r = r; - worst_c = c; - max_rel = rel; - ++num_outside; - } - } - } - } - - if (max_rel > rel_tolerance) { - hwy::Abort(__FILE__, line, - "(%zu,%zu): expected %f, actual %f, norm %f maxabs %f " - "tolerance %f rel %E max_rel %E num_outside %zu\n", - worst_r, worst_c, worst_expected, worst_actual, norm, max_abs, - tolerance, max_rel, rel_tolerance, num_outside); - } -} - // B is already transposed. template HWY_INLINE void MatMulSlow(const MatPtrT A, const MatPtrT B, @@ -257,7 +141,7 @@ void TestMatMul(size_t rows_ac, size_t cols_a_rows_b, size_t cols_bc, bool add, MMOptions options; for (size_t rep = 0; rep < 16; ++rep) { MMPerKey* per_key = MatMulStatic(A, BT, add_row, env, C, options); - AssertClose(A, BT, C_slow, C, env, line); + AssertClose(A, BT, C_slow, C, env.ctx.allocator, env.row_ptrs, line); // Check before TwoMatMulStatic(), which can invalidate per_key. const bool autotune_done = !!per_key->autotune.Best(); @@ -295,7 +179,7 @@ void TestMatMul(size_t rows_ac, size_t cols_a_rows_b, size_t cols_bc, bool add, // TwoMatMulStatic() does not support adding a bias vector. if (!add) { - AssertClose(A, BT, C, C2, env, line); + AssertClose(A, BT, C, C2, env.ctx.allocator, env.row_ptrs, line); } } diff --git a/paligemma/BUILD.bazel b/paligemma/BUILD.bazel index cc6c6e1..b749e05 100644 --- a/paligemma/BUILD.bazel +++ b/paligemma/BUILD.bazel @@ -65,7 +65,6 @@ cc_test( "//:benchmark_helper", "//:configs", "//:gemma_lib", - "//io", "@highway//:hwy_test_util", ], )