mirror of https://github.com/google/gemma.cpp.git
1.29x speedup: bf16 C1/C2. Extend most ops to any type, expand test coverage.
Also increase dot_test.cc range for Zen4, and matmul_test tolerance (failing in some configs) PiperOrigin-RevId: 801789922
This commit is contained in:
parent
bc0c0bac8b
commit
229bd078a1
|
|
@ -376,6 +376,7 @@ cc_test(
|
|||
":test_util",
|
||||
":threading_context",
|
||||
"@googletest//:gtest_main", # buildcleaner: keep
|
||||
"//compression:test_util",
|
||||
"//compression:types",
|
||||
"@highway//:hwy",
|
||||
"@highway//:hwy_test_util",
|
||||
|
|
|
|||
|
|
@ -709,6 +709,242 @@ HWY_INLINE float DecompressAndCall(D, const PackedSpan<const VT> v,
|
|||
comp3);
|
||||
}
|
||||
|
||||
// Similar to `hn::Transform*`, but for compressed `T`. Used by ops-inl.h.
|
||||
// `DF` is the decompressed type, typically `float`.
|
||||
template <class DF, typename T, class Func>
|
||||
HWY_INLINE void DecompressAndCompressInplace(DF df, T* HWY_RESTRICT inout,
|
||||
size_t num, Func&& func) {
|
||||
const auto packed_inout = MakeSpan(inout, num);
|
||||
|
||||
using VF = hn::Vec<decltype(df)>;
|
||||
HWY_LANES_CONSTEXPR const size_t NF = hn::Lanes(df);
|
||||
size_t i = 0;
|
||||
if (num >= 2 * NF) {
|
||||
for (; i <= num - 2 * NF; i += 2 * NF) {
|
||||
VF v0, v1;
|
||||
Decompress2(df, packed_inout, i, v0, v1);
|
||||
const VF out0 = func(df, v0);
|
||||
const VF out1 = func(df, v1);
|
||||
Compress2(df, out0, out1, packed_inout, i);
|
||||
}
|
||||
}
|
||||
|
||||
const size_t remaining = num - i;
|
||||
HWY_DASSERT(remaining < 2 * NF);
|
||||
if (HWY_UNLIKELY(remaining != 0)) {
|
||||
HWY_ALIGN float buf_inout[2 * hn::MaxLanes(df)];
|
||||
// Ensure the second vector is zeroed even if remaining <= NF.
|
||||
hn::Store(hn::Zero(df), df, buf_inout + NF);
|
||||
DecompressAndZeroPad(df, packed_inout, i, buf_inout, remaining);
|
||||
const VF v0 = hn::Load(df, buf_inout);
|
||||
const VF v1 = hn::Load(df, buf_inout + NF);
|
||||
const VF out0 = func(df, v0);
|
||||
const VF out1 = func(df, v1);
|
||||
Compress2(df, out0, out1, MakeSpan(buf_inout, 2 * NF), 0);
|
||||
// Clang generates incorrect code for CopyBytes if num = 2.
|
||||
for (size_t j = 0; j < remaining; ++j) {
|
||||
inout[i + j] = hwy::ConvertScalarTo<T>(buf_inout[j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// One extra argument. `DF` is the decompressed type, typically `float`.
|
||||
template <class DF, typename T, typename T1, class Func>
|
||||
HWY_INLINE void Decompress1AndCompressInplace(DF df, T* HWY_RESTRICT inout,
|
||||
size_t num,
|
||||
const T1* HWY_RESTRICT p1,
|
||||
Func&& func) {
|
||||
const auto packed_inout = MakeSpan(inout, num);
|
||||
const auto packed1 = MakeSpan(p1, num);
|
||||
|
||||
using VF = hn::Vec<decltype(df)>;
|
||||
HWY_LANES_CONSTEXPR const size_t NF = hn::Lanes(df);
|
||||
size_t i = 0;
|
||||
if (num >= 2 * NF) {
|
||||
for (; i <= num - 2 * NF; i += 2 * NF) {
|
||||
VF v0, v1;
|
||||
Decompress2(df, packed_inout, i, v0, v1);
|
||||
VF v10, v11;
|
||||
Decompress2(df, packed1, i, v10, v11);
|
||||
const VF out0 = func(df, v0, v10);
|
||||
const VF out1 = func(df, v1, v11);
|
||||
Compress2(df, out0, out1, packed_inout, i);
|
||||
}
|
||||
}
|
||||
|
||||
const size_t remaining = num - i;
|
||||
HWY_DASSERT(remaining < 2 * NF);
|
||||
if (HWY_UNLIKELY(remaining != 0)) {
|
||||
HWY_ALIGN float buf_inout[2 * hn::MaxLanes(df)];
|
||||
HWY_ALIGN float buf1[2 * hn::MaxLanes(df)];
|
||||
// Ensure the second vector is zeroed even if remaining <= NF.
|
||||
hn::Store(hn::Zero(df), df, buf_inout + NF);
|
||||
hn::Store(hn::Zero(df), df, buf1 + NF);
|
||||
DecompressAndZeroPad(df, packed_inout, i, buf_inout, remaining);
|
||||
DecompressAndZeroPad(df, packed1, i, buf1, remaining);
|
||||
const VF v0 = hn::Load(df, buf_inout);
|
||||
const VF v1 = hn::Load(df, buf_inout + NF);
|
||||
const VF v10 = hn::Load(df, buf1);
|
||||
const VF v11 = hn::Load(df, buf1 + NF);
|
||||
const VF out0 = func(df, v0, v10);
|
||||
const VF out1 = func(df, v1, v11);
|
||||
Compress2(df, out0, out1, MakeSpan(buf_inout, 2 * NF), 0);
|
||||
// Clang generates incorrect code for CopyBytes if num = 2.
|
||||
for (size_t j = 0; j < remaining; ++j) {
|
||||
inout[i + j] = hwy::ConvertScalarTo<T>(buf_inout[j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Single input, separate output. `DF` is the decompressed type, typically
|
||||
// `float`.
|
||||
template <class DF, typename T, typename T1, class Func>
|
||||
HWY_INLINE void Decompress1AndCompressTo(DF df, T* HWY_RESTRICT out, size_t num,
|
||||
const T1* HWY_RESTRICT p1,
|
||||
Func&& func) {
|
||||
const auto packed_out = MakeSpan(out, num);
|
||||
const auto packed1 = MakeSpan(p1, num);
|
||||
|
||||
using VF = hn::Vec<decltype(df)>;
|
||||
HWY_LANES_CONSTEXPR const size_t NF = hn::Lanes(df);
|
||||
size_t i = 0;
|
||||
if (num >= 2 * NF) {
|
||||
for (; i <= num - 2 * NF; i += 2 * NF) {
|
||||
VF v10, v11;
|
||||
Decompress2(df, packed1, i, v10, v11);
|
||||
const VF out0 = func(df, v10);
|
||||
const VF out1 = func(df, v11);
|
||||
Compress2(df, out0, out1, packed_out, i);
|
||||
}
|
||||
}
|
||||
|
||||
const size_t remaining = num - i;
|
||||
HWY_DASSERT(remaining < 2 * NF);
|
||||
if (HWY_UNLIKELY(remaining != 0)) {
|
||||
HWY_ALIGN float buf1[2 * hn::MaxLanes(df)];
|
||||
HWY_ALIGN float buf_out[2 * hn::MaxLanes(df)];
|
||||
// Ensure the second vector is zeroed even if remaining <= NF.
|
||||
hn::Store(hn::Zero(df), df, buf1 + NF);
|
||||
DecompressAndZeroPad(df, packed1, i, buf1, remaining);
|
||||
const VF v10 = hn::Load(df, buf1);
|
||||
const VF v11 = hn::Load(df, buf1 + NF);
|
||||
const VF out0 = func(df, v10);
|
||||
const VF out1 = func(df, v11);
|
||||
Compress2(df, out0, out1, MakeSpan(buf_out, 2 * NF), 0);
|
||||
// Clang generates incorrect code for CopyBytes if num = 2.
|
||||
for (size_t j = 0; j < remaining; ++j) {
|
||||
out[i + j] = hwy::ConvertScalarTo<T>(buf_out[j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Two inputs. `DF` is the decompressed type, typically `float`.
|
||||
template <class DF, typename T, typename T1, typename T2, class Func>
|
||||
HWY_INLINE void Decompress2AndCompressTo(DF df, T* HWY_RESTRICT out, size_t num,
|
||||
const T1* HWY_RESTRICT p1,
|
||||
const T2* HWY_RESTRICT p2,
|
||||
Func&& func) {
|
||||
const auto packed_out = MakeSpan(out, num);
|
||||
const auto packed1 = MakeSpan(p1, num);
|
||||
const auto packed2 = MakeSpan(p2, num);
|
||||
|
||||
using VF = hn::Vec<decltype(df)>;
|
||||
HWY_LANES_CONSTEXPR const size_t NF = hn::Lanes(df);
|
||||
size_t i = 0;
|
||||
if (num >= 2 * NF) {
|
||||
for (; i <= num - 2 * NF; i += 2 * NF) {
|
||||
VF v10, v11, v20, v21;
|
||||
Decompress2(df, packed1, i, v10, v11);
|
||||
Decompress2(df, packed2, i, v20, v21);
|
||||
const VF out0 = func(df, v10, v20);
|
||||
const VF out1 = func(df, v11, v21);
|
||||
Compress2(df, out0, out1, packed_out, i);
|
||||
}
|
||||
}
|
||||
|
||||
const size_t remaining = num - i;
|
||||
HWY_DASSERT(remaining < 2 * NF);
|
||||
if (HWY_UNLIKELY(remaining != 0)) {
|
||||
HWY_ALIGN float buf1[2 * hn::MaxLanes(df)];
|
||||
HWY_ALIGN float buf2[2 * hn::MaxLanes(df)];
|
||||
HWY_ALIGN float buf_out[2 * hn::MaxLanes(df)];
|
||||
// Ensure the second vector is zeroed even if remaining <= NF.
|
||||
hn::Store(hn::Zero(df), df, buf1 + NF);
|
||||
hn::Store(hn::Zero(df), df, buf2 + NF);
|
||||
DecompressAndZeroPad(df, packed1, i, buf1, remaining);
|
||||
DecompressAndZeroPad(df, packed2, i, buf2, remaining);
|
||||
const VF v10 = hn::Load(df, buf1);
|
||||
const VF v11 = hn::Load(df, buf1 + NF);
|
||||
const VF v20 = hn::Load(df, buf2);
|
||||
const VF v21 = hn::Load(df, buf2 + NF);
|
||||
const VF out0 = func(df, v10, v20);
|
||||
const VF out1 = func(df, v11, v21);
|
||||
Compress2(df, out0, out1, MakeSpan(buf_out, 2 * NF), 0);
|
||||
// Clang generates incorrect code for CopyBytes if num = 2.
|
||||
for (size_t j = 0; j < remaining; ++j) {
|
||||
out[i + j] = hwy::ConvertScalarTo<T>(buf_out[j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Three inputs. `DF` is the decompressed type, typically `float`.
|
||||
template <class DF, typename T, typename T1, typename T2, typename T3,
|
||||
class Func>
|
||||
HWY_INLINE void Decompress3AndCompressTo(DF df, T* HWY_RESTRICT out, size_t num,
|
||||
const T1* HWY_RESTRICT p1,
|
||||
const T2* HWY_RESTRICT p2,
|
||||
const T3* HWY_RESTRICT p3,
|
||||
Func&& func) {
|
||||
const auto packed_out = MakeSpan(out, num);
|
||||
const auto packed1 = MakeSpan(p1, num);
|
||||
const auto packed2 = MakeSpan(p2, num);
|
||||
const auto packed3 = MakeSpan(p3, num);
|
||||
|
||||
using VF = hn::Vec<decltype(df)>;
|
||||
HWY_LANES_CONSTEXPR const size_t NF = hn::Lanes(df);
|
||||
size_t i = 0;
|
||||
if (num >= 2 * NF) {
|
||||
for (; i <= num - 2 * NF; i += 2 * NF) {
|
||||
VF v10, v11, v20, v21, v30, v31;
|
||||
Decompress2(df, packed1, i, v10, v11);
|
||||
Decompress2(df, packed2, i, v20, v21);
|
||||
Decompress2(df, packed3, i, v30, v31);
|
||||
const VF out0 = func(df, v10, v20, v30);
|
||||
const VF out1 = func(df, v11, v21, v31);
|
||||
Compress2(df, out0, out1, packed_out, i);
|
||||
}
|
||||
}
|
||||
|
||||
const size_t remaining = num - i;
|
||||
HWY_DASSERT(remaining < 2 * NF);
|
||||
if (HWY_UNLIKELY(remaining != 0)) {
|
||||
HWY_ALIGN float buf1[2 * hn::MaxLanes(df)];
|
||||
HWY_ALIGN float buf2[2 * hn::MaxLanes(df)];
|
||||
HWY_ALIGN float buf3[2 * hn::MaxLanes(df)];
|
||||
HWY_ALIGN float buf_out[2 * hn::MaxLanes(df)];
|
||||
// Ensure the second vector is zeroed even if remaining <= NF.
|
||||
hn::Store(hn::Zero(df), df, buf1 + NF);
|
||||
hn::Store(hn::Zero(df), df, buf2 + NF);
|
||||
hn::Store(hn::Zero(df), df, buf3 + NF);
|
||||
DecompressAndZeroPad(df, packed1, i, buf1, remaining);
|
||||
DecompressAndZeroPad(df, packed2, i, buf2, remaining);
|
||||
DecompressAndZeroPad(df, packed3, i, buf3, remaining);
|
||||
const VF v10 = hn::Load(df, buf1);
|
||||
const VF v11 = hn::Load(df, buf1 + NF);
|
||||
const VF v20 = hn::Load(df, buf2);
|
||||
const VF v21 = hn::Load(df, buf2 + NF);
|
||||
const VF v30 = hn::Load(df, buf3);
|
||||
const VF v31 = hn::Load(df, buf3 + NF);
|
||||
const VF out0 = func(df, v10, v20, v30);
|
||||
const VF out1 = func(df, v11, v21, v31);
|
||||
Compress2(df, out0, out1, MakeSpan(buf_out, 2 * NF), 0);
|
||||
// Clang generates incorrect code for CopyBytes if num = 2.
|
||||
for (size_t j = 0; j < remaining; ++j) {
|
||||
out[i + j] = hwy::ConvertScalarTo<T>(buf_out[j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace gcpp
|
||||
|
|
|
|||
|
|
@ -18,11 +18,10 @@
|
|||
#define HWY_DISABLED_TARGETS GEMMA_DISABLED_TARGETS
|
||||
#endif // HWY_DISABLED_TARGETS
|
||||
|
||||
#include "compression/compress.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "compression/compress.h"
|
||||
#include "compression/distortion.h"
|
||||
#include "util/test_util.h"
|
||||
#include "hwy/aligned_allocator.h"
|
||||
|
|
@ -45,7 +44,7 @@ namespace hn = hwy::HWY_NAMESPACE;
|
|||
|
||||
// Calls Compress and Decompress2 and verifies the distortion/error.
|
||||
template <typename Packed>
|
||||
struct TestDecompress2T {
|
||||
struct TestDecompress2 {
|
||||
template <typename T, class D>
|
||||
HWY_INLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = hn::Lanes(d);
|
||||
|
|
@ -120,12 +119,12 @@ struct TestDecompress2T {
|
|||
}
|
||||
};
|
||||
|
||||
void TestAllDecompress2() { ForeachPackedAndRawType<TestDecompress2T>(); }
|
||||
void TestAllDecompress2() { ForeachPackedAndRawType<TestDecompress2>(); }
|
||||
|
||||
// Calls Compress and DecompressAndZeroPad for all short lengths and verifies
|
||||
// the distortion/error.
|
||||
template <typename Packed>
|
||||
struct TestShortLengthsT {
|
||||
struct TestShortLengths {
|
||||
template <typename T, class D>
|
||||
HWY_INLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = hn::Lanes(d);
|
||||
|
|
@ -196,7 +195,82 @@ struct TestShortLengthsT {
|
|||
}
|
||||
};
|
||||
|
||||
void TestAllShortLengths() { ForeachPackedAndRawType<TestShortLengthsT>(); }
|
||||
void TestAllShortLengths() { ForeachPackedAndRawType<TestShortLengths>(); }
|
||||
|
||||
// Verifies the arguments and remainder handling of `DecompressAndCompress*`.
|
||||
class TestDecompressAndCompress {
|
||||
public:
|
||||
template <typename T, class D>
|
||||
HWY_INLINE void operator()(T /*unused*/, D d) {
|
||||
ForeachActivationType3<Test>(d);
|
||||
}
|
||||
|
||||
private:
|
||||
struct Test {
|
||||
template <typename T1, typename T2, typename T3, /*Deduced:*/ class D>
|
||||
void operator()(T1, T2, T3, D d) {
|
||||
hwy::RandomState rng;
|
||||
using DF = hn::Repartition<float, D>;
|
||||
using VF = hn::Vec<DF>;
|
||||
const DF df;
|
||||
|
||||
for (size_t num = 1; num < 7 * hn::Lanes(d); ++num) {
|
||||
auto p = hwy::AllocateAligned<T1>(num);
|
||||
auto p1 = hwy::AllocateAligned<T2>(num);
|
||||
auto p2 = hwy::AllocateAligned<T3>(num);
|
||||
auto out = hwy::AllocateAligned<T1>(num);
|
||||
auto expected1 = hwy::AllocateAligned<T1>(num);
|
||||
auto expected2 = hwy::AllocateAligned<T1>(num);
|
||||
auto expected3 = hwy::AllocateAligned<T1>(num);
|
||||
HWY_ASSERT(p && p1 && p2 && out && expected1 && expected2 && expected3);
|
||||
// Two bits each, totalling 6 bits which fit in the BF16 mantissa.
|
||||
for (size_t i = 0; i < num; ++i) {
|
||||
const size_t mod = i & 3;
|
||||
p[i] = hwy::ConvertScalarTo<T1>(mod);
|
||||
p1[i] = hwy::ConvertScalarTo<T2>(mod << 2);
|
||||
p2[i] = hwy::ConvertScalarTo<T3>(mod << 4);
|
||||
// For `Decompress1AndCompressInplace` to not overwrite `p`.
|
||||
out[i] = p[i];
|
||||
expected1[i] = hwy::ConvertScalarTo<T1>(mod);
|
||||
expected2[i] = hwy::ConvertScalarTo<T1>((mod << 2) | mod);
|
||||
expected3[i] =
|
||||
hwy::ConvertScalarTo<T1>((mod << 4) | (mod << 2) | mod);
|
||||
}
|
||||
|
||||
DecompressAndCompressInplace(df, p.get(), num,
|
||||
[](DF, VF v) HWY_ATTR -> VF { return v; });
|
||||
HWY_ASSERT_ARRAY_EQ(expected1.get(), p.get(), num);
|
||||
|
||||
// Uses `out` so as not to overwrite `p`.
|
||||
Decompress1AndCompressInplace(
|
||||
df, out.get(), num, p1.get(),
|
||||
[](DF, VF v, VF v1) HWY_ATTR -> VF { return hn::Add(v, v1); });
|
||||
HWY_ASSERT_ARRAY_EQ(expected2.get(), out.get(), num);
|
||||
|
||||
Decompress1AndCompressTo(df, out.get(), num, p.get(),
|
||||
[](DF, VF v) HWY_ATTR -> VF { return v; });
|
||||
HWY_ASSERT_ARRAY_EQ(expected1.get(), out.get(), num);
|
||||
|
||||
Decompress2AndCompressTo(df, out.get(), num, p.get(), p1.get(),
|
||||
[](DF, VF v, VF v1)
|
||||
HWY_ATTR -> VF { return hn::Add(v, v1); });
|
||||
HWY_ASSERT_ARRAY_EQ(expected2.get(), out.get(), num);
|
||||
|
||||
Decompress3AndCompressTo(
|
||||
df, out.get(), num, p.get(), p1.get(), p2.get(),
|
||||
[](DF, VF v, VF v1, VF v2)
|
||||
HWY_ATTR -> VF { return hn::Add(hn::Add(v, v1), v2); });
|
||||
HWY_ASSERT_ARRAY_EQ(expected3.get(), out.get(), num);
|
||||
}
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
void TestAllDecompressAndCompress() {
|
||||
// The Highway Test interface (`ForGE128Vectors`) only supports a single type.
|
||||
// We hard-code one here, and use `ForeachActivationType` internally.
|
||||
hn::ForGE128Vectors<TestDecompressAndCompress>()(float());
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
|
|
@ -208,6 +282,7 @@ namespace gcpp {
|
|||
HWY_BEFORE_TEST(CompressTest);
|
||||
HWY_EXPORT_AND_TEST_P(CompressTest, TestAllDecompress2);
|
||||
HWY_EXPORT_AND_TEST_P(CompressTest, TestAllShortLengths);
|
||||
HWY_EXPORT_AND_TEST_P(CompressTest, TestAllDecompressAndCompress);
|
||||
HWY_AFTER_TEST();
|
||||
} // namespace gcpp
|
||||
#endif // HWY_ONCE
|
||||
|
|
|
|||
|
|
@ -67,6 +67,35 @@ void ForeachPackedAndRawType() {
|
|||
}
|
||||
}
|
||||
|
||||
template <class Test, class D>
|
||||
void ForeachActivationType1(D d) {
|
||||
Test test;
|
||||
test(float(), d);
|
||||
test(BF16(), d);
|
||||
}
|
||||
|
||||
template <class Test, class D>
|
||||
void ForeachActivationType2(D d) {
|
||||
Test test;
|
||||
test(float(), float(), d);
|
||||
test(float(), BF16(), d);
|
||||
test(BF16(), float(), d);
|
||||
test(BF16(), BF16(), d);
|
||||
}
|
||||
|
||||
template <class Test, class D>
|
||||
void ForeachActivationType3(D d) {
|
||||
Test test;
|
||||
test(float(), float(), float(), d);
|
||||
test(float(), float(), BF16(), d);
|
||||
test(float(), BF16(), float(), d);
|
||||
test(float(), BF16(), BF16(), d);
|
||||
test(BF16(), float(), float(), d);
|
||||
test(BF16(), float(), BF16(), d);
|
||||
test(BF16(), BF16(), float(), d);
|
||||
test(BF16(), BF16(), BF16(), d);
|
||||
}
|
||||
|
||||
// Generates inputs: deterministic, within max SfpStream range.
|
||||
template <typename MatT>
|
||||
MatStorageT<MatT> GenerateMat(const Extents2D& extents,
|
||||
|
|
|
|||
|
|
@ -186,6 +186,11 @@ constexpr bool IsNuqStream() {
|
|||
return hwy::IsSame<hwy::RemoveCvRef<Packed>, NuqStream>();
|
||||
}
|
||||
|
||||
template <typename Packed>
|
||||
constexpr bool SupportsPointerArithmetic() {
|
||||
return !IsNuqStream<Packed>();
|
||||
}
|
||||
|
||||
// Tensor types for loading weights.
|
||||
enum class Type { kUnknown, kF32, kBF16, kSFP, kNUQ, kF64 };
|
||||
// These are used in `ModelConfig.Specifier`, hence the strings will not
|
||||
|
|
|
|||
|
|
@ -206,9 +206,8 @@ struct Activations {
|
|||
|
||||
// Gated FFW
|
||||
MatStorageT<BF16> pre_ffw_rms_out;
|
||||
// Norm may be large, so prefer to keep as f32.
|
||||
MatStorageT<float> C1;
|
||||
MatStorageT<float> C2;
|
||||
MatStorageT<BF16> C1;
|
||||
MatStorageT<BF16> C2;
|
||||
MatStorageT<float> ffw_out;
|
||||
|
||||
AttentionActivations attention;
|
||||
|
|
|
|||
|
|
@ -144,8 +144,8 @@ void SingleDotSoftmaxWeightedSum(
|
|||
// Apply rope and scaling to Q.
|
||||
if (layer.query_norm_scale.HasPtr()) {
|
||||
CallUpcasted(&layer.query_norm_scale, [&](const auto* weights_t) {
|
||||
RMSNormInplace(weights_t->PackedScale1(), 0, q,
|
||||
layer.layer_config.qkv_dim, p, worker);
|
||||
RMSNormInplace(weights_t->PackedScale1(), q, layer.layer_config.qkv_dim,
|
||||
p, worker);
|
||||
});
|
||||
}
|
||||
|
||||
|
|
@ -307,7 +307,7 @@ static HWY_INLINE void ComputeQKV(size_t num_tokens, const size_t layer_idx,
|
|||
// Apply further processing to K.
|
||||
if (layer.key_norm_scale.HasPtr()) {
|
||||
CallUpcasted(&layer.key_norm_scale, [&](const auto* weights_t) {
|
||||
RMSNormInplace(weights_t->PackedScale1(), 0, kv_f32, qkv_dim,
|
||||
RMSNormInplace(weights_t->PackedScale1(), kv_f32, qkv_dim,
|
||||
env.ctx.profiler, thread);
|
||||
});
|
||||
}
|
||||
|
|
|
|||
|
|
@ -43,14 +43,14 @@ HWY_BEFORE_NAMESPACE();
|
|||
namespace gcpp {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
template <typename T>
|
||||
void Activation(ActivationType activation, T* HWY_RESTRICT c1,
|
||||
const T* HWY_RESTRICT c2, const size_t count, hwy::Profiler& p,
|
||||
template <typename T1, typename T2>
|
||||
void Activation(ActivationType activation, T1* HWY_RESTRICT c1,
|
||||
const T2* HWY_RESTRICT c2, const size_t count, hwy::Profiler& p,
|
||||
const size_t worker) {
|
||||
static const auto zone = p.AddZone("Gen.Activation");
|
||||
PROFILER_ZONE3(p, worker, zone);
|
||||
namespace hn = hwy::HWY_NAMESPACE;
|
||||
using DF = hn::ScalableTag<T>;
|
||||
using DF = hn::ScalableTag<float>;
|
||||
using VF = hn::Vec<DF>;
|
||||
// ActivationType::Gelu
|
||||
if (c2 == nullptr) { // No multiplier, just Gelu.
|
||||
|
|
@ -58,9 +58,10 @@ void Activation(ActivationType activation, T* HWY_RESTRICT c1,
|
|||
return;
|
||||
};
|
||||
// Has multiplier, Gelu(c1) * c2.
|
||||
hn::Transform1(DF(), c1, count, c2, [](DF df, VF v, VF mul) HWY_ATTR {
|
||||
return hn::Mul(mul, Gelu(df, v));
|
||||
});
|
||||
Decompress1AndCompressInplace(DF(), c1, count, c2,
|
||||
[](DF df, VF v1, VF v2) HWY_ATTR -> VF {
|
||||
return hn::Mul(v2, Gelu(df, v1));
|
||||
});
|
||||
}
|
||||
|
||||
// No C2 multiplier.
|
||||
|
|
@ -75,10 +76,9 @@ void ActivationBatched(ActivationType activation, Mat& c1,
|
|||
});
|
||||
}
|
||||
|
||||
template <class Mat>
|
||||
HWY_NOINLINE void ActivationBatched(ActivationType activation, Mat& c1,
|
||||
const Mat* c2, ThreadingContext& ctx) {
|
||||
using T = typename Mat::T;
|
||||
template <class Mat1, class Mat2>
|
||||
HWY_NOINLINE void ActivationBatched(ActivationType activation, Mat1& c1,
|
||||
const Mat2* c2, ThreadingContext& ctx) {
|
||||
HWY_DASSERT(c1.SameShape(*c2));
|
||||
if (c2 && c2->HasPtr()) {
|
||||
SmallParallelFor(c1.Rows(), ctx.pools, [&](uint64_t task, size_t worker) {
|
||||
|
|
@ -87,8 +87,9 @@ HWY_NOINLINE void ActivationBatched(ActivationType activation, Mat& c1,
|
|||
});
|
||||
} else { // No multiplier
|
||||
SmallParallelFor(c1.Rows(), ctx.pools, [&](uint64_t task, size_t worker) {
|
||||
Activation(activation, c1.Row(task), static_cast<const T*>(nullptr),
|
||||
c1.Cols(), ctx.profiler, worker);
|
||||
Activation(activation, c1.Row(task),
|
||||
static_cast<const typename Mat2::T*>(nullptr), c1.Cols(),
|
||||
ctx.profiler, worker);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -335,7 +335,7 @@ void PrefillVit(const ModelConfig& model_config, const WeightsPtrs& weights,
|
|||
|
||||
// Apply soft embedding norm before input projection.
|
||||
CallUpcasted(&weights.mm_embed_norm, [&](const auto* weights_t) {
|
||||
RMSNormInplace(weights_t->PackedScale1(), 0, activations.x.Row(0),
|
||||
RMSNormInplace(weights_t->PackedScale1(), activations.x.Row(0),
|
||||
vit_model_dim, env.ctx.profiler, hwy::Profiler::Thread());
|
||||
});
|
||||
}
|
||||
|
|
|
|||
|
|
@ -750,7 +750,7 @@ class DotStats {
|
|||
void CheckMuls() const {
|
||||
// Comp2 is between Compensated and Kahan.
|
||||
ASSERT_INSIDE(kComp2, 1.001, s_muls[kComp2].Mean(), 1.4);
|
||||
ASSERT_INSIDE(kComp2, 1.001f, s_muls[kComp2].Max(), 2.4f);
|
||||
ASSERT_INSIDE(kComp2, 1.001f, s_muls[kComp2].Max(), 6.8f);
|
||||
ASSERT_INSIDE(kComp2, 1.0, s_muls[kComp2].GeometricMean(), 1.2);
|
||||
|
||||
// Compensated and Double are very accurate.
|
||||
|
|
|
|||
|
|
@ -119,7 +119,7 @@ void AssertClose(const MatPtrT<TA>& A, const MatPtrT<TB>& B,
|
|||
const double eps_bf16 = hwy::ConvertScalarTo<double>(hwy::Epsilon<BF16>());
|
||||
const double eps_f32 = hwy::ConvertScalarTo<double>(hwy::Epsilon<float>());
|
||||
// Dot() uses double-precision summation.
|
||||
double tolerance = 12 * norm * eps_f32;
|
||||
double tolerance = 20 * norm * eps_f32;
|
||||
// If B is F32, Dot() promotes F32 or even F64, but MatMul demotes the F32 to
|
||||
// BF16, so add extra tolerance.
|
||||
if (IsF32<TB>()) {
|
||||
|
|
|
|||
336
ops/ops-inl.h
336
ops/ops-inl.h
|
|
@ -127,12 +127,13 @@ HWY_INLINE hn::Vec<D> Gelu(D d, hn::Vec<D> v) {
|
|||
}
|
||||
|
||||
// Activation already has a profiler zone.
|
||||
static HWY_NOINLINE HWY_MAYBE_UNUSED void Gelu(float* HWY_RESTRICT x,
|
||||
size_t size) {
|
||||
template <typename T>
|
||||
static HWY_NOINLINE HWY_MAYBE_UNUSED void Gelu(T* HWY_RESTRICT x, size_t size) {
|
||||
namespace hn = hwy::HWY_NAMESPACE;
|
||||
using D = hn::ScalableTag<float>;
|
||||
hn::Transform(D(), x, size,
|
||||
[](D d, hn::Vec<D> v) HWY_ATTR { return Gelu(d, v); });
|
||||
using DF = hn::ScalableTag<float>;
|
||||
using VF = hn::Vec<DF>;
|
||||
DecompressAndCompressInplace(
|
||||
DF(), x, size, [](DF d, VF v) HWY_ATTR -> VF { return Gelu(d, v); });
|
||||
}
|
||||
|
||||
template <class D, HWY_IF_F32_D(D)>
|
||||
|
|
@ -179,13 +180,15 @@ HWY_INLINE hn::Vec<D> Sigmoid(D d, hn::Vec<D> v) {
|
|||
}
|
||||
|
||||
// Sigmoid using the logistic function 1 / (1 + exp(-x[i]))
|
||||
static HWY_NOINLINE HWY_MAYBE_UNUSED void Sigmoid(float* HWY_RESTRICT x,
|
||||
template <typename T>
|
||||
static HWY_NOINLINE HWY_MAYBE_UNUSED void Sigmoid(T* HWY_RESTRICT x,
|
||||
size_t size) {
|
||||
PROFILER_ZONE("ops.Sigmoid");
|
||||
namespace hn = hwy::HWY_NAMESPACE;
|
||||
using D = hn::ScalableTag<float>;
|
||||
hn::Transform(D(), x, size,
|
||||
[](D d, hn::Vec<D> v) HWY_ATTR { return Sigmoid(d, v); });
|
||||
using DF = hn::ScalableTag<float>;
|
||||
using VF = hn::Vec<DF>;
|
||||
DecompressAndCompressInplace(
|
||||
DF(), x, size, [](DF d, VF v) HWY_ATTR -> VF { return Sigmoid(d, v); });
|
||||
}
|
||||
|
||||
namespace detail {
|
||||
|
|
@ -205,71 +208,53 @@ float RMSNormMul(const VT* HWY_RESTRICT x, const size_t size, hwy::Profiler& p,
|
|||
|
||||
} // namespace detail
|
||||
|
||||
// `x_ofs` is the offset within `x`, required for NuqStream.
|
||||
template <typename XT, typename WT, typename OT>
|
||||
HWY_NOINLINE HWY_MAYBE_UNUSED void RMSNorm(const XT* HWY_RESTRICT x,
|
||||
const WT* HWY_RESTRICT weight,
|
||||
size_t w_ofs, OT* HWY_RESTRICT out,
|
||||
OT* HWY_RESTRICT out,
|
||||
const size_t size, hwy::Profiler& p,
|
||||
const size_t worker) {
|
||||
static const auto zone = p.AddZone("Ops.RMSNorm");
|
||||
PROFILER_ZONE3(p, worker, zone);
|
||||
|
||||
namespace hn = hwy::HWY_NAMESPACE;
|
||||
const hn::ScalableTag<float> df;
|
||||
using VF = hn::Vec<decltype(df)>;
|
||||
const size_t NF = hn::Lanes(df);
|
||||
using DF = hn::ScalableTag<float>;
|
||||
using VF = hn::Vec<DF>;
|
||||
|
||||
const VF mul = hn::Set(df, detail::RMSNormMul(x, size, p, worker));
|
||||
const VF mul = hn::Set(DF(), detail::RMSNormMul(x, size, p, worker));
|
||||
const VF* HWY_RESTRICT pmul = &mul;
|
||||
|
||||
const auto packed_x = MakeSpan(x, size);
|
||||
const auto packed_w = MakeSpan(weight, w_ofs + size);
|
||||
const auto packed_out = MakeSpan(out, size);
|
||||
|
||||
HWY_DASSERT(size % (2 * NF) == 0);
|
||||
for (size_t i = 0; i < size; i += 2 * NF) {
|
||||
VF x0, x1, w0, w1;
|
||||
Decompress2(df, packed_x, i, x0, x1);
|
||||
Decompress2(df, packed_w, w_ofs + i, w0, w1);
|
||||
const VF m0 = hn::Mul(mul, x0);
|
||||
const VF m1 = hn::Mul(mul, x1);
|
||||
// (1+weight) * m = m + weight*m = one FMA.
|
||||
const VF out0 = hn::MulAdd(m0, w0, m0);
|
||||
const VF out1 = hn::MulAdd(m1, w1, m1);
|
||||
Compress2(df, out0, out1, packed_out, i);
|
||||
}
|
||||
Decompress2AndCompressTo(DF(), out, size, x, weight,
|
||||
[pmul](DF /*df*/, VF vx, VF vw) HWY_ATTR -> VF {
|
||||
const VF m = hn::Mul(*pmul, vx);
|
||||
// (1+weight) * m = m + weight*m = one FMA.
|
||||
return hn::MulAdd(m, vw, m);
|
||||
});
|
||||
}
|
||||
|
||||
// Same as RMSNorm, but its HWY_RESTRICT forbids passing the same pointer.
|
||||
template <typename WT, typename XT>
|
||||
HWY_NOINLINE HWY_MAYBE_UNUSED void RMSNormInplace(
|
||||
const WT* HWY_RESTRICT weight, size_t w_ofs, XT* HWY_RESTRICT inout,
|
||||
const size_t size, hwy::Profiler& p, const size_t worker) {
|
||||
HWY_NOINLINE HWY_MAYBE_UNUSED void RMSNormInplace(const WT* HWY_RESTRICT weight,
|
||||
XT* HWY_RESTRICT inout,
|
||||
const size_t size,
|
||||
hwy::Profiler& p,
|
||||
const size_t worker) {
|
||||
static const auto zone = p.AddZone("Ops.RMSNormInplace");
|
||||
PROFILER_ZONE3(p, worker, zone);
|
||||
|
||||
namespace hn = hwy::HWY_NAMESPACE;
|
||||
const hn::ScalableTag<float> df;
|
||||
using VF = hn::Vec<decltype(df)>;
|
||||
const size_t NF = hn::Lanes(df);
|
||||
using DF = hn::ScalableTag<float>;
|
||||
using VF = hn::Vec<DF>;
|
||||
|
||||
const VF mul = hn::Set(df, detail::RMSNormMul(inout, size, p, worker));
|
||||
const VF mul = hn::Set(DF(), detail::RMSNormMul(inout, size, p, worker));
|
||||
const VF* HWY_RESTRICT pmul = &mul;
|
||||
|
||||
const auto packed_w = MakeSpan(weight, w_ofs + size);
|
||||
const auto packed_x = MakeSpan(inout, size);
|
||||
|
||||
HWY_DASSERT(size % (2 * NF) == 0);
|
||||
for (size_t i = 0; i < size; i += 2 * NF) {
|
||||
VF x0, x1, w0, w1;
|
||||
Decompress2(df, packed_x, i, x0, x1);
|
||||
Decompress2(df, packed_w, w_ofs + i, w0, w1);
|
||||
const VF m0 = hn::Mul(mul, x0);
|
||||
const VF m1 = hn::Mul(mul, x1);
|
||||
// (1+weight) * m = m + weight*m = one FMA.
|
||||
const VF out0 = hn::MulAdd(m0, w0, m0);
|
||||
const VF out1 = hn::MulAdd(m1, w1, m1);
|
||||
Compress2(df, out0, out1, packed_x, i);
|
||||
}
|
||||
Decompress1AndCompressInplace(DF(), inout, size, weight,
|
||||
[pmul](DF /*df*/, VF vx, VF vw) HWY_ATTR -> VF {
|
||||
const VF m = hn::Mul(*pmul, vx);
|
||||
// (1+weight) * m = m + weight*m = one FMA.
|
||||
return hn::MulAdd(m, vw, m);
|
||||
});
|
||||
}
|
||||
|
||||
// Computes mean mu and mean of squares mu2 of a vector. Used in LayerNorm.
|
||||
|
|
@ -301,9 +286,9 @@ HWY_NOINLINE void LayerNorm(const XT* x, const WT* HWY_RESTRICT scale,
|
|||
PROFILER_ZONE("ops.LayerNorm");
|
||||
|
||||
namespace hn = hwy::HWY_NAMESPACE;
|
||||
const hn::ScalableTag<float> df;
|
||||
using VF = hn::Vec<decltype(df)>;
|
||||
const size_t NF = hn::Lanes(df);
|
||||
using DF = hn::ScalableTag<float>;
|
||||
const DF df;
|
||||
using VF = hn::Vec<DF>;
|
||||
|
||||
double mu, mu2;
|
||||
ComputeMoments(x, size, mu, mu2);
|
||||
|
|
@ -315,56 +300,13 @@ HWY_NOINLINE void LayerNorm(const XT* x, const WT* HWY_RESTRICT scale,
|
|||
const VF* HWY_RESTRICT pmu = &vmu;
|
||||
const VF* HWY_RESTRICT pvar = &vvar;
|
||||
|
||||
const auto packed_x = MakeSpan(x, size);
|
||||
const auto packed_scale = MakeSpan(scale, size);
|
||||
const auto packed_bias = MakeSpan(bias, size);
|
||||
const auto packed_out = MakeSpan(out, size);
|
||||
|
||||
// Loop body for one vector, called from main loop and remainder loop.
|
||||
const auto norm = [pmu, pvar](VF x, VF s, VF add) HWY_ATTR -> VF {
|
||||
const VF centered = hn::Sub(x, *pmu);
|
||||
const VF mul = hn::Mul(s, *pvar);
|
||||
return hn::MulAdd(centered, mul, add);
|
||||
};
|
||||
|
||||
size_t i = 0;
|
||||
if (size >= 2 * NF) {
|
||||
for (; i <= size - 2 * NF; i += 2 * NF) {
|
||||
VF x0, x1, s0, s1, add0, add1;
|
||||
Decompress2(df, packed_x, i, x0, x1);
|
||||
Decompress2(df, packed_scale, i, s0, s1);
|
||||
Decompress2(df, packed_bias, i, add0, add1);
|
||||
const VF n0 = norm(x0, s0, add0);
|
||||
const VF n1 = norm(x1, s1, add1);
|
||||
Compress2(df, n0, n1, packed_out, i);
|
||||
}
|
||||
}
|
||||
|
||||
const size_t remaining = size - i;
|
||||
HWY_DASSERT(remaining < 2 * NF);
|
||||
if (HWY_UNLIKELY(remaining != 0)) {
|
||||
HWY_ALIGN float buf_x[2 * hn::MaxLanes(df)];
|
||||
HWY_ALIGN float buf_scale[2 * hn::MaxLanes(df)];
|
||||
HWY_ALIGN float buf_bias[2 * hn::MaxLanes(df)];
|
||||
// Ensure the second vectors are zeroed even if remaining <= NF.
|
||||
hn::Store(hn::Zero(df), df, buf_x + NF);
|
||||
hn::Store(hn::Zero(df), df, buf_scale + NF);
|
||||
hn::Store(hn::Zero(df), df, buf_bias + NF);
|
||||
HWY_ALIGN OT buf_out[2 * hn::MaxLanes(df)];
|
||||
DecompressAndZeroPad(df, packed_x, i, buf_x, remaining);
|
||||
DecompressAndZeroPad(df, packed_scale, i, buf_scale, remaining);
|
||||
DecompressAndZeroPad(df, packed_bias, i, buf_bias, remaining);
|
||||
const VF x0 = hn::Load(df, buf_x);
|
||||
const VF x1 = hn::Load(df, buf_x + NF);
|
||||
const VF s0 = hn::Load(df, buf_scale);
|
||||
const VF s1 = hn::Load(df, buf_scale + NF);
|
||||
const VF add0 = hn::Load(df, buf_bias);
|
||||
const VF add1 = hn::Load(df, buf_bias + NF);
|
||||
const VF n0 = norm(x0, s0, add0);
|
||||
const VF n1 = norm(x1, s1, add1);
|
||||
Compress2(df, n0, n1, MakeSpan(buf_out, 2 * NF), 0);
|
||||
hwy::CopyBytes(buf_out, out + i, remaining * sizeof(OT));
|
||||
}
|
||||
Decompress3AndCompressTo(DF(), out, size, x, scale, bias,
|
||||
[pmu, pvar](DF /*df*/, VF x, VF s, VF add)
|
||||
HWY_ATTR -> VF {
|
||||
const VF centered = hn::Sub(x, *pmu);
|
||||
const VF mul = hn::Mul(s, *pvar);
|
||||
return hn::MulAdd(centered, mul, add);
|
||||
});
|
||||
}
|
||||
|
||||
static HWY_NOINLINE HWY_MAYBE_UNUSED void AddAbsolutePositionalEmbeddings(
|
||||
|
|
@ -541,40 +483,11 @@ static HWY_NOINLINE HWY_MAYBE_UNUSED void AddFrom(const XT* HWY_RESTRICT x,
|
|||
PROFILER_ZONE3(p, worker, zone);
|
||||
|
||||
namespace hn = hwy::HWY_NAMESPACE;
|
||||
const hn::ScalableTag<float> df;
|
||||
const size_t NF = hn::Lanes(df);
|
||||
using VF = hn::Vec<decltype(df)>;
|
||||
|
||||
const auto packed_x = MakeSpan(x, size);
|
||||
|
||||
size_t i = 0;
|
||||
if (size >= 2 * NF) {
|
||||
for (; i <= size - 2 * NF; i += 2 * NF) {
|
||||
VF x0, x1;
|
||||
Decompress2(df, packed_x, i, x0, x1);
|
||||
VF out0 = hn::Load(df, out + i);
|
||||
VF out1 = hn::Load(df, out + i + NF);
|
||||
hn::Store(hn::Add(x0, out0), df, out + i);
|
||||
hn::Store(hn::Add(x1, out1), df, out + i + NF);
|
||||
}
|
||||
}
|
||||
|
||||
const size_t remaining = size - i;
|
||||
const size_t remaining1 = remaining - HWY_MIN(remaining, NF);
|
||||
HWY_DASSERT(remaining < 2 * NF);
|
||||
HWY_DASSERT(remaining1 < NF);
|
||||
if (HWY_UNLIKELY(remaining != 0)) {
|
||||
HWY_ALIGN float buf_x[2 * hn::MaxLanes(df)];
|
||||
// Ensure the second vector is zeroed even if remaining <= NF.
|
||||
hn::Store(hn::Zero(df), df, buf_x + NF);
|
||||
DecompressAndZeroPad(df, packed_x, i, buf_x, remaining);
|
||||
const VF x0 = hn::Load(df, buf_x);
|
||||
const VF x1 = hn::Load(df, buf_x + NF);
|
||||
const VF out0 = hn::LoadN(df, out + i, remaining);
|
||||
const VF out1 = hn::LoadN(df, out + i + NF, remaining1);
|
||||
hn::StoreN(hn::Add(x0, out0), df, out + i, remaining);
|
||||
hn::StoreN(hn::Add(x1, out1), df, out + i + NF, remaining1);
|
||||
}
|
||||
using DF = hn::ScalableTag<float>;
|
||||
using VF = hn::Vec<DF>;
|
||||
Decompress1AndCompressInplace(DF(), out, size, x,
|
||||
[&](DF /*df*/, VF out, VF x)
|
||||
HWY_ATTR -> VF { return hn::Add(x, out); });
|
||||
}
|
||||
|
||||
// Simple loops unless/until batch sizes are large enough to parallelize.
|
||||
|
|
@ -588,7 +501,7 @@ void RMSNormBatched(const MatPtrT<XT>& activations, const MatPtr& weights,
|
|||
CallUpcasted(&weights, [&](const auto* weights_t) {
|
||||
SmallParallelFor(
|
||||
activations.Rows(), ctx.pools, [&](uint64_t token_idx, size_t worker) {
|
||||
RMSNorm(activations.Row(token_idx), weights_t->PackedScale1(), 0,
|
||||
RMSNorm(activations.Row(token_idx), weights_t->PackedScale1(),
|
||||
out.Row(token_idx), activations.Cols(), ctx.profiler, worker);
|
||||
});
|
||||
});
|
||||
|
|
@ -603,7 +516,7 @@ void RMSNormInplaceBatched(const MatPtr& weights, MatPtrT<XT>& inout,
|
|||
CallUpcasted(&weights, [&](const auto* weights_t) {
|
||||
SmallParallelFor(
|
||||
inout.Rows(), ctx.pools, [&](uint64_t token_idx, size_t worker) {
|
||||
RMSNormInplace(weights_t->PackedScale1(), 0, inout.Row(token_idx),
|
||||
RMSNormInplace(weights_t->PackedScale1(), inout.Row(token_idx),
|
||||
inout.Cols(), ctx.profiler, worker);
|
||||
});
|
||||
});
|
||||
|
|
@ -645,38 +558,15 @@ HWY_NOINLINE HWY_MAYBE_UNUSED void MulByConst(const float c, XT* HWY_RESTRICT x,
|
|||
static const auto zone = p.AddZone("Ops.MulByConst");
|
||||
PROFILER_ZONE3(p, worker, zone);
|
||||
namespace hn = hwy::HWY_NAMESPACE;
|
||||
const hn::ScalableTag<float> df;
|
||||
const size_t NF = hn::Lanes(df);
|
||||
using VF = hn::Vec<decltype(df)>;
|
||||
using DF = hn::ScalableTag<float>;
|
||||
using VF = hn::Vec<DF>;
|
||||
|
||||
const VF v_c = hn::Set(df, c);
|
||||
const auto packed_x = MakeSpan(x, size);
|
||||
const VF vc = hn::Set(DF(), c);
|
||||
const VF* HWY_RESTRICT pc = &vc;
|
||||
|
||||
size_t i = 0;
|
||||
if (size >= 2 * NF) {
|
||||
for (; i <= size - 2 * NF; i += 2 * NF) {
|
||||
VF x0, x1;
|
||||
Decompress2(df, packed_x, i, x0, x1);
|
||||
x0 = hn::Mul(x0, v_c);
|
||||
x1 = hn::Mul(x1, v_c);
|
||||
Compress2(df, x0, x1, packed_x, i);
|
||||
}
|
||||
}
|
||||
|
||||
const size_t remaining = size - i;
|
||||
HWY_DASSERT(remaining < 2 * NF);
|
||||
if (HWY_UNLIKELY(remaining != 0)) {
|
||||
HWY_ALIGN float buf_x[2 * hn::MaxLanes(df)];
|
||||
// Ensure the second vector is zeroed even if remaining <= NF.
|
||||
hn::Store(hn::Zero(df), df, buf_x + NF);
|
||||
DecompressAndZeroPad(df, packed_x, i, buf_x, remaining);
|
||||
VF x0 = hn::Load(df, buf_x);
|
||||
VF x1 = hn::Load(df, buf_x + NF);
|
||||
x0 = hn::Mul(x0, v_c);
|
||||
x1 = hn::Mul(x1, v_c);
|
||||
Compress2(df, x0, x1, MakeSpan(buf_x, 2 * NF), 0);
|
||||
hwy::CopyBytes(buf_x, x + i, remaining * sizeof(XT));
|
||||
}
|
||||
DecompressAndCompressInplace(DF(), x, size,
|
||||
[pc](DF /*df*/, VF x)
|
||||
HWY_ATTR -> VF { return hn::Mul(x, *pc); });
|
||||
}
|
||||
|
||||
// Same as above, but with a separate output. Same as below without the add.
|
||||
|
|
@ -687,42 +577,18 @@ HWY_NOINLINE HWY_MAYBE_UNUSED void MulByConstTo(
|
|||
static const auto zone = p.AddZone("Ops.MulByConstTo");
|
||||
PROFILER_ZONE3(p, worker, zone);
|
||||
namespace hn = hwy::HWY_NAMESPACE;
|
||||
const hn::ScalableTag<float> df;
|
||||
const size_t NF = hn::Lanes(df);
|
||||
using VF = hn::Vec<decltype(df)>;
|
||||
using DF = hn::ScalableTag<float>;
|
||||
using VF = hn::Vec<DF>;
|
||||
|
||||
const VF v_c = hn::Set(df, c);
|
||||
const auto packed_x = MakeSpan(x, size);
|
||||
const auto packed_out = MakeSpan(out, size);
|
||||
const VF vc = hn::Set(DF(), c);
|
||||
const VF* HWY_RESTRICT pc = &vc;
|
||||
|
||||
size_t i = 0;
|
||||
if (size >= 2 * NF) {
|
||||
for (; i <= size - 2 * NF; i += 2 * NF) {
|
||||
VF x0, x1;
|
||||
Decompress2(df, packed_x, i, x0, x1);
|
||||
const VF out0 = hn::Mul(x0, v_c);
|
||||
const VF out1 = hn::Mul(x1, v_c);
|
||||
Compress2(df, out0, out1, packed_out, i);
|
||||
}
|
||||
}
|
||||
|
||||
const size_t remaining = size - i;
|
||||
HWY_DASSERT(remaining < 2 * NF);
|
||||
if (HWY_UNLIKELY(remaining != 0)) {
|
||||
HWY_ALIGN float buf_x[2 * hn::MaxLanes(df)];
|
||||
HWY_ALIGN float buf_out[2 * hn::MaxLanes(df)];
|
||||
// Ensure the second vector is zeroed even if remaining <= NF.
|
||||
hn::Store(hn::Zero(df), df, buf_x + NF);
|
||||
DecompressAndZeroPad(df, packed_x, i, buf_x, remaining);
|
||||
const VF x0 = hn::Load(df, buf_x);
|
||||
const VF x1 = hn::Load(df, buf_x + NF);
|
||||
const VF out0 = hn::Mul(x0, v_c);
|
||||
const VF out1 = hn::Mul(x1, v_c);
|
||||
Compress2(df, out0, out1, MakeSpan(buf_out, 2 * NF), 0);
|
||||
hwy::CopyBytes(buf_out, out + i, remaining * sizeof(OT));
|
||||
}
|
||||
Decompress1AndCompressTo(DF(), out, size, x,
|
||||
[pc](DF /*df*/, VF x)
|
||||
HWY_ATTR -> VF { return hn::Mul(x, *pc); });
|
||||
}
|
||||
|
||||
// out[i] += x[i] * c.
|
||||
template <typename XT, typename OT>
|
||||
HWY_NOINLINE HWY_MAYBE_UNUSED void MulByConstAndAdd(
|
||||
const float c, const XT* HWY_RESTRICT x, OT* HWY_RESTRICT out,
|
||||
|
|
@ -730,45 +596,16 @@ HWY_NOINLINE HWY_MAYBE_UNUSED void MulByConstAndAdd(
|
|||
static const auto zone = p.AddZone("Ops.MulByConstAndAdd");
|
||||
PROFILER_ZONE3(p, worker, zone);
|
||||
namespace hn = hwy::HWY_NAMESPACE;
|
||||
const hn::ScalableTag<float> df;
|
||||
const size_t NF = hn::Lanes(df);
|
||||
using VF = hn::Vec<decltype(df)>;
|
||||
using DF = hn::ScalableTag<float>;
|
||||
using VF = hn::Vec<DF>;
|
||||
|
||||
const VF v_c = hn::Set(df, c);
|
||||
const auto packed_x = MakeSpan(x, size);
|
||||
const auto packed_out = MakeSpan(out, size);
|
||||
const VF vc = hn::Set(DF(), c);
|
||||
const VF* HWY_RESTRICT pc = &vc;
|
||||
|
||||
size_t i = 0;
|
||||
if (size >= 2 * NF) {
|
||||
for (; i <= size - 2 * NF; i += 2 * NF) {
|
||||
VF x0, x1, out0, out1;
|
||||
Decompress2(df, packed_x, i, x0, x1);
|
||||
Decompress2(df, packed_out, i, out0, out1);
|
||||
out0 = hn::MulAdd(x0, v_c, out0);
|
||||
out1 = hn::MulAdd(x1, v_c, out1);
|
||||
Compress2(df, out0, out1, packed_out, i);
|
||||
}
|
||||
}
|
||||
|
||||
const size_t remaining = size - i;
|
||||
HWY_DASSERT(remaining < 2 * NF);
|
||||
if (HWY_UNLIKELY(remaining != 0)) {
|
||||
HWY_ALIGN float buf_x[2 * hn::MaxLanes(df)];
|
||||
HWY_ALIGN float buf_out[2 * hn::MaxLanes(df)];
|
||||
// Ensure the second vectors are zeroed even if remaining <= NF.
|
||||
hn::Store(hn::Zero(df), df, buf_x + NF);
|
||||
hn::Store(hn::Zero(df), df, buf_out + NF);
|
||||
DecompressAndZeroPad(df, packed_x, i, buf_x, remaining);
|
||||
DecompressAndZeroPad(df, packed_out, i, buf_out, remaining);
|
||||
const VF x0 = hn::Load(df, buf_x);
|
||||
const VF x1 = hn::Load(df, buf_x + NF);
|
||||
VF out0 = hn::Load(df, buf_out);
|
||||
VF out1 = hn::Load(df, buf_out + NF);
|
||||
out0 = hn::MulAdd(x0, v_c, out0);
|
||||
out1 = hn::MulAdd(x1, v_c, out1);
|
||||
Compress2(df, out0, out1, MakeSpan(buf_out, 2 * NF), 0);
|
||||
hwy::CopyBytes(buf_out, out + i, remaining * sizeof(OT));
|
||||
}
|
||||
Decompress1AndCompressInplace(DF(), out, size, x,
|
||||
[&](DF /*df*/, VF out, VF x) HWY_ATTR -> VF {
|
||||
return hn::MulAdd(x, *pc, out);
|
||||
});
|
||||
}
|
||||
|
||||
// See below for a specialized version for top-1 sampling.
|
||||
|
|
@ -913,15 +750,18 @@ static HWY_NOINLINE void LogitsSoftCap(const float cap, float* HWY_RESTRICT x,
|
|||
PROFILER_ZONE3(p, worker, zone);
|
||||
|
||||
namespace hn = hwy::HWY_NAMESPACE;
|
||||
using D = hn::ScalableTag<float>;
|
||||
using V = hn::Vec<D>;
|
||||
using DF = hn::ScalableTag<float>;
|
||||
using VF = hn::Vec<DF>;
|
||||
|
||||
const float inv_cap = 1.0f / cap;
|
||||
const VF vcap = hn::Set(DF(), cap);
|
||||
const VF vinv_cap = hn::Set(DF(), 1.0f / cap);
|
||||
const VF* HWY_RESTRICT pcap = &vcap;
|
||||
const VF* HWY_RESTRICT pinv_cap = &vinv_cap;
|
||||
|
||||
hn::Transform(D(), x, size, [cap, inv_cap](D d, V v) HWY_ATTR {
|
||||
return hn::Mul(hn::Set(d, cap),
|
||||
hn::Tanh(d, hn::Mul(v, hn::Set(d, inv_cap))));
|
||||
});
|
||||
DecompressAndCompressInplace(
|
||||
DF(), x, size, [pcap, pinv_cap](DF d, VF v) HWY_ATTR -> VF {
|
||||
return hn::Mul(*pcap, hn::Tanh(d, hn::Mul(v, *pinv_cap)));
|
||||
});
|
||||
}
|
||||
|
||||
// Calls LogitsSoftCap if cap != 0.0f.
|
||||
|
|
|
|||
479
ops/ops_test.cc
479
ops/ops_test.cc
|
|
@ -47,6 +47,7 @@
|
|||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
#include "hwy/highway.h"
|
||||
// After highway.h
|
||||
#include "compression/test_util-inl.h"
|
||||
#include "ops/ops-inl.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
|
||||
|
|
@ -83,48 +84,6 @@ T Random(hwy::RandomState& rng) {
|
|||
HWY_MAX(hwy::ConvertScalarTo<double>(hwy::LowestValue<T>()), val));
|
||||
}
|
||||
|
||||
HWY_NOINLINE void SimpleAddFrom(const float* HWY_RESTRICT other,
|
||||
float* HWY_RESTRICT x, size_t size) {
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
x[i] += other[i];
|
||||
}
|
||||
}
|
||||
|
||||
HWY_NOINLINE void SimpleMulBy(const float* HWY_RESTRICT other,
|
||||
float* HWY_RESTRICT x, size_t size) {
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
x[i] *= other[i];
|
||||
}
|
||||
}
|
||||
|
||||
HWY_NOINLINE void SimpleMulByConst(float c, float* HWY_RESTRICT x,
|
||||
size_t size) {
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
x[i] *= c;
|
||||
}
|
||||
}
|
||||
|
||||
HWY_NOINLINE void SimpleMulByConstAndAdd(float c, const float* HWY_RESTRICT x,
|
||||
float* HWY_RESTRICT out, size_t size) {
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
out[i] += x[i] * c;
|
||||
}
|
||||
}
|
||||
|
||||
HWY_NOINLINE void SimpleSoftmax(float* HWY_RESTRICT x, size_t size) {
|
||||
HWY_DASSERT(size != 0);
|
||||
float sum = 0.0;
|
||||
const float maxval = *std::max_element(x, x + size);
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
x[i] = std::exp(x[i] - maxval);
|
||||
sum += x[i];
|
||||
}
|
||||
const float scale = 1.0f / sum;
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
x[i] *= scale;
|
||||
}
|
||||
}
|
||||
|
||||
template <size_t k>
|
||||
HWY_NOINLINE std::discrete_distribution<int> SourceCreateDistribution(
|
||||
std::array<float, k>& top_k, float temperature) {
|
||||
|
|
@ -141,7 +100,8 @@ HWY_NOINLINE std::discrete_distribution<int> SourceCreateDistribution(
|
|||
return std::discrete_distribution<int>(std::begin(top_k), std::end(top_k));
|
||||
}
|
||||
|
||||
struct TestAddFrom {
|
||||
class TestAddFrom {
|
||||
public:
|
||||
template <class D>
|
||||
void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
|
||||
hwy::RandomState& rng) {
|
||||
|
|
@ -171,9 +131,24 @@ struct TestAddFrom {
|
|||
hwy::AssertArraySimilar(e, x, count, hwy::TargetName(HWY_TARGET), __FILE__,
|
||||
__LINE__);
|
||||
}
|
||||
|
||||
private:
|
||||
template <typename T1, typename T2>
|
||||
static HWY_NOINLINE void SimpleAddFrom(const T1* HWY_RESTRICT other,
|
||||
T2* HWY_RESTRICT x, size_t size) {
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
x[i] = hwy::ConvertScalarTo<T2>(hwy::ConvertScalarTo<float>(x[i]) +
|
||||
hwy::ConvertScalarTo<float>(other[i]));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct TestMulByConstAndAdd {
|
||||
void TestAllAddFrom() {
|
||||
hn::ForPartialVectors<ForeachCountAndMisalign<TestAddFrom>>()(float());
|
||||
}
|
||||
|
||||
class TestMulByConstAndAdd {
|
||||
public:
|
||||
template <class D>
|
||||
void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
|
||||
hwy::RandomState& rng) {
|
||||
|
|
@ -204,9 +179,27 @@ struct TestMulByConstAndAdd {
|
|||
hwy::AssertArraySimilar(e, x, count, hwy::TargetName(HWY_TARGET), __FILE__,
|
||||
__LINE__);
|
||||
}
|
||||
|
||||
private:
|
||||
template <typename T1, typename T2>
|
||||
static HWY_NOINLINE void SimpleMulByConstAndAdd(float c,
|
||||
const T1* HWY_RESTRICT x,
|
||||
T2* HWY_RESTRICT out,
|
||||
size_t size) {
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
out[i] = hwy::ConvertScalarTo<T2>(hwy::ConvertScalarTo<float>(out[i]) +
|
||||
hwy::ConvertScalarTo<float>(x[i]) * c);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct TestMulByConst {
|
||||
void TestAllMulByConstAndAdd() {
|
||||
hn::ForPartialVectors<ForeachCountAndMisalign<TestMulByConstAndAdd>>()(
|
||||
float());
|
||||
}
|
||||
|
||||
class TestMulByConst {
|
||||
public:
|
||||
template <class D>
|
||||
void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
|
||||
hwy::RandomState& rng) {
|
||||
|
|
@ -234,9 +227,61 @@ struct TestMulByConst {
|
|||
hwy::AssertArraySimilar(e, x, count, hwy::TargetName(HWY_TARGET), __FILE__,
|
||||
__LINE__);
|
||||
}
|
||||
|
||||
private:
|
||||
template <typename T1>
|
||||
static HWY_NOINLINE void SimpleMulByConst(float c, T1* HWY_RESTRICT x,
|
||||
size_t size) {
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
x[i] = hwy::ConvertScalarTo<T1>(hwy::ConvertScalarTo<float>(x[i]) * c);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct TestSoftmax {
|
||||
void TestAllMulByConst() {
|
||||
hn::ForPartialVectors<ForeachCountAndMisalign<TestMulByConst>>()(float());
|
||||
}
|
||||
|
||||
struct TestMulByConstTo {
|
||||
template <class D>
|
||||
void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
|
||||
hwy::RandomState& rng) {
|
||||
if (misalign_b == 0) return;
|
||||
using T = hn::TFromD<D>;
|
||||
|
||||
hwy::AlignedFreeUniquePtr<T[]> px =
|
||||
hwy::AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
|
||||
hwy::AlignedFreeUniquePtr<T[]> pe =
|
||||
hwy::AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
|
||||
hwy::AlignedFreeUniquePtr<T[]> pactual =
|
||||
hwy::AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
|
||||
HWY_ASSERT(px && pe && pactual);
|
||||
|
||||
T* x = px.get() + misalign_a;
|
||||
T* e = pe.get() + misalign_a;
|
||||
T* actual = pe.get() + misalign_a;
|
||||
|
||||
T constant = Random<T>(rng);
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
x[i] = Random<T>(rng);
|
||||
e[i] = hwy::ConvertScalarTo<T>(hwy::ConvertScalarTo<float>(x[i]) *
|
||||
hwy::ConvertScalarTo<float>(constant));
|
||||
}
|
||||
|
||||
MulByConstTo(constant, x, actual, count, hwy::Profiler::Get(),
|
||||
/*worker=*/0);
|
||||
|
||||
hwy::AssertArraySimilar(e, actual, count, hwy::TargetName(HWY_TARGET),
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
};
|
||||
|
||||
void TestAllMulByConstTo() {
|
||||
hn::ForPartialVectors<ForeachCountAndMisalign<TestMulByConstTo>>()(float());
|
||||
}
|
||||
|
||||
class TestSoftmax {
|
||||
public:
|
||||
template <class D>
|
||||
void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
|
||||
hwy::RandomState& rng) {
|
||||
|
|
@ -270,8 +315,27 @@ struct TestSoftmax {
|
|||
}
|
||||
ASSERT_NEAR(sum, 1.0, 1e-6);
|
||||
}
|
||||
|
||||
private:
|
||||
static HWY_NOINLINE void SimpleSoftmax(float* HWY_RESTRICT x, size_t size) {
|
||||
HWY_DASSERT(size != 0);
|
||||
float sum = 0.0;
|
||||
const float maxval = *std::max_element(x, x + size);
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
x[i] = std::exp(x[i] - maxval);
|
||||
sum += x[i];
|
||||
}
|
||||
const float scale = 1.0f / sum;
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
x[i] *= scale;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
void TestAllSoftmax() {
|
||||
hn::ForPartialVectors<ForeachCountAndMisalign<TestSoftmax>>()(float());
|
||||
}
|
||||
|
||||
template <size_t k>
|
||||
struct TestCreateDistribution {
|
||||
void operator()(hwy::RandomState& rng) {
|
||||
|
|
@ -291,43 +355,60 @@ struct TestCreateDistribution {
|
|||
}
|
||||
};
|
||||
|
||||
void TestAllAddFrom() {
|
||||
hn::ForPartialVectors<ForeachCountAndMisalign<TestAddFrom>>()(float());
|
||||
}
|
||||
|
||||
void TestAllMulByConst() {
|
||||
hn::ForPartialVectors<ForeachCountAndMisalign<TestMulByConst>>()(float());
|
||||
}
|
||||
|
||||
void TestAllMulByConstAndAdd() {
|
||||
hn::ForPartialVectors<ForeachCountAndMisalign<TestMulByConstAndAdd>>()(
|
||||
float());
|
||||
}
|
||||
|
||||
void TestAllSoftmax() {
|
||||
hn::ForPartialVectors<ForeachCountAndMisalign<TestSoftmax>>()(float());
|
||||
}
|
||||
|
||||
void TestAllCreateDistribution() {
|
||||
TestCreateDistribution<2048>();
|
||||
TestCreateDistribution<5000>();
|
||||
}
|
||||
|
||||
void TestSigmoid() {
|
||||
std::vector<float> values;
|
||||
for (int i = -150; i <= 150; ++i) {
|
||||
values.push_back(.1f * i);
|
||||
}
|
||||
std::vector<float> result = values;
|
||||
Sigmoid(result.data(), result.size());
|
||||
struct TestSigmoid {
|
||||
template <typename T, class D>
|
||||
void operator()(T, D) const {
|
||||
std::vector<T> values;
|
||||
for (int i = -150; i <= 150; ++i) {
|
||||
values.push_back(hwy::ConvertScalarTo<T>(.1f * i));
|
||||
}
|
||||
std::vector<T> result = values;
|
||||
Sigmoid(result.data(), result.size());
|
||||
|
||||
for (size_t i = 0; i < values.size(); i++) {
|
||||
const float max_error = 0.00007;
|
||||
float value = values[i];
|
||||
float approx = result[i];
|
||||
float expected = (1 / (1 + std::exp(-values[i])));
|
||||
EXPECT_NEAR(approx, expected, max_error) << "Input: " << value;
|
||||
for (size_t i = 0; i < values.size(); i++) {
|
||||
const float max_error = IsBF16<T>() ? 0.2f : 0.00007f;
|
||||
const float value = hwy::ConvertScalarTo<float>(values[i]);
|
||||
const float actual = hwy::ConvertScalarTo<float>(result[i]);
|
||||
const float expected = (1 / (1 + std::exp(-value)));
|
||||
EXPECT_NEAR(expected, actual, max_error)
|
||||
<< (IsBF16<T>() ? "bf16" : "float");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static HWY_NOINLINE void TestAllSigmoid() {
|
||||
ForeachActivationType1<TestSigmoid>(hn::ScalableTag<float>());
|
||||
}
|
||||
|
||||
struct TestGelu {
|
||||
template <typename T, class D>
|
||||
void operator()(T, D) const {
|
||||
std::vector<T> values;
|
||||
for (int i = -150; i <= 150; ++i) {
|
||||
values.push_back(hwy::ConvertScalarTo<T>(.1f * i));
|
||||
}
|
||||
std::vector<T> result = values;
|
||||
Gelu(result.data(), result.size());
|
||||
|
||||
for (size_t i = 0; i < values.size(); i++) {
|
||||
const float max_error = IsBF16<T>() ? 0.2f : 0.00007f;
|
||||
const float x = hwy::ConvertScalarTo<float>(values[i]);
|
||||
const float actual = hwy::ConvertScalarTo<float>(result[i]);
|
||||
const float expected =
|
||||
x * (0.5f + 0.5f * tanh(x * (0.79788f + 0.035677f * x * x)));
|
||||
EXPECT_NEAR(expected, actual, max_error)
|
||||
<< (IsBF16<T>() ? "bf16" : "float");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static HWY_NOINLINE void TestAllGelu() {
|
||||
ForeachActivationType1<TestGelu>(hn::ScalableTag<float>());
|
||||
}
|
||||
|
||||
static HWY_NOINLINE HWY_MAYBE_UNUSED void ScalarRopeAndMulBy(
|
||||
|
|
@ -421,7 +502,8 @@ void TestRopeAndMulBy() {
|
|||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_NOINLINE float ScalarSquaredL2(const T* HWY_RESTRICT a, size_t size) {
|
||||
static HWY_NOINLINE float ScalarSquaredL2(const T* HWY_RESTRICT a,
|
||||
size_t size) {
|
||||
double sum = 0.0;
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
const float f = hwy::ConvertScalarTo<float>(a[i]);
|
||||
|
|
@ -431,9 +513,11 @@ HWY_NOINLINE float ScalarSquaredL2(const T* HWY_RESTRICT a, size_t size) {
|
|||
}
|
||||
|
||||
// Supports bf16 and f32 inputs/outputs, which can be in-place.
|
||||
// Shared between TestRMSNorm and TestRMSNormInplace.
|
||||
template <typename XT, typename WT, typename OT>
|
||||
HWY_NOINLINE void ScalarRMSNorm(const XT* x, const WT* HWY_RESTRICT weight,
|
||||
OT* out, size_t size) {
|
||||
static HWY_NOINLINE void ScalarRMSNorm(const XT* x,
|
||||
const WT* HWY_RESTRICT weight, OT* out,
|
||||
size_t size) {
|
||||
constexpr float kEps = 1e-6f;
|
||||
float ss = ScalarSquaredL2(x, size);
|
||||
ss = 1.0f / sqrtf(ss / StaticCast<float>(size) + kEps);
|
||||
|
|
@ -445,42 +529,73 @@ HWY_NOINLINE void ScalarRMSNorm(const XT* x, const WT* HWY_RESTRICT weight,
|
|||
}
|
||||
}
|
||||
|
||||
template <typename XT, typename WT, typename OT>
|
||||
void TestRMSNorm(hwy::RandomState& rng) {
|
||||
constexpr size_t kSize = 128;
|
||||
HWY_ALIGN XT vec[kSize];
|
||||
HWY_ALIGN WT weight[kSize];
|
||||
HWY_ALIGN OT expected[kSize];
|
||||
HWY_ALIGN OT actual[kSize];
|
||||
struct TestRMSNorm {
|
||||
template <typename XT, typename WT, typename OT, class D>
|
||||
void operator()(XT, WT, OT, D) const {
|
||||
hwy::RandomState rng;
|
||||
|
||||
for (size_t i = 0; i < kSize; ++i) {
|
||||
vec[i] = hwy::ConvertScalarTo<XT>(RandomGaussian(rng));
|
||||
weight[i] = hwy::ConvertScalarTo<WT>(RandomGaussian(rng));
|
||||
}
|
||||
constexpr size_t kSize = 128;
|
||||
HWY_ALIGN XT vec[kSize];
|
||||
HWY_ALIGN WT weight[kSize];
|
||||
HWY_ALIGN OT expected[kSize];
|
||||
HWY_ALIGN OT actual[kSize];
|
||||
|
||||
ScalarRMSNorm(vec, weight, expected, kSize);
|
||||
RMSNorm(vec, weight, 0, actual, kSize, hwy::Profiler::Get(), /*worker=*/0);
|
||||
for (size_t i = 0; i < kSize; ++i) {
|
||||
vec[i] = hwy::ConvertScalarTo<XT>(RandomGaussian(rng));
|
||||
weight[i] = hwy::ConvertScalarTo<WT>(RandomGaussian(rng));
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < kSize; i++) {
|
||||
const float e = hwy::ConvertScalarTo<float>(expected[i]);
|
||||
const float a = hwy::ConvertScalarTo<float>(actual[i]);
|
||||
if (!IsNear(e, a, 1e-5f)) {
|
||||
HWY_ABORT("RMSNorm %s %s %s mismatch at %zu: %E %E\n", TypeName<XT>(),
|
||||
TypeName<WT>(), TypeName<OT>(), i, e, a);
|
||||
ScalarRMSNorm(vec, weight, expected, kSize);
|
||||
RMSNorm(vec, weight, actual, kSize, hwy::Profiler::Get(), /*worker=*/0);
|
||||
|
||||
for (size_t i = 0; i < kSize; i++) {
|
||||
const float e = hwy::ConvertScalarTo<float>(expected[i]);
|
||||
const float a = hwy::ConvertScalarTo<float>(actual[i]);
|
||||
if (!IsNear(e, a, 1e-5f)) {
|
||||
HWY_ABORT("RMSNorm %s %s %s mismatch at %zu: %E %E\n", TypeName<XT>(),
|
||||
TypeName<WT>(), TypeName<OT>(), i, e, a);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
void TestAllRMSNorm() {
|
||||
hwy::RandomState rng;
|
||||
TestRMSNorm<float, float, float>(rng);
|
||||
TestRMSNorm<float, float, BF16>(rng);
|
||||
TestRMSNorm<float, BF16, float>(rng);
|
||||
TestRMSNorm<float, BF16, BF16>(rng);
|
||||
TestRMSNorm<BF16, float, float>(rng);
|
||||
TestRMSNorm<BF16, float, BF16>(rng);
|
||||
TestRMSNorm<BF16, BF16, float>(rng);
|
||||
TestRMSNorm<BF16, BF16, BF16>(rng);
|
||||
ForeachActivationType3<TestRMSNorm>(hn::ScalableTag<float>());
|
||||
}
|
||||
|
||||
struct TestRMSNormInplace {
|
||||
template <typename XT, typename WT, class D>
|
||||
void operator()(XT, WT, D) const {
|
||||
hwy::RandomState rng;
|
||||
|
||||
constexpr size_t kSize = 128;
|
||||
HWY_ALIGN XT expected[kSize];
|
||||
HWY_ALIGN XT actual[kSize];
|
||||
HWY_ALIGN WT weight[kSize];
|
||||
|
||||
for (size_t i = 0; i < kSize; ++i) {
|
||||
expected[i] = hwy::ConvertScalarTo<XT>(RandomGaussian(rng));
|
||||
actual[i] = expected[i];
|
||||
weight[i] = hwy::ConvertScalarTo<WT>(RandomGaussian(rng));
|
||||
}
|
||||
|
||||
ScalarRMSNorm(expected, weight, expected, kSize);
|
||||
RMSNormInplace(weight, actual, kSize, hwy::Profiler::Get(),
|
||||
/*worker=*/0);
|
||||
|
||||
for (size_t i = 0; i < kSize; i++) {
|
||||
const float e = hwy::ConvertScalarTo<float>(expected[i]);
|
||||
const float a = hwy::ConvertScalarTo<float>(actual[i]);
|
||||
if (!IsNear(e, a, 1e-5f)) {
|
||||
HWY_ABORT("RMSNormInplace %s %s mismatch at %zu: %E %E\n",
|
||||
TypeName<XT>(), TypeName<WT>(), i, e, a);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
void TestAllRMSNormInplace() {
|
||||
ForeachActivationType2<TestRMSNormInplace>(hn::ScalableTag<float>());
|
||||
}
|
||||
|
||||
void TestLayerNormSimple() {
|
||||
|
|
@ -497,91 +612,92 @@ void TestLayerNormSimple() {
|
|||
|
||||
for (size_t i = 0; i < kSize; i++) {
|
||||
const float max_error = 1e-6f;
|
||||
float value = values[i];
|
||||
float res = result[i];
|
||||
// out = (x - 0.0) * 1.2 * 0.9999995 + 0.1 = 1.2999994 / -1.0999994;
|
||||
float expected = (i % 2 == 0) ? 1.2999994f : -1.0999994f;
|
||||
EXPECT_NEAR(res, expected, max_error) << "Input: " << value;
|
||||
EXPECT_NEAR(res, expected, max_error);
|
||||
}
|
||||
}
|
||||
|
||||
// Computes mean mu and mean of squares mu2 of a vector. Used in
|
||||
// ScalarLayerNorm.
|
||||
template <typename T>
|
||||
HWY_NOINLINE void ScalarMus(const T* HWY_RESTRICT a, size_t size, double& mu,
|
||||
double& mu2) {
|
||||
HWY_ASSERT(size > 0);
|
||||
double sum = 0.0;
|
||||
double sum2 = 0.0;
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
const float f = hwy::ConvertScalarTo<float>(a[i]);
|
||||
sum += f;
|
||||
sum2 += f * f;
|
||||
}
|
||||
mu = sum / size;
|
||||
mu2 = sum2 / size;
|
||||
}
|
||||
class TestLayerNorm {
|
||||
public:
|
||||
template <typename XT, typename WT, typename OT, class D>
|
||||
void operator()(XT, WT, OT, D) const {
|
||||
hwy::RandomState rng;
|
||||
constexpr size_t kSize = 128;
|
||||
XT vec[kSize];
|
||||
WT weight[kSize];
|
||||
WT bias[kSize];
|
||||
OT expected[kSize];
|
||||
OT actual[kSize];
|
||||
|
||||
// Compare py/flax/linen/normalization.py.
|
||||
// out = (x - mean) * scale * rsqrt(var + epsilon) + bias
|
||||
template <typename XT, typename WT, typename OT>
|
||||
HWY_NOINLINE void ScalarLayerNorm(const XT* x, const WT* HWY_RESTRICT scale,
|
||||
const WT* HWY_RESTRICT bias, OT* out,
|
||||
size_t size) {
|
||||
constexpr double kEps = 1e-6;
|
||||
double mu, mu2;
|
||||
ScalarMus(x, size, mu, mu2);
|
||||
double var = mu2 - mu * mu;
|
||||
constexpr double kZero = 0.0;
|
||||
var = HWY_MAX(var, kZero);
|
||||
var = 1.0 / sqrt(var + kEps);
|
||||
for (size_t j = 0; j < size; j++) {
|
||||
const float v = hwy::ConvertScalarTo<float>(x[j]);
|
||||
const float s = hwy::ConvertScalarTo<float>(scale[j]);
|
||||
const float b = hwy::ConvertScalarTo<float>(bias[j]);
|
||||
out[j] = hwy::ConvertScalarTo<OT>((v - mu) * s * var + b);
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < kSize; ++i) {
|
||||
vec[i] = hwy::ConvertScalarTo<XT>(RandomGaussian(rng));
|
||||
weight[i] = hwy::ConvertScalarTo<WT>(RandomGaussian(rng));
|
||||
bias[i] = hwy::ConvertScalarTo<WT>(RandomGaussian(rng));
|
||||
}
|
||||
|
||||
template <typename XT, typename WT, typename OT>
|
||||
void TestLayerNorm(hwy::RandomState& rng) {
|
||||
constexpr size_t kSize = 128;
|
||||
XT vec[kSize];
|
||||
WT weight[kSize];
|
||||
WT bias[kSize];
|
||||
OT expected[kSize];
|
||||
OT actual[kSize];
|
||||
double expected_mu, expected_mu2;
|
||||
ScalarMus(vec, kSize, expected_mu, expected_mu2);
|
||||
double actual_mu, actual_mu2;
|
||||
ComputeMoments(vec, kSize, actual_mu, actual_mu2);
|
||||
|
||||
for (size_t i = 0; i < kSize; ++i) {
|
||||
vec[i] = hwy::ConvertScalarTo<XT>(RandomGaussian(rng));
|
||||
weight[i] = hwy::ConvertScalarTo<WT>(RandomGaussian(rng));
|
||||
bias[i] = hwy::ConvertScalarTo<WT>(RandomGaussian(rng));
|
||||
}
|
||||
ScalarLayerNorm(vec, weight, bias, expected, kSize);
|
||||
LayerNorm(vec, weight, bias, actual, kSize);
|
||||
|
||||
double expected_mu, expected_mu2;
|
||||
ScalarMus(vec, kSize, expected_mu, expected_mu2);
|
||||
double actual_mu, actual_mu2;
|
||||
ComputeMoments(vec, kSize, actual_mu, actual_mu2);
|
||||
|
||||
ScalarLayerNorm(vec, weight, bias, expected, kSize);
|
||||
LayerNorm(vec, weight, bias, actual, kSize);
|
||||
|
||||
for (size_t i = 0; i < kSize; i++) {
|
||||
const float e = hwy::ConvertScalarTo<float>(expected[i]);
|
||||
const float a = hwy::ConvertScalarTo<float>(actual[i]);
|
||||
if (!IsNear(e, a, 1e-5f)) {
|
||||
HWY_ABORT("LayerNorm %s %s %s mismatch at %zu: %E %E\n", TypeName<XT>(),
|
||||
TypeName<WT>(), TypeName<OT>(), i, e, a);
|
||||
for (size_t i = 0; i < kSize; i++) {
|
||||
const float e = hwy::ConvertScalarTo<float>(expected[i]);
|
||||
const float a = hwy::ConvertScalarTo<float>(actual[i]);
|
||||
if (!IsNear(e, a, 1e-5f)) {
|
||||
HWY_ABORT("LayerNorm %s %s %s mismatch at %zu: %E %E\n", TypeName<XT>(),
|
||||
TypeName<WT>(), TypeName<OT>(), i, e, a);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
// Computes mean mu and mean of squares mu2 of a vector. Used in
|
||||
// ScalarLayerNorm.
|
||||
template <typename T>
|
||||
static HWY_NOINLINE void ScalarMus(const T* HWY_RESTRICT a, size_t size,
|
||||
double& mu, double& mu2) {
|
||||
HWY_ASSERT(size > 0);
|
||||
double sum = 0.0;
|
||||
double sum2 = 0.0;
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
const float f = hwy::ConvertScalarTo<float>(a[i]);
|
||||
sum += f;
|
||||
sum2 += f * f;
|
||||
}
|
||||
mu = sum / size;
|
||||
mu2 = sum2 / size;
|
||||
}
|
||||
|
||||
// Compare py/flax/linen/normalization.py.
|
||||
// out = (x - mean) * scale * rsqrt(var + epsilon) + bias
|
||||
template <typename XT, typename WT, typename OT>
|
||||
static HWY_NOINLINE void ScalarLayerNorm(const XT* x,
|
||||
const WT* HWY_RESTRICT scale,
|
||||
const WT* HWY_RESTRICT bias, OT* out,
|
||||
size_t size) {
|
||||
constexpr double kEps = 1e-6;
|
||||
double mu, mu2;
|
||||
ScalarMus(x, size, mu, mu2);
|
||||
double var = mu2 - mu * mu;
|
||||
constexpr double kZero = 0.0;
|
||||
var = HWY_MAX(var, kZero);
|
||||
var = 1.0 / sqrt(var + kEps);
|
||||
for (size_t j = 0; j < size; j++) {
|
||||
const float v = hwy::ConvertScalarTo<float>(x[j]);
|
||||
const float s = hwy::ConvertScalarTo<float>(scale[j]);
|
||||
const float b = hwy::ConvertScalarTo<float>(bias[j]);
|
||||
out[j] = hwy::ConvertScalarTo<OT>((v - mu) * s * var + b);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
void TestAllLayerNorm() {
|
||||
hwy::RandomState rng;
|
||||
TestLayerNorm<float, float, float>(rng);
|
||||
TestLayerNorm<float, float, BF16>(rng);
|
||||
TestLayerNorm<float, BF16, float>(rng);
|
||||
TestLayerNorm<float, BF16, BF16>(rng);
|
||||
ForeachActivationType3<TestLayerNorm>(hn::ScalableTag<float>());
|
||||
}
|
||||
|
||||
void TestSampleTopK() {
|
||||
|
|
@ -646,12 +762,15 @@ namespace gcpp {
|
|||
HWY_BEFORE_TEST(OpsTest);
|
||||
HWY_EXPORT_AND_TEST_P(OpsTest, TestAllAddFrom);
|
||||
HWY_EXPORT_AND_TEST_P(OpsTest, TestAllMulByConst);
|
||||
HWY_EXPORT_AND_TEST_P(OpsTest, TestAllMulByConstTo);
|
||||
HWY_EXPORT_AND_TEST_P(OpsTest, TestAllMulByConstAndAdd);
|
||||
HWY_EXPORT_AND_TEST_P(OpsTest, TestAllSoftmax);
|
||||
HWY_EXPORT_AND_TEST_P(OpsTest, TestAllCreateDistribution);
|
||||
HWY_EXPORT_AND_TEST_P(OpsTest, TestSigmoid);
|
||||
HWY_EXPORT_AND_TEST_P(OpsTest, TestAllSigmoid);
|
||||
HWY_EXPORT_AND_TEST_P(OpsTest, TestAllGelu);
|
||||
HWY_EXPORT_AND_TEST_P(OpsTest, TestRopeAndMulBy);
|
||||
HWY_EXPORT_AND_TEST_P(OpsTest, TestAllRMSNorm);
|
||||
HWY_EXPORT_AND_TEST_P(OpsTest, TestAllRMSNormInplace);
|
||||
HWY_EXPORT_AND_TEST_P(OpsTest, TestAllLayerNorm);
|
||||
HWY_EXPORT_AND_TEST_P(OpsTest, TestLayerNormSimple);
|
||||
HWY_EXPORT_AND_TEST_P(OpsTest, TestSampleTopK);
|
||||
|
|
|
|||
Loading…
Reference in New Issue