From 507d64e3e635fd434d03b50d552570a68afd0d6c Mon Sep 17 00:00:00 2001 From: enum-class Date: Tue, 5 Mar 2024 17:37:09 +0800 Subject: [PATCH 1/9] use hwy/simd for SquaredL2 calculation --- ops.h | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/ops.h b/ops.h index 8f92d82..d7a3ee1 100644 --- a/ops.h +++ b/ops.h @@ -340,11 +340,21 @@ static HWY_NOINLINE HWY_MAYBE_UNUSED float Dot(const float* HWY_RESTRICT a, // = Dot(a, a, size), but that is not allowed due to HWY_RESTRICT. static HWY_NOINLINE HWY_MAYBE_UNUSED float SquaredL2( const float* HWY_RESTRICT a, size_t size) { - float total = 0.f; - for (size_t i = 0; i < size; ++i) { - total += a[i] * a[i]; + const hn::ScalableTag d; + const size_t N = hn::Lanes(d); + HWY_DASSERT(size >= N); + HWY_DASSERT(size % (2 * N) == 0); + + auto sum0 = hn::Zero(d); + auto sum1 = hn::Zero(d); + for (size_t i = 0; i + 2 * N <= size; i += 2 * N) { + const auto a0 = LoadU(d, a + i); + sum0 = MulAdd(a0, a0, sum0); + const auto a1 = LoadU(d, a + i + N); + sum1 = MulAdd(a1, a1, sum1); } - return total; + + return ReduceSum(d, Add(sum0, sum1)); } static HWY_NOINLINE HWY_MAYBE_UNUSED void RMSNorm( From 5f016fb433ff9ee9fbdc9834697de9256a05b84c Mon Sep 17 00:00:00 2001 From: enum-class Date: Tue, 5 Mar 2024 17:53:52 +0800 Subject: [PATCH 2/9] use hwy/simd for RMSNorm(f, bf, f) calculation --- ops.h | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/ops.h b/ops.h index 8f92d82..1919ac9 100644 --- a/ops.h +++ b/ops.h @@ -362,12 +362,30 @@ static HWY_NOINLINE HWY_MAYBE_UNUSED void RMSNorm( static HWY_NOINLINE HWY_MAYBE_UNUSED void RMSNorm( const float* HWY_RESTRICT x, const hwy::bfloat16_t* HWY_RESTRICT weight, float* HWY_RESTRICT out, size_t size) { + namespace hn = hwy::HWY_NAMESPACE; + constexpr float eps = 1e-6f; - float ss = SquaredL2(x, size); - ss = 1.0f / sqrtf(ss / StaticCast(size) + eps); - for (size_t j = 0; j < size; j++) { - // Note 1.0f centering here - out[j] = (1.0f + hwy::F32FromBF16(weight[j])) * (ss * x[j]); + constexpr size_t unroll_size = 2; + + const hn::ScalableTag dbf; + const hn::Repartition df32; + const size_t N32 = hn::Lanes(df32); + + const float ss = SquaredL2(x, size); + const auto vss = + hn::Set(df32, 1.0f / sqrtf(ss / StaticCast(size) + eps)); + + HWY_DASSERT(size % (unroll_size * MaxLanes(df32)) == 0); + for (size_t i = 0; i < size; i += unroll_size * N32) { + const hn::Vec w16 = hn::LoadU(dbf, weight + i); + const auto w0 = hn::PromoteLowerTo(df32, w16); + const auto w1 = hn::PromoteUpperTo(df32, w16); + const auto m0 = hn::Mul(vss, hn::LoadU(df32, x + i)); + const auto m1 = hn::Mul(vss, hn::LoadU(df32, x + i + N32)); + + // (1+weight) * m = m + weight*m = one FMA. + hn::StoreU(hn::MulAdd(m0, w0, m0), df32, out + i); + hn::StoreU(hn::MulAdd(m1, w1, m1), df32, out + i + N32); } } From bc845515b7261284fd001fd0f481a1593e29ff97 Mon Sep 17 00:00:00 2001 From: enum-class Date: Tue, 5 Mar 2024 20:45:30 +0800 Subject: [PATCH 3/9] fix style, add kCamelCase style for constexpr in clang-tidy --- .clang-tidy | 4 ++++ ops.h | 10 +++++----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/.clang-tidy b/.clang-tidy index abcd9d7..497c2e3 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -1,4 +1,5 @@ FormatStyle: file +WarningsAsErrors: "*" Checks: "-*,\ abseil-*,\ -abseil-string-find-startswith,\ @@ -204,3 +205,6 @@ Checks: "-*,\ -readability-uppercase-literal-suffix,\ -readability-use-anyofallof " +CheckOptions: + - { key: readability-identifier-naming.ConstexprVariableCase, value: CamelCase } + - { key: readability-identifier-naming.ConstexprVariablePrefix, value: k } diff --git a/ops.h b/ops.h index 1919ac9..5d34b3a 100644 --- a/ops.h +++ b/ops.h @@ -364,8 +364,8 @@ static HWY_NOINLINE HWY_MAYBE_UNUSED void RMSNorm( float* HWY_RESTRICT out, size_t size) { namespace hn = hwy::HWY_NAMESPACE; - constexpr float eps = 1e-6f; - constexpr size_t unroll_size = 2; + constexpr float kEps = 1e-6f; + constexpr size_t kUnrollSize = 2; const hn::ScalableTag dbf; const hn::Repartition df32; @@ -373,10 +373,10 @@ static HWY_NOINLINE HWY_MAYBE_UNUSED void RMSNorm( const float ss = SquaredL2(x, size); const auto vss = - hn::Set(df32, 1.0f / sqrtf(ss / StaticCast(size) + eps)); + hn::Set(df32, 1.0f / sqrtf(ss / StaticCast(size) + kEps)); - HWY_DASSERT(size % (unroll_size * MaxLanes(df32)) == 0); - for (size_t i = 0; i < size; i += unroll_size * N32) { + HWY_DASSERT(size % (kUnrollSize * MaxLanes(df32)) == 0); + for (size_t i = 0; i < size; i += kUnrollSize * N32) { const hn::Vec w16 = hn::LoadU(dbf, weight + i); const auto w0 = hn::PromoteLowerTo(df32, w16); const auto w1 = hn::PromoteUpperTo(df32, w16); From 843d9b0e1fc14729f7a5b19c2f172aaa0379734b Mon Sep 17 00:00:00 2001 From: enum-class Date: Wed, 6 Mar 2024 08:25:50 +0800 Subject: [PATCH 4/9] fix for-loop bounderies --- ops.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ops.h b/ops.h index d7a3ee1..179001c 100644 --- a/ops.h +++ b/ops.h @@ -342,12 +342,12 @@ static HWY_NOINLINE HWY_MAYBE_UNUSED float SquaredL2( const float* HWY_RESTRICT a, size_t size) { const hn::ScalableTag d; const size_t N = hn::Lanes(d); - HWY_DASSERT(size >= N); + HWY_DASSERT(size >= 2 * N); HWY_DASSERT(size % (2 * N) == 0); auto sum0 = hn::Zero(d); auto sum1 = hn::Zero(d); - for (size_t i = 0; i + 2 * N <= size; i += 2 * N) { + for (size_t i = 0; i <= size - 2 * N; i += 2 * N) { const auto a0 = LoadU(d, a + i); sum0 = MulAdd(a0, a0, sum0); const auto a1 = LoadU(d, a + i + N); From fce5c8c967212f7cedd591633e0532a51d146601 Mon Sep 17 00:00:00 2001 From: Jan Wassenberg Date: Thu, 7 Mar 2024 22:18:46 -0800 Subject: [PATCH 5/9] Avoid fadvise on older Android. Fixes #84 PiperOrigin-RevId: 613815953 --- compression/blob_store.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compression/blob_store.cc b/compression/blob_store.cc index e088fc6..050dfbd 100644 --- a/compression/blob_store.cc +++ b/compression/blob_store.cc @@ -341,7 +341,7 @@ BlobError BlobReader::Open(const char* filename) { #endif if (fd_ < 0) return __LINE__; -#if HWY_OS_LINUX +#if HWY_OS_LINUX && (!defined(__ANDROID_API__) || __ANDROID_API__ >= 21) // Doubles the readahead window, which seems slightly faster when cached. (void)posix_fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL); #endif From 415464b047829d4fd64a6b7fde2b1c0ba843b793 Mon Sep 17 00:00:00 2001 From: austinvhuang Date: Sun, 10 Mar 2024 15:41:17 -0400 Subject: [PATCH 6/9] fix CMakeLists typo --- examples/hello_world/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/hello_world/CMakeLists.txt b/examples/hello_world/CMakeLists.txt index 397b957..9d44f04 100644 --- a/examples/hello_world/CMakeLists.txt +++ b/examples/hello_world/CMakeLists.txt @@ -35,7 +35,7 @@ if (BUILD_MODE STREQUAL "local") else() FetchContent_Declare(gemma GIT_REPOSITORY https://github.com/google/gemma.cpp.git GIT_TAG 8c7b2cf61b9794b806de091685dc6739dd3db837) endif() -FetchContent_MakeAvailabl(gemma) +FetchContent_MakeAvailable(gemma) if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE "Release") From 0d406061c0e78b60c151ff0ba6af2d24ffc40d5e Mon Sep 17 00:00:00 2001 From: Jan Wassenberg Date: Mon, 11 Mar 2024 21:57:35 -0700 Subject: [PATCH 7/9] Detect and print build type. Refs #88 PiperOrigin-RevId: 614906000 --- run.cc | 4 ++-- util/app.h | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/run.cc b/run.cc index 507979d..610b824 100644 --- a/run.cc +++ b/run.cc @@ -66,8 +66,8 @@ void ShowConfig(LoaderArgs& loader, InferenceArgs& inference, AppArgs& app) { << std::thread::hardware_concurrency() << std::endl << "Instruction set : " << hwy::TargetName(hwy::DispatchedTarget()) << " (" - << hwy::VectorBytes() * 8 << " bits)" - << "\n" + << hwy::VectorBytes() * 8 << " bits)" << "\n" + << "Compiled config : " << CompiledConfig() << "\n" << "Weight Type : " << gcpp::TypeName(gcpp::WeightT()) << "\n" << "EmbedderInput Type : " diff --git a/util/app.h b/util/app.h index 7f926a5..79956be 100644 --- a/util/app.h +++ b/util/app.h @@ -36,6 +36,24 @@ namespace gcpp { +static inline const char* CompiledConfig() { + if (HWY_IS_ASAN) { + return "asan"; + } else if (HWY_IS_MSAN) { + return "msan"; + } else if (HWY_IS_TSAN) { + return "tsan"; +#if defined(HWY_IS_UBSAN) + } else if (HWY_IS_UBSAN) { + return "ubsan"; +#endif + } else if (HWY_IS_DEBUG_BUILD) { + return "dbg"; + } else { + return "opt"; + } +} + static inline void PinThreadToCore(size_t cpu_index) { #if HWY_OS_LINUX // Forces the thread to run on the logical processor with the same number. From 9345b0aed5df1afa41120b0feacd4e827d6139a5 Mon Sep 17 00:00:00 2001 From: Austin Huang Date: Tue, 12 Mar 2024 09:40:34 -0700 Subject: [PATCH 8/9] Fix bazel build failure: https://github.com/google/gemma.cpp/actions/runs/8252325981/job/22571549312 PiperOrigin-RevId: 615073081 --- BUILD.bazel | 2 ++ 1 file changed, 2 insertions(+) diff --git a/BUILD.bazel b/BUILD.bazel index cc5104c..d2b96bc 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -89,7 +89,9 @@ cc_binary( ":app", ":args", ":gemma_lib", + # copybara:strip_begin "//base", + # copybara:strip_end "//compression:compress", "@hwy//:hwy", "@hwy//:nanobenchmark", From a9aa63fd2ea6b786ed0706d619588bfe2d43370e Mon Sep 17 00:00:00 2001 From: Austin Huang Date: Tue, 12 Mar 2024 10:42:50 -0700 Subject: [PATCH 9/9] Fix bazel build attempt #2 (remove //base through automation change). https://github.com/google/gemma.cpp/actions/runs/8252325981/job/22571549312 PiperOrigin-RevId: 615097246 --- BUILD.bazel | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/BUILD.bazel b/BUILD.bazel index d2b96bc..885aff5 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -69,7 +69,7 @@ cc_library( deps = [ ":args", ":transformer_ops", - "//base", + # "//base", "//compression:compress", "@hwy//:hwy", "@hwy//:matvec", @@ -89,9 +89,7 @@ cc_binary( ":app", ":args", ":gemma_lib", - # copybara:strip_begin - "//base", - # copybara:strip_end + # "//base", "//compression:compress", "@hwy//:hwy", "@hwy//:nanobenchmark",