mirror of https://github.com/google/gemma.cpp.git
use hwy/simd for RMSNorm(f, bf, f) calculation
This commit is contained in:
parent
bb9b023502
commit
5f016fb433
28
ops.h
28
ops.h
|
|
@ -362,12 +362,30 @@ static HWY_NOINLINE HWY_MAYBE_UNUSED void RMSNorm(
|
|||
static HWY_NOINLINE HWY_MAYBE_UNUSED void RMSNorm(
|
||||
const float* HWY_RESTRICT x, const hwy::bfloat16_t* HWY_RESTRICT weight,
|
||||
float* HWY_RESTRICT out, size_t size) {
|
||||
namespace hn = hwy::HWY_NAMESPACE;
|
||||
|
||||
constexpr float eps = 1e-6f;
|
||||
float ss = SquaredL2(x, size);
|
||||
ss = 1.0f / sqrtf(ss / StaticCast<float>(size) + eps);
|
||||
for (size_t j = 0; j < size; j++) {
|
||||
// Note 1.0f centering here
|
||||
out[j] = (1.0f + hwy::F32FromBF16(weight[j])) * (ss * x[j]);
|
||||
constexpr size_t unroll_size = 2;
|
||||
|
||||
const hn::ScalableTag<hwy::bfloat16_t> dbf;
|
||||
const hn::Repartition<float, decltype(dbf)> df32;
|
||||
const size_t N32 = hn::Lanes(df32);
|
||||
|
||||
const float ss = SquaredL2(x, size);
|
||||
const auto vss =
|
||||
hn::Set(df32, 1.0f / sqrtf(ss / StaticCast<float>(size) + eps));
|
||||
|
||||
HWY_DASSERT(size % (unroll_size * MaxLanes(df32)) == 0);
|
||||
for (size_t i = 0; i < size; i += unroll_size * N32) {
|
||||
const hn::Vec<decltype(dbf)> w16 = hn::LoadU(dbf, weight + i);
|
||||
const auto w0 = hn::PromoteLowerTo(df32, w16);
|
||||
const auto w1 = hn::PromoteUpperTo(df32, w16);
|
||||
const auto m0 = hn::Mul(vss, hn::LoadU(df32, x + i));
|
||||
const auto m1 = hn::Mul(vss, hn::LoadU(df32, x + i + N32));
|
||||
|
||||
// (1+weight) * m = m + weight*m = one FMA.
|
||||
hn::StoreU(hn::MulAdd(m0, w0, m0), df32, out + i);
|
||||
hn::StoreU(hn::MulAdd(m1, w1, m1), df32, out + i + N32);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue