// Copyright 2024 Google LLC // SPDX-License-Identifier: Apache-2.0 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Include guard for non-SIMD code. #ifndef THIRD_PARTY_GEMMA_CPP_OPS_OPS_INL_H_ #define THIRD_PARTY_GEMMA_CPP_OPS_OPS_INL_H_ #include #include #include #include #include #include #include // std::enable_if_t #include #include #include "ops/matmul.h" #include "ops/ops.h" #include "util/allocator.h" #include "util/basics.h" // TokenAndProb, RngStream #include "util/mat.h" #include "util/threading_context.h" #include "util/zones.h" #include "hwy/base.h" #include "hwy/bit_set.h" #include "hwy/contrib/sort/order.h" #include "hwy/contrib/sort/vqsort.h" #include "hwy/detect_targets.h" #include "hwy/profiler.h" #endif // THIRD_PARTY_GEMMA_CPP_OPS_OPS_INL_H_ // Include guard for (potentially) SIMD code. #if defined(THIRD_PARTY_GEMMA_CPP_OPS_TOGGLE) == defined(HWY_TARGET_TOGGLE) #ifdef THIRD_PARTY_GEMMA_CPP_OPS_TOGGLE #undef THIRD_PARTY_GEMMA_CPP_OPS_TOGGLE #else #define THIRD_PARTY_GEMMA_CPP_OPS_TOGGLE #endif #include "compression/compress-inl.h" #include "ops/dot-inl.h" #include "ops/matmul_static.h" // includes highway.h #include "ops/sum-inl.h" #include "hwy/contrib/algo/transform-inl.h" #include "hwy/contrib/math/math-inl.h" HWY_BEFORE_NAMESPACE(); namespace gcpp { namespace HWY_NAMESPACE { namespace hn = hwy::HWY_NAMESPACE; // Computes C = A * B + add via MatMulStatic. // This function uses CallUpcasted to dispatch to the correct MatMulStatic // instantiation based on the runtime type of B. template MMPerKey* CallMatMul(const MatPtrT& A, const MatPtr& B, const float* HWY_RESTRICT add, MatMulEnv& env, MatPtrT& C, const MMOptions& options = MMOptions()) { return CallUpcasted(&B, [&](const auto* B_t) { return MatMulStatic(A, *B_t, add, env, C, options); }); } static inline void CallTwoMatMul(const MatPtrT& A, const MatPtr& B1, const MatPtr& B2, MatMulEnv& env, MatPtrT& C, const MMOptions& options) { return CallUpcastedSame(&B1, &B2, [&](const auto* B1_t, const auto* B2_t) { return TwoMatMulStatic(A, *B1_t, *B2_t, env, C, options); }); } HWY_INLINE double PackTokenAndProb(int32_t token, float prob) { // casting prob from float to double just makes some changes to the // exponent bias and pads zeros in the mantissa. double packed = static_cast(prob); int64_t packed_int64; hwy::CopySameSize(&packed, &packed_int64); // stuff the token into the lower 32 bits of packed_int64. (it is an int32_t // anyway) packed_int64 &= 0xFFFFFFFF00000000; packed_int64 |= token; // copy bytes back into packed. hwy::CopySameSize(&packed_int64, &packed); return packed; } HWY_INLINE TokenAndProb UnpackTokenAndProb(double packed) { TokenAndProb tp; int64_t packed_int64; hwy::CopySameSize(&packed, &packed_int64); tp.token = static_cast(packed_int64 & 0xFFFFFFFFULL); // clear the lower 32 bits of packed_int64 before copying back into packed. packed_int64 &= 0xFFFFFFFF00000000ULL; hwy::CopySameSize(&packed_int64, &packed); tp.prob = static_cast(packed); return tp; } template HWY_INLINE constexpr std::enable_if_t< std::is_arithmetic_v && std::is_arithmetic_v, To> StaticCast(From from) noexcept { if constexpr (std::is_unsigned_v && std::is_floating_point_v) { return static_cast( static_cast>(from)); } else { return static_cast(from); } } // We use the tanh approximation for gelu (also used in training). // gelu(x) = 0.5 * x * (1 + tanh(sqrt(2/π) * (x + 0.044715 * x^3))) // = 0.5 * x * (1 + tanh(x * (sqrt(2/π) + sqrt(2/π) * 0.044715 * x^2))) // = 0.5 * x * (1 + tanh(x * (0.79788 + 0.035677 * x^2))) // = x * (0.5 + 0.5 * tanh(x * (0.79788 + 0.035677 * x^2)))) template HWY_INLINE hn::Vec Gelu(D d, hn::Vec v) { const hn::Vec kMul = hn::Set(d, 0.03567740813636141f); const hn::Vec kSqrt2OverPi = hn::Set(d, 0.797884560804236f); const hn::Vec kHalf = hn::Set(d, 0.5f); const hn::Vec v2 = hn::Mul(v, v); const hn::Vec arg = hn::Mul(v, hn::MulAdd(kMul, v2, kSqrt2OverPi)); const hn::Vec cdf = hn::MulAdd(kHalf, hn::Tanh(d, arg), kHalf); return hn::Mul(v, cdf); } // Activation already has a profiler zone. template static HWY_NOINLINE HWY_MAYBE_UNUSED void Gelu(T* HWY_RESTRICT x, size_t size) { namespace hn = hwy::HWY_NAMESPACE; using DF = hn::ScalableTag; using VF = hn::Vec; DecompressAndCompressInplace( DF(), x, size, [](DF d, VF v) HWY_ATTR -> VF { return Gelu(d, v); }); } template HWY_INLINE hn::Vec Sigmoid(D d, hn::Vec v) { using VF = hn::Vec; // Chebyshev polynomial coefficients for rational approximation const VF c0 = hn::Set(d, 0.00949107017368078f); const VF c1 = hn::Set(d, 0.0654858946800232f); const VF c2 = hn::Set(d, 0.231547489762306f - 0.00949107017368078f); const VF c3 = hn::Set(d, 0.530778527259827f); const VF c4 = hn::Set(d, 0.855334937572479f); const VF c5 = hn::Set(d, 0.500000894069672f); const VF d0 = hn::Set(d, 0.130970627069473f); const VF d1 = hn::Set(d, 3.99615288415589e-07f); const VF d2 = hn::Set(d, 1.06155431270599f - 0.130970627069473f); const VF d3 = hn::Set(d, 1.35144250634767e-06f); const VF d4 = hn::Set(d, 1); // The approximation works in range -12..12, but the input value is clamped // in -11.5..11.5 since the approximation slightly overshoots after that. // The function is nearly 0 for input values below -11.5 and nearly 1 for // input values above 11.5. const VF invtwelve = hn::Set(d, 1.0f / 12.0f); const VF lo = hn::Set(d, -11.5f); const VF hi = hn::Set(d, 11.5f); VF f = hn::Clamp(v, lo, hi); f = hn::Mul(f, invtwelve); VF f2 = hn::Add(f, f); VF a1 = hn::MulAdd(f2, c0, c1); VF a2 = hn::MulAdd(f2, a1, c2); VF a3 = hn::Sub(hn::MulAdd(f2, a2, c3), a1); VF a4 = hn::Sub(hn::MulAdd(f2, a3, c4), a2); VF f0 = hn::Sub(hn::MulAdd(f, a4, c5), a3); VF b1 = hn::MulAdd(f2, d0, d1); VF b2 = hn::MulAdd(f2, b1, d2); VF b3 = hn::Sub(hn::MulAdd(f2, b2, d3), b1); VF f1 = hn::Sub(hn::MulAdd(f, b3, d4), b2); return hn::Div(f0, f1); } // Sigmoid using the logistic function 1 / (1 + exp(-x[i])) template static HWY_NOINLINE HWY_MAYBE_UNUSED void Sigmoid(T* HWY_RESTRICT x, size_t size) { PROFILER_ZONE("ops.Sigmoid"); namespace hn = hwy::HWY_NAMESPACE; using DF = hn::ScalableTag; using VF = hn::Vec; DecompressAndCompressInplace( DF(), x, size, [](DF d, VF v) HWY_ATTR -> VF { return Sigmoid(d, v); }); } namespace detail { // Shared by RMSNorm and RMSNormInplace. template float RMSNormMul(const VT* HWY_RESTRICT x, const size_t size, ThreadingContext& ctx, const size_t worker) { GCPP_ZONE(ctx, worker, Zones::kOpsRmsNormMul); const hn::ScalableTag d; const float l2 = DecompressAndCall(d, MakeSpan(x, size), DotKernelDefault()); constexpr float kEps = 1e-6f; // avoid divide by zero return 1.0f / sqrtf(l2 / StaticCast(size) + kEps); } } // namespace detail template HWY_NOINLINE HWY_MAYBE_UNUSED void RMSNorm( const XT* HWY_RESTRICT x, const WT* HWY_RESTRICT weight, const size_t w_ofs, OT* HWY_RESTRICT out, const size_t size, ThreadingContext& ctx, const size_t worker) { GCPP_ZONE(ctx, worker, Zones::kOpsRmsNorm); namespace hn = hwy::HWY_NAMESPACE; using DF = hn::ScalableTag; using VF = hn::Vec; const VF mul = hn::Set(DF(), detail::RMSNormMul(x, size, ctx, worker)); const VF* HWY_RESTRICT pmul = &mul; Decompress2AndCompressTo(DF(), out, size, x, weight, w_ofs, [pmul](DF /*df*/, VF vx, VF vw) HWY_ATTR -> VF { const VF m = hn::Mul(*pmul, vx); // (1+weight) * m = m + weight*m = one FMA. return hn::MulAdd(m, vw, m); }); } // Same as RMSNorm, but its HWY_RESTRICT forbids passing the same pointer. template HWY_NOINLINE HWY_MAYBE_UNUSED void RMSNormInplace( const WT* HWY_RESTRICT weight, const size_t w_ofs, XT* HWY_RESTRICT inout, const size_t size, ThreadingContext& ctx, const size_t worker) { GCPP_ZONE(ctx, worker, Zones::kOpsRmsNormInplace); namespace hn = hwy::HWY_NAMESPACE; using DF = hn::ScalableTag; using VF = hn::Vec; const VF mul = hn::Set(DF(), detail::RMSNormMul(inout, size, ctx, worker)); const VF* HWY_RESTRICT pmul = &mul; Decompress1AndCompressInplace(DF(), inout, size, weight, w_ofs, [pmul](DF /*df*/, VF vx, VF vw) HWY_ATTR -> VF { const VF m = hn::Mul(*pmul, vx); // (1+weight) * m = m + weight*m = one FMA. return hn::MulAdd(m, vw, m); }); } // Computes mean mu and mean of squares mu2 of a vector. Used in LayerNorm. template HWY_NOINLINE void ComputeMoments(const XT* HWY_RESTRICT x, size_t size, double& mu, double& mu2) { HWY_ASSERT(size > 0); const hn::ScalableTag df; // Use the existing Sum and Dot kernels for simplicity. The second pass // is likely not too expensive because it will be in L1. const double sum = Sum(df, x, size); // We only have one array, so calling `DecompressAndCall` instead of `Dot`` // avoids loading the 'second' vector again. const double sum2 = DecompressAndCall(df, MakeSpan(x, size), DotKernelDouble()); const double inv_size = 1.0 / static_cast(size); mu = sum * inv_size; mu2 = sum2 * inv_size; } // Compare py/flax/linen/normalization.py. // out = (x - mean) * scale * rsqrt(var + epsilon) + bias // x and out may be the same. template HWY_NOINLINE void LayerNorm(const XT* x, const WT* HWY_RESTRICT scale, const WT* HWY_RESTRICT bias, OT* out, size_t size) { PROFILER_ZONE("ops.LayerNorm"); namespace hn = hwy::HWY_NAMESPACE; using DF = hn::ScalableTag; const DF df; using VF = hn::Vec; double mu, mu2; ComputeMoments(x, size, mu, mu2); double var = mu2 - mu * mu; var = HWY_MAX(var, 0.0); var = 1.0 / sqrt(var + 1E-6); const VF vmu = hn::Set(df, static_cast(mu)); const VF vvar = hn::Set(df, static_cast(var)); const VF* HWY_RESTRICT pmu = &vmu; const VF* HWY_RESTRICT pvar = &vvar; Decompress3AndCompressTo(DF(), out, size, x, scale, bias, [pmu, pvar](DF /*df*/, VF x, VF s, VF add) HWY_ATTR -> VF { const VF centered = hn::Sub(x, *pmu); const VF mul = hn::Mul(s, *pvar); return hn::MulAdd(centered, mul, add); }); } static HWY_NOINLINE HWY_MAYBE_UNUSED void AddAbsolutePositionalEmbeddings( float* HWY_RESTRICT x, size_t dim_model, size_t pos) { PROFILER_ZONE("ops.AddAbsolutePositionalEmbeddings"); const size_t num_timescales = dim_model / 2; const float log_timescale_increment = logf(10000.0f) / (num_timescales != 0 ? StaticCast(num_timescales - 1) : 1.0f); for (size_t dim = 0; dim < num_timescales; ++dim) { const float inv_timescale = expf(StaticCast(dim) * -log_timescale_increment); x[dim] += sinf(StaticCast(pos) * inv_timescale); x[num_timescales + dim] += cosf(StaticCast(pos) * inv_timescale); } } /* RoPE as in Rotary Position Embeddings from the `RoFormer` paper (https://arxiv.org/abs/2104.09864v5). The query and key vectors are rotated as a function of their absolute position using the rotation matrix R before the self-attention operation. R is a d x d matrix. R = cos(m*theta_1) -sin(m*theta_1) ... 0 0 sin(m*theta_1) cos(m*theta_1) 0 0 ... 0 0 0 0 ... 0 0 ... 0 0 ... cos(m*theta_{d/2}) sin(m*theta_{d/2}) 0 0 ... sin(m*theta_{d/2}) cos(m*theta_{d/2}) Here theta_i = 10000^(-2(i-1)/d), where d is the dimension of the vector and i is the ith index of the vector. Applying the rotation matrix R to a vector v is equivalent to rotating every consecutive pair of dimensions of v i.e. v_{2i} and v_{2i+1} by an angle m*theta_i. However in the Gemma implementation we choose to rotate the pairs of dimensions v_{i} and v_{i + d//2} instead. pos parameter is deliberately an int because in the backward pass we call this with negative values (for the VJP calculation we need the transpose of this rotation matrix which is simply the same matrix with -pos parameter) */ // `inv_timescale[dim_qkv / 2]` is precomputed in AttentionActivations. // This overload is called if `post_qk == PostQKType::HalfRope`. static HWY_NOINLINE HWY_MAYBE_UNUSED void Rope( float* HWY_RESTRICT x, const size_t dim_qkv, const float* HWY_RESTRICT inv_timescale, const int pos, ThreadingContext& ctx, const size_t worker) { GCPP_ZONE(ctx, worker, Zones::kOpsRope); HWY_DASSERT(dim_qkv % 2 == 0); const size_t half_dim_qkv = dim_qkv / 2; const hn::ScalableTag df; const size_t NF = hn::Lanes(df); using VF = hn::Vec; const VF vpos = hn::Set(df, static_cast(pos)); // Vectorize computation for half_dim_qkv - (half_dim_qkv % Lanes) const size_t vectorizable_dims = hwy::RoundDownTo(half_dim_qkv, NF); size_t dim = 0; for (; dim < vectorizable_dims; dim += NF) { const VF vinv_time_scale = hn::LoadU(df, inv_timescale + dim); const VF vtheta = hn::Mul(vpos, vinv_time_scale); // Compute rotations. VF vcos_theta; VF vsin_theta; hn::SinCos(df, vtheta, vsin_theta, vcos_theta); // Scale input with rotations. const VF vx0 = hn::LoadU(df, x + dim); const VF vx1 = hn::LoadU(df, x + dim + half_dim_qkv); const VF vout0 = hn::MulSub(vx0, vcos_theta, hn::Mul(vx1, vsin_theta)); const VF vout1 = hn::MulAdd(vx0, vsin_theta, hn::Mul(vx1, vcos_theta)); hn::StoreU(vout0, df, x + dim); hn::StoreU(vout1, df, x + dim + half_dim_qkv); } // Vectorize computation for remaining dims - same as above, but with LoadN. const size_t remaining_dims = half_dim_qkv - dim; HWY_DASSERT(remaining_dims < NF); // at most one iteration if (remaining_dims != 0) { VF vinv_time_scale = hn::LoadN(df, inv_timescale + dim, remaining_dims); VF vtheta = hn::Mul(vpos, vinv_time_scale); // Compute rotations. VF vcos_theta; VF vsin_theta; hn::SinCos(df, vtheta, vsin_theta, vcos_theta); // Scale input with rotations. const VF vx0 = hn::LoadN(df, x + dim, remaining_dims); const VF vx1 = hn::LoadN(df, x + dim + half_dim_qkv, remaining_dims); const VF vout0 = hn::MulSub(vx0, vcos_theta, hn::Mul(vx1, vsin_theta)); const VF vout1 = hn::MulAdd(vx0, vsin_theta, hn::Mul(vx1, vcos_theta)); hn::StoreN(vout0, df, x + dim, remaining_dims); hn::StoreN(vout1, df, x + dim + half_dim_qkv, remaining_dims); } } // `inv_timescale[dim_qkv / 2]` is precomputed in AttentionActivations. static HWY_NOINLINE HWY_MAYBE_UNUSED void RopeAndMulBy( const float mul, float* HWY_RESTRICT x, const size_t dim_qkv, const float* HWY_RESTRICT inv_timescale, const int pos, ThreadingContext& ctx, const size_t worker) { GCPP_ZONE(ctx, worker, Zones::kOpsRopeAndMulBy); HWY_DASSERT(dim_qkv % 2 == 0); const size_t half_dim_qkv = dim_qkv / 2; const hn::ScalableTag df; const size_t NF = hn::Lanes(df); using VF = hn::Vec; const VF vmul = hn::Set(df, mul); const VF vpos = hn::Set(df, static_cast(pos)); // Vectorize computation for half_dim_qkv - (half_dim_qkv % Lanes) const size_t vectorizable_dims = hwy::RoundDownTo(half_dim_qkv, NF); size_t dim = 0; for (; dim < vectorizable_dims; dim += NF) { const VF vinv_time_scale = hn::LoadU(df, inv_timescale + dim); const VF vtheta = hn::Mul(vpos, vinv_time_scale); // Compute rotations. VF vcos_theta; VF vsin_theta; hn::SinCos(df, vtheta, vsin_theta, vcos_theta); // Scale input with rotations and multiply with constant. const VF vx0 = hn::Mul(vmul, hn::LoadU(df, x + dim)); const VF vx1 = hn::Mul(vmul, hn::LoadU(df, x + dim + half_dim_qkv)); const VF vout0 = hn::MulSub(vx0, vcos_theta, hn::Mul(vx1, vsin_theta)); const VF vout1 = hn::MulAdd(vx0, vsin_theta, hn::Mul(vx1, vcos_theta)); hn::StoreU(vout0, df, x + dim); hn::StoreU(vout1, df, x + dim + half_dim_qkv); } // Vectorize computation for remaining dims - same as above, but with LoadN. const size_t remaining_dims = half_dim_qkv - dim; HWY_DASSERT(remaining_dims < NF); // at most one iteration if (remaining_dims != 0) { VF vinv_time_scale = hn::LoadN(df, inv_timescale + dim, remaining_dims); VF vtheta = hn::Mul(vpos, vinv_time_scale); // Compute rotations. VF vcos_theta; VF vsin_theta; hn::SinCos(df, vtheta, vsin_theta, vcos_theta); // Scale input with rotations and multiply with constant. const VF vx0 = hn::Mul(vmul, hn::LoadN(df, x + dim, remaining_dims)); const VF vx1 = hn::Mul(vmul, hn::LoadN(df, x + dim + half_dim_qkv, remaining_dims)); const VF vout0 = hn::MulSub(vx0, vcos_theta, hn::Mul(vx1, vsin_theta)); const VF vout1 = hn::MulAdd(vx0, vsin_theta, hn::Mul(vx1, vcos_theta)); hn::StoreN(vout0, df, x + dim, remaining_dims); hn::StoreN(vout1, df, x + dim + half_dim_qkv, remaining_dims); } } template static HWY_NOINLINE HWY_MAYBE_UNUSED void AddFrom(const XT* HWY_RESTRICT x, float* HWY_RESTRICT out, const size_t size, ThreadingContext& ctx, const size_t worker) { GCPP_ZONE(ctx, worker, Zones::kOpsAddFrom); namespace hn = hwy::HWY_NAMESPACE; using DF = hn::ScalableTag; using VF = hn::Vec; Decompress1AndCompressInplace(DF(), out, size, x, /*p1_ofs=*/0, [&](DF /*df*/, VF out, VF x) HWY_ATTR -> VF { return hn::Add(x, out); }); } // Simple loops unless/until batch sizes are large enough to parallelize. template void RMSNormBatched(const MatPtrT& activations, const MatPtr& weights, MatPtrT& out, ThreadingContext& ctx, size_t cluster_idx = 0) { HWY_DASSERT(weights.Rows() == 1); HWY_DASSERT(weights.Cols() == activations.Cols()); activations.DebugCheckSameShape(out); CallUpcasted(&weights, [&](const auto* weights_t) { ParallelFor(Parallelism::kFlat, activations.Rows(), ctx, cluster_idx, Callers::kOpsRMSNormBatched, [&](uint64_t token_idx, size_t worker) { RMSNorm(activations.Row(token_idx), weights_t->PackedScale1(), /*w_ofs=*/0, out.Row(token_idx), activations.Cols(), ctx, worker); }); }); } template void RMSNormInplaceBatched(const MatPtr& weights, MatPtrT& inout, ThreadingContext& ctx, size_t cluster_idx = 0) { HWY_DASSERT(weights.Rows() == 1); HWY_DASSERT(weights.Cols() == inout.Cols()); CallUpcasted(&weights, [&](const auto* weights_t) { ParallelFor(Parallelism::kFlat, inout.Rows(), ctx, cluster_idx, Callers::kOpsRMSNormInplaceBatched, [&](uint64_t token_idx, size_t worker) { RMSNormInplace(weights_t->PackedScale1(), /*w_ofs=*/0, inout.Row(token_idx), inout.Cols(), ctx, worker); }); }); } // x and out may be the same. template void LayerNormBatched(const MatPtrT& x, const MatPtr& weight, const MatPtr& bias, MatPtrT& out) { HWY_DASSERT(weight.Cols() == bias.Cols()); HWY_DASSERT(weight.Cols() == x.Cols()); HWY_DASSERT(x.SameShape(out)); CallUpcastedSame( &weight, &bias, [&](const auto* weight_t, const auto* bias_t) { for (size_t token_idx = 0; token_idx < x.Rows(); ++token_idx) { LayerNorm(x.Row(token_idx), weight_t->PackedScale1(), bias_t->PackedScale1(), out.Row(token_idx), x.Cols()); } }); } template static HWY_INLINE void AddFromBatched(const MatPtrT& x, MatPtrT& out, ThreadingContext& ctx, size_t cluster_idx = 0) { HWY_DASSERT(out.SameShape(x)); ParallelFor( Parallelism::kFlat, out.Rows(), ctx, cluster_idx, Callers::kOpsAddFromBatched, [&](uint64_t token_idx, size_t worker) { AddFrom(x.Row(token_idx), out.Row(token_idx), x.Cols(), ctx, worker); }); } // No profiler zone because this is short and frequently called. template HWY_NOINLINE HWY_MAYBE_UNUSED void MulByConst(const float c, XT* HWY_RESTRICT x, const size_t size) { namespace hn = hwy::HWY_NAMESPACE; using DF = hn::ScalableTag; using VF = hn::Vec; const VF vc = hn::Set(DF(), c); const VF* HWY_RESTRICT pc = &vc; DecompressAndCompressInplace(DF(), x, size, [pc](DF /*df*/, VF x) HWY_ATTR -> VF { return hn::Mul(x, *pc); }); } // Same as above, but with a separate output. Same as below without the add. template HWY_NOINLINE HWY_MAYBE_UNUSED void MulByConstTo( const float c, const XT* HWY_RESTRICT x, OT* HWY_RESTRICT out, const size_t size, ThreadingContext& ctx, const size_t worker) { GCPP_ZONE(ctx, worker, Zones::kOpsMulByConstTo); namespace hn = hwy::HWY_NAMESPACE; using DF = hn::ScalableTag; using VF = hn::Vec; const VF vc = hn::Set(DF(), c); const VF* HWY_RESTRICT pc = &vc; Decompress1AndCompressTo(DF(), out, size, x, [pc](DF /*df*/, VF x) HWY_ATTR -> VF { return hn::Mul(x, *pc); }); } // out[i] += x[i] * c. template HWY_NOINLINE HWY_MAYBE_UNUSED void MulByConstAndAdd(const float c, const XT* HWY_RESTRICT x, OT* HWY_RESTRICT out, const size_t size) { namespace hn = hwy::HWY_NAMESPACE; using DF = hn::ScalableTag; using VF = hn::Vec; const VF vc = hn::Set(DF(), c); const VF* HWY_RESTRICT pc = &vc; Decompress1AndCompressInplace(DF(), out, size, x, /*p1_ofs=*/0, [&](DF /*df*/, VF out, VF x) HWY_ATTR -> VF { return hn::MulAdd(x, *pc, out); }); } template , HWY_IF_V_SIZE_GT_D(DF, 63)> HWY_INLINE HWY_MAYBE_UNUSED void Mul16(DF df, const VF scale, VF& sum0, VF& sum1, VF& sum2, VF& sum3, VF& sum4, VF& sum5, VF& sum6, VF& sum7, VF& sum8, VF& sum9, VF& sum10, VF& sum11, VF& sum12, VF& sum13, VF& sum14, VF& sum15) { sum0 = hn::Mul(sum0, hn::BroadcastLane<0>(scale)); sum1 = hn::Mul(sum1, hn::BroadcastLane<1>(scale)); sum2 = hn::Mul(sum2, hn::BroadcastLane<2>(scale)); sum3 = hn::Mul(sum3, hn::BroadcastLane<3>(scale)); sum4 = hn::Mul(sum4, hn::BroadcastLane<4>(scale)); sum5 = hn::Mul(sum5, hn::BroadcastLane<5>(scale)); sum6 = hn::Mul(sum6, hn::BroadcastLane<6>(scale)); sum7 = hn::Mul(sum7, hn::BroadcastLane<7>(scale)); sum8 = hn::Mul(sum8, hn::BroadcastLane<8>(scale)); sum9 = hn::Mul(sum9, hn::BroadcastLane<9>(scale)); sum10 = hn::Mul(sum10, hn::BroadcastLane<10>(scale)); sum11 = hn::Mul(sum11, hn::BroadcastLane<11>(scale)); sum12 = hn::Mul(sum12, hn::BroadcastLane<12>(scale)); sum13 = hn::Mul(sum13, hn::BroadcastLane<13>(scale)); sum14 = hn::Mul(sum14, hn::BroadcastLane<14>(scale)); sum15 = hn::Mul(sum15, hn::BroadcastLane<15>(scale)); } template , HWY_IF_V_SIZE_LE_D(DF, 63)> HWY_INLINE HWY_MAYBE_UNUSED void Mul16(DF df, const VF scale, VF& sum0, VF& sum1, VF& sum2, VF& sum3, VF& sum4, VF& sum5, VF& sum6, VF& sum7, VF& sum8, VF& sum9, VF& sum10, VF& sum11, VF& sum12, VF& sum13, VF& sum14, VF& sum15) {} template , HWY_IF_V_SIZE_GT_D(DF, 31)> HWY_INLINE HWY_MAYBE_UNUSED void Mul8(DF df, const VF scale, VF& sum0, VF& sum1, VF& sum2, VF& sum3, VF& sum4, VF& sum5, VF& sum6, VF& sum7) { sum0 = hn::Mul(sum0, hn::BroadcastLane<0>(scale)); sum1 = hn::Mul(sum1, hn::BroadcastLane<1>(scale)); sum2 = hn::Mul(sum2, hn::BroadcastLane<2>(scale)); sum3 = hn::Mul(sum3, hn::BroadcastLane<3>(scale)); sum4 = hn::Mul(sum4, hn::BroadcastLane<4>(scale)); sum5 = hn::Mul(sum5, hn::BroadcastLane<5>(scale)); sum6 = hn::Mul(sum6, hn::BroadcastLane<6>(scale)); sum7 = hn::Mul(sum7, hn::BroadcastLane<7>(scale)); } template , HWY_IF_V_SIZE_LE_D(DF, 31)> HWY_INLINE HWY_MAYBE_UNUSED void Mul8(DF df, const VF scale, VF& sum0, VF& sum1, VF& sum2, VF& sum3, VF& sum4, VF& sum5, VF& sum6, VF& sum7) {} template , HWY_IF_V_SIZE_GT_D(DF, 63)> HWY_INLINE HWY_MAYBE_UNUSED void MulAdd16( DF df, const VF common, const VF split, VF& sum0, VF& sum1, VF& sum2, VF& sum3, VF& sum4, VF& sum5, VF& sum6, VF& sum7, VF& sum8, VF& sum9, VF& sum10, VF& sum11, VF& sum12, VF& sum13, VF& sum14, VF& sum15) { sum0 = hn::MulAdd(common, hn::BroadcastLane<0>(split), sum0); sum1 = hn::MulAdd(common, hn::BroadcastLane<1>(split), sum1); sum2 = hn::MulAdd(common, hn::BroadcastLane<2>(split), sum2); sum3 = hn::MulAdd(common, hn::BroadcastLane<3>(split), sum3); sum4 = hn::MulAdd(common, hn::BroadcastLane<4>(split), sum4); sum5 = hn::MulAdd(common, hn::BroadcastLane<5>(split), sum5); sum6 = hn::MulAdd(common, hn::BroadcastLane<6>(split), sum6); sum7 = hn::MulAdd(common, hn::BroadcastLane<7>(split), sum7); sum8 = hn::MulAdd(common, hn::BroadcastLane<8>(split), sum8); sum9 = hn::MulAdd(common, hn::BroadcastLane<9>(split), sum9); sum10 = hn::MulAdd(common, hn::BroadcastLane<10>(split), sum10); sum11 = hn::MulAdd(common, hn::BroadcastLane<11>(split), sum11); sum12 = hn::MulAdd(common, hn::BroadcastLane<12>(split), sum12); sum13 = hn::MulAdd(common, hn::BroadcastLane<13>(split), sum13); sum14 = hn::MulAdd(common, hn::BroadcastLane<14>(split), sum14); sum15 = hn::MulAdd(common, hn::BroadcastLane<15>(split), sum15); } template , HWY_IF_V_SIZE_LE_D(DF, 63)> HWY_INLINE HWY_MAYBE_UNUSED void MulAdd16( DF df, const VF common, const VF split, VF& sum0, VF& sum1, VF& sum2, VF& sum3, VF& sum4, VF& sum5, VF& sum6, VF& sum7, VF& sum8, VF& sum9, VF& sum10, VF& sum11, VF& sum12, VF& sum13, VF& sum14, VF& sum15) {} template , HWY_IF_V_SIZE_GT_D(DF, 31)> HWY_INLINE HWY_MAYBE_UNUSED void MulAdd8(DF df, const VF common, const VF split, VF& sum0, VF& sum1, VF& sum2, VF& sum3, VF& sum4, VF& sum5, VF& sum6, VF& sum7) { sum0 = hn::MulAdd(common, hn::BroadcastLane<0>(split), sum0); sum1 = hn::MulAdd(common, hn::BroadcastLane<1>(split), sum1); sum2 = hn::MulAdd(common, hn::BroadcastLane<2>(split), sum2); sum3 = hn::MulAdd(common, hn::BroadcastLane<3>(split), sum3); sum4 = hn::MulAdd(common, hn::BroadcastLane<4>(split), sum4); sum5 = hn::MulAdd(common, hn::BroadcastLane<5>(split), sum5); sum6 = hn::MulAdd(common, hn::BroadcastLane<6>(split), sum6); sum7 = hn::MulAdd(common, hn::BroadcastLane<7>(split), sum7); } template , HWY_IF_V_SIZE_LE_D(DF, 31)> HWY_INLINE HWY_MAYBE_UNUSED void MulAdd8(DF df, const VF common, const VF split, VF& sum0, VF& sum1, VF& sum2, VF& sum3, VF& sum4, VF& sum5, VF& sum6, VF& sum7) {} template > HWY_INLINE HWY_MAYBE_UNUSED void MulAdd4(DF df, const VF common, const VF split, VF& sum0, VF& sum1, VF& sum2, VF& sum3) { sum0 = hn::MulAdd(common, hn::BroadcastLane<0>(split), sum0); sum1 = hn::MulAdd(common, hn::BroadcastLane<1>(split), sum1); sum2 = hn::MulAdd(common, hn::BroadcastLane<2>(split), sum2); sum3 = hn::MulAdd(common, hn::BroadcastLane<3>(split), sum3); } // For an 8xNF tile of float values in 8xNF-lane registers, multiplies 8 rows // of V by the corresponding values in c0-c7 and adds them to NF rows of out, // after first prescaling out by scale. // The depth (size) must be a multiple of NF. template > HWY_NOINLINE HWY_MAYBE_UNUSED void MulByConstAndAddTile( DF df, const VF scale, const VF c0, const VF c1, const VF c2, const VF c3, const VF c4, const VF c5, const VF c6, const VF c7, const MatPtrT& v, const size_t* HWY_RESTRICT pos, float* HWY_RESTRICT out, const uint32_t* HWY_RESTRICT out_offsets, const size_t size) { namespace hn = hwy::HWY_NAMESPACE; HWY_LANES_CONSTEXPR size_t NF = hn::Lanes(df); size_t i = 0; while (i + NF <= size) { if HWY_LANES_CONSTEXPR (NF == 16) { VF out0, out1, out2, out3, out4, out5, out6, out7; VF out8, out9, out10, out11, out12, out13, out14, out15; out0 = hn::Load(df, out + i + out_offsets[0]); out1 = hn::Load(df, out + i + out_offsets[1]); out2 = hn::Load(df, out + i + out_offsets[2]); out3 = hn::Load(df, out + i + out_offsets[3]); out4 = hn::Load(df, out + i + out_offsets[4]); out5 = hn::Load(df, out + i + out_offsets[5]); out6 = hn::Load(df, out + i + out_offsets[6]); out7 = hn::Load(df, out + i + out_offsets[7]); out8 = hn::Load(df, out + i + out_offsets[8]); out9 = hn::Load(df, out + i + out_offsets[9]); out10 = hn::Load(df, out + i + out_offsets[10]); out11 = hn::Load(df, out + i + out_offsets[11]); out12 = hn::Load(df, out + i + out_offsets[12]); out13 = hn::Load(df, out + i + out_offsets[13]); out14 = hn::Load(df, out + i + out_offsets[14]); out15 = hn::Load(df, out + i + out_offsets[15]); Mul16(df, scale, out0, out1, out2, out3, out4, out5, out6, out7, out8, out9, out10, out11, out12, out13, out14, out15); VF x0 = hn::Load(df, v.Row(pos[0]) + i); MulAdd16(df, x0, c0, out0, out1, out2, out3, out4, out5, out6, out7, out8, out9, out10, out11, out12, out13, out14, out15); VF x1 = hn::Load(df, v.Row(pos[1]) + i); MulAdd16(df, x1, c1, out0, out1, out2, out3, out4, out5, out6, out7, out8, out9, out10, out11, out12, out13, out14, out15); VF x2 = hn::Load(df, v.Row(pos[2]) + i); MulAdd16(df, x2, c2, out0, out1, out2, out3, out4, out5, out6, out7, out8, out9, out10, out11, out12, out13, out14, out15); VF x3 = hn::Load(df, v.Row(pos[3]) + i); MulAdd16(df, x3, c3, out0, out1, out2, out3, out4, out5, out6, out7, out8, out9, out10, out11, out12, out13, out14, out15); VF x4 = hn::Load(df, v.Row(pos[4]) + i); MulAdd16(df, x4, c4, out0, out1, out2, out3, out4, out5, out6, out7, out8, out9, out10, out11, out12, out13, out14, out15); VF x5 = hn::Load(df, v.Row(pos[5]) + i); MulAdd16(df, x5, c5, out0, out1, out2, out3, out4, out5, out6, out7, out8, out9, out10, out11, out12, out13, out14, out15); VF x6 = hn::Load(df, v.Row(pos[6]) + i); MulAdd16(df, x6, c6, out0, out1, out2, out3, out4, out5, out6, out7, out8, out9, out10, out11, out12, out13, out14, out15); VF x7 = hn::Load(df, v.Row(pos[7]) + i); MulAdd16(df, x7, c7, out0, out1, out2, out3, out4, out5, out6, out7, out8, out9, out10, out11, out12, out13, out14, out15); hn::Store(out0, df, out + i + out_offsets[0]); hn::Store(out1, df, out + i + out_offsets[1]); hn::Store(out2, df, out + i + out_offsets[2]); hn::Store(out3, df, out + i + out_offsets[3]); hn::Store(out4, df, out + i + out_offsets[4]); hn::Store(out5, df, out + i + out_offsets[5]); hn::Store(out6, df, out + i + out_offsets[6]); hn::Store(out7, df, out + i + out_offsets[7]); hn::Store(out8, df, out + i + out_offsets[8]); hn::Store(out9, df, out + i + out_offsets[9]); hn::Store(out10, df, out + i + out_offsets[10]); hn::Store(out11, df, out + i + out_offsets[11]); hn::Store(out12, df, out + i + out_offsets[12]); hn::Store(out13, df, out + i + out_offsets[13]); hn::Store(out14, df, out + i + out_offsets[14]); hn::Store(out15, df, out + i + out_offsets[15]); } if HWY_LANES_CONSTEXPR (NF == 8) { VF out0, out1, out2, out3, out4, out5, out6, out7; out0 = hn::Load(df, out + i + out_offsets[0]); out1 = hn::Load(df, out + i + out_offsets[1]); out2 = hn::Load(df, out + i + out_offsets[2]); out3 = hn::Load(df, out + i + out_offsets[3]); out4 = hn::Load(df, out + i + out_offsets[4]); out5 = hn::Load(df, out + i + out_offsets[5]); out6 = hn::Load(df, out + i + out_offsets[6]); out7 = hn::Load(df, out + i + out_offsets[7]); Mul8(df, scale, out0, out1, out2, out3, out4, out5, out6, out7); VF x0 = hn::Load(df, v.Row(pos[0]) + i); MulAdd8(df, x0, c0, out0, out1, out2, out3, out4, out5, out6, out7); VF x1 = hn::Load(df, v.Row(pos[1]) + i); MulAdd8(df, x1, c1, out0, out1, out2, out3, out4, out5, out6, out7); VF x2 = hn::Load(df, v.Row(pos[2]) + i); MulAdd8(df, x2, c2, out0, out1, out2, out3, out4, out5, out6, out7); VF x3 = hn::Load(df, v.Row(pos[3]) + i); MulAdd8(df, x3, c3, out0, out1, out2, out3, out4, out5, out6, out7); VF x4 = hn::Load(df, v.Row(pos[4]) + i); MulAdd8(df, x4, c4, out0, out1, out2, out3, out4, out5, out6, out7); VF x5 = hn::Load(df, v.Row(pos[5]) + i); MulAdd8(df, x5, c5, out0, out1, out2, out3, out4, out5, out6, out7); VF x6 = hn::Load(df, v.Row(pos[6]) + i); MulAdd8(df, x6, c6, out0, out1, out2, out3, out4, out5, out6, out7); VF x7 = hn::Load(df, v.Row(pos[7]) + i); MulAdd8(df, x7, c7, out0, out1, out2, out3, out4, out5, out6, out7); hn::Store(out0, df, out + i + out_offsets[0]); hn::Store(out1, df, out + i + out_offsets[1]); hn::Store(out2, df, out + i + out_offsets[2]); hn::Store(out3, df, out + i + out_offsets[3]); hn::Store(out4, df, out + i + out_offsets[4]); hn::Store(out5, df, out + i + out_offsets[5]); hn::Store(out6, df, out + i + out_offsets[6]); hn::Store(out7, df, out + i + out_offsets[7]); } if HWY_LANES_CONSTEXPR (NF == 4) { VF out0, out1, out2, out3; out0 = hn::Load(df, out + i + out_offsets[0]); out1 = hn::Load(df, out + i + out_offsets[1]); out2 = hn::Load(df, out + i + out_offsets[2]); out3 = hn::Load(df, out + i + out_offsets[3]); out0 = hn::Mul(out0, hn::BroadcastLane<0>(scale)); out1 = hn::Mul(out1, hn::BroadcastLane<1>(scale)); out2 = hn::Mul(out2, hn::BroadcastLane<2>(scale)); out3 = hn::Mul(out3, hn::BroadcastLane<3>(scale)); VF x0 = hn::Load(df, v.Row(pos[0]) + i); MulAdd4(df, x0, c0, out0, out1, out2, out3); VF x1 = hn::Load(df, v.Row(pos[1]) + i); MulAdd4(df, x1, c1, out0, out1, out2, out3); VF x2 = hn::Load(df, v.Row(pos[2]) + i); MulAdd4(df, x2, c2, out0, out1, out2, out3); VF x3 = hn::Load(df, v.Row(pos[3]) + i); MulAdd4(df, x3, c3, out0, out1, out2, out3); VF x4 = hn::Load(df, v.Row(pos[4]) + i); MulAdd4(df, x4, c4, out0, out1, out2, out3); VF x5 = hn::Load(df, v.Row(pos[5]) + i); MulAdd4(df, x5, c5, out0, out1, out2, out3); VF x6 = hn::Load(df, v.Row(pos[6]) + i); MulAdd4(df, x6, c6, out0, out1, out2, out3); VF x7 = hn::Load(df, v.Row(pos[7]) + i); MulAdd4(df, x7, c7, out0, out1, out2, out3); hn::Store(out0, df, out + i + out_offsets[0]); hn::Store(out1, df, out + i + out_offsets[1]); hn::Store(out2, df, out + i + out_offsets[2]); hn::Store(out3, df, out + i + out_offsets[3]); } i += NF; } HWY_DASSERT(size == i); } template > HWY_INLINE HWY_MAYBE_UNUSED void MulAdd4(DF df, const VF common, const VF c0, const VF c1, const VF c2, const VF c3, VF& sum0, VF& sum1, VF& sum2, VF& sum3) { sum0 = hn::MulAdd(common, c0, sum0); sum1 = hn::MulAdd(common, c1, sum1); sum2 = hn::MulAdd(common, c2, sum2); sum3 = hn::MulAdd(common, c3, sum3); } template > HWY_INLINE HWY_MAYBE_UNUSED void MulAdd4Lanes(DF df, const MatPtrT& v, const size_t* HWY_RESTRICT pos, const size_t offset, const VF c0, const VF c1, const VF c2, const VF c3, VF& sum0, VF& sum1, VF& sum2, VF& sum3) { // TODO(rays): Check whether a transpose of c0-c3 is applicable and faster. VF x0 = hn::Load(df, v.Row(pos[0]) + offset); MulAdd4(df, x0, hn::BroadcastLane<0>(c0), hn::BroadcastLane<0>(c1), hn::BroadcastLane<0>(c2), hn::BroadcastLane<0>(c3), sum0, sum1, sum2, sum3); VF x1 = hn::Load(df, v.Row(pos[1]) + offset); MulAdd4(df, x1, hn::BroadcastLane<1>(c0), hn::BroadcastLane<1>(c1), hn::BroadcastLane<1>(c2), hn::BroadcastLane<1>(c3), sum0, sum1, sum2, sum3); VF x2 = hn::Load(df, v.Row(pos[2]) + offset); MulAdd4(df, x2, hn::BroadcastLane<2>(c0), hn::BroadcastLane<2>(c1), hn::BroadcastLane<2>(c2), hn::BroadcastLane<2>(c3), sum0, sum1, sum2, sum3); VF x3 = hn::Load(df, v.Row(pos[3]) + offset); MulAdd4(df, x3, hn::BroadcastLane<3>(c0), hn::BroadcastLane<3>(c1), hn::BroadcastLane<3>(c2), hn::BroadcastLane<3>(c3), sum0, sum1, sum2, sum3); } template , HWY_IF_V_SIZE_GT_D(DF, 31)> HWY_INLINE HWY_MAYBE_UNUSED void MulAddSecond4Lanes( DF df, const MatPtrT& v, const size_t* HWY_RESTRICT pos, const size_t offset, const VF c0, const VF c1, const VF c2, const VF c3, VF& sum0, VF& sum1, VF& sum2, VF& sum3) { VF x4 = hn::Load(df, v.Row(pos[4]) + offset); MulAdd4(df, x4, hn::BroadcastLane<4>(c0), hn::BroadcastLane<4>(c1), hn::BroadcastLane<4>(c2), hn::BroadcastLane<4>(c3), sum0, sum1, sum2, sum3); VF x5 = hn::Load(df, v.Row(pos[5]) + offset); MulAdd4(df, x5, hn::BroadcastLane<5>(c0), hn::BroadcastLane<5>(c1), hn::BroadcastLane<5>(c2), hn::BroadcastLane<5>(c3), sum0, sum1, sum2, sum3); VF x6 = hn::Load(df, v.Row(pos[6]) + offset); MulAdd4(df, x6, hn::BroadcastLane<6>(c0), hn::BroadcastLane<6>(c1), hn::BroadcastLane<6>(c2), hn::BroadcastLane<6>(c3), sum0, sum1, sum2, sum3); VF x7 = hn::Load(df, v.Row(pos[7]) + offset); MulAdd4(df, x7, hn::BroadcastLane<7>(c0), hn::BroadcastLane<7>(c1), hn::BroadcastLane<7>(c2), hn::BroadcastLane<7>(c3), sum0, sum1, sum2, sum3); } template , HWY_IF_V_SIZE_LE_D(DF, 31)> HWY_INLINE HWY_MAYBE_UNUSED void MulAddSecond4Lanes( DF df, const MatPtrT& v, const size_t* HWY_RESTRICT pos, const size_t offset, const VF c0, const VF c1, const VF c2, const VF c3, VF& sum0, VF& sum1, VF& sum2, VF& sum3) {} template , HWY_IF_V_SIZE_GT_D(DF, 63)> HWY_INLINE HWY_MAYBE_UNUSED void MulAddSecond8Lanes( DF df, const MatPtrT& v, const size_t* HWY_RESTRICT pos, const size_t offset, const VF c0, const VF c1, const VF c2, const VF c3, VF& sum0, VF& sum1, VF& sum2, VF& sum3) { VF x8 = hn::Load(df, v.Row(pos[8]) + offset); MulAdd4(df, x8, hn::BroadcastLane<8>(c0), hn::BroadcastLane<8>(c1), hn::BroadcastLane<8>(c2), hn::BroadcastLane<8>(c3), sum0, sum1, sum2, sum3); VF x9 = hn::Load(df, v.Row(pos[9]) + offset); MulAdd4(df, x9, hn::BroadcastLane<9>(c0), hn::BroadcastLane<9>(c1), hn::BroadcastLane<9>(c2), hn::BroadcastLane<9>(c3), sum0, sum1, sum2, sum3); VF x10 = hn::Load(df, v.Row(pos[10]) + offset); MulAdd4(df, x10, hn::BroadcastLane<10>(c0), hn::BroadcastLane<10>(c1), hn::BroadcastLane<10>(c2), hn::BroadcastLane<10>(c3), sum0, sum1, sum2, sum3); VF x11 = hn::Load(df, v.Row(pos[11]) + offset); MulAdd4(df, x11, hn::BroadcastLane<11>(c0), hn::BroadcastLane<11>(c1), hn::BroadcastLane<11>(c2), hn::BroadcastLane<11>(c3), sum0, sum1, sum2, sum3); VF x12 = hn::Load(df, v.Row(pos[12]) + offset); MulAdd4(df, x12, hn::BroadcastLane<12>(c0), hn::BroadcastLane<12>(c1), hn::BroadcastLane<12>(c2), hn::BroadcastLane<12>(c3), sum0, sum1, sum2, sum3); VF x13 = hn::Load(df, v.Row(pos[13]) + offset); MulAdd4(df, x13, hn::BroadcastLane<13>(c0), hn::BroadcastLane<13>(c1), hn::BroadcastLane<13>(c2), hn::BroadcastLane<13>(c3), sum0, sum1, sum2, sum3); VF x14 = hn::Load(df, v.Row(pos[14]) + offset); MulAdd4(df, x14, hn::BroadcastLane<14>(c0), hn::BroadcastLane<14>(c1), hn::BroadcastLane<14>(c2), hn::BroadcastLane<14>(c3), sum0, sum1, sum2, sum3); VF x15 = hn::Load(df, v.Row(pos[15]) + offset); MulAdd4(df, x15, hn::BroadcastLane<15>(c0), hn::BroadcastLane<15>(c1), hn::BroadcastLane<15>(c2), hn::BroadcastLane<15>(c3), sum0, sum1, sum2, sum3); } template , HWY_IF_V_SIZE_LE_D(DF, 63)> HWY_INLINE HWY_MAYBE_UNUSED void MulAddSecond8Lanes( DF df, const MatPtrT& v, const size_t* HWY_RESTRICT pos, const size_t offset, const VF c0, const VF c1, const VF c2, const VF c3, VF& sum0, VF& sum1, VF& sum2, VF& sum3) {} // For an NFx4 tile of float values in 4xNF-lane registers, multiplies NF rows // of V by the corresponding values in c0-c3 and adds them to NF rows of out, // after first prescaling out by scale. // The depth (size) must be a multiple of NF. template > HWY_NOINLINE HWY_MAYBE_UNUSED void MulByConstAndAddTile4( DF df, const float* HWY_RESTRICT scales, const VF c0, const VF c1, const VF c2, const VF c3, const MatPtrT& v, const size_t* HWY_RESTRICT pos, float* HWY_RESTRICT out, const uint32_t* HWY_RESTRICT out_offsets, const size_t size) { namespace hn = hwy::HWY_NAMESPACE; HWY_LANES_CONSTEXPR size_t NF = hn::Lanes(df); size_t i = 0; while (i + NF <= size) { VF out0, out1, out2, out3; out0 = hn::Load(df, out + i + out_offsets[0]); out1 = hn::Load(df, out + i + out_offsets[1]); out2 = hn::Load(df, out + i + out_offsets[2]); out3 = hn::Load(df, out + i + out_offsets[3]); out0 = hn::Mul(out0, hn::Set(df, scales[0])); out1 = hn::Mul(out1, hn::Set(df, scales[1])); out2 = hn::Mul(out2, hn::Set(df, scales[2])); out3 = hn::Mul(out3, hn::Set(df, scales[3])); MulAdd4Lanes(df, v, pos, i, c0, c1, c2, c3, out0, out1, out2, out3); if HWY_LANES_CONSTEXPR (NF >= 8) { MulAddSecond4Lanes(df, v, pos, i, c0, c1, c2, c3, out0, out1, out2, out3); if HWY_LANES_CONSTEXPR (NF >= 16) { MulAddSecond8Lanes(df, v, pos, i, c0, c1, c2, c3, out0, out1, out2, out3); } } hn::Store(out0, df, out + i + out_offsets[0]); hn::Store(out1, df, out + i + out_offsets[1]); hn::Store(out2, df, out + i + out_offsets[2]); hn::Store(out3, df, out + i + out_offsets[3]); i += NF; } HWY_DASSERT(size == i); } // Prescales NF rows of out by scale, then multiplies 1 row of V by the // corresponding values in c0 and adds them to the NF rows of out. // The depth (size) must be a multiple of NF. template > HWY_NOINLINE HWY_MAYBE_UNUSED void MulByConstAndAddVector( DF df, const VF scale, const VF c0, const MatPtrT& v, const size_t pos, float* HWY_RESTRICT out, const uint32_t* HWY_RESTRICT out_offsets, const size_t size) { namespace hn = hwy::HWY_NAMESPACE; HWY_LANES_CONSTEXPR size_t NF = hn::Lanes(df); size_t i = 0; while (i + NF <= size) { if HWY_LANES_CONSTEXPR (NF == 16) { VF out0, out1, out2, out3, out4, out5, out6, out7; VF out8, out9, out10, out11, out12, out13, out14, out15; out0 = hn::Load(df, out + i + out_offsets[0]); out1 = hn::Load(df, out + i + out_offsets[1]); out2 = hn::Load(df, out + i + out_offsets[2]); out3 = hn::Load(df, out + i + out_offsets[3]); out4 = hn::Load(df, out + i + out_offsets[4]); out5 = hn::Load(df, out + i + out_offsets[5]); out6 = hn::Load(df, out + i + out_offsets[6]); out7 = hn::Load(df, out + i + out_offsets[7]); out8 = hn::Load(df, out + i + out_offsets[8]); out9 = hn::Load(df, out + i + out_offsets[9]); out10 = hn::Load(df, out + i + out_offsets[10]); out11 = hn::Load(df, out + i + out_offsets[11]); out12 = hn::Load(df, out + i + out_offsets[12]); out13 = hn::Load(df, out + i + out_offsets[13]); out14 = hn::Load(df, out + i + out_offsets[14]); out15 = hn::Load(df, out + i + out_offsets[15]); Mul16(df, scale, out0, out1, out2, out3, out4, out5, out6, out7, out8, out9, out10, out11, out12, out13, out14, out15); VF x0 = hn::Load(df, v.Row(pos) + i); MulAdd16(df, x0, c0, out0, out1, out2, out3, out4, out5, out6, out7, out8, out9, out10, out11, out12, out13, out14, out15); hn::Store(out0, df, out + i + out_offsets[0]); hn::Store(out1, df, out + i + out_offsets[1]); hn::Store(out2, df, out + i + out_offsets[2]); hn::Store(out3, df, out + i + out_offsets[3]); hn::Store(out4, df, out + i + out_offsets[4]); hn::Store(out5, df, out + i + out_offsets[5]); hn::Store(out6, df, out + i + out_offsets[6]); hn::Store(out7, df, out + i + out_offsets[7]); hn::Store(out8, df, out + i + out_offsets[8]); hn::Store(out9, df, out + i + out_offsets[9]); hn::Store(out10, df, out + i + out_offsets[10]); hn::Store(out11, df, out + i + out_offsets[11]); hn::Store(out12, df, out + i + out_offsets[12]); hn::Store(out13, df, out + i + out_offsets[13]); hn::Store(out14, df, out + i + out_offsets[14]); hn::Store(out15, df, out + i + out_offsets[15]); } if HWY_LANES_CONSTEXPR (NF == 8) { VF out0, out1, out2, out3, out4, out5, out6, out7; out0 = hn::Load(df, out + i + out_offsets[0]); out1 = hn::Load(df, out + i + out_offsets[1]); out2 = hn::Load(df, out + i + out_offsets[2]); out3 = hn::Load(df, out + i + out_offsets[3]); out4 = hn::Load(df, out + i + out_offsets[4]); out5 = hn::Load(df, out + i + out_offsets[5]); out6 = hn::Load(df, out + i + out_offsets[6]); out7 = hn::Load(df, out + i + out_offsets[7]); Mul8(df, scale, out0, out1, out2, out3, out4, out5, out6, out7); VF x0 = hn::Load(df, v.Row(pos) + i); MulAdd8(df, x0, c0, out0, out1, out2, out3, out4, out5, out6, out7); hn::Store(out0, df, out + i + out_offsets[0]); hn::Store(out1, df, out + i + out_offsets[1]); hn::Store(out2, df, out + i + out_offsets[2]); hn::Store(out3, df, out + i + out_offsets[3]); hn::Store(out4, df, out + i + out_offsets[4]); hn::Store(out5, df, out + i + out_offsets[5]); hn::Store(out6, df, out + i + out_offsets[6]); hn::Store(out7, df, out + i + out_offsets[7]); } if HWY_LANES_CONSTEXPR (NF == 4) { VF out0, out1, out2, out3; out0 = hn::Load(df, out + i + out_offsets[0]); out1 = hn::Load(df, out + i + out_offsets[1]); out2 = hn::Load(df, out + i + out_offsets[2]); out3 = hn::Load(df, out + i + out_offsets[3]); out0 = hn::Mul(out0, hn::BroadcastLane<0>(scale)); out1 = hn::Mul(out1, hn::BroadcastLane<1>(scale)); out2 = hn::Mul(out2, hn::BroadcastLane<2>(scale)); out3 = hn::Mul(out3, hn::BroadcastLane<3>(scale)); VF x0 = hn::Load(df, v.Row(pos) + i); MulAdd4(df, x0, c0, out0, out1, out2, out3); hn::Store(out0, df, out + i + out_offsets[0]); hn::Store(out1, df, out + i + out_offsets[1]); hn::Store(out2, df, out + i + out_offsets[2]); hn::Store(out3, df, out + i + out_offsets[3]); } i += NF; } HWY_DASSERT(size == i); } // See below for a specialized version for top-1 sampling. // TODO: support bf16 logits using Decompress2. // Computes softmax probabilities for the given logits, normalizing in-place. // The calculation is numerically stable, using the max-subtraction trick to // compute exp(logits[i] - max(logits)) before normalizing by the sum. // If temperature is provided and not 1.0, each intermediate exp() result is // divided by temperature before normalization; however, this division by // temperature cancels out during the final normalization step, meaning // temperature currently has no effect on the output probabilities. // @param logits In-out: on input, contains logits; on output, overwritten with // probabilities. // @param ctx Input: threading context for parallelism and profiling. // @param worker Input: worker thread index. // @param temperature Input: softmax temperature. // @param softmax_max_out Optional output: if not null, stores the max logit // value. // @param softmax_d_out Optional output: if softmax_max is not null, this must // not be null and stores the sum of exp(logit - max). static HWY_NOINLINE void Softmax(Logits logits, ThreadingContext& ctx, const size_t worker, float temperature = 1.0f, const SMOptions& sm_options = {}) { GCPP_ZONE(ctx, worker, Zones::kOpsSoftmax); HWY_DASSERT(logits.size() != 0); namespace hn = hwy::HWY_NAMESPACE; using D = hn::ScalableTag; using V = hn::Vec; const D d; const V vmin = hn::Set(d, hwy::LowestValue()); V vmax = vmin; V* pmax = &vmax; // workaround for SVE: cannot capture &vector directly hn::Foreach(d, logits.data(), logits.size(), vmin, [pmax](const auto d, const V value) HWY_ATTR { *pmax = hn::Max(*pmax, value); }); vmax = hn::MaxOfLanes(d, vmax); // Subtract max (avoid precision loss for large exponents) and exponentiate. hn::Transform(d, logits.data(), logits.size(), [pmax](const auto d, const V value) HWY_ATTR { if constexpr (HWY_TARGET & HWY_ALL_SVE) { // Workaround for buggy SVE codegen: avoid inlined Exp(). return hn::CallExp(d, hn::Sub(value, *pmax)); } else { return hn::Exp(d, hn::Sub(value, *pmax)); } }); if (temperature != 1.0f) { const float temperature_inv = 1.0f / temperature; hn::Transform(d, logits.data(), logits.size(), [temperature_inv](const auto d, const V value) HWY_ATTR { return hn::Mul(value, hn::Set(d, temperature_inv)); }); } // Normalize to probability distribution. The exact sum seems like it should // not make a huge difference. It halves the standard deviation of the sum of // the normalized probabilities from 1E-7 to 5E-8, but actually also changes // the generated text after a few hundred tokens. const float sum_exp = Sum(d, logits.data(), logits.size()); // Double-precision reciprocal does not appear to affect the results. const float mul = 1.0f / sum_exp; MulByConst(mul, logits.data(), logits.size()); if (sm_options.max_out) { *sm_options.max_out = hn::GetLane(vmax); *sm_options.d_out = sum_exp; } } // Note: https://arxiv.org/pdf/2001.04438 proposes to replace the three max / // exp / mul passes with two passes, both of which compute Exp. This is // reportedly only faster for very large arrays, larger even than our 256K // vocab size. We instead fuse the subsequent sampling pass into the softmax, // which already knows the max value which top-1 sampling would again seek. // Returns the argmax and x[argmax]. static HWY_INLINE TokenAndProb ArgmaxAndMax(Logits logits) { namespace hn = hwy::HWY_NAMESPACE; using D = hn::ScalableTag; using V = hn::Vec; using M = hn::Mask; const D d; const hn::RebindToSigned di; using TI = hn::TFromD; using VI = hn::Vec; const size_t N = hn::Lanes(d); HWY_ASSERT(logits.size() % (2 * N) == 0); V max0 = hn::Set(d, hwy::LowestValue()); V max1 = max0; VI argmax0 = hn::Zero(di); VI argmax1 = argmax0; for (size_t i = 0; i < logits.size(); i += 2 * N) { const V v0 = hn::LoadU(d, &logits[i]); const V v1 = hn::LoadU(d, &logits[i + N]); const VI vi0 = hn::Iota(di, static_cast(i)); const VI vi1 = hn::Iota(di, static_cast(i + N)); const M gt0 = hn::Gt(v0, max0); const M gt1 = hn::Gt(v1, max1); max0 = hn::IfThenElse(gt0, v0, max0); max1 = hn::IfThenElse(gt1, v1, max1); argmax0 = hn::IfThenElse(hn::RebindMask(di, gt0), vi0, argmax0); argmax1 = hn::IfThenElse(hn::RebindMask(di, gt1), vi1, argmax1); } // Combine the two vectors const M gt0 = hn::Gt(max0, max1); max0 = hn::IfThenElse(gt0, max0, max1); argmax0 = hn::IfThenElse(hn::RebindMask(di, gt0), argmax0, argmax1); // Reduce to the global max const V max = hn::MaxOfLanes(d, max0); // broadcasts // Argmax = lowest-indexed lane equal to the global max const size_t lane = hn::FindKnownFirstTrue(d, hn::Eq(max, max0)); const TI argmax = hn::ExtractLane(argmax0, lane); return TokenAndProb{.token = argmax, .prob = hn::GetLane(max)}; } // Returns argmax of softmax and its probability. This overwrites `logits`, but // not with normalized probabilities. Only equivalent to `Softmax` + // `sample_func` if `kTopK` == 1. This is worthwhile because `logits.size()` is // typically `kVocabSize == 256K`, and this avoids writing and then scanning // again for the max. static HWY_MAYBE_UNUSED TokenAndProb Top1OfSoftmax(Logits logits) { namespace hn = hwy::HWY_NAMESPACE; const hn::ScalableTag d; using V = hn::Vec; const TokenAndProb argmax = ArgmaxAndMax(logits); // Subtract max (avoid precision loss for large exponents) and exponentiate. const V max = hn::Set(d, argmax.prob); const V* pmax = &max; hn::Transform(d, logits.data(), logits.size(), [pmax](const auto d, const V value) HWY_ATTR { if constexpr (HWY_TARGET & HWY_ALL_SVE) { // Temporary workaround for buggy SVE codegen: avoid inlined // Exp(). return hn::CallExp(d, hn::Sub(value, *pmax)); } else { return hn::Exp(d, hn::Sub(value, *pmax)); } }); // Normalize to a single probability. The exact sum seems like it should not // make a huge difference. It halves the standard deviation of the sum of the // normalized probabilities from 1E-7 to 5E-8, but actually also changes the // generated text after a few hundred tokens. const float sum_exp = Sum(d, logits.data(), logits.size()); const float prob = logits[argmax.token] / sum_exp; return TokenAndProb{.token = argmax.token, .prob = prob}; } static HWY_NOINLINE void LogitsSoftCap(const float cap, Logits logits, ThreadingContext& ctx, const size_t worker) { GCPP_ZONE(ctx, worker, Zones::kOpsLogitsSoftCap); namespace hn = hwy::HWY_NAMESPACE; using DF = hn::ScalableTag; using VF = hn::Vec; const VF vcap = hn::Set(DF(), cap); const VF vinv_cap = hn::Set(DF(), 1.0f / cap); const VF* HWY_RESTRICT pcap = &vcap; const VF* HWY_RESTRICT pinv_cap = &vinv_cap; DecompressAndCompressInplace(DF(), logits.data(), logits.size(), [pcap, pinv_cap](DF d, VF v) HWY_ATTR -> VF { return hn::Mul( *pcap, hn::Tanh(d, hn::Mul(v, *pinv_cap))); }); } // Calls LogitsSoftCap if cap != 0.0f. static HWY_INLINE HWY_MAYBE_UNUSED void MaybeLogitsSoftCap( const float cap, Logits logits, ThreadingContext& ctx, const size_t worker) { if (cap != 0.0f) { LogitsSoftCap(cap, logits, ctx, worker); } } static HWY_INLINE HWY_MAYBE_UNUSED void MaybeLogitsSoftCapBatched( const float cap, MatPtrT& x, const hwy::BitSet4096<>& non_eos, ThreadingContext& ctx, size_t cluster_idx = 0) { if (cap == 0.0f) return; ParallelFor(Parallelism::kFlat, x.Rows(), ctx, cluster_idx, Callers::kOpsMaybeLogitsSoftCapBatched, [&](uint64_t task, size_t worker) { if (non_eos.Get(task)) { LogitsSoftCap(cap, x.RowSpan(task), ctx, worker); } }); } static HWY_NOINLINE HWY_MAYBE_UNUSED size_t SampleArgmax(Logits logits) { size_t max_index = 0; float max_prob = logits[0]; for (size_t i = 1; i < logits.size(); ++i) { if (logits[i] > max_prob) { max_index = i; max_prob = logits[i]; } } return max_index; } HWY_INLINE HWY_MAYBE_UNUSED std::discrete_distribution create_distribution( std::vector& top_k, float temperature) { HWY_ASSERT(temperature >= 0.0f); if (temperature == 0.0f) { // Temperature == 0 is a special case which always returns the argmax (0). // We also want to avoid dividing by zero in the code below. return std::discrete_distribution(); } namespace hn = hwy::HWY_NAMESPACE; using D = hn::ScalableTag; // re-normalize distribution const float temperature_inv = 1.0f / temperature; hn::Transform(D(), top_k.data(), top_k.size(), [temperature_inv](D d, hn::Vec v) HWY_ATTR { return hn::Exp( d, hn::Mul(hn::Log(d, v), hn::Set(d, temperature_inv))); }); return std::discrete_distribution(std::begin(top_k), std::end(top_k)); } template HWY_NOINLINE HWY_MAYBE_UNUSED std::vector TopK( Logits logits, size_t k, TAcceptToken& accept_token) { HWY_ASSERT(k != 0); HWY_ASSERT(k <= logits.size()); std::vector packed_token_probs; for (int32_t i = 0; i < static_cast(logits.size()); ++i) { if (accept_token && !accept_token(i, logits[i])) { continue; } packed_token_probs.push_back(PackTokenAndProb(i, logits[i])); } hwy::VQSelect(packed_token_probs.data(), packed_token_probs.size(), k, hwy::SortDescending()); hwy::VQSort(packed_token_probs.data(), k, hwy::SortDescending()); std::vector token_probs; token_probs.reserve(k); for (int32_t i = 0; i < static_cast(k); ++i) { token_probs.push_back(UnpackTokenAndProb(packed_token_probs[i])); } return token_probs; } template HWY_NOINLINE HWY_MAYBE_UNUSED int SampleTopK(Logits logits, size_t k, RngStream& gen, float temperature, TAcceptToken& accept_token) { std::vector token_probs = TopK(logits, k, accept_token); std::vector topk_indices(k); std::vector topk_probs(k); for (size_t i = 0; i < k; ++i) { topk_indices[i] = token_probs[i].token; topk_probs[i] = token_probs[i].prob; } return topk_indices[create_distribution(topk_probs, temperature)(gen)]; } template HWY_NOINLINE HWY_MAYBE_UNUSED TokenAndProb FusedSoftmaxAndSampleTopK( Logits logits, size_t k, RngStream& gen, float temperature, TAcceptToken& accept_token, ThreadingContext& ctx, size_t worker) { // Softmax and sample top-K is equivalent to taking the top-K logits and // sampling from the softmax of the top-K logits. The latter is faster as it // avoids computing the softmax of all logits. std::vector token_logits = TopK(logits, k, accept_token); std::vector topk_indices(k); std::vector topk_logits(k); for (size_t i = 0; i < token_logits.size(); ++i) { topk_indices[i] = token_logits[i].token; topk_logits[i] = token_logits[i].prob; } const size_t mask = token_logits.size(); Softmax(Logits(topk_logits.data(), mask), ctx, worker, temperature); auto distribution = std::discrete_distribution( std::begin(topk_logits), std::begin(topk_logits) + mask); int topk_sampled_index = distribution(gen); int sampled_index = topk_indices[topk_sampled_index]; return TokenAndProb{.token = sampled_index, .prob = topk_logits[topk_sampled_index]}; } // Performs 4x4 average pooling across row vectors // Input has 4096 (64*64) rows, output has 256 (16*16) rows // Each output row is the average of a 4x4 block of input rows template MatStorageT AvgPool4x4(MatStorageT& input, const Allocator& allocator) { const Extents2D extents = input.Extents(); // Input validation HWY_DASSERT(extents.rows == 4096); // 64 * 64 = 4096 input rows // Create output with 256 rows and same number of columns const size_t out_rows = 256; // 16 * 16 = 256 output rows MatStorageT result("pool4x4", Extents2D(out_rows, extents.cols), allocator, MatPadding::kOdd); const size_t input_dim = 64; // Input is 64×64 const size_t output_dim = 16; // Output is 16×16 for (size_t out_row_idx = 0; out_row_idx < output_dim; ++out_row_idx) { for (size_t out_col_idx = 0; out_col_idx < output_dim; ++out_col_idx) { size_t out_idx = out_row_idx * output_dim + out_col_idx; T* output_row = result.Row(out_idx); // Initialize output row to zeros std::fill(output_row, output_row + extents.cols, 0); // Average 16 row vectors from a 4x4 block for (size_t i = 0; i < 4; ++i) { for (size_t j = 0; j < 4; ++j) { size_t in_row_idx = out_row_idx * 4 + i; size_t in_col_idx = out_col_idx * 4 + j; size_t in_idx = in_row_idx * input_dim + in_col_idx; const T* input_row = input.Row(in_idx); // Add each input row to the output // TODO(philculliton): use AddFrom in `ops-inl` for a vectorized loop. for (size_t col = 0; col < extents.cols; ++col) { output_row[col] += input_row[col]; } } } // Divide by 16 to get the average for (size_t col = 0; col < extents.cols; ++col) { output_row[col] *= T{0.0625}; } } } return result; } // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE } // namespace gcpp HWY_AFTER_NAMESPACE(); #endif // NOLINT