Factor out SumHeads

PiperOrigin-RevId: 842138081
2025-12-09 02:22:46 -08:00 · 2025-12-09 02:22:46 -08:00 · 14a9ecf21d
parent 1014ae9e2a
commit 14a9ecf21d
2 changed files with 20 additions and 19 deletions
--- a/gemma/attention.cc
+++ b/gemma/attention.cc
@ -43,6 +43,7 @@
 // After highway.h
 #include "compression/compress-inl.h"
 #include "gemma/flash_attention.h"
 #include "gemma/gemma-inl.h"
 #include "ops/ops-inl.h"
 HWY_BEFORE_NAMESPACE();
@ -319,25 +320,6 @@ static HWY_INLINE void ComputeQKV(size_t num_tokens, const size_t layer_idx,
      });
 }
 // Sums encoded (`att_out`) over num_heads (`layer_config.heads`) and
 // head_dim (`qkv_dim`) into output (`layer_out`).
 static HWY_INLINE void SumHeads(const LayerWeightsPtrs& layer,
                                AttentionActivationsPtrs& activations,
                                MatMulEnv& env) {
  GCPP_ZONE(env.ctx, hwy::Profiler::GlobalIdx(), Zones::kGenAttentionSumHeads);
  const LayerConfig& layer_config = layer.layer_config;
  (void)layer_config;  // For HWY_DASSERT
  // att_weights and att_out are concatenated heads, each of length
  // layer_config.qkv_dim. Thus the [num_interleaved,
  // layer_config.model_dim] matmul output is the sum over heads. Compare
  // gemma/modules.py: attn_output = self.attn_vec_einsum('BTNH,NHD->BTD',
  // encoded)
  HWY_DASSERT(layer_config.model_dim != 0 && layer_config.heads != 0 &&
              layer_config.qkv_dim != 0);
  CallMatMul(activations.att_out, layer.att_weights, /*add=*/nullptr, env,
             activations.att_sums);
 }
 void GemmaAttention(size_t num_tokens, const size_t layer_idx,
                    const LayerWeightsPtrs& layer,
                    AttentionActivationsPtrs& activations, QBatch& qbatch,
--- a/gemma/gemma-inl.h
+++ b/gemma/gemma-inl.h
@ -183,6 +183,25 @@ static inline void FFWNoVit(const LayerWeightsPtrs& layer,
  CallMatMul(activations.C1, layer.linear_w, nullptr, env, activations.ffw_out);
 }
 // Sums encoded (`att_out`) over num_heads (`layer_config.heads`) and
 // head_dim (`qkv_dim`) into output (`layer_out`).
 static HWY_INLINE void SumHeads(const LayerWeightsPtrs& layer,
                                AttentionActivationsPtrs& activations,
                                MatMulEnv& env) {
  GCPP_ZONE(env.ctx, hwy::Profiler::GlobalIdx(), Zones::kGenAttentionSumHeads);
  const LayerConfig& layer_config = layer.layer_config;
  (void)layer_config;  // For HWY_DASSERT
  // att_weights and att_out are concatenated heads, each of length
  // layer_config.qkv_dim. Thus the [num_interleaved,
  // layer_config.model_dim] matmul output is the sum over heads. Compare
  // gemma/modules.py: attn_output = self.attn_vec_einsum('BTNH,NHD->BTD',
  // encoded)
  HWY_DASSERT(layer_config.model_dim != 0 && layer_config.heads != 0 &&
              layer_config.qkv_dim != 0);
  CallMatMul(activations.att_out, layer.att_weights, /*add=*/nullptr, env,
             activations.att_sums);
 }
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace gcpp