Merge 61dedf73ed into 5a6895c609

2025-12-09 09:55:42 +00:00 · 2025-12-09 09:55:42 +00:00 · 1a12c4d1a6
parent 5a6895c609 61dedf73ed
commit 1a12c4d1a6
5 changed files with 17 additions and 8 deletions
--- a/gemma/activations.h
+++ b/gemma/activations.h
@ -25,6 +25,7 @@

 #include "gemma/configs.h"  // ModelConfig
 #include "gemma/gemma_args.h"  // AttentionImpl
+#include "gemma/kv_cache.h"
 #include "ops/ops.h"        // CreateInvTimescale
 #include "util/basics.h"    // BF16
 #include "util/mat.h"       // MatStorageT
--- a/gemma/attention.cc
+++ b/gemma/attention.cc
@ -321,9 +321,8 @@ static HWY_INLINE void ComputeQKV(size_t num_tokens, const size_t layer_idx,

 // Sums encoded (`att_out`) over num_heads (`layer_config.heads`) and
 // head_dim (`qkv_dim`) into output (`layer_out`).
-static HWY_INLINE void SumHeads(const LayerWeightsPtrs& layer,
-                                AttentionActivationsPtrs& activations,
-                                MatMulEnv& env) {
+void SumHeads(const LayerWeightsPtrs& layer,
+              AttentionActivationsPtrs& activations, MatMulEnv& env) {
  GCPP_ZONE(env.ctx, hwy::Profiler::GlobalIdx(), Zones::kGenAttentionSumHeads);
  const LayerConfig& layer_config = layer.layer_config;
  (void)layer_config;  // For HWY_DASSERT
--- a/gemma/attention.h
+++ b/gemma/attention.h
@ -51,6 +51,8 @@ namespace gcpp {
                      const LayerWeightsPtrs& layer,                          \
                      AttentionActivationsPtrs& activations, QBatch& qbatch,  \
                      MatMulEnv& env, int flags);                             \
+  void SumHeads(const LayerWeightsPtrs& layer,                                \
+                AttentionActivationsPtrs& activations, MatMulEnv& env);       \
  /* NOLINTNEXTLINE(google-readability-namespace-comments) */                 \
  }  // namespace NAMESPACE

--- a/gemma/flash_attention.cc
+++ b/gemma/flash_attention.cc
@ -425,9 +425,14 @@ float HWY_INLINE SingleFlashAttentionRowVector(DF df, VF& x, float& old_max,
  float scale = old_d * std::exp(old_max - m);
  old_d = hn::ReduceSum(df, x) + scale;
  old_max = m;
-  float one_over_d = 1.0f / old_d;
-  scale *= one_over_d;
-  x = hn::Mul(x, hn::Set(df, one_over_d));
+  if (old_d > 0.0f) {
+    const float one_over_d = 1.0f / old_d;
+    scale *= one_over_d;
+    x = hn::Mul(x, hn::Set(df, one_over_d));
+  } else {
+    scale = 0.0f;
+    x = hn::Zero(df);
+  }
  return scale;
 }

--- a/gemma/gemma.cc
+++ b/gemma/gemma.cc
@ -519,8 +519,10 @@ static size_t PrefillTBatchOrQBatch(const ModelConfig& config,
    HWY_ASSERT(qbatch.KV(qi).SeqLen() == seq_len);
  }
  if (max_prompt_size > seq_len) {
-    HWY_ABORT("max_prompt_size = %zu, increase --seq_len to at least that.",
-              max_prompt_size);
+    HWY_ABORT(
+        "max_prompt_size = %zu, seq_len = %zu, increase --seq_len to at least "
+        "that.",
+        max_prompt_size, seq_len);
  }
  HWY_ASSERT(activations.attention.div_seq_len.GetDivisor() == seq_len);