Introduce attention implementation configurability.

PiperOrigin-RevId: 828971705
2025-11-06 08:43:03 -08:00 · 2025-11-06 08:43:03 -08:00 · 35e9f9f05f
parent 091b4567c9
commit 35e9f9f05f
6 changed files with 52 additions and 11 deletions
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -524,6 +524,7 @@ cc_library(
    deps = [
        ":args",
        ":basics",
+        ":configs",
        ":mat",
        "//io",
        "@highway//:hwy",
--- a/gemma/activations.h
+++ b/gemma/activations.h
@ -24,6 +24,7 @@
 #include <vector>

 #include "gemma/configs.h"  // ModelConfig
+#include "gemma/gemma_args.h"  // AttentionImpl
 #include "ops/ops.h"        // CreateInvTimescale
 #include "util/basics.h"    // BF16
 #include "util/mat.h"       // MatStorageT
@ -179,8 +180,8 @@ struct AttentionActivationsPtrs {
 };

 struct Activations {
-  Activations(const ModelConfig& config, size_t batch_size, size_t seq_len,
-              ThreadingContext& ctx,
+  Activations(const RuntimeConfig& runtime_config, const ModelConfig& config,
+              size_t batch_size, size_t seq_len, ThreadingContext& ctx,
              std::vector<hwy::AlignedFreeUniquePtr<uint8_t*[]>>& row_ptrs)
      : layer_config(config.layer_configs[0]),

@ -199,6 +200,7 @@ struct Activations {
        ffw_out(
            MatFactory("ffw_out", batch_size, config.model_dim, ctx.allocator)),

+        attention_impl(runtime_config.attention_impl),
        attention_storage(config, layer_config, batch_size, seq_len,
                          ctx.allocator, row_ptrs),
        attention(config, seq_len, attention_storage) {
@ -248,6 +250,8 @@ struct Activations {
  MatStorageT<BF16> C2;
  MatStorageT<float> ffw_out;

+  AttentionImpl attention_impl;
+
  AttentionActivations attention_storage;
  AttentionActivationsPtrs attention;
 };
--- a/gemma/configs.h
+++ b/gemma/configs.h
@ -80,6 +80,34 @@ static inline bool EnumValid(LayerAttentionType type) {
  return type == LayerAttentionType::kGemma || type == LayerAttentionType::kVit;
 }

+enum class AttentionImpl {
+  kOld,
+  kFlash,
+};
+
+/*
+ * Returns a bitmask of flags to pass to attention functions based on the
+ * attention implementation selected.
+ *
+ * If `hwy_native_dot_bf16` is true, the function will use the old attention
+ * implementation, ignoring `impl`.
+ *
+ * `hwy_native_dot_bf16` needs to be passed in, because the HWY_NATIVE_DOT_BF16
+ * macro is not available outside of highway instrumented translation units and
+ * cannot be made accessible from .h files.
+ */
+static inline int AttentionImplToFlags(AttentionImpl impl,
+                                       int hwy_native_dot_bf16) {
+  if (hwy_native_dot_bf16) return kAttentionUseOld;
+
+  switch (impl) {
+    case AttentionImpl::kOld:
+      return kAttentionUseOld;
+    case AttentionImpl::kFlash:
+      return 0;
+  }
+}
+
 // Post attention and ffw normalization type.
 enum class PostNormType {
  None,
--- a/gemma/flash_attention_test.cc
+++ b/gemma/flash_attention_test.cc
@ -112,8 +112,9 @@ void TestFlashAttention(size_t target_parallelism) {
  RuntimeConfig runtime_config;
  KVCache kv_cache(config, inference_args, ctx.allocator);
  MatMulEnv env(ctx);
-  Activations activations(config, runtime_config.prefill_tbatch_size,
-                          kv_cache.SeqLen(), env.ctx, env.row_ptrs);
+  Activations activations(runtime_config, config,
+                          runtime_config.prefill_tbatch_size, kv_cache.SeqLen(),
+                          env.ctx, env.row_ptrs);
  std::vector<int> tokens(kOuter);
  std::iota(tokens.begin(), tokens.end(), 1);
  PromptTokens prompt(tokens);
--- a/gemma/gemma.cc
+++ b/gemma/gemma.cc
@ -73,10 +73,12 @@ namespace HWY_NAMESPACE {
 void Attention(LayerAttentionType type, const size_t num_tokens,
               const size_t layer_idx, const LayerWeightsPtrs& layer,
               Activations& activations, QBatch& qbatch, MatMulEnv& env) {
+
  if (type == LayerAttentionType::kGemma) {
    // TODO: remove flag to enable FlashAttention.
-    GemmaAttention(num_tokens, layer_idx, layer, activations.attention, qbatch,
-                   env, HWY_NATIVE_DOT_BF16 ? kAttentionUseOld : 0);
+    GemmaAttention(
+        num_tokens, layer_idx, layer, activations.attention, qbatch, env,
+        AttentionImplToFlags(activations.attention_impl, HWY_NATIVE_DOT_BF16));
  }
 }

@ -573,8 +575,9 @@ void GenerateSingleT(const PromptTokens& prompt, size_t pos, size_t prefix_end,
                     const AesCtrEngine& engine, const WeightsPtrs& weights,
                     KVCache& kv_cache, MatMulEnv& env,
                     TimingInfo& timing_info) {
-  Activations activations(config, runtime_config.prefill_tbatch_size,
-                          kv_cache.SeqLen(), env.ctx, env.row_ptrs);
+  Activations activations(runtime_config, config,
+                          runtime_config.prefill_tbatch_size, kv_cache.SeqLen(),
+                          env.ctx, env.row_ptrs);

  AllQueries all_queries(prompt, pos, prefix_end,
                         hwy::Span<KVCache>(&kv_cache, 1));
@ -592,7 +595,7 @@ void GenerateBatchT(const ModelConfig& config,
                    TimingInfo& timing_info) {
  const size_t max_batch_size = HWY_MAX(runtime_config.decode_qbatch_size,
                                        runtime_config.prefill_tbatch_size);
-  Activations activations(config, max_batch_size,
+  Activations activations(runtime_config, config, max_batch_size,
                          all_queries[0].kv_cache.SeqLen(), env.ctx,
                          env.row_ptrs);

@ -617,8 +620,8 @@ void GenerateImageTokensT(const ModelConfig& config,
  const size_t num_tokens = vit_config.max_seq_len;
  prefill_runtime_config.prefill_tbatch_size =
      num_tokens / (vit_config.pool_dim * vit_config.pool_dim);
-  Activations prefill_activations(vit_config, num_tokens, num_tokens, env.ctx,
-                                  env.row_ptrs);
+  Activations prefill_activations(runtime_config, vit_config, num_tokens,
+                                  num_tokens, env.ctx, env.row_ptrs);
  // Weights are for the full PaliGemma model, not just the ViT part.
  PrefillVit(config, weights, prefill_runtime_config, image, image_tokens,
             prefill_activations, env);
--- a/gemma/gemma_args.h
+++ b/gemma/gemma_args.h
@ -24,6 +24,7 @@
 #include <functional>
 #include <string>

+#include "gemma/configs.h"
 #include "io/io.h"  // Path
 #include "util/args.h"
 #include "util/basics.h"  // Tristate
@ -139,6 +140,9 @@ struct RuntimeConfig {

  int verbosity;  // Controls verbosity of printed messages.

+  // Which attention implementation to use.
+  AttentionImpl attention_impl = AttentionImpl::kFlash;
+
  // Functions operating on the generated tokens.
  StreamFunc stream_token;
  BatchStreamFunc batch_stream_token;