From 7b55d41f467145be11ed80312037ab3cb9169c7c Mon Sep 17 00:00:00 2001
From: Ray Smith <rays@google.com>
Date: Tue, 10 Feb 2026 07:55:17 -0800
Subject: [PATCH] Rewrote flash attention to use BF16, transpose k and v,
 rewrote the task distribution, increase parallelism on decode, and use double
 the registers for the core of flash attention.

PiperOrigin-RevId: 868146247
---
 BUILD.bazel                           |   40 +-
 evals/benchmark_helper.cc             |    6 +-
 evals/benchmark_helper.h              |    2 +
 evals/gemma_batch_bench.cc            |   24 +-
 evals/testdata/holiday_story.txt      |   10 +
 evals/testdata/quark_1.txt            |   21 +
 evals/testdata/quark_2.txt            |   90 +
 evals/testdata/special_relativity.txt | 3984 +++++++++++++++++++++++++
 evals/testdata/standard_model.txt     |  851 ++++++
 evals/wheat_from_chaff_test.cc        |  179 ++
 gemma/activations.h                   |   60 +-
 gemma/attention.cc                    |   71 +-
 gemma/attention.h                     |   10 +-
 gemma/attention_test.cc               |  118 +-
 gemma/configs.cc                      |   16 +-
 gemma/configs.h                       |    5 +-
 gemma/flash_attention.cc              | 1595 ++++++----
 gemma/flash_attention.h               |   11 +-
 gemma/flash_attention_test.cc         |   48 +-
 gemma/flash_structs.h                 |   48 +-
 gemma/gemma.cc                        |    8 +-
 gemma/kv_cache.cc                     |   13 +
 gemma/kv_cache.h                      |   27 +-
 ops/ops-inl.h                         |  587 +---
 util/mat.h                            |   11 +
 util/test_util.h                      |    3 +-
 util/zones.cc                         |   12 +-
 util/zones.h                          |    6 +-
 28 files changed, 6662 insertions(+), 1194 deletions(-)
 create mode 100644 evals/testdata/holiday_story.txt
 create mode 100644 evals/testdata/quark_1.txt
 create mode 100644 evals/testdata/quark_2.txt
 create mode 100644 evals/testdata/special_relativity.txt
 create mode 100644 evals/testdata/standard_model.txt
 create mode 100644 evals/wheat_from_chaff_test.cc

diff --git a/BUILD.bazel b/BUILD.bazel
index 9eb60a4..6979bdc 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -547,6 +547,7 @@ cc_library(
     deps = [
         ":basics",
         ":configs",
+        ":flash_structs",
         ":gemma_args",
         ":kv_cache",
         ":mat",
@@ -594,6 +595,11 @@ cc_test(
 
 INTERNAL_DEPS = []
 
+cc_library(
+    name = "flash_structs",
+    hdrs = ["gemma/flash_structs.h"],
+)
+
 cc_library(
     name = "attention",
     srcs = [
@@ -603,7 +609,6 @@ cc_library(
     hdrs = [
         "gemma/attention.h",
         "gemma/flash_attention.h",
-        "gemma/flash_structs.h",
     ],
     textual_hdrs = [
         "gemma/gemma-inl.h",
@@ -612,6 +617,7 @@ cc_library(
         ":activations",
         ":basics",
         ":configs",
+        ":flash_structs",
         ":kv_cache",
         ":mat",
         ":matmul",
@@ -822,6 +828,38 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "wheat_from_chaff_test",
+    srcs = ["evals/wheat_from_chaff_test.cc"],
+    data = [
+        "evals/testdata/google/big_bang_theory.txt",
+        "evals/testdata/google/black_hole.txt",
+        "evals/testdata/google/general_relativity.txt",
+        "evals/testdata/google/qed.txt",
+        "evals/testdata/holiday_story.txt",
+        "evals/testdata/quark_1.txt",
+        "evals/testdata/quark_2.txt",
+        "evals/testdata/special_relativity.txt",
+        "evals/testdata/standard_model.txt",
+    ],
+    linkstatic = True,
+    # Requires model files
+    tags = [
+        "local",
+        "manual",
+        "no_tap",
+    ],
+    deps = [
+        ":benchmark_helper",
+        ":configs",
+        ":gemma_lib",
+        "@googletest//:gtest_main",  # buildcleaner: keep
+        "//io",
+        "@highway//:abort_header_only",
+        "@highway//:hwy_test_util",
+    ],
+)
+
 cc_binary(
     name = "gemma",
     srcs = ["gemma/run.cc"],
diff --git a/evals/benchmark_helper.cc b/evals/benchmark_helper.cc
index 30d364f..ad6bd08 100644
--- a/evals/benchmark_helper.cc
+++ b/evals/benchmark_helper.cc
@@ -150,7 +150,11 @@ std::vector<QueryResult> GemmaEnv::BatchQueryModel(
 
 QueryResult GemmaEnv::QueryModel(const std::string& input) {
   const std::vector<int> prompt = WrapAndTokenize(input);
-  return QueryModel(prompt);
+  auto result = QueryModel(prompt);
+  fprintf(stderr, "prompt size: %zu, response size: %zu, total tokens: %zu\n",
+          prompt.size(), result.tokens_generated - prompt.size(),
+          result.tokens_generated);
+  return result;
 }
 
 QueryResultAndMetrics GemmaEnv::BatchQueryModelWithMetrics(
diff --git a/evals/benchmark_helper.h b/evals/benchmark_helper.h
index 203174c..85f0d21 100644
--- a/evals/benchmark_helper.h
+++ b/evals/benchmark_helper.h
@@ -62,6 +62,8 @@ class GemmaEnv {
         static_cast<size_t>(max_generated_tokens);
   }
 
+  void PrintProfileResults() { ctx_.profiler.PrintResults(); }
+
   std::vector<int> Tokenize(const std::string& input) const {
     std::vector<int> tokens;
     HWY_ASSERT(gemma_.Tokenizer().Encode(input, &tokens));
diff --git a/evals/gemma_batch_bench.cc b/evals/gemma_batch_bench.cc
index dd9cb45..a780aa4 100644
--- a/evals/gemma_batch_bench.cc
+++ b/evals/gemma_batch_bench.cc
@@ -37,7 +37,8 @@ GemmaEnv* s_env = nullptr;
 class GemmaBatchBench : public ::testing::Test {
  protected:
   std::vector<std::string> BatchGemmaReply(
-      const std::vector<std::string>& inputs) {
+      const std::vector<std::string>& inputs, AttentionImpl attention_impl) {
+    s_env->MutableConfig().attention_impl = attention_impl;
     s_env->MutableConfig().temperature = 0.0f;  // deterministic
     s_env->MutableConfig().verbosity = 2;
     std::vector<std::string> replies;
@@ -128,16 +129,19 @@ std::vector<std::string> GenerateInputs() {
 TEST_F(GemmaBatchBench, RandomQuestionsBatched) {
   s_env->SetMaxGeneratedTokens(12);
   const std::vector<std::string> inputs = GenerateInputs();
-
-  // Run multiple times so that auto-tuning is closer to complete.
-  for (size_t rep = 0; rep < 4; ++rep) {
-    std::vector<std::string> responses = BatchGemmaReply(inputs);
-    for (size_t i = 0; i < HWY_MIN(hwy::Unpredictable1() * 3, responses.size());
-         ++i) {
-      fprintf(stderr, "Rep %zu batch answer %zu '%s'\n\n", rep, i,
-              responses[i].c_str());
+  const AttentionImpl modes[] = {AttentionImpl::kOld, AttentionImpl::kFlash};
+  for (const AttentionImpl mode : modes) {
+    // Run multiple times so that auto-tuning is closer to complete.
+    fprintf(stderr, "Testing mode %s\n", GetAttentionImplName(mode).c_str());
+    for (size_t rep = 0; rep < 4; ++rep) {
+      std::vector<std::string> responses = BatchGemmaReply(inputs, mode);
+      for (size_t i = 0;
+           i < HWY_MIN(hwy::Unpredictable1() * 3, responses.size()); ++i) {
+        fprintf(stderr, "Rep %zu batch answer %zu '%s'\n\n", rep, i,
+                responses[i].c_str());
+      }
+      PROFILER_PRINT_RESULTS();
     }
-    PROFILER_PRINT_RESULTS();
   }
 }
 
diff --git a/evals/testdata/holiday_story.txt b/evals/testdata/holiday_story.txt
new file mode 100644
index 0000000..997cbed
--- /dev/null
+++ b/evals/testdata/holiday_story.txt
@@ -0,0 +1,10 @@
+Albert and Marcia were on holiday. Their parents had brought them to the beach.
+Albert was generally unimpressed with beaches, as he would rather explore a dark forest and see the variety of mosses and fungi that grow in the damp conditions.
+On the other hand, Marcia loved to build enormous sand castles.
+Albert enjoyed collecting limpet shells to decorate the outer walls of the turrets, which he secretly thought made them look like daleks.
+Whilst digging sand for building, Marcia always liked to dig deep, to see if she could get to water coming through the sand from the sea.
+When the castle was nearly complete, and Marcia needed more sand, she hit a large piece of rusty metal.
+Curious as to what it was, Marcia kept digging to try to expose all of it, but it was very big and hard to get at as it was so deep in the sand.
+Excited by the prospect of finding something unusual in the sand, Albert joined in to help dig out the entire object.
+Almost an hour later, they had exposed most of a ship’s anchor.
+During the excavation a crowd on onlookers had formed around them, who then proceeded to take selfies in front of the unusual piece of beach litter.
diff --git a/evals/testdata/quark_1.txt b/evals/testdata/quark_1.txt
new file mode 100644
index 0000000..7706631
--- /dev/null
+++ b/evals/testdata/quark_1.txt
@@ -0,0 +1,21 @@
+Text from https://en.wikipedia.org/wiki/Quark is licensed under Creative Commons Attribution-ShareAlike 4.0 License; (https://en.wikipedia.org/wiki/Wikipedia:Text_of_the_Creative_Commons_Attribution-ShareAlike_4.0_International_License)
+
+Quark
+From Wikipedia, the free encyclopedia
+(Redirected from Quarks)
+This article is about the elementary particle and its antiparticle. For other uses, see Quark (disambiguation).
+⁠
+A quark (/ˈkwɔːrk, ˈkwɑːrk/ ⓘ) is a type of elementary particle and a fundamental constituent of matter. Quarks combine to form composite particles called hadrons, the most stable of which are protons and neutrons, the components of atomic nuclei.[1] All commonly observable matter is composed of up quarks, down quarks and electrons. Owing to a phenomenon known as color confinement, quarks are never found in isolation; they can be found only within hadrons, which include baryons (such as protons and neutrons) and mesons, or in quark–gluon plasmas.[2][3][nb 1] For this reason, much of what is known about quarks has been drawn from observations of hadrons.
+
+Quarks have various intrinsic properties, including electric charge, mass, color charge, and spin. They are the only elementary particles in the Standard Model of particle physics to experience all four fundamental interactions, also known as fundamental forces (electromagnetism, gravitation, strong interaction, and weak interaction), as well as the only known particles whose electric charges are not integer multiples of the elementary charge.
+
+There are six types, known as flavors, of quarks: up, down, charm, strange, top, and bottom.[4] Up and down quarks have the lowest masses of all quarks. The heavier quarks rapidly change into up and down quarks through a process of particle decay: the transformation from a higher mass state to a lower mass state. Because of this, up and down quarks are generally stable and the most common in the universe, whereas strange, charm, bottom, and top quarks can only be produced in high energy collisions (such as those involving cosmic rays and in particle accelerators). For every quark flavor there is a corresponding type of antiparticle, known as an antiquark, that differs from the quark only in that some of its properties (such as the electric charge) have equal magnitude but opposite sign.
+
+The quark model was independently proposed by physicists Murray Gell-Mann and George Zweig in 1964.[5] Quarks were introduced as parts of an ordering scheme for hadrons, and there was little evidence for their physical existence until deep inelastic scattering experiments at the Stanford Linear Accelerator Center in 1968.[6][7] Accelerator program experiments have provided evidence for all six flavors. The top quark, first observed at Fermilab in 1995, was the last to be discovered.[5]
+
+Classification
+See also: Standard Model
+A four-by-four table of particles. Columns are three generations of matter (fermions) and one of forces (bosons). In the first three columns, two rows contain quarks and two leptons. The top two rows' columns contain up (u) and down (d) quarks, charm (c) and strange (s) quarks, top (t) and bottom (b) quarks, and photon (γ) and gluon (g), respectively. The bottom two rows' columns contain electron neutrino (ν sub e) and electron (e), muon neutrino (ν sub μ) and muon (μ), and tau neutrino (ν sub τ) and tau (τ), and Z sup 0 and W sup ± weak force. Mass, charge, and spin are listed for each particle.
+Six of the particles in the Standard Model are quarks (shown in purple). Each of the first three columns forms a generation of matter.
+The Standard Model is the theoretical framework describing all the known elementary particles. This model contains six flavors of quarks (q), named up (u), down (d), strange (s), charm (c), bottom (b), and top (t).[4] Antiparticles of quarks are called antiquarks, and are denoted by a bar over the symbol for the corresponding quark, such as u for an up antiquark. As with antimatter in general, antiquarks have the same mass, mean lifetime, and spin as their respective quarks, but the electric charge and other charges have the opposite sign.[8]
+
diff --git a/evals/testdata/quark_2.txt b/evals/testdata/quark_2.txt
new file mode 100644
index 0000000..adab5b0
--- /dev/null
+++ b/evals/testdata/quark_2.txt
@@ -0,0 +1,90 @@
+Text from https://en.wikipedia.org/wiki/Quark is licensed under Creative Commons Attribution-ShareAlike 4.0 License; (https://en.wikipedia.org/wiki/Wikipedia:Text_of_the_Creative_Commons_Attribution-ShareAlike_4.0_International_License)
+
+Quark
+From Wikipedia, the free encyclopedia
+(Redirected from Quarks)
+This article is about the elementary particle and its antiparticle. For other uses, see Quark (disambiguation).
+Quark
+Three colored balls (symbolizing quarks) connected pairwise by springs (symbolizing gluons), all inside a gray circle (symbolizing a proton). The colors of the balls are red, green, and blue, to parallel each quark's color charge. The red and blue balls are labeled "u" (for "up" quark) and the green one is labeled "d" (for "down" quark).
+A proton is composed of two up quarks, one down quark, and the gluons that mediate the forces "binding" them together. The color assignment of individual quarks is arbitrary, but all three colors must be present; red, blue and green are used as an analogy to the primary colors that together produce a white color.
+Composition	elementary particle
+Statistics	fermionic
+Generation	1st, 2nd, 3rd
+Interactions	strong, weak, electromagnetic, gravitation
+Symbol	q
+Antiparticle	antiquark (q)
+Theorized	
+Murray Gell-Mann (1964)
+George Zweig (1964)
+Discovered	SLAC (c. 1968)
+Types	6 (up, down, strange, charm, bottom, and top)
+⁠
+A quark (/ˈkwɔːrk, ˈkwɑːrk/ ⓘ) is a type of elementary particle and a fundamental constituent of matter. Quarks combine to form composite particles called hadrons, the most stable of which are protons and neutrons, the components of atomic nuclei.[1] All commonly observable matter is composed of up quarks, down quarks and electrons. Owing to a phenomenon known as color confinement, quarks are never found in isolation; they can be found only within hadrons, which include baryons (such as protons and neutrons) and mesons, or in quark–gluon plasmas.[2][3][nb 1] For this reason, much of what is known about quarks has been drawn from observations of hadrons.
+
+Quarks have various intrinsic properties, including electric charge, mass, color charge, and spin. They are the only elementary particles in the Standard Model of particle physics to experience all four fundamental interactions, also known as fundamental forces (electromagnetism, gravitation, strong interaction, and weak interaction), as well as the only known particles whose electric charges are not integer multiples of the elementary charge.
+
+There are six types, known as flavors, of quarks: up, down, charm, strange, top, and bottom.[4] Up and down quarks have the lowest masses of all quarks. The heavier quarks rapidly change into up and down quarks through a process of particle decay: the transformation from a higher mass state to a lower mass state. Because of this, up and down quarks are generally stable and the most common in the universe, whereas strange, charm, bottom, and top quarks can only be produced in high energy collisions (such as those involving cosmic rays and in particle accelerators). For every quark flavor there is a corresponding type of antiparticle, known as an antiquark, that differs from the quark only in that some of its properties (such as the electric charge) have equal magnitude but opposite sign.
+
+The quark model was independently proposed by physicists Murray Gell-Mann and George Zweig in 1964.[5] Quarks were introduced as parts of an ordering scheme for hadrons, and there was little evidence for their physical existence until deep inelastic scattering experiments at the Stanford Linear Accelerator Center in 1968.[6][7] Accelerator program experiments have provided evidence for all six flavors. The top quark, first observed at Fermilab in 1995, was the last to be discovered.[5]
+
+Classification
+See also: Standard Model
+A four-by-four table of particles. Columns are three generations of matter (fermions) and one of forces (bosons). In the first three columns, two rows contain quarks and two leptons. The top two rows' columns contain up (u) and down (d) quarks, charm (c) and strange (s) quarks, top (t) and bottom (b) quarks, and photon (γ) and gluon (g), respectively. The bottom two rows' columns contain electron neutrino (ν sub e) and electron (e), muon neutrino (ν sub μ) and muon (μ), and tau neutrino (ν sub τ) and tau (τ), and Z sup 0 and W sup ± weak force. Mass, charge, and spin are listed for each particle.
+Six of the particles in the Standard Model are quarks (shown in purple). Each of the first three columns forms a generation of matter.
+The Standard Model is the theoretical framework describing all the known elementary particles. This model contains six flavors of quarks (q), named up (u), down (d), strange (s), charm (c), bottom (b), and top (t).[4] Antiparticles of quarks are called antiquarks, and are denoted by a bar over the symbol for the corresponding quark, such as u for an up antiquark. As with antimatter in general, antiquarks have the same mass, mean lifetime, and spin as their respective quarks, but the electric charge and other charges have the opposite sign.[8]
+
+Quarks are spin-⁠
+1
+/
+2
+⁠ particles, which means they are fermions according to the spin–statistics theorem. They are subject to the Pauli exclusion principle, which states that no two identical fermions can simultaneously occupy the same quantum state. This is in contrast to bosons (particles with integer spin), of which any number can be in the same state.[9] Unlike leptons, quarks possess color charge, which causes them to engage in the strong interaction. The resulting attraction between different quarks causes the formation of composite particles known as hadrons (see § Strong interaction and color charge below).
+
+The quarks that determine the quantum numbers of hadrons are called valence quarks; apart from these, any hadron may contain an indefinite number of virtual "sea" quarks, antiquarks, and gluons, which do not influence its quantum numbers.[10] There are two families of hadrons: baryons, with three valence quarks, and mesons, with a valence quark and an antiquark.[11] The most common baryons are the proton and the neutron, the building blocks of the atomic nucleus.[12] A great number of hadrons are known (see list of baryons and list of mesons), most of them differentiated by their quark content and the properties these constituent quarks confer. The existence of "exotic" hadrons with more valence quarks, such as tetraquarks (qqqq) and pentaquarks (qqqqq), was conjectured from the beginnings of the quark model[13] but not discovered until the early 21st century.[14][15][16][17]
+
+Elementary fermions are grouped into three generations, each comprising two leptons and two quarks. The first generation includes up and down quarks, the second strange and charm quarks, and the third bottom and top quarks. All searches for a fourth generation of quarks and other elementary fermions have failed,[18][19] and there is strong indirect evidence that no more than three generations exist.[nb 2][20][21][22] Particles in higher generations generally have greater mass and less stability, causing them to decay into lower-generation particles by means of weak interactions. Only first-generation (up and down) quarks occur commonly in nature. Heavier quarks can only be created in high-energy collisions (such as in those involving cosmic rays), and decay quickly; however, they are thought to have been present during the first fractions of a second after the Big Bang, when the universe was in an extremely hot and dense phase (the quark epoch). Studies of heavier quarks are conducted in artificially created conditions, such as in particle accelerators.[23]
+
+Having electric charge, mass, color charge, and flavor, quarks are the only known elementary particles that engage in all four fundamental interactions of contemporary physics: electromagnetism, gravitation, strong interaction, and weak interaction.[12] Gravitation is too weak to be relevant to individual particle interactions except at extremes of energy (Planck energy) and distance scales (Planck distance). However, since no successful quantum theory of gravity exists, gravitation is not described by the Standard Model.
+
+See the table of properties below for a more complete overview of the six quark flavors' properties.
+
+History
+
+Murray Gell-Mann (2007)
+
+George Zweig (2015)
+The quark model was independently proposed by physicists Murray Gell-Mann[24] and George Zweig[25][26] in 1964.[5] The proposal came shortly after Gell-Mann's 1961 formulation of a particle classification system known as the Eightfold Way – or, in more technical terms, SU(3) flavor symmetry, streamlining its structure.[27] Physicist Yuval Ne'eman had independently developed a scheme similar to the Eightfold Way in the same year.[28][29] An early attempt at constituent organization was available in the Sakata model.
+
+At the time of the quark theory's inception, the "particle zoo" included a multitude of hadrons, among other particles. Gell-Mann and Zweig posited that they were not elementary particles, but were instead composed of combinations of quarks and antiquarks. Their model involved three flavors of quarks, up, down, and strange, to which they ascribed properties such as spin and electric charge.[24][25][26] The initial reaction of the physics community to the proposal was mixed. There was particular contention about whether the quark was a physical entity or a mere abstraction used to explain concepts that were not fully understood at the time.[30]
+
+In less than a year, extensions to the Gell-Mann–Zweig model were proposed. Sheldon Glashow and James Bjorken predicted the existence of a fourth flavor of quark, which they called charm. The addition was proposed because it allowed for a better description of the weak interaction (the mechanism that allows quarks to decay), equalized the number of known quarks with the number of known leptons, and implied a mass formula that correctly reproduced the masses of the known mesons.[31]
+
+Deep inelastic scattering experiments conducted in 1968 at the Stanford Linear Accelerator Center (SLAC) and published on October 20, 1969, showed that the proton contained much smaller, point-like objects and was therefore not an elementary particle.[6][7][32] Physicists were reluctant to firmly identify these objects with quarks at the time, instead calling them "partons" – a term coined by Richard Feynman.[33][34][35] The objects that were observed at SLAC would later be identified as up and down quarks as the other flavors were discovered.[36] Nevertheless, "parton" remains in use as a collective term for the constituents of hadrons (quarks, antiquarks, and gluons). Richard Taylor, Henry Kendall and Jerome Friedman received the 1990 Nobel Prize in physics for their work at SLAC.
+
+Photo of bubble chamber tracks next to diagram of same tracks. A neutrino (unseen in photo) enters from below and collides with a proton, producing a negatively charged muon, three positively charged pions, and one negatively charged pion, as well as a neutral lambda baryon (unseen in photograph). The lambda baryon then decays into a proton and a negative pion, producing a "V" pattern.
+Photograph of the event that led to the discovery of the Σ++
+c baryon, at the Brookhaven National Laboratory in 1974
+The strange quark's existence was indirectly validated by SLAC's scattering experiments: not only was it a necessary component of Gell-Mann and Zweig's three-quark model, but it provided an explanation for the kaon (K) and pion (π) hadrons discovered in cosmic rays in 1947.[37]
+
+In a 1970 paper, Glashow, John Iliopoulos and Luciano Maiani presented the GIM mechanism (named from their initials) to explain the experimental non-observation of flavor-changing neutral currents. This theoretical model required the existence of the as-yet undiscovered charm quark.[38][39] The number of supposed quark flavors grew to the current six in 1973, when Makoto Kobayashi and Toshihide Maskawa noted that the experimental observation of CP violation[nb 3][40] could be explained if there were another pair of quarks.
+
+Charm quarks were produced almost simultaneously by two teams in November 1974 (see November Revolution) – one at SLAC under Burton Richter, and one at Brookhaven National Laboratory under Samuel Ting. The charm quarks were observed bound with charm antiquarks in mesons. The two parties had assigned the discovered meson two different symbols, J and ψ; thus, it became formally known as the J/ψ meson. The discovery finally convinced the physics community of the quark model's validity.[35]
+
+In the following years a number of suggestions appeared for extending the quark model to six quarks. Of these, the 1975 paper by Haim Harari[41] was the first to coin the terms top and bottom for the additional quarks.[42]
+
+In 1977, the bottom quark was observed by a team at Fermilab led by Leon Lederman.[43][44] This was a strong indicator of the top quark's existence: without the top quark, the bottom quark would have been without a partner. It was not until 1995 that the top quark was finally observed, also by the CDF[45] and DØ[46] teams at Fermilab.[5] It had a mass much larger than expected,[47] almost as large as that of a gold atom.[48]
+
+Etymology
+For some time, Gell-Mann was undecided on an actual spelling for the term he intended to coin, until he found the word quark in James Joyce's 1939 book Finnegans Wake:[49]
+
+– Three quarks for Muster Mark!
+Sure he hasn't got much of a bark
+And sure any he has it's all beside the mark.
+
+The word quark is an old English word meaning to croak[50] and the above-quoted lines are about a bird choir mocking king Mark of Cornwall in the legend of Tristan and Iseult.[51] Especially in the German-speaking parts of the world there is a widespread legend, however, that Joyce had taken it from the word Quark,[52] a German word of Slavic origin which denotes a curd cheese,[53] but is also a colloquial term for "trivial nonsense".[54] In the legend it is said that he had heard it on a journey to Germany at a farmers' market in Freiburg.[55][56] Some authors, however, defend a possible German origin of Joyce's word quark.[57] Gell-Mann went into further detail regarding the name of the quark in his 1994 book The Quark and the Jaguar:[58]
+
+In 1963, when I assigned the name "quark" to the fundamental constituents of the nucleon, I had the sound first, without the spelling, which could have been "kwork". Then, in one of my occasional perusals of Finnegans Wake, by James Joyce, I came across the word "quark" in the phrase "Three quarks for Muster Mark". Since "quark" (meaning, for one thing, the cry of the gull) was clearly intended to rhyme with "Mark", as well as "bark" and other such words, I had to find an excuse to pronounce it as "kwork". But the book represents the dream of a publican named Humphrey Chimpden Earwicker. Words in the text are typically drawn from several sources at once, like the "portmanteau" words in Through the Looking-Glass. From time to time, phrases occur in the book that are partially determined by calls for drinks at the bar. I argued, therefore, that perhaps one of the multiple sources of the cry "Three quarks for Muster Mark" might be "Three quarts for Mister Mark", in which case the pronunciation "kwork" would not be totally unjustified. In any case, the number three fitted perfectly the way quarks occur in nature.
+
+Zweig preferred the name ace for the particle he had theorized, but Gell-Mann's terminology came to prominence once the quark model had been commonly accepted.[59]
+
+The quark flavors were given their names for several reasons. The up and down quarks are named after the up and down components of isospin, which they carry.[60] Strange quarks were given their name because they were discovered to be components of the strange particles discovered in cosmic rays years before the quark model was proposed; these particles were deemed "strange" because they had unusually long lifetimes.[61] Glashow, who co-proposed the charm quark with Bjorken, is quoted as saying, "We called our construct the 'charmed quark', for we were fascinated and pleased by the symmetry it brought to the subnuclear world."[62] The names "top" and "bottom", coined by Harari, were chosen because they are "logical partners for up and down quarks".[41][42][61] Alternative names for top and bottom quarks are "truth" and "beauty" respectively,[nb 4] but these names have somewhat fallen out of use.[66] While "truth" never did catch on, accelerator complexes devoted to massive production of bottom quarks are sometimes called "beauty factories".[67]
+
diff --git a/evals/testdata/special_relativity.txt b/evals/testdata/special_relativity.txt
new file mode 100644
index 0000000..aabffe3
--- /dev/null
+++ b/evals/testdata/special_relativity.txt
@@ -0,0 +1,3984 @@
+Text from https://en.wikipedia.org/wiki/Special_relativity is licensed under Creative Commons Attribution-ShareAlike 4.0 License; (https://en.wikipedia.org/wiki/Wikipedia:Text_of_the_Creative_Commons_Attribution-ShareAlike_4.0_International_License)
+
+Special relativity
+
+Article
+Talk
+Read
+Edit
+View history
+
+Tools
+Appearance hide
+Text
+
+Small
+
+Standard
+
+Large
+Width
+
+Standard
+
+Wide
+Color (beta)
+
+Automatic
+
+Light
+
+Dark
+From Wikipedia, the free encyclopedia
+
+Albert Einstein around 1905, the year his paper on special relativity was published
+Special relativity
+The world line: a diagrammatic representation of spacetime
+Principle of relativityTheory of relativityFormulations
+Foundations
+Consequences
+Spacetime
+Dynamics
+HistoryPrecursors
+People
+icon Physics portal Category
+vte
+In physics, the special theory of relativity, or special relativity for short, is a scientific theory of the relationship between space and time. In Albert Einstein's 1905 paper, "On the Electrodynamics of Moving Bodies", the theory is presented as being based on just two postulates:[p 1][1][2]
+
+The laws of physics are invariant (identical) in all inertial frames of reference (that is, frames of reference with no acceleration). This is known as the principle of relativity.
+The speed of light in vacuum is the same for all observers, regardless of the motion of light source or observer. This is known as the principle of light constancy, or the principle of light speed invariance.
+The first postulate was first formulated by Galileo Galilei (see Galilean invariance).
+
+Overview
+Relativity is a theory that accurately describes objects moving at speeds far beyond normal experience. Relativity replaces the idea that time flows equally everywhere in the universe with a new concept that time flows differently for every independent object. The flow of time can be expressed by counting ticks on a clock. Moving clocks run slower. At speeds encountered in normal experience, the slow down cannot be observed. Two events measured at the same time on a stationary clock occur at different times if measured on moving clocks. Near the speed of light many physical effects can only be understood by including the effects of special relativity.[3]
+
+Basis
+Unusual among modern topics in physics, the theory of special relativity needs only mathematics at high school level and yet it fundamentally alters our understanding, especially our understanding of the concept of time.[3]: ix  Built on just two postulates or assumptions, many interesting consequences follow.
+
+The two postulates both concern observers moving at a constant speed relative to each other. The first postulate, the § principle of relativity, says the laws of physics do not depend on objects being at absolute rest: for example, an observer on a train sees natural phenomena on that train that look the same whether the train is moving or not.[3]: 5  The second postulate, constant speed of light, says observers in a train station see light travel at the same speed whether they measure light from within the station or light from a moving train. A light signal from the station to the train has the same speed, no matter how fast a train goes.[3]: 25 
+
+In the theory of special relativity, the two postulates combine to change the definition of "relative speed". Rather than the simple concept of distance traveled divided by time spent, the new theory incorporates the speed of light as the maximum possible speed. In special relativity, covering ten times more distance on the ground in the same amount of time according to a moving watch does not result in a speed up as seen from the ground by a factor of ten.[3]: 28 
+
+Consequences
+Special relativity has a wide range of consequences that have been experimentally verified.[4][5] The conceptual effects include:
+
+The relativity of simultaneity – events that appear simultaneous to one observer may not be simultaneous to an observer in motion[3]: 49 
+§ Time dilation – time measured between two events by observers in motion differ
+§ Length contraction – distances between two events by observers in motion differ
+The § Lorentz transformation of velocities – velocities no longer simply add
+Combined with other laws of physics, the two postulates of special relativity predict the equivalence of mass and energy, as expressed in the mass–energy equivalence formula ⁠
+E
+=
+m
+c
+2
+{\displaystyle E=mc^{2}}⁠, where 
+c
+{\displaystyle c} is the speed of light in vacuum.[6][7] Special relativity replaced the conventional notion of an absolute, universal time with the notion of a time that is local to each observer.[8]: 33  Information about distant objects can arrive no faster than the speed of light so visual observations always report events that have happened in the past. This effect makes visual descriptions of the effects of special relativity especially prone to mistakes.[9]
+
+Special relativity also has profound technical consequences. A defining feature of special relativity is the replacement of Euclidean geometry with Lorentzian geometry.[10]: 8  Distances in Euclidean geometry are calculated with the Pythagorean theorem and only involved spatial coordinates. In Lorentzian geometry, 'distances' become 'intervals' and include a time coordinate with a minus sign. Unlike spatial distances, the interval between two events has the same value for all observers independent of their relative velocity. When comparing two sets of coordinates in relative motion the Lorentz transformation replaces the Galilean transformation of Newtonian mechanics.[10]: 98  Other effects include the relativistic corrections to the Doppler effect and the Thomas precession.[1][2] It also explains how electricity and magnetism are related.[1][2]
+
+History
+Main article: History of special relativity
+The principle of relativity, forming one of the two postulates of special relativity, was described by Galileo Galilei in 1632 using a thought experiment involving observing natural phenomena on a moving ship.[11] His conclusions were summarized as Galilean relativity and used as the basis of Newtonian mechanics.[3]: 1  This principle can be expressed as a coordinate transformation, between two coordinate systems. Isaac Newton noted that many transformations, such as those involving rotation or acceleration, will not preserve the observation of physical phenomena. Newton considered only those transformations involving motion with respect to an immovable absolute space, now called transformations between inertial frames.[12]: 17 
+
+In 1864 James Clerk Maxwell presented a theory of electromagnetism which did not obey Galilean relativity. The theory specifically predicted a constant speed of light in vacuum, no matter the motion (velocity, acceleration, etc.) of the light emitter or receiver or its frequency, wavelength, direction, polarization, or phase. This, as yet untested theory, was thought at the time to be only valid in inertial frames fixed in an aether. Numerous experiments followed, attempting to measure the speed of light as Earth moved through the proposed fixed aether, culminating in the 1887 Michelson–Morley experiment which only confirmed the constant speed of light.[12]: 18 
+
+Several fixes to the aether theory were proposed, with those of George Francis FitzGerald, Hendrik Antoon Lorentz, and Jules Henri Poincare all pointing in the direction of a result similar to the theory of special relativity. The final important step was taken by Albert Einstein in a paper published on 26 September 1905 titled "On the Electrodynamics of Moving Bodies".[p 1] Einstein applied the Lorentz transformations known to be compatible with Maxwell's equations for electrodynamics to the classical laws of mechanics. This changed Newton's mechanics situations involving all motions, especially velocities close to that of light[12]: 18  (known as relativistic velocities).
+
+Another way to describe the advance made by the special theory is to say Einstein extended the Galilean principle so that it accounted for the constant speed of light,[10] a phenomenon that had been observed in the Michelson–Morley experiment. He also postulated that it holds for all the laws of physics, including both the laws of mechanics and of electrodynamics.[13] The theory became essentially complete in 1907, with Hermann Minkowski's papers on spacetime.[14]
+
+Special relativity has proven to be the most accurate model of motion at any speed when gravitational and quantum effects are negligible.[15][14] Even so, the Newtonian model remains accurate at low velocities relative to the speed of light, for example, everyday motion on Earth.
+
+When updating his 1911 book on relativity, to include general relativity in 1920, Robert Daniel Carmichael called the earlier work the "restricted theory" as a "special case" of the new general theory; he also used the phrase "special theory of relativity".[16] In comparing to the general theory in 1923 Einstein specifically called his earlier work "the special theory of relativity", saying he meant a restriction to frames in uniform motion.[17]: 111  Just as Galilean relativity is accepted as an approximation of special relativity that is valid for low speeds, special relativity is considered an approximation of general relativity that is valid for weak gravitational fields, that is, at a sufficiently small scale (e.g., when tidal forces are negligible) and in conditions of free fall. But general relativity incorporates non-Euclidean geometry to represent gravitational effects as the geometric curvature of spacetime. Special relativity is restricted to the flat spacetime known as Minkowski space. As long as the universe can be modeled as a pseudo-Riemannian manifold, a Lorentz-invariant frame that abides by special relativity can be defined for a sufficiently small neighborhood of each point in this curved spacetime.
+
+Terminology
+Special relativity builds upon important physics ideas. Among the most basic of these are the following:
+
+speed or velocity, how fast an object moves relative to a reference point.[3]: 25 
+speed of light, the maximum speed of information, independent of the speed of the source and receiver,[10]: 39 
+clock, a device to measure differences in time; in relativity every object is imagined to have its own proper clock[10]: 3  and moving clocks run slower.[3]: 180 
+event: something that happens at a definite place and time. For example, an explosion or a flash of light from an atom;[10]: 10  a generalization of a point in geometrical space,[3]: 43 
+Two observers in relative motion receive information about two events via light signals traveling at constant speed, independent of either observer's speed. Their motion during the transit time causes them to get the information at different times on their local clock.
+
+The more technical background ideas include:
+
+spacetime: geometrical space and time considered together.[10]: 18 
+spacetime interval between two events: a measure of separation between events that incorporates both the spatial distance between them and the duration of time separating them:[10]: 9 
+(
+interval
+)
+2
+=
+[
+event separation in time
+]
+2
+−
+[
+event separation in space
+]
+2
+{\displaystyle ({\text{interval}})^{2}=\left[{\text{event separation in time}}\right]^{2}-\left[{\text{event separation in space}}\right]^{2}}
+
+coordinate system or reference frame: a way to locate events in spacetime. Events have coordinates x, y, z for space and t for time. The coordinates of the event are different in a different reference frame.[18]: 67 
+inertial reference frame: a region of a reference frame where objects at rest with respect to the frame stay as rest, or if in uniform motion, stay in motion; also called a free-float frame.[10]: 31 
+prime system, frame, or coordinate. To emphasize the relationship between two systems of coordinates, both use the same x,y,z axes but one will be marked with a prime (') symbol.
+coordinate transformation: changing how an event is described from one reference frame to another.[18]: 67 
+invariance: when physical laws or quantities do not change in different inertial frames. The speed of light is invariant in special relativity: it is always the same.[18]: 67 
+Traditional "two postulates" approach to special relativity 
+"Reflections of this type made it clear to me as long ago as shortly after 1900, i.e., shortly after Planck's trailblazing work, that neither mechanics nor electrodynamics could (except in limiting cases) claim exact validity. Gradually I despaired of the possibility of discovering the true laws by means of constructive efforts based on known facts. The longer and the more desperately I tried, the more I came to the conviction that only the discovery of a universal formal principle could lead us to assured results ... How, then, could such a universal principle be found?"
+
+Albert Einstein: Autobiographical Notes[p 2]
+Main article: Postulates of special relativity
+Einstein discerned two fundamental propositions that seemed to be the most assured, regardless of the exact validity of the (then) known laws of either mechanics or electrodynamics. These propositions were the constancy of the speed of light in vacuum and the independence of physical laws (especially the constancy of the speed of light) from the choice of inertial system. In his initial presentation of special relativity in 1905 he expressed these postulates as:[p 1][19]
+
+The principle of relativity – the laws by which the states of physical systems undergo change are not affected, whether these changes of state be referred to the one or the other of two systems in uniform translatory motion relative to each other.[p 1]
+The principle of invariant light speed – "... light is always propagated in empty space with a definite velocity [speed] c which is independent of the state of motion of the emitting body" (from the preface).[p 1] That is, light in vacuum propagates with the speed c (a fixed constant, independent of direction) in at least one system of inertial coordinates (the "stationary system"), regardless of the state of motion of the light source.
+The constancy of the speed of light was motivated by Maxwell's theory of electromagnetism[20] and the lack of evidence for the luminiferous ether.[21] There is conflicting evidence on the extent to which Einstein was influenced by the null result of the Michelson–Morley experiment.[22][23] In any case, the null result of the Michelson–Morley experiment helped the notion of the constancy of the speed of light gain widespread and rapid acceptance.
+
+The derivation of special relativity depends not only on these two explicit postulates, but also on several tacit assumptions, including the isotropy and homogeneity of space and the independence of measuring rods and clocks from their past history.[p 3]
+
+Principle of relativity
+Main article: Principle of relativity
+Reference frames and relative motion
+
+Figure 2–1. The primed system is in motion relative to the unprimed system with constant velocity v only along the x-axis, from the perspective of an observer stationary in the unprimed system. By the principle of relativity, an observer stationary in the primed system will view a likewise construction except that the velocity they record will be −v. The changing of the speed of propagation of interaction from infinite in non-relativistic mechanics to a finite value will require a modification of the transformation equations mapping events in one frame to another.
+Reference frames play a crucial role in relativity theory. The term reference frame as used here is an observational perspective in space that is not undergoing any change in motion (acceleration), from which a position can be measured along 3 spatial axes (so, at rest or constant velocity). In addition, a reference frame has the ability to determine measurements of the time of events using a "clock" (any reference device with uniform periodicity).
+
+An event is an occurrence that can be assigned a single unique moment and location in space relative to a reference frame: it is a "point" in spacetime. Since the speed of light is constant in relativity irrespective of the reference frame, pulses of light can be used to unambiguously measure distances and refer back to the times that events occurred to the clock, even though light takes time to reach the clock after the event has transpired.
+
+For example, the explosion of a firecracker may be considered to be an "event". We can completely specify an event by its four spacetime coordinates: The time of occurrence and its 3-dimensional spatial location define a reference point. Let's call this reference frame S.
+
+In relativity theory, we often want to calculate the coordinates of an event from differing reference frames. The equations that relate measurements made in different frames are called transformation equations.
+
+Standard configuration
+To gain insight into how the spacetime coordinates measured by observers in different reference frames compare with each other, it is useful to work with a simplified setup with frames in a standard configuration.[24]: 107  With care, this allows simplification of the math with no loss of generality in the conclusions that are reached. In Fig. 2-1, two Galilean reference frames (i.e., conventional 3-space frames) are displayed in relative motion. Frame S belongs to a first observer O, and frame S′ (pronounced "S prime" or "S dash") belongs to a second observer O′.
+
+The x, y, z axes of frame S are oriented parallel to the respective primed axes of frame S′.
+Frame S′ moves, for simplicity, in a single direction: the x-direction of frame S with a constant velocity v as measured in frame S.
+The origins of frames S and S′ are coincident when time t = 0 for frame S and t′ = 0 for frame S′.
+Since there is no absolute reference frame in relativity theory, a concept of "moving" does not strictly exist, as everything may be moving with respect to some other reference frame. Instead, any two frames that move at the same speed in the same direction are said to be comoving. Therefore, S and S′ are not comoving.
+
+Lack of an absolute reference frame
+The principle of relativity, which states that physical laws have the same form in each inertial reference frame, dates back to Galileo, and was incorporated into Newtonian physics. But in the late 19th century the existence of electromagnetic waves led some physicists to suggest that the universe was filled with a substance they called "aether", which, they postulated, would act as the medium through which these waves, or vibrations, propagated (in many respects similar to the way sound propagates through air). The aether was thought to be an absolute reference frame against which all speeds could be measured, and could be considered fixed and motionless relative to Earth or some other fixed reference point. The aether was supposed to be sufficiently elastic to support electromagnetic waves, while those waves could interact with matter, yet offering no resistance to bodies passing through it (its one property was that it allowed electromagnetic waves to propagate). The results of various experiments, including the Michelson–Morley experiment in 1887 (subsequently verified with more accurate and innovative experiments), led to the theory of special relativity, by showing that the aether did not exist.[25] Einstein's solution was to discard the notion of an aether and the absolute state of rest. In relativity, any reference frame moving with uniform motion will observe the same laws of physics. In particular, the speed of light in vacuum is always measured to be c, even when measured by multiple systems that are moving at different (but constant) velocities.
+
+Relativity without the second postulate
+From the principle of relativity alone without assuming the constancy of the speed of light (i.e., using the isotropy of space and the symmetry implied by the principle of special relativity) it can be shown that the spacetime transformations between inertial frames are either Euclidean, Galilean, or Lorentzian. In the Lorentzian case, one can then obtain relativistic interval conservation and a certain finite limiting speed. Experiments suggest that this speed is the speed of light in vacuum.[p 4][26]: 511 
+
+Lorentz transformation 
+Main article: Lorentz transformation
+Two- vs one- postulate approaches
+Main article: Derivations of the Lorentz transformations
+Einstein combined the two postulates – of relativity – and of the invariance of the speed of light, into a single postulate, the Lorentz transformation:
+
+The insight fundamental for the special theory of relativity is this: The assumptions relativity and light speed invariance are compatible if relations of a new type ("Lorentz transformation") are postulated for the conversion of coordinates and times of events ... The universal principle of the special theory of relativity is contained in the postulate: The laws of physics are invariant with respect to Lorentz transformations (for the transition from one inertial system to any other arbitrarily chosen inertial system). This is a restricting principle for natural laws ...[p 2]
+
+Following Einstein's original presentation of special relativity in 1905, many different sets of postulates have been proposed in various alternative derivations,[27] but Einstein stuck to his approach throughout work.[p 5]
+
+Henri Poincaré provided the mathematical framework for relativity theory by proving that Lorentz transformations are a subset of his Poincaré group of symmetry transformations. Einstein later derived these transformations from his axioms.
+
+While the traditional two-postulate approach to special relativity is presented in innumerable college textbooks and popular presentations,[28] other treatments of special relativity base it on the single postulate of universal Lorentz covariance, or, equivalently, on the single postulate of Minkowski spacetime.[p 6][p 7] Textbooks starting with the single postulate of Minkowski spacetime include those by Taylor and Wheeler[10] and by Callahan.[29]
+
+Lorentz transformation and its inverse
+Define an event to have spacetime coordinates (t, x, y, z) in system S and (t′, x′, y′, z′) in a reference frame S′ moving at a velocity v along the x-axis. Then the Lorentz transformation specifies that these coordinates are related in the following way:
+t
+′
+=
+γ
+ 
+(
+t
+−
+v
+x
+/
+c
+2
+)
+x
+′
+=
+γ
+ 
+(
+x
+−
+v
+t
+)
+y
+′
+=
+y
+z
+′
+=
+z
+,
+{\displaystyle {\begin{aligned}t'&=\gamma \ (t-vx/c^{2})\\x'&=\gamma \ (x-vt)\\y'&=y\\z'&=z,\end{aligned}}}where
+γ
+=
+1
+1
+−
+v
+2
+/
+c
+2
+{\displaystyle \gamma ={\frac {1}{\sqrt {1-v^{2}/c^{2}}}}}is the Lorentz factor and c is the speed of light in vacuum, and the velocity v of S′, relative to S, is parallel to the x-axis. For simplicity, the y and z coordinates are unaffected; only the x and t coordinates are transformed. These Lorentz transformations form a one-parameter group of linear mappings, that parameter being called rapidity.
+
+Solving the four transformation equations above for the unprimed coordinates yields the inverse Lorentz transformation:
+t
+=
+γ
+(
+t
+′
++
+v
+x
+′
+/
+c
+2
+)
+x
+=
+γ
+(
+x
+′
++
+v
+t
+′
+)
+y
+=
+y
+′
+z
+=
+z
+′
+.
+{\displaystyle {\begin{aligned}t&=\gamma (t'+vx'/c^{2})\\x&=\gamma (x'+vt')\\y&=y'\\z&=z'.\end{aligned}}}
+
+This shows that the unprimed frame is moving with the velocity −v, as measured in the primed frame.[30]
+
+There is nothing special about the x-axis. The transformation can apply to the y- or z-axis, or indeed in any direction parallel to the motion (which are warped by the γ factor) and perpendicular; see the article Lorentz transformation for details.
+
+A quantity that is invariant under Lorentz transformations is known as a Lorentz scalar.
+
+Writing the Lorentz transformation and its inverse in terms of coordinate differences, where one event has coordinates (x1, t1) and (x′1, t′1), another event has coordinates (x2, t2) and (x′2, t′2), and the differences are defined as
+
+Eq. 1:    
+Δ
+x
+′
+=
+x
+2
+′
+−
+x
+1
+′
+ 
+,
+ 
+Δ
+t
+′
+=
+t
+2
+′
+−
+t
+1
+′
+ 
+.
+{\displaystyle \Delta x'=x'_{2}-x'_{1}\ ,\ \Delta t'=t'_{2}-t'_{1}\ .}
+Eq. 2:    
+Δ
+x
+=
+x
+2
+−
+x
+1
+ 
+,
+ 
+ 
+Δ
+t
+=
+t
+2
+−
+t
+1
+ 
+.
+{\displaystyle \Delta x=x_{2}-x_{1}\ ,\ \ \Delta t=t_{2}-t_{1}\ .}
+we get
+
+Eq. 3:    
+Δ
+x
+′
+=
+γ
+ 
+(
+Δ
+x
+−
+v
+Δ
+t
+)
+ 
+,
+ 
+ 
+{\displaystyle \Delta x'=\gamma \ (\Delta x-v\,\Delta t)\ ,\ \ } 
+Δ
+t
+′
+=
+γ
+ 
+(
+Δ
+t
+−
+v
+ 
+Δ
+x
+/
+c
+2
+)
+ 
+.
+{\displaystyle \Delta t'=\gamma \ \left(\Delta t-v\ \Delta x/c^{2}\right)\ .}
+Eq. 4:    
+Δ
+x
+=
+γ
+ 
+(
+Δ
+x
+′
++
+v
+Δ
+t
+′
+)
+ 
+,
+ 
+{\displaystyle \Delta x=\gamma \ (\Delta x'+v\,\Delta t')\ ,\ } 
+Δ
+t
+=
+γ
+ 
+(
+Δ
+t
+′
++
+v
+ 
+Δ
+x
+′
+/
+c
+2
+)
+ 
+.
+{\displaystyle \Delta t=\gamma \ \left(\Delta t'+v\ \Delta x'/c^{2}\right)\ .}
+If we take differentials instead of taking differences, we get
+
+Eq. 5:    
+d
+x
+′
+=
+γ
+ 
+(
+d
+x
+−
+v
+d
+t
+)
+ 
+,
+ 
+ 
+{\displaystyle dx'=\gamma \ (dx-v\,dt)\ ,\ \ } 
+d
+t
+′
+=
+γ
+ 
+(
+d
+t
+−
+v
+ 
+d
+x
+/
+c
+2
+)
+ 
+.
+{\displaystyle dt'=\gamma \ \left(dt-v\ dx/c^{2}\right)\ .}
+Eq. 6:    
+d
+x
+=
+γ
+ 
+(
+d
+x
+′
++
+v
+d
+t
+′
+)
+ 
+,
+ 
+{\displaystyle dx=\gamma \ (dx'+v\,dt')\ ,\ } 
+d
+t
+=
+γ
+ 
+(
+d
+t
+′
++
+v
+ 
+d
+x
+′
+/
+c
+2
+)
+ 
+.
+{\displaystyle dt=\gamma \ \left(dt'+v\ dx'/c^{2}\right)\ .}
+Graphical representation of the Lorentz transformation
+
+
+
+
+Figure 3-1. Drawing a Minkowski spacetime diagram to illustrate a Lorentz transformation.
+Spacetime diagrams (also called Minkowski diagrams) are an extremely useful aid to visualizing how coordinates transform between different reference frames. Although it is not as easy to perform exact computations using them as directly invoking the Lorentz transformations, their main power is their ability to provide an intuitive grasp of the results of a relativistic scenario.[26]: 536  To draw a spacetime diagram, begin by considering two Galilean reference frames, S and S′, in standard configuration, as shown in Fig. 2-1.[31]: 155–199 
+
+Fig. 3-1a. Draw the 
+x
+{\displaystyle x} and 
+t
+{\displaystyle t} axes of frame S. The 
+x
+{\displaystyle x} axis is horizontal and the 
+c
+t
+{\displaystyle ct} (time written in units of space) axis is vertical, which is the opposite of the usual convention in kinematics. The 
+c
+t
+{\displaystyle ct} axis is scaled by a factor of 
+c
+{\displaystyle c} so that both axes have common units of length. In the diagram shown, the gridlines are spaced one unit distance apart. The 45° diagonal lines represent the worldlines of two photons passing through the origin at time 
+t
+=
+0.
+{\displaystyle t=0.} The slope of these worldlines is 1 because the photons advance one unit in space per unit of time. Two events, 
+A
+{\displaystyle {\text{A}}} and 
+B
+,
+{\displaystyle {\text{B}},} have been plotted on this graph so that their coordinates may be compared in the S and S' frames.
+
+Fig. 3-1b. Draw the 
+x
+′
+{\displaystyle x'} and 
+c
+t
+′
+{\displaystyle ct'} axes of frame S'. The 
+c
+t
+′
+{\displaystyle ct'} axis represents the worldline of the origin of the S' coordinate system as measured in frame S. In this figure, 
+v
+=
+c
+/
+2.
+{\displaystyle v=c/2.} Both the 
+c
+t
+′
+{\displaystyle ct'} and 
+x
+′
+{\displaystyle x'} axes are tilted from the unprimed axes by an angle 
+α
+=
+tan
+−
+1
+⁡
+(
+β
+)
+,
+{\displaystyle \alpha =\tan ^{-1}(\beta ),} where 
+β
+=
+v
+/
+c
+.
+{\displaystyle \beta =v/c.} The primed and unprimed axes share a common origin because frames S and S' had been set up in standard configuration, so that 
+t
+=
+0
+{\displaystyle t=0} when 
+t
+′
+=
+0.
+{\displaystyle t'=0.}
+
+Fig. 3-1c. Units in the primed axes have a different scale from units in the unprimed axes. From the Lorentz transformations, it can be observed that 
+(
+x
+′
+,
+c
+t
+′
+)
+{\displaystyle (x',ct')} coordinates of 
+(
+0
+,
+1
+)
+{\displaystyle (0,1)} in the primed coordinate system transform to 
+(
+β
+γ
+,
+γ
+)
+{\displaystyle (\beta \gamma ,\gamma )} in the unprimed coordinate system. Likewise, 
+(
+x
+′
+,
+c
+t
+′
+)
+{\displaystyle (x',ct')} coordinates of 
+(
+1
+,
+0
+)
+{\displaystyle (1,0)} in the primed coordinate system transform to 
+(
+γ
+,
+β
+γ
+)
+{\displaystyle (\gamma ,\beta \gamma )} in the unprimed system. Draw gridlines parallel with the 
+c
+t
+′
+{\displaystyle ct'} axis through points 
+(
+k
+γ
+,
+k
+β
+γ
+)
+{\displaystyle (k\gamma ,k\beta \gamma )} as measured in the unprimed frame, where 
+k
+{\displaystyle k} is an integer. Likewise, draw gridlines parallel with the 
+x
+′
+{\displaystyle x'} axis through 
+(
+k
+β
+γ
+,
+k
+γ
+)
+{\displaystyle (k\beta \gamma ,k\gamma )} as measured in the unprimed frame. Using the Pythagorean theorem, we observe that the spacing between 
+c
+t
+′
+{\displaystyle ct'} units equals 
+(
+1
++
+β
+2
+)
+/
+(
+1
+−
+β
+2
+)
+{\textstyle {\sqrt {(1+\beta ^{2})/(1-\beta ^{2})}}} times the spacing between 
+c
+t
+{\displaystyle ct} units, as measured in frame S. This ratio is always greater than 1, and approaches infinity as 
+β
+→
+1.
+{\displaystyle \beta \to 1.}
+
+Fig. 3-1d. Since the speed of light is an invariant, the worldlines of two photons passing through the origin at time 
+t
+′
+=
+0
+{\displaystyle t'=0} still plot as 45° diagonal lines. The primed coordinates of 
+A
+{\displaystyle {\text{A}}} and 
+B
+{\displaystyle {\text{B}}} are related to the unprimed coordinates through the Lorentz transformations and could be approximately measured from the graph (assuming that it has been plotted accurately enough), but the real merit of a Minkowski diagram is its granting us a geometric view of the scenario. For example, in this figure, we observe that the two timelike-separated events that had different x-coordinates in the unprimed frame are now at the same position in space.
+
+While the unprimed frame is drawn with space and time axes that meet at right angles, the primed frame is drawn with axes that meet at acute or obtuse angles. This asymmetry is due to unavoidable distortions in how spacetime coordinates map onto a Cartesian plane. The frames are equivalent.
+
+Consequences derived from the Lorentz transformation
+See also: Twin paradox and Relativistic mechanics
+The consequences of special relativity can be derived from the Lorentz transformation equations.[32] These transformations, and hence special relativity, lead to different physical predictions than those of Newtonian mechanics at all relative velocities, and most pronounced when relative velocities become comparable to the speed of light. The speed of light is so much larger than anything most humans encounter that some of the effects predicted by relativity are initially counterintuitive.
+
+Invariant interval
+In Galilean relativity, the spatial separation, (⁠
+Δ
+r
+{\displaystyle \Delta r}⁠), and the temporal separation, (⁠
+Δ
+t
+{\displaystyle \Delta t}⁠), between two events are independent invariants, the values of which do not change when observed from different frames of reference. In special relativity, however, the interweaving of spatial and temporal coordinates generates the concept of an invariant interval, denoted as ⁠
+Δ
+s
+2
+{\displaystyle \Delta s^{2}}⁠:
+Δ
+s
+2
+=
+def
+c
+2
+Δ
+t
+2
+−
+(
+Δ
+x
+2
++
+Δ
+y
+2
++
+Δ
+z
+2
+)
+{\displaystyle \Delta s^{2}\;{\overset {\text{def}}{=}}\;c^{2}\Delta t^{2}-(\Delta x^{2}+\Delta y^{2}+\Delta z^{2})}In considering the physical significance of ⁠
+Δ
+s
+2
+{\displaystyle \Delta s^{2}}⁠, there are three cases:[26]: 533 [10]: 25–39 
+
+Δs2 > 0: In this case, the two events are separated by more time than space, and they are hence said to be timelike separated. This implies that ⁠
+|
+Δ
+x
+/
+Δ
+t
+|
+<
+c
+{\displaystyle \vert \Delta x/\Delta t\vert <c}⁠, and given the Lorentz transformation ⁠
+Δ
+x
+′
+=
+γ
+ 
+(
+Δ
+x
+−
+v
+ 
+Δ
+t
+)
+{\displaystyle \Delta x'=\gamma \ (\Delta x-v\ \Delta t)}⁠, it is evident that there exists a 
+v
+{\displaystyle v} less than 
+c
+{\displaystyle c} for which 
+Δ
+x
+′
+=
+0
+{\displaystyle \Delta x'=0} (in particular, ⁠
+v
+=
+Δ
+x
+/
+Δ
+t
+{\displaystyle v=\Delta x/\Delta t}⁠). In other words, given two events that are timelike separated, it is possible to find a frame in which the two events happen at the same place. In this frame, the separation in time, ⁠
+Δ
+s
+/
+c
+{\displaystyle \Delta s/c}⁠, is called the proper time.
+Δs2 < 0: In this case, the two events are separated by more space than time, and they are hence said to be spacelike separated. This implies that ⁠
+|
+Δ
+x
+/
+Δ
+t
+|
+>
+c
+{\displaystyle \vert \Delta x/\Delta t\vert >c}⁠, and given the Lorentz transformation ⁠
+Δ
+t
+′
+=
+γ
+ 
+(
+Δ
+t
+−
+v
+Δ
+x
+/
+c
+2
+)
+{\displaystyle \Delta t'=\gamma \ (\Delta t-v\Delta x/c^{2})}⁠, there exists a 
+v
+{\displaystyle v} less than 
+c
+{\displaystyle c} for which 
+Δ
+t
+′
+=
+0
+{\displaystyle \Delta t'=0} (in particular, ⁠
+v
+=
+c
+2
+Δ
+t
+/
+Δ
+x
+{\displaystyle v=c^{2}\Delta t/\Delta x}⁠). In other words, given two events that are spacelike separated, it is possible to find a frame in which the two events happen at the same time. In this frame, the separation in space, ⁠
+−
+Δ
+s
+2
+{\displaystyle \textstyle {\sqrt {-\Delta s^{2}}}}⁠, is called the proper distance, or proper length. For values of 
+v
+{\displaystyle v} greater than and less than ⁠
+c
+2
+Δ
+t
+/
+Δ
+x
+{\displaystyle c^{2}\Delta t/\Delta x}⁠, the sign of 
+Δ
+t
+′
+{\displaystyle \Delta t'} changes, meaning that the temporal order of spacelike-separated events changes depending on the frame in which the events are viewed. But the temporal order of timelike-separated events is absolute, since the only way that 
+v
+{\displaystyle v} could be greater than 
+c
+2
+Δ
+t
+/
+Δ
+x
+{\displaystyle c^{2}\Delta t/\Delta x} would be if ⁠
+v
+>
+c
+{\displaystyle v>c}⁠.
+Δs2 = 0: In this case, the two events are said to be lightlike separated. This implies that ⁠
+|
+Δ
+x
+/
+Δ
+t
+|
+=
+c
+{\displaystyle \vert \Delta x/\Delta t\vert =c}⁠, and this relationship is frame independent due to the invariance of ⁠
+s
+2
+{\displaystyle s^{2}}⁠. From this, we observe that the speed of light is 
+c
+{\displaystyle c} in every inertial frame. In other words, starting from the assumption of universal Lorentz covariance, the constant speed of light is a derived result, rather than a postulate as in the two-postulates formulation of the special theory.
+The interweaving of space and time revokes the implicitly assumed concepts of absolute simultaneity and synchronization across non-comoving frames.
+
+The form of ⁠
+Δ
+s
+2
+{\displaystyle \Delta s^{2}}⁠, being the difference of the squared time lapse and the squared spatial distance, demonstrates a fundamental discrepancy between Euclidean and spacetime distances. The invariance of Δs2 under standard Lorentz transformation is analogous to the invariance of squared distances Δr2 under rotations in Euclidean space.[citation needed] Although space and time have an equal footing in relativity, the minus sign in front of the spatial terms marks space and time as being of essentially different character. They are not the same. Because it treats time differently than it treats the 3 spatial dimensions, Minkowski space differs from four-dimensional Euclidean space. The invariance of this interval is a property of the general Lorentz transform (also called the Poincaré transformation), making it an isometry of spacetime. The general Lorentz transform extends the standard Lorentz transform (which deals with translations without rotation, that is, Lorentz boosts, in the x-direction) with all other translations, reflections, and rotations between any Cartesian inertial frame.[33]: 33–34 
+
+In the analysis of simplified scenarios, such as spacetime diagrams, a reduced-dimensionality form of the invariant interval is often employed:
+Δ
+s
+2
+=
+c
+2
+Δ
+t
+2
+−
+Δ
+x
+2
+{\displaystyle \Delta s^{2}\,=\,c^{2}\Delta t^{2}-\Delta x^{2}}
+
+Demonstrating that the interval is invariant is straightforward for the reduced-dimensionality case and with frames in standard configuration:[26]
+c
+2
+Δ
+t
+2
+−
+Δ
+x
+2
+=
+c
+2
+γ
+2
+(
+Δ
+t
+′
++
+v
+Δ
+x
+′
+c
+2
+)
+2
+−
+γ
+2
+ 
+(
+Δ
+x
+′
++
+v
+Δ
+t
+′
+)
+2
+=
+γ
+2
+(
+c
+2
+Δ
+t
+′
+2
++
+2
+v
+Δ
+x
+′
+Δ
+t
+′
++
+v
+2
+Δ
+x
+′
+2
+c
+2
+)
+−
+γ
+2
+ 
+(
+Δ
+x
+′
+2
++
+2
+v
+Δ
+x
+′
+Δ
+t
+′
++
+v
+2
+Δ
+t
+′
+2
+)
+=
+γ
+2
+c
+2
+Δ
+t
+′
+2
+−
+γ
+2
+v
+2
+Δ
+t
+′
+2
+−
+γ
+2
+Δ
+x
+′
+2
++
+γ
+2
+v
+2
+Δ
+x
+′
+2
+c
+2
+=
+γ
+2
+c
+2
+Δ
+t
+′
+2
+(
+1
+−
+v
+2
+c
+2
+)
+−
+γ
+2
+Δ
+x
+′
+2
+(
+1
+−
+v
+2
+c
+2
+)
+=
+c
+2
+Δ
+t
+′
+2
+−
+Δ
+x
+′
+2
+{\displaystyle {\begin{aligned}c^{2}\Delta t^{2}-\Delta x^{2}&=c^{2}\gamma ^{2}\left(\Delta t'+{\dfrac {v\Delta x'}{c^{2}}}\right)^{2}-\gamma ^{2}\ (\Delta x'+v\Delta t')^{2}\\&=\gamma ^{2}\left(c^{2}\Delta t'^{\,2}+2v\Delta x'\Delta t'+{\dfrac {v^{2}\Delta x'^{\,2}}{c^{2}}}\right)-\gamma ^{2}\ (\Delta x'^{\,2}+2v\Delta x'\Delta t'+v^{2}\Delta t'^{\,2})\\&=\gamma ^{2}c^{2}\Delta t'^{\,2}-\gamma ^{2}v^{2}\Delta t'^{\,2}-\gamma ^{2}\Delta x'^{\,2}+\gamma ^{2}{\dfrac {v^{2}\Delta x'^{\,2}}{c^{2}}}\\&=\gamma ^{2}c^{2}\Delta t'^{\,2}\left(1-{\dfrac {v^{2}}{c^{2}}}\right)-\gamma ^{2}\Delta x'^{\,2}\left(1-{\dfrac {v^{2}}{c^{2}}}\right)\\&=c^{2}\Delta t'^{\,2}-\Delta x'^{\,2}\end{aligned}}}
+
+The value of 
+Δ
+s
+2
+{\displaystyle \Delta s^{2}} is hence independent of the frame in which it is measured.
+
+Relativity of simultaneity
+See also: Relativity of simultaneity and Ladder paradox
+
+Figure 4–1. The three events (A, B, C) are simultaneous in the reference frame of some observer O. In a reference frame moving at v = 0.3c, as measured by O, the events occur in the order C, B, A. In a reference frame moving at v = −0.5c with respect to O, the events occur in the order A, B, C. The white lines, the lines of simultaneity, move from the past to the future in the respective frames (green coordinate axes), highlighting events residing on them. They are the locus of all events occurring at the same time in the respective frame. The gray area is the light cone with respect to the origin of all considered frames.
+Consider two events happening in two different locations that occur simultaneously in the reference frame of one inertial observer. They may occur non-simultaneously in the reference frame of another inertial observer (lack of absolute simultaneity).
+
+From Equation 3 (the forward Lorentz transformation in terms of coordinate differences)
+Δ
+t
+′
+=
+γ
+(
+Δ
+t
+−
+v
+Δ
+x
+c
+2
+)
+{\displaystyle \Delta t'=\gamma \left(\Delta t-{\frac {v\,\Delta x}{c^{2}}}\right)}
+
+It is clear that the two events that are simultaneous in frame S (satisfying Δt = 0), are not necessarily simultaneous in another inertial frame S′ (satisfying Δt′ = 0). Only if these events are additionally co-local in frame S (satisfying Δx = 0), will they be simultaneous in another frame S′.
+
+The Sagnac effect can be considered a manifestation of the relativity of simultaneity for local inertial frames comoving with a rotating Earth.[34] Instruments based on the Sagnac effect for their operation, such as ring laser gyroscopes and fiber optic gyroscopes, are capable of extreme levels of sensitivity.[p 8]
+
+Time dilation
+See also: Time dilation
+The time lapse between two events is not invariant from one observer to another, but is dependent on the relative speeds of the observers' reference frames.
+
+Suppose a clock is at rest in the unprimed system S. The location of the clock on two different ticks is then characterized by Δx = 0. To find the relation between the times between these ticks as measured in both systems, Equation 3 can be used to find:
+
+Δ
+t
+′
+=
+γ
+Δ
+t
+{\displaystyle \Delta t'=\gamma \,\Delta t} for events satisfying 
+Δ
+x
+=
+0
+ 
+.
+{\displaystyle \Delta x=0\ .}
+This shows that the time (Δt′) between the two ticks as seen in the frame in which the clock is moving (S′), is longer than the time (Δt) between these ticks as measured in the rest frame of the clock (S). Time dilation explains a number of physical phenomena; for example, the lifetime of high speed muons created by the collision of cosmic rays with particles in the Earth's outer atmosphere and moving towards the surface is greater than the lifetime of slowly moving muons, created and decaying in a laboratory.[35]
+
+
+Figure 4–2. Hypothetical infinite array of synchronized clocks associated with an observer's reference frame
+Whenever one hears a statement to the effect that "moving clocks run slow", one should envision an inertial reference frame thickly populated with identical, synchronized clocks. As a moving clock travels through this array, its reading at any particular point is compared with a stationary clock at the same point.[36]: 149–152 
+
+The measurements obtained from direct observation of a moving clock would be delayed by the finite speed of light, i.e. the times seen would be distorted by the Doppler effect. Measurements of relativistic effects must always be understood as having been made after finite speed-of-light effects have been factored out.[36]: 149–152 
+
+Langevin's light-clock
+
+
+Figure 4–3. Thought experiment using a light-clock to explain time dilation
+Paul Langevin, an early proponent of the theory of relativity, did much to popularize the theory in the face of resistance by many physicists to Einstein's revolutionary concepts. Among his numerous contributions to the foundations of special relativity were independent work on the mass–energy relationship, a thorough examination of the twin paradox, and investigations into rotating coordinate systems. His name is frequently attached to a hypothetical construct called a "light-clock" (originally developed by Lewis and Tolman in 1909[37]), which he used to perform a novel derivation of the Lorentz transformation.[38]
+
+A light-clock is imagined to be a box of perfectly reflecting walls wherein a light signal reflects back and forth from opposite faces. The concept of time dilation is frequently taught using a light-clock that is traveling in uniform inertial motion perpendicular to a line connecting the two mirrors.[39][40][41][42] (Langevin himself made use of a light-clock oriented parallel to its line of motion.[38])
+
+Consider the scenario illustrated in Fig. 4-3A. Observer A holds a light-clock of length 
+L
+{\displaystyle L} as well as an electronic timer with which she measures how long it takes a pulse to make a round trip up and down along the light-clock. Although observer A is traveling rapidly along a train, from her point of view the emission and receipt of the pulse occur at the same place, and she measures the interval using a single clock located at the precise position of these two events. For the interval between these two events, observer A finds ⁠
+t
+A
+=
+2
+L
+/
+c
+{\displaystyle t_{\text{A}}=2L/c}⁠. A time interval measured using a single clock that is motionless in a particular reference frame is called a proper time interval.[43]
+
+Fig. 4-3B illustrates these same two events from the standpoint of observer B, who is parked by the tracks as the train goes by at a speed of ⁠
+v
+{\displaystyle v}⁠. Instead of making straight up-and-down motions, observer B sees the pulses moving along a zig-zag line. However, because of the postulate of the constancy of the speed of light, the speed of the pulses along these diagonal lines is the same 
+c
+{\displaystyle c} that observer A saw for her up-and-down pulses. B measures the speed of the vertical component of these pulses as 
+±
+c
+2
+−
+v
+2
+,
+{\textstyle \pm {\sqrt {c^{2}-v^{2}}},} so that the total round-trip time of the pulses is 
+t
+B
+=
+2
+L
+/
+c
+2
+−
+v
+2
+=
+{\textstyle t_{\text{B}}=2L{\big /}{\sqrt {c^{2}-v^{2}}}={}}⁠
+t
+A
+/
+1
+−
+v
+2
+/
+c
+2
+{\displaystyle \textstyle t_{\text{A}}{\big /}{\sqrt {1-v^{2}/c^{2}}}}⁠. Note that for observer B, the emission and receipt of the light pulse occurred at different places, and he measured the interval using two stationary and synchronized clocks located at two different positions in his reference frame. The interval that B measured was therefore not a proper time interval because he did not measure it with a single resting clock.[43]
+
+Reciprocal time dilation
+In the above description of the Langevin light-clock, the labeling of one observer as stationary and the other as in motion was completely arbitrary. One could just as well have observer B carrying the light-clock and moving at a speed of 
+v
+{\displaystyle v} to the left, in which case observer A would perceive B's clock as running slower than her local clock.
+
+There is no paradox here, because there is no independent observer C who will agree with both A and B. Observer C necessarily makes his measurements from his own reference frame. If that reference frame coincides with A's reference frame, then C will agree with A's measurement of time. If C's reference frame coincides with B's reference frame, then C will agree with B's measurement of time. If C's reference frame coincides with neither A's frame nor B's frame, then C's measurement of time will disagree with both A's and B's measurement of time.[44]
+
+Twin paradox
+See also: Twin paradox
+The reciprocity of time dilation between two observers in separate inertial frames leads to the so-called twin paradox, articulated in its present form by Langevin in 1911.[45] Langevin imagined an adventurer wishing to explore the future of the Earth. This traveler boards a projectile capable of traveling at 99.995% of the speed of light. After making a round-trip journey to and from a nearby star lasting only two years of his own life, he returns to an Earth that is two hundred years older.
+
+This result appears puzzling because both the traveler and an Earthbound observer would see the other as moving, and so, because of the reciprocity of time dilation, one might initially expect that each should have found the other to have aged less. In reality, there is no paradox at all, because in order for the two observers to perform side-by-side comparisons of their elapsed proper times, the symmetry of the situation must be broken: At least one of the two observers must change their state of motion to match that of the other.[46]
+
+
+Figure 4-4. Doppler analysis of twin paradox
+Knowing the general resolution of the paradox, however, does not immediately yield the ability to calculate correct quantitative results. Many solutions to this puzzle have been provided in the literature and have been reviewed in the Twin paradox article. We will examine in the following one such solution to the paradox.
+
+Our basic aim will be to demonstrate that, after the trip, both twins are in perfect agreement about who aged by how much, regardless of their different experiences. Fig 4-4 illustrates a scenario where the traveling twin flies at 0.6 c to and from a star 3 ly distant. During the trip, each twin sends yearly time signals (measured in their own proper times) to the other. After the trip, the cumulative counts are compared. On the outward phase of the trip, each twin receives the other's signals at the lowered rate of ⁠
+f
+′
+=
+f
+(
+1
+−
+β
+)
+/
+(
+1
++
+β
+)
+{\displaystyle \textstyle f'=f{\sqrt {(1-\beta )/(1+\beta )}}}⁠. Initially, the situation is perfectly symmetric: note that each twin receives the other's one-year signal at two years measured on their own clock. The symmetry is broken when the traveling twin turns around at the four-year mark as measured by her clock. During the remaining four years of her trip, she receives signals at the enhanced rate of ⁠
+f
+″
+=
+f
+(
+1
++
+β
+)
+/
+(
+1
+−
+β
+)
+{\displaystyle \textstyle f''=f{\sqrt {(1+\beta )/(1-\beta )}}}⁠. The situation is quite different with the stationary twin. Because of light-speed delay, he does not see his sister turn around until eight years have passed on his own clock. Thus, he receives enhanced-rate signals from his sister for only a relatively brief period. Although the twins disagree in their respective measures of total time, we see in the following table, as well as by simple observation of the Minkowski diagram, that each twin is in total agreement with the other as to the total number of signals sent from one to the other. There is hence no paradox.[36]: 152–159 
+
+Item	Measured by the
+stay-at-home	Fig 4-4	Measured by
+the traveler	Fig 4-4
+Total time of trip	
+T
+=
+2
+L
+v
+{\displaystyle T={\frac {2L}{v}}}	10 yr	
+T
+′
+=
+2
+L
+γ
+v
+{\displaystyle T'={\frac {2L}{\gamma v}}}	8 yr
+Total number of pulses sent	
+f
+T
+=
+2
+f
+L
+v
+{\displaystyle fT={\frac {2fL}{v}}}	10	
+f
+T
+′
+=
+2
+f
+L
+γ
+v
+{\displaystyle fT'={\frac {2fL}{\gamma v}}}	8
+Time when traveler's turnaround is detected	
+t
+1
+=
+L
+v
++
+L
+c
+{\displaystyle t_{1}={\frac {L}{v}}+{\frac {L}{c}}}	8 yr	
+t
+1
+′
+=
+L
+γ
+v
+{\displaystyle t_{1}'={\frac {L}{\gamma v}}}	4 yr
+Number of pulses received at initial 
+f
+′
+{\displaystyle f'} rate	
+f
+′
+t
+1
+{\displaystyle f't_{1}} 
+=
+f
+L
+v
+(
+1
++
+β
+)
+(
+1
+−
+β
+1
++
+β
+)
+1
+/
+2
+{\displaystyle ={\frac {fL}{v}}(1+\beta )\left({\frac {1-\beta }{1+\beta }}\right)^{1/2}}
+=
+f
+L
+v
+(
+1
+−
+β
+2
+)
+1
+/
+2
+{\displaystyle ={\frac {fL}{v}}(1-\beta ^{2})^{1/2}}	4	
+f
+′
+t
+1
+′
+{\displaystyle f't_{1}'} 
+=
+f
+L
+v
+(
+1
+−
+β
+2
+)
+1
+/
+2
+(
+1
+−
+β
+1
++
+β
+)
+1
+/
+2
+{\displaystyle ={\frac {fL}{v}}(1-\beta ^{2})^{1/2}\left({\frac {1-\beta }{1+\beta }}\right)^{1/2}}
+=
+f
+L
+v
+(
+1
+−
+β
+)
+{\displaystyle ={\frac {fL}{v}}(1-\beta )}	2
+Time for remainder of trip	
+t
+2
+=
+L
+v
+−
+L
+c
+{\displaystyle t_{2}={\frac {L}{v}}-{\frac {L}{c}}}	2 yr	
+t
+2
+′
+=
+L
+γ
+v
+{\displaystyle t_{2}'={\frac {L}{\gamma v}}}	4 yr
+Number of signals received at final 
+f
+″
+{\displaystyle f''} rate	
+f
+″
+t
+2
+{\displaystyle f''t_{2}} 
+=
+f
+L
+v
+(
+1
+−
+β
+)
+(
+1
++
+β
+1
+−
+β
+)
+1
+/
+2
+{\displaystyle ={\frac {fL}{v}}(1-\beta )\left({\frac {1+\beta }{1-\beta }}\right)^{1/2}} 
+=
+f
+L
+v
+(
+1
+−
+β
+2
+)
+1
+/
+2
+{\displaystyle ={\frac {fL}{v}}(1-\beta ^{2})^{1/2}}	4	
+f
+″
+t
+2
+′
+{\displaystyle f''t_{2}'} 
+=
+f
+L
+v
+(
+1
+−
+β
+2
+)
+1
+/
+2
+(
+1
++
+β
+1
+−
+β
+)
+1
+/
+2
+{\displaystyle ={\frac {fL}{v}}(1-\beta ^{2})^{1/2}\left({\frac {1+\beta }{1-\beta }}\right)^{1/2}} 
+=
+f
+L
+v
+(
+1
++
+β
+)
+{\displaystyle ={\frac {fL}{v}}(1+\beta )}	8
+Total number of received pulses	
+2
+f
+L
+v
+(
+1
+−
+β
+2
+)
+1
+/
+2
+{\displaystyle {\frac {2fL}{v}}(1-\beta ^{2})^{1/2}} 
+=
+2
+f
+L
+γ
+v
+{\displaystyle ={\frac {2fL}{\gamma v}}}	8	
+2
+f
+L
+v
+{\displaystyle {\frac {2fL}{v}}}	10
+Twin's calculation as to how much the other twin should have aged	
+T
+′
+=
+2
+L
+γ
+v
+{\displaystyle T'={\frac {2L}{\gamma v}}}	8 yr	
+T
+=
+2
+L
+v
+{\displaystyle T={\frac {2L}{v}}}	10 yr
+Length contraction
+See also: Lorentz contraction
+The dimensions (e.g., length) of an object as measured by one observer may be smaller than the results of measurements of the same object made by another observer (e.g., the ladder paradox involves a long ladder traveling near the speed of light and being contained within a smaller garage).
+
+Similarly, suppose a measuring rod is at rest and aligned along the x-axis in the unprimed system S. In this system, the length of this rod is written as Δx. To measure the length of this rod in the system S′, in which the rod is moving, the distances x′ to the end points of the rod must be measured simultaneously in that system S′. In other words, the measurement is characterized by Δt′ = 0, which can be combined with Equation 4 to find the relation between the lengths Δx and Δx′:
+
+Δ
+x
+′
+=
+Δ
+x
+γ
+{\displaystyle \Delta x'={\frac {\Delta x}{\gamma }}}  for events satisfying 
+Δ
+t
+′
+=
+0
+ 
+.
+{\displaystyle \Delta t'=0\ .}
+This shows that the length (Δx′) of the rod as measured in the frame in which it is moving (S′), is shorter than its length (Δx) in its own rest frame (S).
+
+Time dilation and length contraction are not merely appearances. Time dilation is explicitly related to our way of measuring time intervals between events that occur at the same place in a given coordinate system (called "co-local" events). These time intervals are different in another coordinate system moving with respect to the first, unless the events, in addition to being co-local, are also simultaneous. Similarly, length contraction relates to our measured distances between separated but simultaneous events in a given coordinate system of choice. If these events are not co-local, but are separated by distance (space), they will not occur at the same spatial distance from each other when seen from another moving coordinate system.
+
+Lorentz transformation of velocities
+See also: Velocity-addition formula
+Consider two frames S and S′ in standard configuration. A particle in S moves in the x direction with velocity vector ⁠
+u
+{\displaystyle \mathbf {u} }⁠. What is its velocity 
+u
+′
+{\displaystyle \mathbf {u'} } in frame S′?
+
+We can write
+
+|
+u
+|
+=
+u
+=
+d
+x
+/
+d
+t
+.
+{\displaystyle \mathbf {|u|} =u=dx/dt\,.}		7
+|
+u
+′
+|
+=
+u
+′
+=
+d
+x
+′
+/
+d
+t
+′
+.
+{\displaystyle \mathbf {|u'|} =u'=dx'/dt'\,.}		8
+Substituting expressions for 
+d
+x
+′
+{\displaystyle dx'} and 
+d
+t
+′
+{\displaystyle dt'} from Equation 5 into Equation 8, followed by straightforward mathematical manipulations and back-substitution from Equation 7 yields the Lorentz transformation of the speed 
+u
+{\displaystyle u} to ⁠
+u
+′
+{\displaystyle u'}⁠:
+
+u
+′
+=
+d
+x
+′
+d
+t
+′
+=
+γ
+(
+d
+x
+−
+v
+d
+t
+)
+γ
+(
+d
+t
+−
+v
+d
+x
+c
+2
+)
+=
+d
+x
+d
+t
+−
+v
+1
+−
+v
+c
+2
+d
+x
+d
+t
+=
+u
+−
+v
+1
+−
+u
+v
+c
+2
+.
+{\displaystyle u'={\frac {dx'}{dt'}}={\frac {\gamma (dx-v\,dt)}{\gamma \left(dt-{\dfrac {v\,dx}{c^{2}}}\right)}}={\frac {{\dfrac {dx}{dt}}-v}{1-{\dfrac {v}{c^{2}}}\,{\dfrac {dx}{dt}}}}={\frac {u-v}{1-{\dfrac {uv}{c^{2}}}}}.}		9
+The inverse relation is obtained by interchanging the primed and unprimed symbols and replacing 
+v
+{\displaystyle v} with ⁠
+−
+v
+{\displaystyle -v}⁠.
+
+u
+=
+u
+′
++
+v
+1
++
+u
+′
+v
+/
+c
+2
+.
+{\displaystyle u={\frac {u'+v}{1+u'v/c^{2}}}.}		10
+For 
+u
+{\displaystyle \mathbf {u} } not aligned along the x-axis, we write:[13]: 47–49 
+
+u
+=
+(
+u
+1
+,
+ 
+u
+2
+,
+ 
+u
+3
+)
+=
+(
+d
+x
+/
+d
+t
+,
+ 
+d
+y
+/
+d
+t
+,
+ 
+d
+z
+/
+d
+t
+)
+ 
+.
+{\displaystyle \mathbf {u} =(u_{1},\ u_{2},\ u_{3})=(dx/dt,\ dy/dt,\ dz/dt)\ .}		11
+u
+′
+=
+(
+u
+1
+′
+,
+ 
+u
+2
+′
+,
+ 
+u
+3
+′
+)
+=
+(
+d
+x
+′
+/
+d
+t
+′
+,
+ 
+d
+y
+′
+/
+d
+t
+′
+,
+ 
+d
+z
+′
+/
+d
+t
+′
+)
+ 
+.
+{\displaystyle \mathbf {u'} =(u_{1}',\ u_{2}',\ u_{3}')=(dx'/dt',\ dy'/dt',\ dz'/dt')\ .}		12
+The forward and inverse transformations for this case are:
+
+u
+1
+′
+=
+u
+1
+−
+v
+1
+−
+u
+1
+v
+/
+c
+2
+ 
+,
+u
+2
+′
+=
+u
+2
+γ
+(
+1
+−
+u
+1
+v
+/
+c
+2
+)
+ 
+,
+u
+3
+′
+=
+u
+3
+γ
+(
+1
+−
+u
+1
+v
+/
+c
+2
+)
+ 
+.
+{\displaystyle u_{1}'={\frac {u_{1}-v}{1-u_{1}v/c^{2}}}\ ,\qquad u_{2}'={\frac {u_{2}}{\gamma \left(1-u_{1}v/c^{2}\right)}}\ ,\qquad u_{3}'={\frac {u_{3}}{\gamma \left(1-u_{1}v/c^{2}\right)}}\ .}		13
+u
+1
+=
+u
+1
+′
++
+v
+1
++
+u
+1
+′
+v
+/
+c
+2
+ 
+,
+u
+2
+=
+u
+2
+′
+γ
+(
+1
++
+u
+1
+′
+v
+/
+c
+2
+)
+ 
+,
+u
+3
+=
+u
+3
+′
+γ
+(
+1
++
+u
+1
+′
+v
+/
+c
+2
+)
+ 
+.
+{\displaystyle u_{1}={\frac {u_{1}'+v}{1+u_{1}'v/c^{2}}}\ ,\qquad u_{2}={\frac {u_{2}'}{\gamma \left(1+u_{1}'v/c^{2}\right)}}\ ,\qquad u_{3}={\frac {u_{3}'}{\gamma \left(1+u_{1}'v/c^{2}\right)}}\ .}		14
+Equation 10 and Equation 14 can be interpreted as giving the resultant 
+u
+{\displaystyle \mathbf {u} } of the two velocities 
+v
+{\displaystyle \mathbf {v} } and ⁠
+u
+′
+{\displaystyle \mathbf {u'} }⁠, and they replace the formula ⁠
+u
+=
+u
+′
++
+v
+{\displaystyle \mathbf {u=u'+v} }⁠. which is valid in Galilean relativity. Interpreted in such a fashion, they are commonly referred to as the relativistic velocity addition (or composition) formulas, valid for the three axes of S and S′ being aligned with each other (although not necessarily in standard configuration).[13]: 47–49 
+
+We note the following points:
+
+If an object (e.g., a photon) were moving at the speed of light in one frame (i.e., u = ±c or u′ = ±c), then it would also be moving at the speed of light in any other frame, moving at |v| < c.
+The resultant speed of two velocities with magnitude less than c is always a velocity with magnitude less than c.
+If both |u| and |v| (and then also |u′| and |v′|) are small with respect to the speed of light (that is, e.g., |⁠
+u
+/
+c
+⁠| ≪ 1), then the intuitive Galilean transformations are recovered from the transformation equations for special relativity
+Attaching a frame to a photon (riding a light beam like Einstein considers) requires special treatment of the transformations.
+There is nothing special about the x direction in the standard configuration. The above formalism applies to any direction; and three orthogonal directions allow dealing with all directions in space by decomposing the velocity vectors to their components in these directions. See Velocity-addition formula for details.
+
+Thomas rotation
+See also: Thomas rotation
+
+
+Figure 4-5. Thomas–Wigner rotation
+The composition of two non-collinear Lorentz boosts (i.e., two non-collinear Lorentz transformations, neither of which involve rotation) results in a Lorentz transformation that is not a pure boost but is the composition of a boost and a rotation.
+
+Thomas rotation results from the relativity of simultaneity. In Fig. 4-5a, a rod of length 
+L
+{\displaystyle L} in its rest frame (i.e., having a proper length of ⁠
+L
+{\displaystyle L}⁠) rises vertically along the y-axis in the ground frame.
+
+In Fig. 4-5b, the same rod is observed from the frame of a rocket moving at speed 
+v
+{\displaystyle v} to the right. If we imagine two clocks situated at the left and right ends of the rod that are synchronized in the frame of the rod, relativity of simultaneity causes the observer in the rocket frame to observe (not see) the clock at the right end of the rod as being advanced in time by ⁠
+L
+v
+/
+c
+2
+{\displaystyle Lv/c^{2}}⁠, and the rod is correspondingly observed as tilted.[10]: 98–99 
+
+Unlike second-order relativistic effects such as length contraction or time dilation, this effect becomes quite significant even at fairly low velocities. For example, this can be seen in the spin of moving particles, where Thomas precession is a relativistic correction that applies to the spin of an elementary particle or the rotation of a macroscopic gyroscope, relating the angular velocity of the spin of a particle following a curvilinear orbit to the angular velocity of the orbital motion.[10]: 169–174 
+
+Thomas rotation provides the resolution to the well-known "meter stick and hole paradox".[p 9][10]: 98–99 
+
+Causality and prohibition of motion faster than light
+See also: Causality (physics) and Tachyonic antitelephone
+
+Figure 4–6. Light cone
+In Fig. 4-6, the time interval between the events A (the "cause") and B (the "effect") is 'timelike'; that is, there is a frame of reference in which events A and B occur at the same location in space, separated only by occurring at different times. If A precedes B in that frame, then A precedes B in all frames accessible by a Lorentz transformation. It is possible for matter (or information) to travel (below light speed) from the location of A, starting at the time of A, to the location of B, arriving at the time of B, so there can be a causal relationship (with A the cause and B the effect).
+
+The interval AC in the diagram is 'spacelike'; that is, there is a frame of reference in which events A and C occur simultaneously, separated only in space. There are also frames in which A precedes C (as shown) and frames in which C precedes A. But no frames are accessible by a Lorentz transformation, in which events A and C occur at the same location. If it were possible for a cause-and-effect relationship to exist between events A and C, paradoxes of causality would result.
+
+For example, if signals could be sent faster than light, then signals could be sent into the sender's past (observer B in the diagrams).[47][p 10] A variety of causal paradoxes could then be constructed.
+
+Causality violation: Beginning of scenario resulting from use of a fictitious instantaneous communicator
+Causality violation: B receives the message before having sent it.
+Figure 4-7. Causality violation by the use of fictitious
+"instantaneous communicators"
+Consider the spacetime diagrams in Fig. 4-7. A and B stand alongside a railroad track, when a high-speed train passes by, with C riding in the last car of the train and D riding in the leading car. The world lines of A and B are vertical (ct), distinguishing the stationary position of these observers on the ground, while the world lines of C and D are tilted forwards (ct′), reflecting the rapid motion of the observers C and D stationary in their train, as observed from the ground.
+
+Fig. 4-7a. The event of "B passing a message to D", as the leading car passes by, is at the origin of D's frame. D sends the message along the train to C in the rear car, using a fictitious "instantaneous communicator". The worldline of this message is the fat red arrow along the 
+−
+x
+′
+{\displaystyle -x'} axis, which is a line of simultaneity in the primed frames of C and D. In the (unprimed) ground frame the signal arrives earlier than it was sent.
+Fig. 4-7b. The event of "C passing the message to A", who is standing by the railroad tracks, is at the origin of their frames. Now A sends the message along the tracks to B via an "instantaneous communicator". The worldline of this message is the blue fat arrow, along the 
++
+x
+{\displaystyle +x} axis, which is a line of simultaneity for the frames of A and B. As seen from the spacetime diagram, in the primed frames of C and D, B will receive the message before it was sent out, a violation of causality.[48]
+It is not necessary for signals to be instantaneous to violate causality. Even if the signal from D to C were slightly shallower than the 
+x
+′
+{\displaystyle x'} axis (and the signal from A to B slightly steeper than the 
+x
+{\displaystyle x} axis), it would still be possible for B to receive his message before he had sent it. By increasing the speed of the train to near light speeds, the 
+c
+t
+′
+{\displaystyle ct'} and 
+x
+′
+{\displaystyle x'} axes can be squeezed very close to the dashed line representing the speed of light. With this modified setup, it can be demonstrated that even signals only slightly faster than the speed of light will result in causality violation.[49]
+
+Therefore, if causality is to be preserved, one of the consequences of special relativity is that no information signal or material object can travel faster than light in vacuum.
+
+Only matter and energy are limited by the speed of light. Various trivial situations can be described where some imaginary points move faster than light.[50] For example, the location where the beam of a search light hits the bottom of a cloud can move faster than light when the search light is turned rapidly. The light beam is not solid and it does not instantly follow the motion of the search light and thus does not violate causality or any other relativistic phenomenon.[51][52]
+
+Optical effects
+Dragging effects
+Main article: Fizeau experiment
+
+Figure 5–1. Highly simplified diagram of Fizeau's 1851 experiment.
+In 1850, Hippolyte Fizeau and Léon Foucault independently established that light travels more slowly in water than in air, thus validating a prediction of Fresnel's wave theory of light and invalidating the corresponding prediction of Newton's corpuscular theory.[53] The speed of light was measured in still water. What would be the speed of light in flowing water?
+
+In 1851, Fizeau conducted an experiment to answer this question, a simplified representation of which is illustrated in Fig. 5-1. A beam of light is divided by a beam splitter, and the split beams are passed in opposite directions through a tube of flowing water. They are recombined to form interference fringes, indicating a difference in optical path length, that an observer can view. The experiment demonstrated that dragging of the light by the flowing water caused a displacement of the fringes, showing that the motion of the water had affected the speed of the light.
+
+According to the theories prevailing at the time, light traveling through a moving medium would be a simple sum of its speed through the medium plus the speed of the medium. Contrary to expectation, Fizeau found that although light appeared to be dragged by the water, the magnitude of the dragging was much lower than expected. If 
+u
+′
+=
+c
+/
+n
+{\displaystyle u'=c/n} is the speed of light in still water, and 
+v
+{\displaystyle v} is the speed of the water, and 
+u
+±
+{\displaystyle u_{\pm }} is the water-borne speed of light in the lab frame with the flow of water adding to or subtracting from the speed of light, then
+u
+±
+=
+c
+n
+±
+v
+(
+1
+−
+1
+n
+2
+)
+ 
+.
+{\displaystyle u_{\pm }={\frac {c}{n}}\pm v\left(1-{\frac {1}{n^{2}}}\right)\ .}
+
+Fizeau's results, although consistent with Fresnel's earlier hypothesis of partial aether dragging, were extremely disconcerting to physicists of the time. Among other things, the presence of an index of refraction term meant that, since 
+n
+{\displaystyle n} depends on wavelength, the aether must be capable of sustaining different motions at the same time.[note 1] A variety of theoretical explanations were proposed to explain Fresnel's dragging coefficient, that were completely at odds with each other. Even before the Michelson–Morley experiment, Fizeau's experimental results were among a number of observations that created a critical situation in explaining the optics of moving bodies.[54]
+
+From the point of view of special relativity, Fizeau's result is nothing but an approximation to Equation 10, the relativistic formula for composition of velocities.[33]
+
+u
+±
+=
+u
+′
+±
+v
+1
+±
+u
+′
+v
+/
+c
+2
+=
+{\displaystyle u_{\pm }={\frac {u'\pm v}{1\pm u'v/c^{2}}}=} 
+c
+/
+n
+±
+v
+1
+±
+v
+/
+c
+n
+≈
+{\displaystyle {\frac {c/n\pm v}{1\pm v/cn}}\approx } 
+c
+(
+1
+n
+±
+v
+c
+)
+(
+1
+∓
+v
+c
+n
+)
+≈
+{\displaystyle c\left({\frac {1}{n}}\pm {\frac {v}{c}}\right)\left(1\mp {\frac {v}{cn}}\right)\approx } 
+c
+n
+±
+v
+(
+1
+−
+1
+n
+2
+)
+{\displaystyle {\frac {c}{n}}\pm v\left(1-{\frac {1}{n^{2}}}\right)}
+Relativistic aberration of light
+Main articles: Aberration of light and Light-time correction
+
+Figure 5–2. Illustration of stellar aberration
+Because of the finite speed of light, if the relative motions of a source and receiver include a transverse component, then the direction from which light arrives at the receiver will be displaced from the geometric position in space of the source relative to the receiver. The classical calculation of the displacement takes two forms and makes different predictions depending on whether the receiver, the source, or both are in motion with respect to the medium. (1) If the receiver is in motion, the displacement would be the consequence of the aberration of light. The incident angle of the beam relative to the receiver would be calculable from the vector sum of the receiver's motions and the velocity of the incident light.[55] (2) If the source is in motion, the displacement would be the consequence of light-time correction. The displacement of the apparent position of the source from its geometric position would be the result of the source's motion during the time that its light takes to reach the receiver.[56]
+
+The classical explanation failed experimental test. Since the aberration angle depends on the relationship between the velocity of the receiver and the speed of the incident light, passage of the incident light through a refractive medium should change the aberration angle. In 1810, Arago used this expected phenomenon in a failed attempt to measure the speed of light,[57] and in 1870, George Airy tested the hypothesis using a water-filled telescope, finding that, against expectation, the measured aberration was identical to the aberration measured with an air-filled telescope.[58] A "cumbrous" attempt to explain these results used the hypothesis of partial aether-drag,[59] but was incompatible with the results of the Michelson–Morley experiment, which apparently demanded complete aether-drag.[60]
+
+Assuming inertial frames, the relativistic expression for the aberration of light is applicable to both the receiver moving and source moving cases. A variety of trigonometrically equivalent formulas have been published. Expressed in terms of the variables in Fig. 5-2, these include[33]: 57–60 
+
+cos
+⁡
+θ
+′
+=
+cos
+⁡
+θ
++
+v
+/
+c
+1
++
+(
+v
+/
+c
+)
+cos
+⁡
+θ
+{\displaystyle \cos \theta '={\frac {\cos \theta +v/c}{1+(v/c)\cos \theta }}}   OR   
+sin
+⁡
+θ
+′
+=
+sin
+⁡
+θ
+γ
+[
+1
++
+(
+v
+/
+c
+)
+cos
+⁡
+θ
+]
+{\displaystyle \sin \theta '={\frac {\sin \theta }{\gamma [1+(v/c)\cos \theta ]}}}   OR   
+tan
+⁡
+θ
+′
+2
+=
+(
+c
+−
+v
+c
++
+v
+)
+1
+/
+2
+tan
+⁡
+θ
+2
+{\displaystyle \tan {\frac {\theta '}{2}}=\left({\frac {c-v}{c+v}}\right)^{1/2}\tan {\frac {\theta }{2}}}
+Relativistic Doppler effect
+Main article: Relativistic Doppler effect
+Relativistic longitudinal Doppler effect
+The classical Doppler effect depends on whether the source, receiver, or both are in motion with respect to the medium. The relativistic Doppler effect is independent of any medium. Nevertheless, relativistic Doppler shift for the longitudinal case, with source and receiver moving directly towards or away from each other, can be derived as if it were the classical phenomenon, but modified by the addition of a time dilation term, and that is the treatment described here.[61][62]
+
+Assume the receiver and the source are moving away from each other with a relative speed 
+v
+{\displaystyle v} as measured by an observer on the receiver or the source (The sign convention adopted here is that 
+v
+{\displaystyle v} is negative if the receiver and the source are moving towards each other). Assume that the source is stationary in the medium. Then
+f
+r
+=
+(
+1
+−
+v
+c
+s
+)
+f
+s
+{\displaystyle f_{r}=\left(1-{\frac {v}{c_{s}}}\right)f_{s}}where 
+c
+s
+{\displaystyle c_{s}} is the speed of sound.
+
+For light, and with the receiver moving at relativistic speeds, clocks on the receiver are time dilated relative to clocks at the source. The receiver will measure the received frequency to be
+f
+r
+=
+γ
+(
+1
+−
+β
+)
+f
+s
+=
+1
+−
+β
+1
++
+β
+f
+s
+.
+{\displaystyle f_{r}=\gamma \left(1-\beta \right)f_{s}={\sqrt {\frac {1-\beta }{1+\beta }}}\,f_{s}.}where
+
+β
+=
+v
+/
+c
+{\displaystyle \beta =v/c}  and
+γ
+=
+1
+1
+−
+β
+2
+{\displaystyle \gamma ={\frac {1}{\sqrt {1-\beta ^{2}}}}} is the Lorentz factor.
+An identical expression for relativistic Doppler shift is obtained when performing the analysis in the reference frame of the receiver with a moving source.[63][26]: 540 
+
+Transverse Doppler effect
+
+Figure 5–3. Transverse Doppler effect for two scenarios: (a) receiver moving in a circle around the source; (b) source moving in a circle around the receiver.
+The transverse Doppler effect is one of the main novel predictions of the special theory of relativity.
+
+Classically, one might expect that if source and receiver are moving transversely with respect to each other with no longitudinal component to their relative motions, that there should be no Doppler shift in the light arriving at the receiver.
+
+Special relativity predicts otherwise. Fig. 5-3 illustrates two common variants of this scenario. Both variants can be analyzed using simple time dilation arguments.[26]: 541  In Fig. 5-3a, the receiver observes light from the source as being blueshifted by a factor of ⁠
+γ
+{\displaystyle \gamma }⁠. In Fig. 5-3b, the light is redshifted by the same factor.
+
+Measurement versus visual appearance
+Main article: Terrell rotation
+
+Figure 5–4. Comparison of the measured length contraction of a cube versus its visual appearance.
+Time dilation and length contraction are not optical illusions, but genuine effects. Measurements of these effects are not an artifact of Doppler shift, nor are they the result of neglecting to take into account the time it takes light to travel from an event to an observer.
+
+Scientists make a fundamental distinction between measurement or observation on the one hand, versus visual appearance, or what one sees. The measured shape of an object is a hypothetical snapshot of all of the object's points as they exist at a single moment in time. But the visual appearance of an object is affected by the varying lengths of time that light takes to travel from different points on the object to one's eye.
+
+
+Figure 5–5. Comparison of the measured length contraction of a globe versus its visual appearance, as viewed from a distance of three diameters of the globe from the eye to the red cross.
+For many years, the distinction between the two had not been generally appreciated, and it had generally been thought that a length contracted object passing by an observer would be observed as length contracted. In 1959, James Terrell and Roger Penrose independently pointed out that differential time lag effects in signals reaching the observer from the different parts of a moving object result in a fast moving object's visual appearance being quite different from its measured shape. For example, a receding object would appear contracted, an approaching object would appear elongated, and a passing object would have a skew appearance that has been likened to a rotation.[p 13][p 14][64][65] A sphere in motion retains the circular outline for all speeds, for any distance, and for all view angles, although the surface of the sphere and the images on it will appear distorted.[66][67]
+
+
+Figure 5–6. Galaxy M87 sends out a black-hole-powered jet of electrons and other sub-atomic particles traveling at nearly the speed of light.
+Both Fig. 5-4 and Fig. 5-5 illustrate objects moving transversely to the line of sight. In Fig. 5-4, a cube is viewed from a distance of four times the length of its sides. At high speeds, the sides of the cube that are perpendicular to the direction of motion appear hyperbolic in shape. The cube is not rotated. Rather, light from the rear of the cube takes longer to reach one's eyes compared with light from the front, during which time the cube has moved to the right. At high speeds, the sphere in Fig. 5-5 takes on the appearance of a flattened disk tilted up to 45° from the line of sight. If the objects' motions are not strictly transverse but instead include a longitudinal component, exaggerated distortions in perspective may be seen.[68] This illusion has come to be known as Terrell rotation or the Terrell–Penrose effect.
+
+Another example where visual appearance is at odds with measurement comes from the observation of apparent superluminal motion in various radio galaxies, BL Lac objects, quasars, and other astronomical objects that eject relativistic-speed jets of matter at narrow angles with respect to the viewer. An apparent optical illusion results giving the appearance of faster than light travel.[69][70][71] In Fig. 5-6, galaxy M87 streams out a high-speed jet of subatomic particles almost directly towards us, but Penrose–Terrell rotation causes the jet to appear to be moving laterally in the same manner that the appearance of the cube in Fig. 5-4 has been stretched out.[72]
+
+Dynamics
+Section § Consequences derived from the Lorentz transformation dealt strictly with kinematics, the study of the motion of points, bodies, and systems of bodies without considering the forces that caused the motion. This section discusses masses, forces, energy and so forth, and as such requires consideration of physical effects beyond those encompassed by the Lorentz transformation itself.
+
+Equivalence of mass and energy
+Main article: Mass–energy equivalence
+Mass–energy equivalence is a consequence of special relativity. The energy and momentum, which are separate in Newtonian mechanics, form a four-vector in relativity, and this relates the time component (the energy) to the space components (the momentum) in a non-trivial way. For an object at rest, the energy–momentum four-vector is (E/c, 0, 0, 0): it has a time component, which is the energy, and three space components, which are zero. By changing frames with a Lorentz transformation in the x direction with a small value of the velocity v, the energy momentum four-vector becomes (E/c, Ev/c2, 0, 0). The momentum is equal to the energy multiplied by the velocity divided by c2. As such, the Newtonian mass of an object, which is the ratio of the momentum to the velocity for slow velocities, is equal to E/c2.
+
+The energy and momentum are properties of matter and radiation, and it is impossible to deduce that they form a four-vector just from the two basic postulates of special relativity by themselves, because these do not talk about matter or radiation, they only talk about space and time. The derivation therefore requires some additional physical reasoning. In his 1905 paper, Einstein used the additional principles that Newtonian mechanics should hold for slow velocities, so that there is one energy scalar and one three-vector momentum at slow velocities, and that the conservation law for energy and momentum is exactly true in relativity. Furthermore, he assumed that the energy of light is transformed by the same Doppler-shift factor as its frequency, which he had previously shown to be true based on Maxwell's equations.[p 1] The first of Einstein's papers on this subject was "Does the Inertia of a Body Depend upon its Energy Content?" in 1905.[p 15] Although Einstein's argument in this paper is nearly universally accepted by physicists as correct, even self-evident, many authors over the years have suggested that it is wrong.[73] Other authors suggest that the argument was merely inconclusive because it relied on some implicit assumptions.[74]
+
+Einstein acknowledged the controversy over his derivation in his 1907 survey paper on special relativity. There he notes that it is problematic to rely on Maxwell's equations for the heuristic mass–energy argument. The argument in his 1905 paper can be carried out with the emission of any massless particles, but the Maxwell equations are implicitly used to make it obvious that the emission of light in particular can be achieved only by doing work. To emit electromagnetic waves, all you have to do is shake a charged particle, and this is clearly doing work, so that the emission is of energy.[p 16]
+
+Einstein's 1905 demonstration of E = mc2
+In his fourth of his 1905 Annus mirabilis papers,[p 15] Einstein presented a heuristic argument for the equivalence of mass and energy. Although, as discussed above, subsequent scholarship has established that his arguments fell short of a broadly definitive proof, the conclusions that he reached in this paper have stood the test of time.
+
+Einstein took as starting assumptions his recently discovered formula for relativistic Doppler shift, the laws of conservation of energy and conservation of momentum, and the relationship between the frequency of light and its energy as implied by Maxwell's equations.
+
+
+
+Figure 6-1. Einstein's 1905 derivation of E = mc2
+Fig. 6-1 (top). Consider a system of plane waves of light having frequency 
+f
+{\displaystyle f} traveling in direction 
+ϕ
+{\displaystyle \phi } relative to the x-axis of reference frame S. The frequency (and hence energy) of the waves as measured in frame S′ that is moving along the x-axis at velocity 
+v
+{\displaystyle v} is given by the relativistic Doppler shift formula that Einstein had developed in his 1905 paper on special relativity:[p 1]
+
+f
+′
+f
+=
+1
+−
+(
+v
+/
+c
+)
+cos
+⁡
+ϕ
+1
+−
+v
+2
+/
+c
+2
+{\displaystyle {\frac {f'}{f}}={\frac {1-(v/c)\cos {\phi }}{\sqrt {1-v^{2}/c^{2}}}}}
+Fig. 6-1 (bottom). Consider an arbitrary body that is stationary in reference frame S. Let this body emit a pair of equal-energy light-pulses in opposite directions at angle 
+ϕ
+{\displaystyle \phi } with respect to the x-axis. Each pulse has energy ⁠
+L
+/
+2
+{\displaystyle L/2}⁠. Because of conservation of momentum, the body remains stationary in S after emission of the two pulses. Let 
+E
+0
+{\displaystyle E_{0}} be the energy of the body before emission of the two pulses and 
+E
+1
+{\displaystyle E_{1}} after their emission.
+
+Next, consider the same system observed from frame S′ that is moving along the x-axis at speed 
+v
+{\displaystyle v} relative to frame S. In this frame, light from the forwards and reverse pulses will be relativistically Doppler-shifted. Let 
+H
+0
+{\displaystyle H_{0}} be the energy of the body measured in reference frame S′ before emission of the two pulses and 
+H
+1
+{\displaystyle H_{1}} after their emission. We obtain the following relationships:[p 15]
+
+E
+0
+=
+E
+1
++
+1
+2
+L
++
+1
+2
+L
+=
+E
+1
++
+L
+H
+0
+=
+H
+1
++
+1
+2
+L
+1
+−
+(
+v
+/
+c
+)
+cos
+⁡
+ϕ
+1
+−
+v
+2
+/
+c
+2
++
+1
+2
+L
+1
++
+(
+v
+/
+c
+)
+cos
+⁡
+ϕ
+1
+−
+v
+2
+/
+c
+2
+=
+H
+1
++
+L
+1
+−
+v
+2
+/
+c
+2
+{\displaystyle {\begin{aligned}E_{0}&=E_{1}+{\tfrac {1}{2}}L+{\tfrac {1}{2}}L=E_{1}+L\\[5mu]H_{0}&=H_{1}+{\tfrac {1}{2}}L{\frac {1-(v/c)\cos {\phi }}{\sqrt {1-v^{2}/c^{2}}}}+{\tfrac {1}{2}}L{\frac {1+(v/c)\cos {\phi }}{\sqrt {1-v^{2}/c^{2}}}}=H_{1}+{\frac {L}{\sqrt {1-v^{2}/c^{2}}}}\end{aligned}}}
+From the above equations, we obtain the following:
+
+(
+H
+0
+−
+E
+0
+)
+−
+(
+H
+1
+−
+E
+1
+)
+=
+L
+(
+1
+1
+−
+v
+2
+/
+c
+2
+−
+1
+)
+{\displaystyle \quad \quad (H_{0}-E_{0})-(H_{1}-E_{1})=L\left({\frac {1}{\sqrt {1-v^{2}/c^{2}}}}-1\right)}		6-1
+The two differences of form 
+H
+−
+E
+{\displaystyle H-E} seen in the above equation have a straightforward physical interpretation. Since 
+H
+{\displaystyle H} and 
+E
+{\displaystyle E} are the energies of the arbitrary body in the moving and stationary frames, 
+H
+0
+−
+E
+0
+{\displaystyle H_{0}-E_{0}} and 
+H
+1
+−
+E
+1
+{\displaystyle H_{1}-E_{1}} represents the kinetic energies of the bodies before and after the emission of light (except for an additive constant that fixes the zero point of energy and is conventionally set to zero). Hence,
+
+K
+0
+−
+K
+1
+=
+L
+(
+1
+1
+−
+v
+2
+/
+c
+2
+−
+1
+)
+{\displaystyle \quad \quad K_{0}-K_{1}=L\left({\frac {1}{\sqrt {1-v^{2}/c^{2}}}}-1\right)}		6-2
+Taking a Taylor series expansion and neglecting higher order terms, he obtained
+
+K
+0
+−
+K
+1
+=
+1
+2
+L
+c
+2
+v
+2
+{\displaystyle \quad \quad K_{0}-K_{1}={\frac {1}{2}}{\frac {L}{c^{2}}}v^{2}}		6-3
+Comparing the above expression with the classical expression for kinetic energy, K.E. = ⁠
+1
+/
+2
+⁠mv2, Einstein then noted: "If a body gives off the energy L in the form of radiation, its mass diminishes by L/c2."
+
+Rindler has observed that Einstein's heuristic argument suggested merely that energy contributes to mass. In 1905, Einstein's cautious expression of the mass–energy relationship allowed for the possibility that "dormant" mass might exist that would remain behind after all the energy of a body was removed. By 1907, however, Einstein was ready to assert that all inertial mass represented a reserve of energy. "To equate all mass with energy required an act of aesthetic faith, very characteristic of Einstein."[13]: 81–84  Einstein's bold hypothesis has been amply confirmed in the years subsequent to his original proposal.
+
+For a variety of reasons, Einstein's original derivation is currently seldom taught. Besides the vigorous debate that continues until this day as to the formal correctness of his original derivation, the recognition of special relativity as being what Einstein called a "principle theory" has led to a shift away from reliance on electromagnetic phenomena to purely dynamic methods of proof.[75]
+
+How far can you travel from the Earth?
+See also: Space travel under constant acceleration
+Since nothing can travel faster than light, one might conclude that a human can never travel farther from Earth than ~ 100 light years. You would easily think that a traveler would never be able to reach more than the few solar systems that exist within the limit of 100 light years from Earth. However, because of time dilation, a hypothetical spaceship can travel thousands of light years during a passenger's lifetime. If a spaceship could be built that accelerates at a constant 1g, it will, after one year, be travelling at almost the speed of light as seen from Earth. This is described by:
+v
+(
+t
+)
+=
+a
+t
+1
++
+a
+2
+t
+2
+/
+c
+2
+,
+{\displaystyle v(t)={\frac {at}{\sqrt {1+a^{2}t^{2}/c^{2}}}},}where v(t) is the velocity at a time t, a is the acceleration of the spaceship and t is the coordinate time as measured by people on Earth.[p 17] Therefore, after one year of accelerating at 9.81 m/s2, the spaceship will be travelling at v = 0.712 c and 0.946 c after three years, relative to Earth. After three years of this acceleration, with the spaceship achieving a velocity of 94.6% of the speed of light relative to Earth, time dilation will result in each second experienced on the spaceship corresponding to 3.1 seconds back on Earth. During their journey, people on Earth will experience more time than they do – since their clocks (all physical phenomena) would really be ticking 3.1 times faster than those of the spaceship. A 5-year round trip for the traveller will take 6.5 Earth years and cover a distance of over 6 light-years. A 20-year round trip for them (5 years accelerating, 5 decelerating, twice each) will land them back on Earth having travelled for 335 Earth years and a distance of 331 light years.[76] A full 40-year trip at 1g will appear on Earth to last 58,000 years and cover a distance of 55,000 light years. A 40-year trip at 1.1 g will take 148000 years and cover about 140000 light years. A one-way 28 year (14 years accelerating, 14 decelerating as measured with the astronaut's clock) trip at 1g acceleration could reach 2,000,000 light-years to the Andromeda Galaxy.[76] This same time dilation is why a muon travelling close to c is observed to travel much farther than c times its half-life (when at rest).[77]
+
+Elastic collisions
+Examination of the collision products generated by particle accelerators around the world provides scientists evidence of the structure of the subatomic world and the natural laws governing it. Analysis of the collision products, the sum of whose masses may vastly exceed the masses of the incident particles, requires special relativity.[78]
+
+In Newtonian mechanics, analysis of collisions involves use of the conservation laws for mass, momentum and energy. In relativistic mechanics, mass is not independently conserved, because it has been subsumed into the total relativistic energy. We illustrate the differences that arise between the Newtonian and relativistic treatments of particle collisions by examining the simple case of two perfectly elastic colliding particles of equal mass. (Inelastic collisions are discussed in Spacetime#Conservation laws. Radioactive decay may be considered a sort of time-reversed inelastic collision.[78])
+
+Elastic scattering of charged elementary particles deviates from ideality due to the production of Bremsstrahlung radiation.[79][80]
+
+Newtonian analysis
+
+Figure 6–2. Newtonian analysis of the elastic collision of a moving particle with an equal mass stationary particle
+Fig. 6-2 provides a demonstration of the result, familiar to billiard players, that if a stationary ball is struck elastically by another one of the same mass (assuming no sidespin, or "English"), then after collision, the diverging paths of the two balls will subtend a right angle. (a) In the stationary frame, an incident sphere traveling at 2v strikes a stationary sphere. (b) In the center of momentum frame, the two spheres approach each other symmetrically at ±v. After elastic collision, the two spheres rebound from each other with equal and opposite velocities ±u. Energy conservation requires that |u| = |v|. (c) Reverting to the stationary frame, the rebound velocities are v ± u. The dot product (v + u) ⋅ (v − u) = v2 − u2 = 0, indicating that the vectors are orthogonal.[13]: 26–27 
+
+Relativistic analysis
+
+Figure 6–3. Relativistic elastic collision between a moving particle incident upon an equal mass stationary particle
+Consider the elastic collision scenario in Fig. 6-3 between a moving particle colliding with an equal mass stationary particle. Unlike the Newtonian case, the angle between the two particles after collision is less than 90°, is dependent on the angle of scattering, and becomes smaller and smaller as the velocity of the incident particle approaches the speed of light:
+
+The relativistic momentum and total relativistic energy of a particle are given by
+
+p
+→
+=
+γ
+m
+v
+→
+and
+E
+=
+γ
+m
+c
+2
+{\displaystyle \quad \quad {\vec {p}}=\gamma m{\vec {v}}\quad {\text{and}}\quad E=\gamma mc^{2}}		6-4
+Conservation of momentum dictates that the sum of the momenta of the incoming particle and the stationary particle (which initially has momentum = 0) equals the sum of the momenta of the emergent particles:
+
+γ
+1
+m
+v
+1
+→
++
+0
+=
+γ
+2
+m
+v
+2
+→
++
+γ
+3
+m
+v
+3
+→
+{\displaystyle \quad \quad \gamma _{1}m{\vec {v_{1}}}+0=\gamma _{2}m{\vec {v_{2}}}+\gamma _{3}m{\vec {v_{3}}}}		6-5
+Likewise, the sum of the total relativistic energies of the incoming particle and the stationary particle (which initially has total energy mc2) equals the sum of the total energies of the emergent particles:
+
+γ
+1
+m
+c
+2
++
+m
+c
+2
+=
+γ
+2
+m
+c
+2
++
+γ
+3
+m
+c
+2
+{\displaystyle \quad \quad \gamma _{1}mc^{2}+mc^{2}=\gamma _{2}mc^{2}+\gamma _{3}mc^{2}}		6-6
+Breaking down (6-5) into its components, replacing 
+v
+{\displaystyle v} with the dimensionless ⁠
+β
+{\displaystyle \beta }⁠, and factoring out common terms from (6-5) and (6-6) yields the following:[p 18]
+
+β
+1
+γ
+1
+=
+β
+2
+γ
+2
+cos
+⁡
+θ
++
+β
+3
+γ
+3
+cos
+⁡
+ϕ
+{\displaystyle \quad \quad \beta _{1}\gamma _{1}=\beta _{2}\gamma _{2}\cos {\theta }+\beta _{3}\gamma _{3}\cos {\phi }}		6-7
+β
+2
+γ
+2
+sin
+⁡
+θ
+=
+β
+3
+γ
+3
+sin
+⁡
+ϕ
+{\displaystyle \quad \quad \beta _{2}\gamma _{2}\sin {\theta }=\beta _{3}\gamma _{3}\sin {\phi }}		6-8
+γ
+1
++
+1
+=
+γ
+2
++
+γ
+3
+{\displaystyle \quad \quad \gamma _{1}+1=\gamma _{2}+\gamma _{3}}		6-9
+From these we obtain the following relationships:[p 18]
+
+β
+2
+=
+β
+1
+sin
+⁡
+ϕ
+{
+β
+1
+2
+sin
+2
+⁡
+ϕ
++
+sin
+2
+⁡
+(
+ϕ
++
+θ
+)
+/
+γ
+1
+2
+}
+1
+/
+2
+{\displaystyle \quad \quad \beta _{2}={\frac {\beta _{1}\sin {\phi }}{\{\beta _{1}^{2}\sin ^{2}{\phi }+\sin ^{2}(\phi +\theta )/\gamma _{1}^{2}\}^{1/2}}}}		6-10
+β
+3
+=
+β
+1
+sin
+⁡
+θ
+{
+β
+1
+2
+sin
+2
+⁡
+θ
++
+sin
+2
+⁡
+(
+ϕ
++
+θ
+)
+/
+γ
+1
+2
+}
+1
+/
+2
+{\displaystyle \quad \quad \beta _{3}={\frac {\beta _{1}\sin {\theta }}{\{\beta _{1}^{2}\sin ^{2}{\theta }+\sin ^{2}(\phi +\theta )/\gamma _{1}^{2}\}^{1/2}}}}		6-11
+cos
+⁡
+(
+ϕ
++
+θ
+)
+=
+(
+γ
+1
+−
+1
+)
+sin
+⁡
+θ
+cos
+⁡
+θ
+{
+(
+γ
+1
++
+1
+)
+2
+sin
+2
+⁡
+θ
++
+4
+cos
+2
+⁡
+θ
+}
+1
+/
+2
+{\displaystyle \quad \quad \cos {(\phi +\theta )}={\frac {(\gamma _{1}-1)\sin {\theta }\cos {\theta }}{\{(\gamma _{1}+1)^{2}\sin ^{2}\theta +4\cos ^{2}\theta \}^{1/2}}}}		6-12
+For the symmetrical case in which 
+ϕ
+=
+θ
+{\displaystyle \phi =\theta } and ⁠
+β
+2
+=
+β
+3
+{\displaystyle \beta _{2}=\beta _{3}}⁠, (6-12) takes on the simpler form:[p 18]
+
+cos
+⁡
+θ
+=
+β
+1
+{
+2
+/
+γ
+1
++
+3
+β
+1
+2
+−
+2
+}
+1
+/
+2
+{\displaystyle \quad \quad \cos {\theta }={\frac {\beta _{1}}{\{2/\gamma _{1}+3\beta _{1}^{2}-2\}^{1/2}}}}		6-13
+Rapidity
+Main article: Rapidity
+
+Figure 7-1a. A ray through the unit circle x2 + y2 = 1 in the point (cos a, sin a), where a is twice the area between the ray, the circle, and the x-axis.
+
+Figure 7-1b. A ray through the unit hyperbola x2 − y2 = 1 in the point (cosh a, sinh a), where a is twice the area between the ray, the hyperbola, and the x-axis.
+
+Figure 7–2. Plot of the three basic Hyperbolic functions: hyperbolic sine (sinh), hyperbolic cosine (cosh) and hyperbolic tangent (tanh). Sinh is red, cosh is blue and tanh is green.
+Lorentz transformations relate coordinates of events in one reference frame to those of another frame. Relativistic composition of velocities is used to add two velocities together. The formulas to perform the latter computations are nonlinear, making them more complex than the corresponding Galilean formulas.
+
+This nonlinearity is an artifact of our choice of parameters.[10]: 47–59  We have previously noted that in an x–ct spacetime diagram, the points at some constant spacetime interval from the origin form an invariant hyperbola. We have also noted that the coordinate systems of two spacetime reference frames in standard configuration are hyperbolically rotated with respect to each other.
+
+The natural functions for expressing these relationships are the hyperbolic analogs of the trigonometric functions. Fig. 7-1a shows a unit circle with sin(a) and cos(a), the only difference between this diagram and the familiar unit circle of elementary trigonometry being that a is interpreted, not as the angle between the ray and the x-axis, but as twice the area of the sector swept out by the ray from the x-axis. Numerically, the angle and 2 × area measures for the unit circle are identical. Fig. 7-1b shows a unit hyperbola with sinh(a) and cosh(a), where a is likewise interpreted as twice the tinted area.[81] Fig. 7-2 presents plots of the sinh, cosh, and tanh functions.
+
+For the unit circle, the slope of the ray is given by
+
+slope
+=
+tan
+⁡
+a
+=
+sin
+⁡
+a
+cos
+⁡
+a
+.
+{\displaystyle {\text{slope}}=\tan a={\frac {\sin a}{\cos a}}.}
+In the Cartesian plane, rotation of point (x, y) into point (x', y') by angle θ is given by
+
+(
+x
+′
+y
+′
+)
+=
+(
+cos
+⁡
+θ
+−
+sin
+⁡
+θ
+sin
+⁡
+θ
+cos
+⁡
+θ
+)
+(
+x
+y
+)
+.
+{\displaystyle {\begin{pmatrix}x'\\y'\\\end{pmatrix}}={\begin{pmatrix}\cos \theta &-\sin \theta \\\sin \theta &\cos \theta \\\end{pmatrix}}{\begin{pmatrix}x\\y\\\end{pmatrix}}.}
+In a spacetime diagram, the velocity parameter 
+β
+≡
+v
+c
+{\displaystyle \beta \equiv {\frac {v}{c}}} is the analog of slope. The rapidity, φ, is defined by[26]: 543 
+
+β
+≡
+tanh
+⁡
+ϕ
+,
+{\displaystyle \beta \equiv \tanh \phi ,}
+where
+
+tanh
+⁡
+ϕ
+=
+sinh
+⁡
+ϕ
+cosh
+⁡
+ϕ
+=
+e
+ϕ
+−
+e
+−
+ϕ
+e
+ϕ
++
+e
+−
+ϕ
+.
+{\displaystyle \tanh \phi ={\frac {\sinh \phi }{\cosh \phi }}={\frac {e^{\phi }-e^{-\phi }}{e^{\phi }+e^{-\phi }}}.}
+The rapidity defined above is very useful in special relativity because many expressions take on a considerably simpler form when expressed in terms of it. For example, rapidity is simply additive in the collinear velocity-addition formula;[26]: 544 
+
+β
+=
+β
+1
++
+β
+2
+1
++
+β
+1
+β
+2
+=
+{\displaystyle \beta ={\frac {\beta _{1}+\beta _{2}}{1+\beta _{1}\beta _{2}}}=} 
+tanh
+⁡
+ϕ
+1
++
+tanh
+⁡
+ϕ
+2
+1
++
+tanh
+⁡
+ϕ
+1
+tanh
+⁡
+ϕ
+2
+=
+{\displaystyle {\frac {\tanh \phi _{1}+\tanh \phi _{2}}{1+\tanh \phi _{1}\tanh \phi _{2}}}=} 
+tanh
+⁡
+(
+ϕ
+1
++
+ϕ
+2
+)
+,
+{\displaystyle \tanh(\phi _{1}+\phi _{2}),}
+or in other words, ⁠
+ϕ
+=
+ϕ
+1
++
+ϕ
+2
+{\displaystyle \phi =\phi _{1}+\phi _{2}}⁠.
+
+The Lorentz transformations take a simple form when expressed in terms of rapidity. The γ factor can be written as
+
+γ
+=
+1
+1
+−
+β
+2
+=
+1
+1
+−
+tanh
+2
+⁡
+ϕ
+{\displaystyle \gamma ={\frac {1}{\sqrt {1-\beta ^{2}}}}={\frac {1}{\sqrt {1-\tanh ^{2}\phi }}}} 
+=
+cosh
+⁡
+ϕ
+,
+{\displaystyle =\cosh \phi ,}
+γ
+β
+=
+β
+1
+−
+β
+2
+=
+tanh
+⁡
+ϕ
+1
+−
+tanh
+2
+⁡
+ϕ
+{\displaystyle \gamma \beta ={\frac {\beta }{\sqrt {1-\beta ^{2}}}}={\frac {\tanh \phi }{\sqrt {1-\tanh ^{2}\phi }}}} 
+=
+sinh
+⁡
+ϕ
+.
+{\displaystyle =\sinh \phi .}
+Transformations describing relative motion with uniform velocity and without rotation of the space coordinate axes are called boosts.
+
+Substituting γ and γβ into the transformations as previously presented and rewriting in matrix form, the Lorentz boost in the x-direction may be written as
+
+(
+c
+t
+′
+x
+′
+)
+=
+(
+cosh
+⁡
+ϕ
+−
+sinh
+⁡
+ϕ
+−
+sinh
+⁡
+ϕ
+cosh
+⁡
+ϕ
+)
+(
+c
+t
+x
+)
+,
+{\displaystyle {\begin{pmatrix}ct'\\x'\end{pmatrix}}={\begin{pmatrix}\cosh \phi &-\sinh \phi \\-\sinh \phi &\cosh \phi \end{pmatrix}}{\begin{pmatrix}ct\\x\end{pmatrix}},}
+and the inverse Lorentz boost in the x-direction may be written as
+
+(
+c
+t
+x
+)
+=
+(
+cosh
+⁡
+ϕ
+sinh
+⁡
+ϕ
+sinh
+⁡
+ϕ
+cosh
+⁡
+ϕ
+)
+(
+c
+t
+′
+x
+′
+)
+.
+{\displaystyle {\begin{pmatrix}ct\\x\end{pmatrix}}={\begin{pmatrix}\cosh \phi &\sinh \phi \\\sinh \phi &\cosh \phi \end{pmatrix}}{\begin{pmatrix}ct'\\x'\end{pmatrix}}.}
+In other words, Lorentz boosts represent hyperbolic rotations in Minkowski spacetime.[citation needed]
+
+The advantages of using hyperbolic functions are such that some textbooks such as the classic ones by Taylor and Wheeler introduce their use at a very early stage.[10]
+
+Minkowski spacetime
+Main article: Minkowski space
+
+Figure 10–1. Orthogonality and rotation of coordinate systems compared between left: Euclidean space through circular angle φ, right: in Minkowski spacetime through hyperbolic angle φ (red lines labelled c denote the worldlines of a light signal, a vector is orthogonal to itself if it lies on this line).[82]
+The physical theory of special relativity was recast by Hermann Minkowski in a 4-dimensional geometry now called Minkowski space. Minkowski spacetime appears to be very similar to the standard 3-dimensional Euclidean space, but there is a crucial difference with respect to time. In 3D space, the differential of distance (line element) ds is defined by
+d
+s
+2
+=
+d
+x
+⋅
+d
+x
+=
+d
+x
+1
+2
++
+d
+x
+2
+2
++
+d
+x
+3
+2
+,
+{\displaystyle ds^{2}=d\mathbf {x} \cdot d\mathbf {x} =dx_{1}^{2}+dx_{2}^{2}+dx_{3}^{2},}where dx = (dx1, dx2, dx3) are the differentials of the three spatial dimensions. In Minkowski geometry, there is an extra dimension with coordinate X0 derived from time, such that the distance differential fulfills
+d
+s
+2
+=
+−
+d
+X
+0
+2
++
+d
+X
+1
+2
++
+d
+X
+2
+2
++
+d
+X
+3
+2
+,
+{\displaystyle ds^{2}=-dX_{0}^{2}+dX_{1}^{2}+dX_{2}^{2}+dX_{3}^{2},}where dX = (dX0, dX1, dX2, dX3) are the differentials of the four spacetime dimensions. This suggests a deep theoretical insight: special relativity is simply a rotational symmetry of our spacetime, analogous to the rotational symmetry of Euclidean space (see Fig. 10-1).[83] Just as Euclidean space uses a Euclidean metric, so spacetime uses a Minkowski metric. Basically, special relativity can be stated as the invariance of any spacetime interval (that is the 4D distance between any two events) when viewed from any inertial reference frame. All equations and effects of special relativity can be derived from this rotational symmetry (the Poincaré group) of Minkowski spacetime.
+
+The form of ds above depends on the metric and on the choices for the X0 coordinate. To make the time coordinate look like the space coordinates, it can be treated as imaginary: X0 = ict (this is called a Wick rotation). According to Misner, Thorne and Wheeler (1971, §2.3), ultimately the deeper understanding of both special and general relativity will come from the study of the Minkowski metric (described below) and to take X0 = ct, rather than a "disguised" Euclidean metric using ict as the time coordinate.
+
+Some authors use X0 = t, with factors of c elsewhere to compensate; for instance, spatial coordinates are divided by c or factors of c±2 are included in the metric tensor.[84] These numerous conventions can be superseded by using natural units where c = 1. Then space and time have equivalent units, and no factors of c appear anywhere.
+
+A four dimensional space has four-dimensional vectors, or "four-vectors". The simplest example of a four-vector is the position of an event in spacetime, which constitutes a timelike component ct and spacelike component x = (x, y, z), in a contravariant position four-vector with components:
+X
+ν
+=
+(
+X
+0
+,
+X
+1
+,
+X
+2
+,
+X
+3
+)
+=
+(
+c
+t
+,
+x
+,
+y
+,
+z
+)
+=
+(
+c
+t
+,
+x
+)
+.
+{\displaystyle X^{\nu }=(X^{0},X^{1},X^{2},X^{3})=(ct,x,y,z)=(ct,\mathbf {x} ).}where we define X0 = ct so that the time coordinate has the same dimension of distance as the other spatial dimensions; so that space and time are treated equally.[85][86][87]
+
+4‑vectors
+Main article: Four-vector
+4‑vectors, and more generally tensors, simplify the mathematics and conceptual understanding of special relativity. Working exclusively with such objects leads to formulas that are manifestly relativistically invariant, which is a considerable advantage in non-trivial contexts. For instance, demonstrating relativistic invariance of Maxwell's equations in their usual form is not trivial, while it is merely a routine calculation, really no more than an observation, using the field strength tensor formulation.[88]
+
+Definition of 4-vectors
+A 4-tuple, ⁠
+A
+=
+(
+A
+0
+,
+A
+1
+,
+A
+2
+,
+A
+3
+)
+{\displaystyle A=\left(A_{0},A_{1},A_{2},A_{3}\right)}⁠ is a "4-vector" if its component Ai transform between frames according to the Lorentz transformation.
+
+If using ⁠
+(
+c
+t
+,
+x
+,
+y
+,
+z
+)
+{\displaystyle (ct,x,y,z)}⁠ coordinates, A is a 4–vector if it transforms (in the x-direction) according to
+
+A
+0
+′
+=
+γ
+(
+A
+0
+−
+(
+v
+/
+c
+)
+A
+1
+)
+A
+1
+′
+=
+γ
+(
+A
+1
+−
+(
+v
+/
+c
+)
+A
+0
+)
+A
+2
+′
+=
+A
+2
+A
+3
+′
+=
+A
+3
+,
+{\displaystyle {\begin{aligned}A_{0}'&=\gamma \left(A_{0}-(v/c)A_{1}\right)\\A_{1}'&=\gamma \left(A_{1}-(v/c)A_{0}\right)\\A_{2}'&=A_{2}\\A_{3}'&=A_{3}\end{aligned}},}
+which comes from simply replacing ct with A0 and x with A1 in the earlier presentation of the Lorentz transformation.
+
+As usual, when we write x, t, etc. we generally mean Δx, Δt etc.
+
+The last three components of a 4–vector must be a standard vector in three-dimensional space. Therefore, a 4–vector must transform like ⁠
+(
+c
+Δ
+t
+,
+Δ
+x
+,
+Δ
+y
+,
+Δ
+z
+)
+{\displaystyle (c\Delta t,\Delta x,\Delta y,\Delta z)}⁠ under Lorentz transformations as well as rotations.[89]: 36–59 
+
+Properties of 4-vectors
+Closure under linear combination: If A and B are 4-vectors, then ⁠
+C
+=
+a
+A
++
+a
+B
+{\displaystyle C=aA+aB}⁠ is also a 4-vector.
+Inner-product invariance: If A and B are 4-vectors, then their inner product (scalar product) is invariant, i.e. their inner product is independent of the frame in which it is calculated. Note how the calculation of inner product differs from the calculation of the inner product of a 3-vector. In the following, 
+A
+→
+{\displaystyle {\vec {A}}} and 
+B
+→
+{\displaystyle {\vec {B}}} are 3-vectors:
+A
+⋅
+B
+≡
+{\displaystyle A\cdot B\equiv } 
+A
+0
+B
+0
+−
+A
+1
+B
+1
+−
+A
+2
+B
+2
+−
+A
+3
+B
+3
+≡
+{\displaystyle A_{0}B_{0}-A_{1}B_{1}-A_{2}B_{2}-A_{3}B_{3}\equiv } 
+A
+0
+B
+0
+−
+A
+→
+⋅
+B
+→
+{\displaystyle A_{0}B_{0}-{\vec {A}}\cdot {\vec {B}}}
+In addition to being invariant under Lorentz transformation, the above inner product is also invariant under rotation in 3-space.
+Two vectors are said to be orthogonal if ⁠
+A
+⋅
+B
+=
+0
+{\displaystyle A\cdot B=0}⁠. Unlike the case with 3-vectors, orthogonal 4-vectors are not necessarily at right angles to each other. The rule is that two 4-vectors are orthogonal if they are offset by equal and opposite angles from the 45° line, which is the world line of a light ray. This implies that a lightlike 4-vector is orthogonal to itself.
+Invariance of the magnitude of a vector: The magnitude of a vector is the inner product of a 4-vector with itself, and is a frame-independent property. As with intervals, the magnitude may be positive, negative or zero, so that the vectors are referred to as timelike, spacelike or null (lightlike). Note that a null vector is not the same as a zero vector. A null vector is one for which ⁠
+A
+⋅
+A
+=
+0
+{\displaystyle A\cdot A=0}⁠, while a zero vector is one whose components are all zero. Special cases illustrating the invariance of the norm include the invariant interval 
+c
+2
+t
+2
+−
+x
+2
+{\displaystyle c^{2}t^{2}-x^{2}} and the invariant length of the relativistic momentum vector ⁠
+E
+2
+−
+p
+2
+c
+2
+{\displaystyle E^{2}-p^{2}c^{2}}⁠.[26]: 639 [89]: 36–59 
+Examples of 4-vectors
+Displacement 4-vector: Otherwise known as the spacetime separation, this is (Δt, Δx, Δy, Δz), or for infinitesimal separations, (dt, dx, dy, dz).
+d
+S
+≡
+(
+d
+t
+,
+d
+x
+,
+d
+y
+,
+d
+z
+)
+{\displaystyle dS\equiv (dt,dx,dy,dz)}
+Velocity 4-vector: This results when the displacement 4-vector is divided by 
+d
+τ
+{\displaystyle d\tau }, where 
+d
+τ
+{\displaystyle d\tau } is the proper time between the two events that yield dt, dx, dy, and dz.
+V
+≡
+d
+S
+d
+τ
+=
+(
+d
+t
+,
+d
+x
+,
+d
+y
+,
+d
+z
+)
+d
+t
+/
+γ
+=
+{\displaystyle V\equiv {\frac {dS}{d\tau }}={\frac {(dt,dx,dy,dz)}{dt/\gamma }}=} 
+γ
+(
+1
+,
+d
+x
+d
+t
+,
+d
+y
+d
+t
+,
+d
+z
+d
+t
+)
+=
+{\displaystyle \gamma \left(1,{\frac {dx}{dt}},{\frac {dy}{dt}},{\frac {dz}{dt}}\right)=} 
+(
+γ
+,
+γ
+v
+→
+)
+{\displaystyle (\gamma ,\gamma {\vec {v}})}
+
+Figure 7-3a. The momentarily comoving reference frames of an accelerating particle as observed from a stationary frame.
+
+Figure 7-3b. The momentarily comoving reference frames along the trajectory of an accelerating observer (center).
+The 4-velocity is tangent to the world line of a particle, and has a length equal to one unit of time in the frame of the particle.
+An accelerated particle does not have an inertial frame in which it is always at rest. However, an inertial frame can always be found that is momentarily comoving with the particle. This frame, the momentarily comoving reference frame (MCRF), enables application of special relativity to the analysis of accelerated particles.
+Since photons move on null lines, 
+d
+τ
+=
+0
+{\displaystyle d\tau =0} for a photon, and a 4-velocity cannot be defined. There is no frame in which a photon is at rest, and no MCRF can be established along a photon's path.
+Energy–momentum 4-vector:
+P
+≡
+(
+E
+/
+c
+,
+p
+→
+)
+=
+(
+E
+/
+c
+,
+p
+x
+,
+p
+y
+,
+p
+z
+)
+{\displaystyle P\equiv (E/c,{\vec {p}})=(E/c,p_{x},p_{y},p_{z})}
+As indicated before, there are varying treatments for the energy–momentum 4-vector so that one may also see it expressed as 
+(
+E
+,
+p
+→
+)
+{\displaystyle (E,{\vec {p}})} or ⁠
+(
+E
+,
+p
+→
+c
+)
+{\displaystyle (E,{\vec {p}}c)}⁠. The first component is the total energy (including mass) of the particle (or system of particles) in a given frame, while the remaining components are its spatial momentum. The energy–momentum 4-vector is a conserved quantity.
+Acceleration 4-vector: This results from taking the derivative of the velocity 4-vector with respect to ⁠
+τ
+{\displaystyle \tau }⁠.
+A
+≡
+d
+V
+d
+τ
+=
+{\displaystyle A\equiv {\frac {dV}{d\tau }}=} 
+d
+d
+τ
+(
+γ
+,
+γ
+v
+→
+)
+=
+{\displaystyle {\frac {d}{d\tau }}(\gamma ,\gamma {\vec {v}})=} 
+γ
+(
+d
+γ
+d
+t
+,
+d
+(
+γ
+v
+→
+)
+d
+t
+)
+{\displaystyle \gamma \left({\frac {d\gamma }{dt}},{\frac {d(\gamma {\vec {v}})}{dt}}\right)}
+Force 4-vector: This is the derivative of the momentum 4-vector with respect to 
+τ
+.
+{\displaystyle \tau .}
+F
+≡
+d
+P
+d
+τ
+=
+{\displaystyle F\equiv {\frac {dP}{d\tau }}=} 
+γ
+(
+d
+E
+d
+t
+,
+d
+p
+→
+d
+t
+)
+=
+{\displaystyle \gamma \left({\frac {dE}{dt}},{\frac {d{\vec {p}}}{dt}}\right)=} 
+γ
+(
+d
+E
+d
+t
+,
+f
+→
+)
+{\displaystyle \gamma \left({\frac {dE}{dt}},{\vec {f}}\right)}
+As expected, the final components of the above 4-vectors are all standard 3-vectors corresponding to spatial 3-momentum, 3-force etc.[89]: 36–59 
+
+4-vectors and physical law
+The first postulate of special relativity declares the equivalency of all inertial frames. A physical law holding in one frame must apply in all frames, since otherwise it would be possible to differentiate between frames. Newtonian momenta fail to behave properly under Lorentzian transformation, and Einstein preferred to change the definition of momentum to one involving 4-vectors rather than give up on conservation of momentum.
+
+Physical laws must be based on constructs that are frame independent. This means that physical laws may take the form of equations connecting scalars, which are always frame independent. However, equations involving 4-vectors require the use of tensors with appropriate rank, which themselves can be thought of as being built up from 4-vectors.[26]: 644  General relativity from the outset relies heavily on 4‑vectors, and more generally tensors, representing physically relevant entities.
+
+Acceleration
+Further information: Acceleration (special relativity)
+Special relativity does accommodate accelerations as well as accelerating frames of reference.[90] It is a common misconception that special relativity is applicable only to inertial frames, and that it is unable to handle accelerating objects or accelerating reference frames.[91] It is only when gravitation is significant that general relativity is required.[92]
+
+Properly handling accelerating frames does require some care, however. The difference between special and general relativity is that (1) In special relativity, all velocities are relative, but acceleration is absolute. (2) In general relativity, all motion is relative, whether inertial, accelerating, or rotating. To accommodate this difference, general relativity uses curved spacetime.[92]
+
+In this section, we analyze several scenarios involving accelerated reference frames.
+
+
+Dewan–Beran–Bell spaceship paradox
+Main article: Bell's spaceship paradox
+The Dewan–Beran–Bell spaceship paradox (Bell's spaceship paradox) is a good example of a problem where intuitive reasoning unassisted by the geometric insight of the spacetime approach can lead to issues.[citation needed]
+
+
+Figure 7–4. Dewan–Beran–Bell spaceship paradox
+In Fig. 7-4, two identical spaceships float in space and are at rest relative to each other. They are connected by a string that is capable of only a limited amount of stretching before breaking. At a given instant in our frame, the observer frame, both spaceships accelerate in the same direction along the line between them with the same constant proper acceleration. In relativity theory, proper acceleration is the physical acceleration (i.e., measurable acceleration as by an accelerometer) experienced by an object. It is thus acceleration relative to a free-fall, or inertial, observer who is momentarily at rest relative to the object being measured. Will the string break?
+
+When the paradox was new and relatively unknown, even professional physicists had difficulty working out the solution. Two lines of reasoning lead to opposite conclusions. Both arguments, which are presented below, are flawed even though one of them yields the correct answer.
+
+To observers in the rest frame, the spaceships start a distance L apart and remain the same distance apart during acceleration. During acceleration, L is a length contracted distance of the distance L' = γL in the frame of the accelerating spaceships. After a sufficiently long time, γ will increase to a sufficiently large factor that the string must break.
+Let A and B be the rear and front spaceships. In the frame of the spaceships, each spaceship sees the other spaceship doing the same thing that it is doing. A says that B has the same acceleration that he has, and B sees that A matches her every move. So the spaceships stay the same distance apart, and the string does not break.
+The problem with the first argument is that there is no "frame of the spaceships". There cannot be, because the two spaceships measure a growing distance between the two. Because there is no common frame of the spaceships, the length of the string is ill-defined. Nevertheless, the conclusion is correct, and the argument is mostly right. The second argument, however, completely ignores the relativity of simultaneity.
+
+
+Figure 7–5. The curved lines represent the world lines of two observers A and B who accelerate in the same direction with the same constant magnitude acceleration. At A' and B', the observers stop accelerating. The dashed lines are lines of simultaneity for either observer before acceleration begins and after acceleration stops.
+A spacetime diagram (Fig. 7-5) makes the correct solution to this paradox almost immediately evident. Two observers in Minkowski spacetime accelerate with constant magnitude 
+k
+{\displaystyle k} acceleration for proper time 
+σ
+{\displaystyle \sigma } (acceleration and elapsed time measured by the observers themselves, not some inertial observer). They are comoving and inertial before and after this phase. In Minkowski geometry, the length along the line of simultaneity 
+A
+′
+B
+″
+{\displaystyle A'B''} turns out to be greater than the length along the line of simultaneity ⁠
+A
+B
+{\displaystyle AB}⁠.
+
+The length increase can be calculated with the help of the Lorentz transformation. If, as illustrated in Fig. 7-5, the acceleration is finished, the ships will remain at a constant offset in some frame ⁠
+S
+′
+{\displaystyle S'}⁠. If 
+x
+A
+{\displaystyle x_{A}} and 
+x
+B
+=
+x
+A
++
+L
+{\displaystyle x_{B}=x_{A}+L} are the ships' positions in ⁠
+S
+{\displaystyle S}⁠, the positions in frame 
+S
+′
+{\displaystyle S'} are:[93]
+
+x
+A
+′
+=
+γ
+(
+x
+A
+−
+v
+t
+)
+x
+B
+′
+=
+γ
+(
+x
+A
++
+L
+−
+v
+t
+)
+L
+′
+=
+x
+B
+′
+−
+x
+A
+′
+=
+γ
+L
+{\displaystyle {\begin{aligned}x'_{A}&=\gamma \left(x_{A}-vt\right)\\x'_{B}&=\gamma \left(x_{A}+L-vt\right)\\L'&=x'_{B}-x'_{A}=\gamma L\end{aligned}}}
+The "paradox", as it were, comes from the way that Bell constructed his example. In the usual discussion of Lorentz contraction, the rest length is fixed and the moving length shortens as measured in frame ⁠
+S
+{\displaystyle S}⁠. As shown in Fig. 7-5, Bell's example asserts the moving lengths 
+A
+B
+{\displaystyle AB} and 
+A
+′
+B
+′
+{\displaystyle A'B'} measured in frame 
+S
+{\displaystyle S} to be fixed, thereby forcing the rest frame length 
+A
+′
+B
+″
+{\displaystyle A'B''} in frame 
+S
+′
+{\displaystyle S'} to increase.
+
+
+Accelerated observer with horizon
+Main articles: Event horizon § Apparent horizon of an accelerated particle, and Rindler coordinates
+Certain special relativity problem setups can lead to insight about phenomena normally associated with general relativity, such as event horizons. In the text accompanying Section "Invariant hyperbola" of the article Spacetime, the magenta hyperbolae represented paths that are tracked by a constantly accelerating traveler in spacetime. During periods of positive acceleration, the traveler's velocity just approaches the speed of light, while, measured in our frame, the traveler's acceleration constantly decreases.
+
+
+Figure 7–6. Accelerated relativistic observer with horizon. Another well-drawn illustration of the same topic may be viewed here.
+Fig. 7-6 details various features of the traveler's motions with more specificity. At any given moment, her space axis is formed by a line passing through the origin and her current position on the hyperbola, while her time axis is the tangent to the hyperbola at her position. The velocity parameter 
+β
+{\displaystyle \beta } approaches a limit of one as 
+c
+t
+{\displaystyle ct} increases. Likewise, 
+γ
+{\displaystyle \gamma } approaches infinity.
+
+The shape of the invariant hyperbola corresponds to a path of constant proper acceleration. This is demonstrable as follows:
+
+We remember that ⁠
+β
+=
+c
+t
+/
+x
+{\displaystyle \beta =ct/x}⁠.
+Since ⁠
+c
+2
+t
+2
+−
+x
+2
+=
+s
+2
+{\displaystyle c^{2}t^{2}-x^{2}=s^{2}}⁠, we conclude that ⁠
+β
+(
+c
+t
+)
+=
+c
+t
+/
+c
+2
+t
+2
+−
+s
+2
+{\displaystyle \beta (ct)=ct/{\sqrt {c^{2}t^{2}-s^{2}}}}⁠.
+γ
+=
+1
+/
+1
+−
+β
+2
+=
+{\displaystyle \gamma =1/{\sqrt {1-\beta ^{2}}}=} 
+c
+2
+t
+2
+−
+s
+2
+/
+s
+{\displaystyle {\sqrt {c^{2}t^{2}-s^{2}}}/s}
+From the relativistic force law, 
+F
+=
+d
+p
+/
+d
+t
+=
+{\displaystyle F=dp/dt=}⁠
+d
+p
+c
+/
+d
+(
+c
+t
+)
+=
+d
+(
+β
+γ
+m
+c
+2
+)
+/
+d
+(
+c
+t
+)
+{\displaystyle dpc/d(ct)=d(\beta \gamma mc^{2})/d(ct)}⁠.
+Substituting 
+β
+(
+c
+t
+)
+{\displaystyle \beta (ct)} from step 2 and the expression for 
+γ
+{\displaystyle \gamma } from step 3 yields ⁠
+F
+=
+m
+c
+2
+/
+s
+{\displaystyle F=mc^{2}/s}⁠, which is a constant expression.[94]: 110–113 
+Fig. 7-6 illustrates a specific calculated scenario. Terence (A) and Stella (B) initially stand together 100 light hours from the origin. Stella lifts off at time 0, her spacecraft accelerating at 0.01 c per hour. Every twenty hours, Terence radios updates to Stella about the situation at home (solid green lines). Stella receives these regular transmissions, but the increasing distance (offset in part by time dilation) causes her to receive Terence's communications later and later as measured on her clock, and she never receives any communications from Terence after 100 hours on his clock (dashed green lines).[94]: 110–113 
+
+After 100 hours according to Terence's clock, Stella enters a dark region. She has traveled outside Terence's timelike future. On the other hand, Terence can continue to receive Stella's messages to him indefinitely. He just has to wait long enough. Spacetime has been divided into distinct regions separated by an apparent event horizon. So long as Stella continues to accelerate, she can never know what takes place behind this horizon.[94]: 110–113 
+
+Relativity and unifying electromagnetism
+Main articles: Classical electromagnetism and special relativity and Covariant formulation of classical electromagnetism
+Theoretical investigation in classical electromagnetism led to the discovery of wave propagation. Equations generalizing the electromagnetic effects found that finite propagation speed of the E and B fields required certain behaviors on charged particles. The general study of moving charges forms the Liénard–Wiechert potential, which is a step towards special relativity.
+
+The Lorentz transformation of the electric field of a moving charge into a non-moving observer's reference frame results in the appearance of a mathematical term commonly called the magnetic field. Conversely, the magnetic field generated by a moving charge disappears and becomes a purely electrostatic field in a comoving frame of reference. Maxwell's equations are thus simply an empirical fit to special relativistic effects in a classical model of the Universe. As electric and magnetic fields are reference frame dependent and thus intertwined, one speaks of electromagnetic fields. Special relativity provides the transformation rules for how an electromagnetic field in one inertial frame appears in another inertial frame.
+
+Maxwell's equations in the 3D form are already consistent with the physical content of special relativity, although they are easier to manipulate in a manifestly covariant form, that is, in the language of tensor calculus.[88]
+
+Theories of relativity and quantum mechanics
+Special relativity can be combined with quantum mechanics to form relativistic quantum mechanics and quantum electrodynamics. How general relativity and quantum mechanics can be unified is one of the unsolved problems in physics; quantum gravity and a "theory of everything", which require a unification including general relativity too, are active and ongoing areas in theoretical research.
+
+The early Bohr–Sommerfeld atomic model explained the fine structure of alkali metal atoms using both special relativity and the preliminary knowledge on quantum mechanics of the time.[95]
+
+In 1928, Paul Dirac constructed an influential relativistic wave equation, now known as the Dirac equation in his honour,[p 19] that is fully compatible both with special relativity and with the final version of quantum theory existing after 1926. This equation not only described the intrinsic angular momentum of the electrons called spin, it also led to the prediction of the antiparticle of the electron (the positron),[p 19][p 20] and fine structure could only be fully explained with special relativity. It was the first foundation of relativistic quantum mechanics.
+
+On the other hand, the existence of antiparticles leads to the conclusion that relativistic quantum mechanics is not enough for a more accurate and complete theory of particle interactions.[citation needed] Instead, a theory of particles interpreted as quantized fields, called quantum field theory, becomes necessary; in which particles can be created and destroyed throughout space and time.
+
+Status
+Main articles: Tests of special relativity and Criticism of the theory of relativity
+Special relativity in its Minkowski spacetime is accurate only when the absolute value of the gravitational potential is much less than c2 in the region of interest.[96] In a strong gravitational field, one must use general relativity. General relativity becomes special relativity at the limit of a weak field. At very small scales, such as at the Planck length and below, quantum effects must be taken into consideration resulting in quantum gravity. But at macroscopic scales and in the absence of strong gravitational fields, special relativity is experimentally tested to extremely high degree of accuracy (10−20)[97] and thus accepted by the physics community. Experimental results that appear to contradict it are not reproducible and are thus widely believed to be due to experimental errors.[98]
+
+Special relativity is mathematically self-consistent, and it is an organic part of all modern physical theories, most notably quantum field theory, string theory, and general relativity (in the limiting case of negligible gravitational fields).
+
+Newtonian mechanics mathematically follows from special relativity at small velocities (compared to the speed of light) – thus Newtonian mechanics can be considered as a special relativity of slow moving bodies. See Classical mechanics for a more detailed discussion.
+
+Several experiments predating Einstein's 1905 paper are now interpreted as evidence for relativity. Of these it is known Einstein was aware of the Fizeau experiment before 1905,[99] and historians have concluded that Einstein was at least aware of the Michelson–Morley experiment as early as 1899 despite claims he made in his later years that it played no role in his development of the theory.[23]
+
+The Fizeau experiment (1851, repeated by Michelson and Morley in 1886) measured the speed of light in moving media, with results that are consistent with relativistic addition of colinear velocities.
+The famous Michelson–Morley experiment (1881, 1887) gave further support to the postulate that detecting an absolute reference velocity was not achievable. It should be stated here that, contrary to many alternative claims, it said little about the invariance of the speed of light with respect to the source and observer's velocity, as both source and observer were travelling together at the same velocity at all times.
+The Trouton–Noble experiment (1903) showed that the torque on a capacitor is independent of position and inertial reference frame.
+The Experiments of Rayleigh and Brace (1902, 1904) showed that length contraction does not lead to birefringence for a co-moving observer, in accordance with the relativity principle.
+Particle accelerators accelerate and measure the properties of particles moving at near the speed of light, where their behavior is consistent with relativity theory and inconsistent with the earlier Newtonian mechanics. These machines would simply not work if they were not engineered according to relativistic principles. In addition, a considerable number of modern experiments have been conducted to test special relativity. Some examples:
+
+Tests of relativistic energy and momentum – testing the limiting speed of particles
+Ives–Stilwell experiment – testing relativistic Doppler effect and time dilation
+Experimental testing of time dilation – relativistic effects on a fast-moving particle's half-life
+Kennedy–Thorndike experiment – time dilation in accordance with Lorentz transformations
+Hughes–Drever experiment – testing isotropy of space and mass
+Modern searches for Lorentz violation – various modern tests
+Experiments to test emission theory demonstrated that the speed of light is independent of the speed of the emitter.
+Experiments to test the aether drag hypothesis – no "aether flow obstruction".
+See also
+People
+Arnold Sommerfeld
+Hermann Minkowski
+Max Born
+Max Planck
+Max von Laue
+Mileva Marić
+Relativity
+Bondi k-calculus
+Doubly special relativity
+Einstein synchronisation
+History of special relativity
+Relativity priority dispute
+Rietdijk–Putnam argument
+Special relativity (alternative formulations)
+Physics
+Born coordinates
+Born rigidity
+Einstein's thought experiments
+Lorentz ether theory
+Moving magnet and conductor problem
+physical cosmology
+Relativistic disk
+Relativistic Euler equations
+Relativistic heat conduction
+Shape waves
+Mathematics
+Lorentz group
+Relativity in the APS formalism
+Philosophy
+actualism
+conventionalism
+Paradoxes
+Bell's spaceship paradox
+Ehrenfest paradox
+Lighthouse paradox
+Velocity composition paradox
+Notes
+ The refractive index dependence of the presumed partial aether-drag was eventually confirmed by Pieter Zeeman in 1914–1915, long after special relativity had been accepted by the mainstream. Using a scaled-up version of Michelson's apparatus connected directly to Amsterdam's main water conduit, Zeeman was able to perform extended measurements using monochromatic light ranging from violet (4358 Å) through red (6870 Å).[p 11][p 12]
+
diff --git a/evals/testdata/standard_model.txt b/evals/testdata/standard_model.txt
new file mode 100644
index 0000000..0d014d7
--- /dev/null
+++ b/evals/testdata/standard_model.txt
@@ -0,0 +1,851 @@
+Text from https://en.wikipedia.org/wiki/Standard_Model is licensed under Creative Commons Attribution-ShareAlike 4.0 License; (https://en.wikipedia.org/wiki/Wikipedia:Text_of_the_Creative_Commons_Attribution-ShareAlike_4.0_International_License)
+Standard Model
+
+Article
+Talk
+Read
+Edit
+View history
+
+Tools
+Appearance hide
+Text
+
+Small
+
+Standard
+
+Large
+Width
+
+Standard
+
+Wide
+Color (beta)
+
+Automatic
+
+Light
+
+Dark
+From Wikipedia, the free encyclopedia
+This article is about a non-mathematical general overview of the Standard Model of particle physics. For a mathematical description, see Mathematical formulation of the Standard Model. For other uses, see Standard model (disambiguation).
+Standard Model of particle physics
+
+Elementary particles of the Standard Model
+Background
+Constituents
+Limitations
+Scientists
+vte
+The Standard Model of particle physics is the theory describing three of the four known fundamental forces (electromagnetic, weak and strong interactions – excluding gravity) in the universe and classifying all known elementary particles. It was developed in stages throughout the latter half of the 20th century, through the work of many scientists worldwide,[1] with the current formulation being finalized in the mid-1970s upon experimental confirmation of the existence of quarks. Since then, proof of the top quark (1995), the tau neutrino (2000), and the Higgs boson (2012) have added further credence to the Standard Model. In addition, the Standard Model has predicted with great accuracy the various properties of weak neutral currents and the W and Z bosons.
+
+Although the Standard Model is believed to be theoretically self-consistent[note 1] and has demonstrated some success in providing experimental predictions, it leaves some physical phenomena unexplained and so falls short of being a complete theory of fundamental interactions.[3] For example, it does not fully explain why there is more matter than anti-matter, incorporate the full theory of gravitation[4] as described by general relativity, or account for the universe's accelerating expansion as possibly described by dark energy. The model does not contain any viable dark matter particle that possesses all of the required properties deduced from observational cosmology. It also does not incorporate neutrino oscillations and their non-zero masses.
+
+The development of the Standard Model was driven by theoretical and experimental particle physicists alike. The Standard Model is a paradigm of a quantum field theory for theorists, exhibiting a wide range of phenomena, including spontaneous symmetry breaking, anomalies, and non-perturbative behavior. It is used as a basis for building more exotic models that incorporate hypothetical particles, extra dimensions, and elaborate symmetries (such as supersymmetry) to explain experimental results at variance with the Standard Model, such as the existence of dark matter and neutrino oscillations.
+
+Historical background
+See also: History of quantum field theory, History of subatomic physics, Julian Schwinger, and John Clive Ward
+In 1928, Paul Dirac introduced the Dirac equation, which implied the existence of antimatter.[5] In 1954, Yang Chen-Ning and Robert Mills extended the concept of gauge theory for abelian groups, e.g. quantum electrodynamics, to nonabelian groups to provide an explanation for strong interactions.[6] In 1957, Chien-Shiung Wu demonstrated parity was not conserved in the weak interaction.[7] In 1961, Sheldon Glashow combined the electromagnetic and weak interactions.[8] In 1964, Murray Gell-Mann and George Zweig introduced quarks and that same year Oscar W. Greenberg implicitly introduced color charge of quarks.[9] In 1967 Steven Weinberg[10] and Abdus Salam[11] incorporated the Higgs mechanism[12][13][14] into Glashow's electroweak interaction, giving it its modern form.
+
+In 1970, Sheldon Glashow, John Iliopoulos, and Luciano Maiani introduced the GIM mechanism, predicting the charm quark.[15] In 1973 Gross and Wilczek and Politzer independently discovered that non-Abelian gauge theories, like the color theory of the strong force, have asymptotic freedom.[15] In 1976, Martin Perl discovered the tau lepton at the SLAC.[16][17] In 1977, a team led by Leon Lederman at Fermilab discovered the bottom quark.[18]
+
+The Higgs mechanism is believed to give rise to the masses of all the elementary particles in the Standard Model. This includes the masses of the W and Z bosons, and the masses of the fermions, i.e. the quarks and leptons.
+
+After the neutral weak currents caused by Z boson exchange were discovered at CERN in 1973,[19][20][21][22] the electroweak theory became widely accepted and Glashow, Salam, and Weinberg shared the 1979 Nobel Prize in Physics for discovering it. The W± and Z0 bosons were discovered experimentally in 1983; and the ratio of their masses was found to be as the Standard Model predicted.[23]
+
+The theory of the strong interaction (i.e. quantum chromodynamics, QCD), to which many contributed, acquired its modern form in 1973–74 when asymptotic freedom was proposed[24][25] (a development that made QCD the main focus of theoretical research)[26] and experiments confirmed that the hadrons were composed of fractionally charged quarks.[27][28]
+
+The term "Standard Model" was introduced by Abraham Pais and Sam Treiman in 1975,[29] with reference to the electroweak theory with four quarks.[30] Steven Weinberg has since claimed priority, explaining that he chose the term Standard Model out of a sense of modesty[a][31][32][better source needed] and used it in 1973 during a talk in Aix-en-Provence in France.[33]
+
+Particle content
+The Standard Model includes members of several classes of elementary particles, which in turn can be distinguished by other characteristics, such as color charge.
+
+All particles can be summarized as follows:
+
+vte
+Elementary particles
+Elementary fermions
+Half-integer spin
+Obey the Fermi–Dirac statistics
+Elementary bosons
+Integer spin
+Obey the Bose–Einstein statistics
+Quarks and antiquarks
+Spin = ⁠
+1
+/
+2
+⁠
+Fractional electric charge
+Have color charge
+Participate in both strong interactions
+and in electroweak interactions
+Leptons and antileptons
+Spin = ⁠
+1
+/
+2
+⁠
+Integer electric charge
+No color charge
+Participate in Electroweak interactions
+Gauge bosons
+Spin = 1
+Force carriers
+Scalar bosons
+Spin = 0
+Three generations
+Up (u),Down (d)
+Charm (c),Strange (s)
+Top (t),Bottom (b)
+Three generations
+Electron (e−
+), [†] Electron neutrino (ν
+e)
+Muon (μ−
+),Muon neutrino (ν
+μ)
+Tau (τ−
+),Tau neutrino (ν
+τ)
+Three kinds
+Photon
+(γ; electromagnetic interaction)
+W and Z bosons
+(W+
+, W−
+, Z0
+; weak interaction)
+Eight types of gluons
+(g; strong interaction)
+One kind
+
+Higgs boson (H0
+)
+Notes:
+[†] An anti-electron (e+
+) is conventionally called a "positron".
+
+Fermions
+The Standard Model includes 12 elementary particles of spin ⁠
+1
+/
+2
+⁠, known as fermions.[34] Fermions respect the Pauli exclusion principle, meaning that two identical fermions cannot simultaneously occupy the same quantum state in the same atom.[35] Each fermion has a corresponding antiparticle, which are particles that have corresponding properties with the exception of opposite charges.[36] Fermions are classified based on how they interact, which is determined by the charges they carry, into two groups: quarks and leptons. Within each group, pairs of particles that exhibit similar physical behaviors are then grouped into generations (see the table). Each member of a generation has a greater mass than the corresponding particle of generations prior. Thus, there are three generations of quarks and leptons.[37] As first-generation particles do not decay, they comprise all of ordinary (baryonic) matter.[38] Specifically, all atoms consist of electrons orbiting around the atomic nucleus, ultimately constituted of up and down quarks. On the other hand, second- and third-generation charged particles decay with very short half-lives and can only be observed in high-energy environments. Neutrinos of all generations also do not decay, and pervade the universe, but rarely interact with baryonic matter.
+
+There are six quarks: up, down, charm, strange, top, and bottom.[34][37] Quarks carry color charge, and hence interact via the strong interaction. The color confinement phenomenon results in quarks being strongly bound together such that they form color-neutral composite particles called hadrons; quarks cannot individually exist and must always bind with other quarks. Hadrons can contain either a quark-antiquark pair (mesons) or three quarks (baryons).[39] The lightest baryons are the nucleons: the proton and neutron. Quarks also carry electric charge and weak isospin, and thus interact with other fermions through electromagnetism and weak interaction. The six leptons consist of the electron, electron neutrino, muon, muon neutrino, tau, and tau neutrino. The leptons do not carry color charge, and do not respond to strong interaction. The charged leptons carry an electric charge of −1 e, while the three neutrinos carry zero electric charge. Thus, the neutrinos' motions are influenced by only the weak interaction and gravity, making them difficult to observe.
+
+Gauge bosons
+
+Interactions in the Standard Model. All Feynman diagrams in the model are built from combinations of these vertices. q  is any quark,  g  is a gluon, X represents any electrically charged particle,  γ  is a photon,   f   is any fermion, m is any particle with mass (with the possible exception of some neutrinos);  mB  is any massive boson. In diagrams with multiple particle labels separated by '/', one particle label is chosen. However, in those diagrams with particle labels separated by '|', the labels must be chosen in the same left-to-right order. For example, in the four boson electroweak case the valid diagrams are WWWW, WWZZ, WWγγ, WWZγ. The conjugate of each listed vertex (reversing the direction of arrows) is also allowed.[40]
+The Standard Model includes 4 kinds of gauge bosons of spin 1,[34] with bosons being quantum particles containing an integer spin. The gauge bosons are defined as force carriers, as they are responsible for mediating the fundamental interactions. The Standard Model explains the four fundamental forces as arising from the interactions, with fermions exchanging virtual force carrier particles, thus mediating the forces. At a macroscopic scale, this manifests as a force.[41] As a result, they do not follow the Pauli exclusion principle that constrains fermions; bosons do not have a theoretical limit on their spatial density. The types of gauge bosons are described below.
+
+Electromagnetism
+Photons (γ) mediate the electromagnetic force, responsible for interactions between electrically charged particles. At present, the photon is the only known massless particle, and its interactions with other matter are described by the theory of quantum electrodynamics (QED).
+Strong interaction
+Gluons (g) mediate the strong interactions, which binds quarks to each other by influencing the color charge, with the interactions being described in the theory of quantum chromodynamics (QCD). For theoretical convenience, gluons are presumed to have no mass. There are eight distinct gluons, by color charge, with each being denoted through a color-and-anticolor combination (e.g. red–antigreen).[note 2] As gluons have an effective color charge, they also interact amongst themselves.
+Weak interaction
+The W+
+, W−
+, and Z0
+ gauge bosons mediate weak interactions between fermions; the W±
+ are responsible for radioactive decay, and Z0
+ deflect neutrinos traveling through solid matter. They have large intrinsic mass, with the Z0
+ having a little more mass than the W±
+, and approximately the same mass as an entire atom of zirconium. Strangely, weak interactions involving the W±
+ only ever act on left-handed particles and right-handed antiparticles respectively, whereas interactions with the Z0
+ involve both left- and right-handed particles and antiparticles.[note 3][note 4]
+Gravitation
+In the Standard Model gravitation is currently only approximately explained, and then only for relatively low-strength gravitational fields, as the hypothetical mediating particle graviton has been proposed and described, but never observed.[43] This is due to the incompatibility of quantum mechanics and Einstein's theory of general relativity, regarded as being the best explanation for gravity. In general relativity, gravity is explained as being the geometric curving of spacetime.[44]
+The Feynman diagram calculations, which are a graphical representation of the perturbation theory approximation, invoke "force mediating particles", and when applied to analyze high-energy scattering experiments are in reasonable agreement with the data. However, perturbation theory (and with it the concept of a "force-mediating particle") fails in other situations. These include low-energy quantum chromodynamics, bound states, and solitons. The interactions between all the particles described by the Standard Model are summarized by the diagrams on the right of this section.
+
+Higgs boson
+Main article: Higgs boson
+The Higgs particle is a massive scalar elementary particle theorized by Peter Higgs (and others) in 1964, when he showed that Goldstone's 1962 theorem (generic continuous symmetry, which is spontaneously broken) provides a third polarization of a massive vector field. Hence, Goldstone's original scalar doublet, the massive spin-zero particle, was proposed as the Higgs boson, and is a key building block in the Standard Model.[45] It has no intrinsic spin, and for that reason is classified as a boson with spin-0.[34]
+
+The Higgs boson plays a unique role in the Standard Model, by explaining why the other elementary particles, except the photon and gluon, are massive. In particular, the Higgs boson explains why the photon has no mass, while the W and Z bosons are very heavy. Elementary-particle masses and the differences between electromagnetism (mediated by the photon) and the weak force (mediated by the W and Z bosons) are critical to many aspects of the structure of microscopic (and hence macroscopic) matter. In electroweak theory, the Higgs boson generates the masses of the leptons (electron, muon, and tau) and quarks. As the Higgs boson is massive, it must interact with itself.
+
+Because the Higgs boson is a very massive particle and also decays almost immediately when created, only a very high-energy particle accelerator can observe and record it. Experiments to confirm and determine the nature of the Higgs boson using the Large Hadron Collider (LHC) at CERN began in early 2010 and were performed at Fermilab's Tevatron until its closure in late 2011. Mathematical consistency of the Standard Model requires that any mechanism capable of generating the masses of elementary particles must become visible[clarification needed] at energies above 1.4 TeV;[46] therefore, the LHC (designed to collide two 7 TeV proton beams) was built to answer the question of whether the Higgs boson actually exists.[47]
+
+On 4 July 2012, two of the experiments at the LHC (ATLAS and CMS) both reported independently that they had found a new particle with a mass of about 125 GeV/c2 (about 133 proton masses, on the order of 10−25 kg), which is "consistent with the Higgs boson".[48][49] On 13 March 2013, it was confirmed to be the searched-for Higgs boson.[50][51]
+
+Theoretical aspects
+Main article: Mathematical formulation of the Standard Model
+Construction of the Standard Model Lagrangian
+Parameters of the Standard Model
+Technically, quantum field theory provides the mathematical framework for the Standard Model, in which a Lagrangian controls the dynamics and kinematics of the theory. Each kind of particle is described in terms of a dynamical field that pervades space-time.[52] The construction of the Standard Model proceeds following the modern method of constructing most field theories: by first postulating a set of symmetries of the system, and then by writing down the most general renormalizable Lagrangian from its particle (field) content that observes these symmetries.
+
+The global Poincaré symmetry is postulated for all relativistic quantum field theories. It consists of the familiar translational symmetry, rotational symmetry and the inertial reference frame invariance central to the theory of special relativity. The local SU(3) × SU(2) × U(1) gauge symmetry is an internal symmetry that essentially defines the Standard Model. Roughly, the three factors of the gauge symmetry give rise to the three fundamental interactions. The fields fall into different representations of the various symmetry groups of the Standard Model (see table). Upon writing the most general Lagrangian, one finds that the dynamics depends on 19 parameters, whose numerical values are established by experiment. The parameters are summarized in the table (made visible by clicking "show") above.
+
+Quantum chromodynamics sector
+Main article: Quantum chromodynamics
+The quantum chromodynamics (QCD) sector defines the interactions between quarks and gluons, which is a Yang–Mills gauge theory with SU(3) symmetry, generated by 
+T
+a
+=
+λ
+a
+/
+2
+{\displaystyle T^{a}=\lambda ^{a}/2}. Since leptons do not interact with gluons, they are not affected by this sector. The Dirac Lagrangian of the quarks coupled to the gluon fields is given by
+L
+QCD
+=
+ψ
+¯
+i
+γ
+μ
+D
+μ
+ψ
+−
+1
+4
+G
+μ
+ν
+a
+G
+a
+μ
+ν
+,
+{\displaystyle {\mathcal {L}}_{\text{QCD}}={\overline {\psi }}i\gamma ^{\mu }D_{\mu }\psi -{\frac {1}{4}}G_{\mu \nu }^{a}G_{a}^{\mu \nu },}where 
+ψ
+{\displaystyle \psi } is a three component column vector of Dirac spinors, each element of which refers to a quark field with a specific color charge (i.e. red, blue, and green) and summation over flavor (i.e. up, down, strange, etc.) is implied.
+
+The gauge covariant derivative of QCD is defined by 
+D
+μ
+≡
+∂
+μ
+−
+i
+g
+s
+1
+2
+λ
+a
+G
+μ
+a
+{\displaystyle D_{\mu }\equiv \partial _{\mu }-ig_{\text{s}}{\frac {1}{2}}\lambda ^{a}G_{\mu }^{a}}, where
+
+γμ are the Dirac matrices,
+Ga
+μ is the 8-component (
+a
+=
+1
+,
+2
+,
+…
+,
+8
+{\displaystyle a=1,2,\dots ,8}) SU(3) gauge field,
+λa
+ are the 3 × 3 Gell-Mann matrices, generators of the SU(3) color group,
+Ga
+μν represents the gluon field strength tensor, and
+gs is the strong coupling constant.
+The QCD Lagrangian is invariant under local SU(3) gauge transformations; i.e., transformations of the form 
+ψ
+→
+ψ
+′
+=
+U
+ψ
+{\displaystyle \psi \rightarrow \psi '=U\psi }, where 
+U
+=
+e
+−
+i
+g
+s
+λ
+a
+ϕ
+a
+(
+x
+)
+{\displaystyle U=e^{-ig_{\text{s}}\lambda ^{a}\phi ^{a}(x)}} is 3 × 3 unitary matrix with determinant 1, making it a member of the group SU(3), and 
+ϕ
+a
+(
+x
+)
+{\displaystyle \phi ^{a}(x)} is an arbitrary function of spacetime.
+
+Electroweak sector
+Main article: Electroweak interaction
+The electroweak sector is a Yang–Mills gauge theory with the symmetry group U(1) × SU(2)L,
+L
+EW
+=
+Q
+¯
+L
+j
+i
+γ
+μ
+D
+μ
+Q
+L
+j
++
+u
+¯
+R
+j
+i
+γ
+μ
+D
+μ
+u
+R
+j
++
+d
+¯
+R
+j
+i
+γ
+μ
+D
+μ
+d
+R
+j
++
+ℓ
+¯
+L
+j
+i
+γ
+μ
+D
+μ
+ℓ
+L
+j
++
+e
+¯
+R
+j
+i
+γ
+μ
+D
+μ
+e
+R
+j
+−
+1
+4
+W
+a
+μ
+ν
+W
+μ
+ν
+a
+−
+1
+4
+B
+μ
+ν
+B
+μ
+ν
+,
+{\displaystyle {\mathcal {L}}_{\text{EW}}={\overline {Q}}_{{\text{L}}j}i\gamma ^{\mu }D_{\mu }Q_{{\text{L}}j}+{\overline {u}}_{{\text{R}}j}i\gamma ^{\mu }D_{\mu }u_{{\text{R}}j}+{\overline {d}}_{{\text{R}}j}i\gamma ^{\mu }D_{\mu }d_{{\text{R}}j}+{\overline {\ell }}_{{\text{L}}j}i\gamma ^{\mu }D_{\mu }\ell _{{\text{L}}j}+{\overline {e}}_{{\text{R}}j}i\gamma ^{\mu }D_{\mu }e_{{\text{R}}j}-{\tfrac {1}{4}}W_{a}^{\mu \nu }W_{\mu \nu }^{a}-{\tfrac {1}{4}}B^{\mu \nu }B_{\mu \nu },}where the subscript 
+j
+{\displaystyle j} sums over the three generations of fermions; 
+Q
+L
+,
+u
+R
+{\displaystyle Q_{\text{L}},u_{\text{R}}}, and 
+d
+R
+{\displaystyle d_{\text{R}}} are the left-handed doublet, right-handed singlet up type, and right handed singlet down type quark fields; and 
+ℓ
+L
+{\displaystyle \ell _{\text{L}}} and 
+e
+R
+{\displaystyle e_{\text{R}}} are the left-handed doublet and right-handed singlet lepton fields.
+
+The electroweak gauge covariant derivative is defined as 
+D
+μ
+≡
+∂
+μ
+−
+i
+g
+′
+1
+2
+Y
+W
+B
+μ
+−
+i
+g
+1
+2
+τ
+→
+L
+W
+→
+μ
+{\displaystyle D_{\mu }\equiv \partial _{\mu }-ig'{\tfrac {1}{2}}Y_{\text{W}}B_{\mu }-ig{\tfrac {1}{2}}{\vec {\tau }}_{\text{L}}{\vec {W}}_{\mu }}, where
+
+Bμ is the U(1) gauge field,
+YW is the weak hypercharge – the generator of the U(1) group,
+W→μ is the 3-component SU(2) gauge field,
+→
+τ
+L are the Pauli matrices – infinitesimal generators of the SU(2) group – with subscript L to indicate that they only act on left-chiral fermions,
+g' and g are the U(1) and SU(2) coupling constants respectively,
+W
+a
+μ
+ν
+{\displaystyle W^{a\mu \nu }} (
+a
+=
+1
+,
+2
+,
+3
+{\displaystyle a=1,2,3}) and 
+B
+μ
+ν
+{\displaystyle B^{\mu \nu }} are the field strength tensors for the weak isospin and weak hypercharge fields.
+Notice that the addition of fermion mass terms into the electroweak Lagrangian is forbidden, since terms of the form 
+m
+ψ
+¯
+ψ
+{\displaystyle m{\overline {\psi }}\psi } do not respect U(1) × SU(2)L gauge invariance. Neither is it possible to add explicit mass terms for the U(1) and SU(2) gauge fields. The Higgs mechanism is responsible for the generation of the gauge boson masses, and the fermion masses result from Yukawa-type interactions with the Higgs field.
+
+Higgs sector
+Main article: Higgs mechanism
+In the Standard Model, the Higgs field is an SU(2)L doublet of complex scalar fields with four degrees of freedom:
+φ
+=
+(
+φ
++
+φ
+0
+)
+=
+1
+2
+(
+φ
+1
++
+i
+φ
+2
+φ
+3
++
+i
+φ
+4
+)
+,
+{\displaystyle \varphi ={\begin{pmatrix}\varphi ^{+}\\\varphi ^{0}\end{pmatrix}}={\frac {1}{\sqrt {2}}}{\begin{pmatrix}\varphi _{1}+i\varphi _{2}\\\varphi _{3}+i\varphi _{4}\end{pmatrix}},}where the superscripts + and 0 indicate the electric charge 
+Q
+{\displaystyle Q} of the components. The weak hypercharge 
+Y
+W
+{\displaystyle Y_{\text{W}}} of both components is 1. Before symmetry breaking, the Higgs Lagrangian is
+L
+H
+=
+(
+D
+μ
+φ
+)
+†
+(
+D
+μ
+φ
+)
+−
+V
+(
+φ
+)
+,
+{\displaystyle {\mathcal {L}}_{\text{H}}=\left(D_{\mu }\varphi \right)^{\dagger }\left(D^{\mu }\varphi \right)-V(\varphi ),}where 
+D
+μ
+{\displaystyle D_{\mu }} is the electroweak gauge covariant derivative defined above and 
+V
+(
+φ
+)
+{\displaystyle V(\varphi )} is the potential of the Higgs field. The square of the covariant derivative leads to three and four point interactions between the electroweak gauge fields 
+W
+μ
+a
+{\displaystyle W_{\mu }^{a}} and 
+B
+μ
+{\displaystyle B_{\mu }} and the scalar field 
+φ
+{\displaystyle \varphi }. The scalar potential is given by
+V
+(
+φ
+)
+=
+−
+μ
+2
+φ
+†
+φ
++
+λ
+(
+φ
+†
+φ
+)
+2
+,
+{\displaystyle V(\varphi )=-\mu ^{2}\varphi ^{\dagger }\varphi +\lambda \left(\varphi ^{\dagger }\varphi \right)^{2},}where 
+μ
+2
+>
+0
+{\displaystyle \mu ^{2}>0}, so that 
+φ
+{\displaystyle \varphi } acquires a non-zero Vacuum expectation value, which generates masses for the Electroweak gauge fields (the Higgs mechanism), and 
+λ
+>
+0
+{\displaystyle \lambda >0}, so that the potential is bounded from below. The quartic term describes self-interactions of the scalar field 
+φ
+{\displaystyle \varphi }.
+
+The minimum of the potential is degenerate with an infinite number of equivalent ground state solutions, which occurs when 
+φ
+†
+φ
+=
+μ
+2
+2
+λ
+{\displaystyle \varphi ^{\dagger }\varphi ={\tfrac {\mu ^{2}}{2\lambda }}}. It is possible to perform a gauge transformation on 
+φ
+{\displaystyle \varphi } such that the ground state is transformed to a basis where 
+φ
+1
+=
+φ
+2
+=
+φ
+4
+=
+0
+{\displaystyle \varphi _{1}=\varphi _{2}=\varphi _{4}=0} and 
+φ
+3
+=
+μ
+λ
+≡
+v
+{\displaystyle \varphi _{3}={\tfrac {\mu }{\sqrt {\lambda }}}\equiv v}. This breaks the symmetry of the ground state. The expectation value of 
+φ
+{\displaystyle \varphi } now becomes
+⟨
+φ
+⟩
+=
+1
+2
+(
+0
+v
+)
+,
+{\displaystyle \langle \varphi \rangle ={\frac {1}{\sqrt {2}}}{\begin{pmatrix}0\\v\end{pmatrix}},}where 
+v
+{\displaystyle v} has units of mass and sets the scale of electroweak physics. This is the only dimensional parameter of the Standard Model and has a measured value of ~246 GeV/c2.
+
+After symmetry breaking, the masses of the W and Z are given by 
+m
+W
+=
+1
+2
+g
+v
+{\displaystyle m_{\text{W}}={\frac {1}{2}}gv} and 
+m
+Z
+=
+1
+2
+g
+2
++
+g
+′
+2
+v
+{\displaystyle m_{\text{Z}}={\frac {1}{2}}{\sqrt {g^{2}+g'^{2}}}v}, which can be viewed as predictions of the theory. The photon remains massless. The mass of the Higgs boson is 
+m
+H
+=
+2
+μ
+2
+=
+2
+λ
+v
+{\displaystyle m_{\text{H}}={\sqrt {2\mu ^{2}}}={\sqrt {2\lambda }}v}. Since 
+μ
+{\displaystyle \mu } and 
+λ
+{\displaystyle \lambda } are free parameters, the Higgs's mass could not be predicted beforehand and had to be determined experimentally.
+
+Yukawa sector
+The Yukawa interaction terms are:
+L
+Yukawa
+=
+(
+Y
+u
+)
+m
+n
+(
+Q
+¯
+L
+)
+m
+φ
+~
+(
+u
+R
+)
+n
++
+(
+Y
+d
+)
+m
+n
+(
+Q
+¯
+L
+)
+m
+φ
+(
+d
+R
+)
+n
++
+(
+Y
+e
+)
+m
+n
+(
+ℓ
+¯
+L
+)
+m
+φ
+(
+e
+R
+)
+n
++
+h
+.
+c
+.
+{\displaystyle {\mathcal {L}}_{\text{Yukawa}}=(Y_{\text{u}})_{mn}({\bar {Q}}_{\text{L}})_{m}{\tilde {\varphi }}(u_{\text{R}})_{n}+(Y_{\text{d}})_{mn}({\bar {Q}}_{\text{L}})_{m}\varphi (d_{\text{R}})_{n}+(Y_{\text{e}})_{mn}({\bar {\ell }}_{\text{L}})_{m}{\varphi }(e_{\text{R}})_{n}+\mathrm {h.c.} }where 
+Y
+u
+{\displaystyle Y_{\text{u}}}, 
+Y
+d
+{\displaystyle Y_{\text{d}}}, and 
+Y
+e
+{\displaystyle Y_{\text{e}}} are 3 × 3 matrices of Yukawa couplings, with the mn term giving the coupling of the generations m and n, and h.c. means Hermitian conjugate of preceding terms. The fields 
+Q
+L
+{\displaystyle Q_{\text{L}}} and 
+ℓ
+L
+{\displaystyle \ell _{\text{L}}} are left-handed quark and lepton doublets. Likewise, 
+u
+R
+,
+d
+R
+{\displaystyle u_{\text{R}},d_{\text{R}}} and 
+e
+R
+{\displaystyle e_{\text{R}}} are right-handed up-type quark, down-type quark, and lepton singlets. Finally 
+φ
+{\displaystyle \varphi } is the Higgs doublet and 
+φ
+~
+=
+i
+τ
+2
+φ
+∗
+{\displaystyle {\tilde {\varphi }}=i\tau _{2}\varphi ^{*}} is its charge conjugate state.
+
+The Yukawa terms are invariant under the SU(2)L × U(1)Y gauge symmetry of the Standard Model and generate masses for all fermions after spontaneous symmetry breaking.
+
+Fundamental interactions
+Main article: Fundamental interaction
+The Standard Model describes three of the four fundamental interactions in nature; only gravity remains unexplained. In the Standard Model, such an interaction is described as an exchange of bosons between the objects affected, such as a photon for the electromagnetic force and a gluon for the strong interaction. Those particles are called force carriers or messenger particles.[53]
+
+The four fundamental interactions of nature[54]
+Property/Interaction	Gravitation	Electroweak	Strong
+Weak	Electromagnetic	Fundamental	Residual
+Mediating particles	Not yet observed
+(Graviton hypothesised)	W+, W− and Z0	γ (photon)	Gluons	π, ρ and ω mesons
+Affected particles	All particles	W+, W−: Left-handed fermions; Z0: All fermions	Electrically charged	Quarks, gluons	Hadrons
+Acts on	Stress–energy tensor	Flavor	Electric charge	Color charge	
+Bound states formed	Planets, stars, galaxies, galaxy groups	—N/a	Atoms, molecules	Hadrons	Atomic nuclei
+Strength at the scale of quarks
+(relative to electromagnetism)	10−41 (predicted)	10−4	1	60	Not applicable
+to quarks
+Strength at the scale of
+protons/neutrons
+(relative to electromagnetism)	10−36 (predicted)	10−7	1	Not applicable
+to hadrons	20
+icon
+This section does not cite any sources. Please help improve this section by adding citations to reliable sources. Unsourced material may be challenged and removed. (June 2021) (Learn how and when to remove this message)
+Gravity
+See also: Quantum gravity and Gravity
+
+Fundamental Interactions of the Standard Model including the hypothetical graviton
+Despite being perhaps the most familiar fundamental interaction, gravity is not described by the Standard Model, due to contradictions that arise when combining general relativity, the modern theory of gravity, and quantum mechanics.[55][56] However, gravity is so weak at microscopic scales, that it is essentially unmeasurable. The graviton is postulated to be the mediating particle, but has not yet been proved to exist.[57]
+
+Electromagnetism
+See also: Electromagnetism and Quantum electrodynamics
+Electromagnetism is the only long-range force in the Standard Model. It is mediated by photons and couples to electric charge.[58] Electromagnetism is responsible for a wide range of phenomena including atomic electron shell structure, chemical bonds, electric circuits and electronics. Electromagnetic interactions in the Standard Model are described by quantum electrodynamics.
+
+Weak interaction
+See also: Weak interaction and Electroweak interaction
+The weak interaction is responsible for various forms of particle decay, such as beta decay. It is weak and short-range, due to the fact that the weak mediating particles, W and Z bosons, have mass. W bosons have electric charge and mediate interactions that change the particle type (referred to as flavor) and charge. Interactions mediated by W bosons are charged current interactions. Z bosons are neutral and mediate neutral current interactions, which do not change particle flavor. Thus Z bosons are similar to the photon, aside from them being massive and interacting with the neutrino. The weak interaction is also the only interaction to violate parity and CP. Parity violation is maximal for charged current interactions, since the W boson interacts exclusively with left-handed fermions and right-handed antifermions.
+
+In the Standard Model, the weak force is understood in terms of the electroweak theory, which states that the weak and electromagnetic interactions become united into a single electroweak interaction at high energies.
+
+Strong interaction
+See also: Strong interaction, Nuclear force, and Quantum chromodynamics
+The strong interaction is responsible for hadronic and nuclear binding. It is mediated by gluons, which couple to color charge. Since gluons themselves have color charge, the strong force exhibits confinement and asymptotic freedom. Confinement means that only color-neutral particles can exist in isolation, therefore quarks can only exist in hadrons and never in isolation, at low energies. Asymptotic freedom means that the strong force becomes weaker, as the energy scale increases. The strong force overpowers the electrostatic repulsion of protons and quarks in nuclei and hadrons respectively, at their respective scales.
+
+While quarks are bound in hadrons by the fundamental strong interaction, which is mediated by gluons, nucleons are bound by an emergent phenomenon termed the residual strong force or nuclear force. This interaction is mediated by mesons, such as the pion. The color charges inside the nucleon cancel out, meaning most of the gluon and quark fields cancel out outside of the nucleon. However, some residue is "leaked", which appears as the exchange of virtual mesons, which result in an effective attractive force between nucleons. The (fundamental) strong interaction is described by quantum chromodynamics, which is a component of the Standard Model.
+
+Tests and predictions
+The Standard Model predicted the existence of the W and Z bosons, gluon, top quark and charm quark, and predicted many of their properties before these particles were observed. The predictions were experimentally confirmed with good precision.[59]
+
+The Standard Model also predicted the existence of the Higgs boson, which was found in 2012 at the Large Hadron Collider, the final fundamental particle predicted by the Standard Model to be experimentally confirmed.[60]
+
+Challenges
+See also: Physics beyond the Standard Model
+Unsolved problem in physics
+What gives rise to the Standard Model of particle physics?
+Why do particle masses and coupling constants have the values that we measure?
+Why are there three generations of particles?
+Why is there more matter than antimatter in the universe?
+Where does dark matter fit into the model? Does it even consist of one or more new particles?
+More unsolved problems in physics
+Self-consistency of the Standard Model (currently formulated as a non-abelian gauge theory quantized through path-integrals) has not been mathematically proved. While regularized versions useful for approximate computations (for example lattice gauge theory) exist, it is not known whether they converge (in the sense of S-matrix elements) in the limit that the regulator is removed. A key question related to the consistency is the Yang–Mills existence and mass gap problem.
+
+Experiments indicate that neutrinos have mass, which the classic Standard Model did not allow.[61] To accommodate this finding, the classic Standard Model can be modified to include neutrino mass, although it is not obvious exactly how this should be done.
+
+If one insists on using only Standard Model particles, this can be achieved by adding a non-renormalizable interaction of leptons with the Higgs boson.[62] On a fundamental level, such an interaction emerges in the seesaw mechanism where heavy right-handed neutrinos are added to the theory. This is natural in the left-right symmetric extension of the Standard Model[63][64] and in certain grand unified theories.[65] As long as new physics appears below or around 1014 GeV, the neutrino masses can be of the right order of magnitude.
+
+Theoretical and experimental research has attempted to extend the Standard Model into a unified field theory or a theory of everything, a complete theory explaining all physical phenomena including constants. Inadequacies of the Standard Model that motivate such research include:
+
+The model does not explain gravitation, although physical confirmation of a theoretical particle known as a graviton would account for it to a degree. Though it addresses strong and electroweak interactions, the Standard Model does not consistently explain the canonical theory of gravitation, general relativity, in terms of quantum field theory. The reason for this is, among other things, that quantum field theories of gravity generally break down before reaching the Planck scale. As a consequence, we have no reliable theory for the very early universe.
+Some physicists consider it to be ad hoc and inelegant, requiring 19 numerical constants whose values are unrelated and arbitrary.[66] Although the Standard Model, as it now stands, can explain why neutrinos have masses, the specifics of neutrino mass are still unclear. It is believed that explaining neutrino mass will require an additional 7 or 8 constants, which are also arbitrary parameters.[67]
+The Higgs mechanism gives rise to the hierarchy problem if some new physics (coupled to the Higgs) is present at high energy scales. In these cases, in order for the weak scale to be much smaller than the Planck scale, severe fine tuning of the parameters is required; there are, however, other scenarios that include quantum gravity in which such fine tuning can be avoided.[68]
+The model is inconsistent with the emerging Lambda-CDM model of cosmology. Contentions include the absence of an explanation in the Standard Model of particle physics for the observed amount of cold dark matter (CDM) and its contributions to dark energy, which are many orders of magnitude too large. It is also difficult to accommodate the observed predominance of matter over antimatter (matter/antimatter asymmetry). The isotropy and homogeneity of the visible universe over large distances seems to require a mechanism like cosmic inflation, which would also constitute an extension of the Standard Model.
+Currently, no proposed theory of everything has been widely accepted or verified.
+
+See also
+Yang–Mills theory
+Fundamental interaction:
+Quantum electrodynamics
+Strong interaction: Color charge, Quantum chromodynamics, Quark model
+Weak interaction: Electroweak interaction, Fermi's interaction, Weak hypercharge, Weak isospin
+Gauge theory: Introduction to gauge theory
+Generation
+Higgs mechanism: Higgs boson, Alternatives to the Standard Higgs Model
+Lagrangian (field theory)
+Open questions: CP violation, Neutrino masses, QCD matter, Quantum triviality
+Quantum field theory
+Standard Model: Mathematical formulation of, Physics beyond the Standard Model
+Electron electric dipole moment
+Notes
+ There are mathematical issues regarding quantum field theories still under debate (see e.g. Landau pole), but the predictions extracted from the Standard Model by current methods applicable to current experiments are all self-consistent.[2]
+ Although nine color–anticolor combinations mathematically exist, gluons form color octet particles. As one color-symmetric combination is linear and forms a color singlet particles, there are eight possible gluons.[42]
+ The W±
+ carries an electric charge of  +1  and right-handed spin, or  −1 and left-handed spin. The W±
+ separately couple to other particles through photons and the electromagnetic interaction.
+ The electrically neutral Z0
+ boson interacts with both left-handed and right-handed regular particles and antiparticles, although with different strengths for each combination of left- and right- and of regular particles and antiparticles (see weak charge). Photons and these three gauge bosons are grouped together as a hypothetically unified, single the electroweak interaction.
+ A model is a representation of reality, whereas a theory is an explanation of reality; this Wikipedia article and some of the literature refers to the Standard Model as a theory.
+
diff --git a/evals/wheat_from_chaff_test.cc b/evals/wheat_from_chaff_test.cc
new file mode 100644
index 0000000..c7d8739
--- /dev/null
+++ b/evals/wheat_from_chaff_test.cc
@@ -0,0 +1,179 @@
+// Copyright 2026 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+
+#include <cstring>
+#include <string>
+#include <vector>
+
+#include "evals/benchmark_helper.h"
+#include "gemma/configs.h"
+#include "gemma/gemma.h"
+#include "io/io.h"
+#include "hwy/base.h"
+#include "hwy/tests/hwy_gtest.h"
+
+// This test can be run manually with the downloaded gemma weights.
+// To run the test, pass the following flags:
+// --tokenizer <tokenizer_path> --weights <weights_path>
+// or just use the single-file weights file with --weights <weights_path>.
+// It should pass for the following models:
+// Gemma2: gemma2-2b-it
+
+namespace gcpp {
+namespace {
+
+static const char* kQuestions =
+    "From the above information, please answer the following questions: "
+    "What did Marcia find in the sand? "
+    "What is Albert's preferred holiday activity? "
+    "How long did it take to dig out the object from the sand? "
+    "What is Marcia's preferred holiday activity? "
+    "What made the castle turrets look like daleks? "
+    "Which people first proposed the quark model of hadrons, and when?";
+
+// All phrases in kAnswers must appear in the response in the order given for
+// the test to pass.
+static const char* kAnswers[] = {
+    "a ship's anchor",  "a dark forest", "an hour",
+    "enormous sand",    "castles",       "limpet shells",
+    "Murray Gell-Mann", "George Zweig",  "1964"};
+
+std::string LoadPromptFile(const std::string& filename) {
+  // If the filename is empty, return an empty string.
+  if (filename.empty()) {
+    return "";
+  }
+  std::string path = testing::SrcDir() +
+                     "evals/testdata/"
+                     + filename;
+  return ReadFileToString(Path(path));
+}
+
+std::string BuildPrompt(const std::vector<std::string>& files,
+                        const std::string& suffix) {
+  std::string prompt;
+  for (const std::string& file : files) {
+    prompt += LoadPromptFile(file);
+  }
+  prompt += suffix;
+  return prompt;
+}
+
+class GemmaTest : public ::testing::Test {
+ public:
+  // Requires argc/argv, hence do not use `SetUpTestSuite`.
+  static void InitEnv(int argc, char** argv) {
+    HWY_ASSERT(s_env == nullptr);  // Should only be called once.
+    ConsumedArgs consumed(argc, argv);
+    GemmaArgs args(argc, argv, consumed);
+    consumed.AbortIfUnconsumed();
+
+    s_env = new GemmaEnv(args);
+    const gcpp::ModelConfig& config = s_env->GetGemma()->Config();
+    fprintf(stderr, "Using %s\n", config.Specifier().c_str());
+  }
+
+  static void DeleteEnv() { delete s_env; }
+
+ protected:
+  std::string GemmaReply(const std::string& input,
+                         AttentionImpl attention_mode) {
+    HWY_ASSERT(s_env);  // must have called InitEnv()
+    s_env->SetMaxGeneratedTokens(256);
+    s_env->MutableConfig().attention_impl = attention_mode;
+    s_env->MutableConfig().temperature = 0.0f;  // deterministic
+    s_env->MutableConfig().verbosity = 1;
+    // Always use turn structure (WrapAndTokenize).
+    auto response = s_env->QueryModel(input);
+    return response.response.substr(response.response_start_pos);
+  }
+
+  // Checks that the response contains the expected answer substrings int the
+  // expected order. Testing against a few keywords is more robust than checking
+  // the whole string.
+  void TestExpectations(const std::string& response) {
+    fprintf(stderr, "Response: '%s'\n", response.c_str());
+    size_t pos = 0;
+    for (const char* answer : kAnswers) {
+      auto found = response.find(answer, pos);
+      EXPECT_NE(found, std::string::npos)
+          << "Response does not contain " << answer;
+      if (found != std::string::npos) {
+        pos = found + strlen(answer);
+      }
+    }
+    s_env->PrintProfileResults();
+  }
+
+  // Shared state. Requires argc/argv, so construct in main via InitEnv.
+  // Note that the style guide forbids non-local static variables with dtors.
+  static GemmaEnv* s_env;
+};
+
+GemmaEnv* GemmaTest::s_env = nullptr;
+
+// Tests whether Gemma can find the right answer in varying levels of
+// background information, ranging from the bare facts to outright distraction.
+TEST_F(GemmaTest, WheatFromChaff) {
+  const AttentionImpl modes[] = {AttentionImpl::kOld, AttentionImpl::kFlash};
+
+  fprintf(stderr, "Warmup, mode %s\n", GetAttentionImplName(modes[0]).c_str());
+  auto prompt = BuildPrompt({"quark_1.txt", "holiday_story.txt"}, kQuestions);
+  auto response = GemmaReply(prompt, modes[0]);
+  TestExpectations(response);
+  for (const AttentionImpl mode : modes) {
+    const std::string mode_name = GetAttentionImplName(mode);
+    fprintf(stderr, "\nTesting quark_1 prompt, mode %s\n", mode_name.c_str());
+    prompt = BuildPrompt({"holiday_story.txt", "quark_1.txt"}, kQuestions);
+    response = GemmaReply(prompt, mode);
+    TestExpectations(response);
+    fprintf(stderr, "\nTesting quark_2 prompt, mode %s\n", mode_name.c_str());
+    prompt = BuildPrompt({"holiday_story.txt", "quark_2.txt"}, kQuestions);
+    response = GemmaReply(prompt, mode);
+    TestExpectations(response);
+    fprintf(stderr, "\nTesting standard_model prompt, mode %s\n",
+            mode_name.c_str());
+    prompt = BuildPrompt(
+        {"holiday_story.txt", "quark_2.txt", "standard_model.txt"}, kQuestions);
+    response = GemmaReply(prompt, mode);
+    TestExpectations(response);
+    if (s_env->MutableKVCache().SeqLen() > 38000) {
+      fprintf(stderr, "\nTesting special_relativity, mode %s\n",
+              mode_name.c_str());
+      prompt = BuildPrompt(
+          {"holiday_story.txt", "quark_2.txt", "special_relativity.txt"},
+          kQuestions);
+    } else {
+      fprintf(stderr, "\nSkipping special_relativity, mode %s\n",
+              mode_name.c_str());
+      prompt = BuildPrompt({"quark_1.txt", "holiday_story.txt"}, kQuestions);
+    }
+    response = GemmaReply(prompt, mode);
+    TestExpectations(response);
+  }
+}
+
+}  // namespace
+}  // namespace gcpp
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  gcpp::GemmaTest::InitEnv(argc, argv);
+  int ret = RUN_ALL_TESTS();
+  gcpp::GemmaTest::DeleteEnv();
+  return ret;
+}
diff --git a/gemma/activations.h b/gemma/activations.h
index c1b943e..0e18875 100644
--- a/gemma/activations.h
+++ b/gemma/activations.h
@@ -24,6 +24,7 @@
 #include <vector>
 
 #include "gemma/configs.h"     // ModelConfig
+#include "gemma/flash_structs.h"
 #include "gemma/gemma_args.h"  // AttentionImpl
 #include "gemma/kv_cache.h"
 #include "gemma/tensor_stats.h"
@@ -52,10 +53,13 @@ struct AttentionActivations {
   AttentionActivations(
       const ModelConfig& config, const LayerConfig& layer_config,
       size_t batch_size, size_t seq_len, const RuntimeConfig& runtime_config,
-      const Allocator& allocator,
+      size_t max_workers, const Allocator& allocator,
       std::vector<hwy::AlignedFreeUniquePtr<uint8_t*[]>>& row_ptrs)
-      :  // `vocab_size == 0` means it is for Vit part, VitAttention is still
-         // MHA and does not use an external KV cache.
+      : rep_factor(max_workers *
+                   AttentionActivations::kThreadReplicationFactor /
+                   // `vocab_size == 0` means it is for Vit part, VitAttention
+                   // is still MHA and does not use an external KV cache.
+                   layer_config.heads),
         q(MatFactory("q", batch_size,
                      config.vocab_size == 0
                          ? layer_config.heads * 3 * layer_config.qkv_dim
@@ -76,11 +80,19 @@ struct AttentionActivations {
         vit_C(MatFactory("C2", batch_size, seq_len, allocator)),
         pre_att_rms_out(MatFactory("pre_att_rms_out", batch_size,
                                    config.model_dim, allocator)),
-        att(MatFactory("att", batch_size, layer_config.heads * seq_len,
-                       allocator)),
+        // att is only valid for AttentionImpl::kOld.
+        att(MatFactory(
+            "att", batch_size,
+            layer_config.heads *
+                (runtime_config.attention_impl == AttentionImpl::kOld ? seq_len
+                                                                      : 1),
+            allocator)),
         att_out(MatFactory("att_out", batch_size,
                            layer_config.heads * layer_config.qkv_dim,
                            allocator)),
+        att_out_reps(MatFactory("att_out", batch_size * rep_factor,
+                                layer_config.heads * layer_config.qkv_dim,
+                                allocator)),
         softmax_max(MatFactory("softmax_max", batch_size, layer_config.heads,
                                allocator)),
         softmax_d(
@@ -102,6 +114,11 @@ struct AttentionActivations {
       }
       return;
     }
+    // This is a guess at the maximum number of params we might need to avoid
+    // reallocations. The actual number of params is determined by the number of
+    // query tiles, which is not known here.
+    flash_params.reserve(batch_size * layer_config.heads);
+    split_flash_params.reserve(batch_size * layer_config.heads);
 
     // For MatMul outputs, precompute their row pointers.
     // If we forget any MatMul outputs here, debug builds print a warning but
@@ -125,6 +142,7 @@ struct AttentionActivations {
     pre_att_rms_out.OverrideRows(batch_size);
     att.OverrideRows(batch_size);
     att_out.OverrideRows(batch_size);
+    att_out_reps.OverrideRows(batch_size * rep_factor);
     softmax_max.OverrideRows(batch_size);
     softmax_d.OverrideRows(batch_size);
     att_sums.OverrideRows(batch_size);
@@ -132,6 +150,15 @@ struct AttentionActivations {
     // `inv_timescale*` are not batched.
   }
 
+  // Maximum factor by which we might scale-up work to maximize parallelism.
+  size_t rep_factor = 1;
+  // Parameters for flash attention. The size of the vector is somewhere between
+  // the number of query rows and 1/8th of that.
+  std::vector<FlashAttentionParams> flash_params;
+  // Parameters for flash attention, split by k-position. May be significantly
+  // larger than flash_params in decode mode, when the number of query rows is
+  // small.
+  std::vector<FlashAttentionParams> split_flash_params;
   MatStorageT<float> q;  // query
   MatStorageT<BF16> q_bf;
   MatStorageT<BF16> q_T;  // Transposed to maximize attention speed.
@@ -143,6 +170,7 @@ struct AttentionActivations {
   MatStorageT<float> pre_att_rms_out;
   MatStorageT<float> att;          // attention vector
   MatStorageT<float> att_out;      // attention output
+  MatStorageT<float> att_out_reps;  // attention output for each thread.
   MatStorageT<float> softmax_max;  // see OnlineSoftmaxState
   MatStorageT<float> softmax_d;    // see OnlineSoftmaxState
   // Accumulation of attention outputs over heads
@@ -151,19 +179,27 @@ struct AttentionActivations {
   // Rope
   MatStorageT<float> inv_timescale;
   MatStorageT<float> inv_timescale_global;
+  // Replication factor to help evenly share work over threads.
+  static constexpr size_t kThreadReplicationFactor = 4;
 };
 
 // A non-owning view of AttentionActivations.
 struct AttentionActivationsPtrs {
-  AttentionActivationsPtrs(const ModelConfig& config, size_t seq_len)
+  AttentionActivationsPtrs(
+      const ModelConfig& config, size_t seq_len,
+      std::vector<FlashAttentionParams>& flash_params,
+      std::vector<FlashAttentionParams>& split_flash_params)
       : config(config),
+        flash_params(flash_params),
+        split_flash_params(split_flash_params),
         div_seq_len(static_cast<uint32_t>(seq_len)),
         div_heads(static_cast<uint32_t>(config.layer_configs[0].heads)),
         query_scale(ChooseQueryScale(config)) {}
 
   AttentionActivationsPtrs(const ModelConfig& config, size_t seq_len,
-                           const AttentionActivations& activations)
-      : AttentionActivationsPtrs(config, seq_len) {
+                           AttentionActivations& activations)
+      : AttentionActivationsPtrs(config, seq_len, activations.flash_params,
+                                 activations.split_flash_params) {
     q = activations.q;
     q_bf = activations.q_bf;
     q_T = activations.q_T;
@@ -173,6 +209,7 @@ struct AttentionActivationsPtrs {
     pre_att_rms_out = activations.pre_att_rms_out;
     att = activations.att;
     att_out = activations.att_out;
+    att_out_reps = activations.att_out_reps;
     softmax_max = activations.softmax_max;
     softmax_d = activations.softmax_d;
     att_sums = activations.att_sums;
@@ -203,6 +240,9 @@ struct AttentionActivationsPtrs {
   }
 
   const ModelConfig& config;
+  // Parameters for flash attention.
+  std::vector<FlashAttentionParams>& flash_params;
+  std::vector<FlashAttentionParams>& split_flash_params;
 
   // For the matrices below, the batch_size dimension is really qbatch.Size() *
   // token_batch_size, but in all known uses, one of those is 1.  Specifically,
@@ -228,6 +268,7 @@ struct AttentionActivationsPtrs {
   // Attention output computed from att * V, size batch_size x (q_heads *
   // qkv_dim).
   MatPtrT<float> att_out;
+  MatPtrT<float> att_out_reps;
   // The maximum logit value encountered when computing att_out from att,
   // size batch_size x q_heads . See OnlineSoftmaxState for details.
   // WARNING: Only filled in for AttentionImpl::kOld.
@@ -282,7 +323,8 @@ struct Activations {
         s_w_linear_w(config.num_layers, max_workers),
         attention_impl(runtime_config.attention_impl),
         attention_storage(config, layer_config, batch_size, seq_len,
-                          runtime_config, ctx.allocator, row_ptrs),
+                          runtime_config, ctx.pools.MaxWorkers(), ctx.allocator,
+                          row_ptrs),
         attention(config, seq_len, attention_storage) {
     HWY_ASSERT(batch_size != 0);
 
diff --git a/gemma/attention.cc b/gemma/attention.cc
index b5dc5e1..570c4f4 100644
--- a/gemma/attention.cc
+++ b/gemma/attention.cc
@@ -49,6 +49,39 @@ HWY_BEFORE_NAMESPACE();
 namespace gcpp {
 namespace HWY_NAMESPACE {
 
+// Returns the number of floats per vector (aka NF).
+size_t FloatsPerVector() {
+  using DF = hn::ScalableTag<float>;
+  const DF df;
+  return hn::Lanes(df);
+}
+
+// The k-cache and v-cache are setup without knowing NF. So if it hasn't been
+// done already, reshape it to take NF into account.
+void MaybeReshapeCache(const MatPtrT<KV_t>& kv, MatPtrT<KV_t>& cache) {
+  if (kv.Cols() > cache.Cols()) {
+    cache.ReshapePackedRowsToCols(2 * FloatsPerVector());
+  }
+}
+
+// Transposes a single row of the kv cache into the k-cache and v-cache.
+void TransposeKVCacheRow(const KV_t* HWY_RESTRICT kv, KV_t* HWY_RESTRICT k,
+                         KV_t* HWY_RESTRICT v, size_t qkv_dim) {
+  // This is inefficient, as the writes are scattered over cache lines, but it
+  // is a tiny fraction of the overall computation, and it is linear in the
+  // token length.
+  const size_t kFloatsPerTile = 2 * FloatsPerVector();
+  for (size_t i = 0; i < qkv_dim; i += 2) {
+    k[i * kFloatsPerTile] = kv[i];
+    k[i * kFloatsPerTile + 1] = kv[i + 1];
+  }
+  for (size_t i = 0; i < qkv_dim; i += kFloatsPerTile) {
+    for (size_t j = 0; j < kFloatsPerTile; j++) {
+      v[i * kFloatsPerTile + j] = kv[i + j + qkv_dim];
+    }
+  }
+}
+
 // Computes Q.K scores, which are "logits" (or scores) stored to att.
 // `k` is a strided view of the kv cache with dimensions [seq_len, qkv_dim].
 static HWY_INLINE void QDotK(const size_t start_pos, const size_t last_pos,
@@ -280,6 +313,11 @@ static HWY_INLINE void ComputeQKV(size_t num_tokens, const size_t layer_idx,
   kv_rows.AttachRowPtrs(env.row_ptrs[0].get());
   CallMatMul(activations.pre_att_rms_out, layer.qkv_einsum_w2,
              /*add=*/nullptr, env, kv_rows);
+  for (size_t qi = 0; qi < qbatch.Size(); ++qi) {
+    MaybeReshapeCache(qbatch.KV(qi).kv_cache, qbatch.KV(qi).k_cache);
+    MaybeReshapeCache(qbatch.KV(qi).kv_cache, qbatch.KV(qi).v_cache);
+  }
+  const size_t kFloatsPerVector = FloatsPerVector();
 
   // Apply positional encodings for K.
   // Note that 2D parallelism is not worth the fork/join overhead because the
@@ -299,6 +337,26 @@ static HWY_INLINE void ComputeQKV(size_t num_tokens, const size_t layer_idx,
         KV_t* HWY_RESTRICT kv = kv_cache.Row(cache_pos) +
                                 layer_idx * cache_layer_size +
                                 head * qkv_dim * 2;
+        // Note that k_cache and v_cache are different shapes.
+        // The innermost dimension of k is 2 values from qkv_dim because they
+        // are going to be used in a BF16 dot product involving pairs of
+        // values over NF k positions.
+        // The innermost dimension of v is 2NF values from qkv_dim because they
+        // will be loaded into a BF16 vector to be scaled and added to the
+        // cached attention output in 2 NF-sized registers.
+        // TODO(rays): factor out these calculations into functions.
+        auto& k_cache = qbatch.KV(qi).k_cache;
+        KV_t* HWY_RESTRICT k =
+            k_cache.Row(cache_pos / (2 * kFloatsPerVector)) +
+            (layer_idx * cache_layer_size + head * qkv_dim * 2) *
+                kFloatsPerVector +
+            (cache_pos % (2 * kFloatsPerVector)) * 2;
+        auto& v_cache = qbatch.KV(qi).v_cache;
+        KV_t* HWY_RESTRICT v =
+            v_cache.Row(cache_pos / (2 * kFloatsPerVector)) +
+            (layer_idx * cache_layer_size + head * qkv_dim * 2) *
+                kFloatsPerVector +
+            (cache_pos % (2 * kFloatsPerVector)) * 2 * kFloatsPerVector;
 
         HWY_ALIGN float kv_f32[2 * kMaxQKVDim];
         const hn::ScalableTag<float> df;
@@ -319,13 +377,17 @@ static HWY_INLINE void ComputeQKV(size_t num_tokens, const size_t layer_idx,
                              /*mul=*/1.0f);
         CompressPerThread tls;
         Compress(kv_f32, 2 * qkv_dim, tls, MakeSpan(kv, 2 * qkv_dim), 0);
+        // This is inefficient, as multiple threads are writing the same K
+        // cache line, but the input is generated by a matmul, so it is
+        // difficult to change, and it probably isn't significant.
+        TransposeKVCacheRow(kv, k, v, qkv_dim);
       });
 }
 
 void GemmaAttention(size_t num_tokens, const size_t layer_idx,
                     const LayerWeightsPtrs& layer,
                     AttentionActivationsPtrs& activations, QBatch& qbatch,
-                    MatMulEnv& env, int flags) {
+                    MatMulEnv& env, AttentionImpl attention_impl, int flags) {
   GCPP_ZONE(env.ctx, hwy::Profiler::GlobalIdx(), Zones::kGenAttention);
 
   const LayerConfig& layer_config = layer.layer_config;
@@ -335,15 +397,16 @@ void GemmaAttention(size_t num_tokens, const size_t layer_idx,
   (void)layer_config;  // only used in HWY_DASSERT
 
   ComputeQKV(num_tokens, layer_idx, layer, activations, qbatch, flags, env);
-  if (flags & kAttentionUseOld) {
+  if (attention_impl == AttentionImpl::kOld) {
     DotSoftmaxWeightedSum(num_tokens, layer_idx, layer.query_norm_scale,
                           activations, qbatch, env.ctx);
   } else {
     // * 2 does not help on Turin.
     FlashAttention(num_tokens,
-                   /*target_parallelism=*/env.ctx.pools.MaxWorkers() * 1,
+                   /*target_parallelism=*/env.ctx.pools.MaxWorkers() *
+                       AttentionActivations::kThreadReplicationFactor,
                    layer_idx, layer.query_norm_scale, activations, qbatch,
-                   env.ctx);
+                   env.ctx, attention_impl);
   }
   SumHeads(layer, activations, env);
 }
diff --git a/gemma/attention.h b/gemma/attention.h
index 71411b2..14870de 100644
--- a/gemma/attention.h
+++ b/gemma/attention.h
@@ -31,6 +31,13 @@ namespace gcpp {
 // Passed to HWY_VISIT_TARGETS; declares for one target.
 #define GEMMA_DECL_ATTENTION(TARGET, NAMESPACE)                               \
   namespace NAMESPACE {                                                       \
+  size_t FloatsPerVector();                                                   \
+                                                                              \
+  void MaybeReshapeCache(const MatPtrT<KV_t>& kv, MatPtrT<KV_t>& cache);      \
+                                                                              \
+  void TransposeKVCacheRow(const KV_t* HWY_RESTRICT kv, KV_t* HWY_RESTRICT k, \
+                           KV_t* HWY_RESTRICT v, size_t qkv_dim);             \
+                                                                              \
   void PositionalEncodingQK(float* qk, size_t layer_idx,                      \
                             const AttentionActivationsPtrs& activations,      \
                             ThreadingContext& ctx, size_t worker, size_t pos, \
@@ -53,7 +60,8 @@ namespace gcpp {
   void GemmaAttention(size_t num_tokens, const size_t layer_idx,              \
                       const LayerWeightsPtrs& layer,                          \
                       AttentionActivationsPtrs& activations, QBatch& qbatch,  \
-                      MatMulEnv& env, int flags);                             \
+                      MatMulEnv& env, AttentionImpl attention_impl,           \
+                      int flags);                                             \
   /* NOLINTNEXTLINE(google-readability-namespace-comments) */                 \
   }  // namespace NAMESPACE
 
diff --git a/gemma/attention_test.cc b/gemma/attention_test.cc
index 53f1d01..46214fd 100644
--- a/gemma/attention_test.cc
+++ b/gemma/attention_test.cc
@@ -1,8 +1,10 @@
 #include <cstddef>
+#include <cstdlib>
 #include <cstring>  // strcmp
 #include <memory>
 #include <numeric>
 #include <optional>
+#include <string>
 #include <vector>
 
 #include "gtest/gtest.h"
@@ -105,7 +107,8 @@ struct TestAttentionState {
         tokens(num_tokens),
         attention_storage_(model_state.config, model_state.layer_config,
                            batch_size, num_tokens, runtime_config,
-                           state.ctx.allocator, row_ptrs_),
+                           state.ctx.pools.MaxWorkers(), state.ctx.allocator,
+                           row_ptrs_),
         attention(model_state.config, num_tokens, attention_storage_) {
     for (size_t i = 0; i < qbatch_size; ++i) {
       kv_caches.emplace_back(model_state.config, inference_args,
@@ -143,6 +146,7 @@ struct TestAttentionState {
 };
 
 double GetTolerance() {
+  if (IsBF16<KV_t>()) return 1e-2;
   const char* target_name = hwy::TargetName(HWY_TARGET);
   if (strncmp(target_name, "AVX2", 4) == 0) {
     return 2e-2;
@@ -155,6 +159,57 @@ double GetTolerance() {
   }
 }
 
+// Comparison function for computations that used BF16, whether the result is
+// stored in BF16 or F32.
+// Compare with absolute tolerance for values with small magnitudes.
+// Compare with relative tolerance for values with larger magnitudes.
+template <typename T>
+bool CompareArraySimilarBF16(const T* expected, const T* actual, size_t count,
+                             const char* target_name, const char* filename,
+                             int line) {
+  constexpr double kTolerance = 3e-2;
+  for (size_t i = 0; i < count; ++i) {
+    const double exp = hwy::ConvertScalarTo<double>(expected[i]);
+    const double act = hwy::ConvertScalarTo<double>(actual[i]);
+    const double l1 = std::abs(act - exp);
+    // Cannot divide, so check absolute error.
+    if (std::abs(exp) <= 1.0) {
+      if (l1 > kTolerance) {
+        std::string array_values = hwy::detail::FormatMismatchedArrays(
+            expected, actual, count, kTolerance);
+        HWY_WARN("%s %s:%d %s mismatch %zu of %zu: %E %E l1 %E tol %E%s\n",
+                 target_name, filename, line, "BF16", i, count, exp, act, l1,
+                 kTolerance, array_values.c_str());
+        return false;
+      }
+    } else {  // relative
+      const double rel = l1 / exp;
+      if (rel > kTolerance) {
+        std::string array_values = hwy::detail::FormatMismatchedArrays(
+            expected, actual, count, kTolerance);
+        HWY_WARN("%s %s:%d %s mismatch %zu of %zu: %E %E rel %E tol %E%s\n",
+                 target_name, filename, line, "BF16", i, count, exp, act, rel,
+                 kTolerance, array_values.c_str());
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+template <typename T>
+bool CompareArraySimilar(const T* expected, const T* actual, size_t count,
+                         const char* target_name, const char* filename,
+                         int line) {
+  if constexpr (IsBF16<KV_t>()) {
+    return CompareArraySimilarBF16(expected, actual, count, target_name,
+                                   filename, line);
+  } else {
+    return hwy::CompareArraySimilar(expected, actual, count, GetTolerance(),
+                                    target_name, filename, line);
+  }
+}
+
 template <size_t kNumTokens, size_t kQBatchSize, size_t kDims>
 void CompareAttSumsWithGolden(
     const AttentionActivationsPtrs& attention,
@@ -170,9 +225,9 @@ void CompareAttSumsWithGolden(
       for (size_t j = 0; j < kDims; ++j) {
         actual_row[j] = hwy::F32FromBF16(attention.att_sums.Row(i)[j]);
       }
-      EXPECT_TRUE(hwy::CompareArraySimilar(
-          golden[token_idx][qi], actual_row.get(), kDims, GetTolerance(),
-          hwy::TargetName(HWY_TARGET), __FILE__, __LINE__))
+      EXPECT_TRUE(CompareArraySimilar(golden[token_idx][qi], actual_row.get(),
+                                      kDims, hwy::TargetName(HWY_TARGET),
+                                      __FILE__, __LINE__))
           << "att_sums mismatch for token_idx=" << token_idx << " qi=" << qi;
     }
   }
@@ -200,19 +255,20 @@ void CompareKVCacheWithGolden(
 
   for (size_t token_idx = 0; token_idx < kNumTokens; ++token_idx) {
     for (size_t qi = 0; qi < kQBatchSize; ++qi) {
-      const float* cache_row =
+      const BF16* cache_row =
           kv_caches[qi].kv_cache.Row(start_offset + token_idx);
       for (size_t j = 0; j < kDims; ++j) {
-        actual_k_row[j] = cache_row[kv_offset + j];
-        actual_v_row[j] = cache_row[kv_offset + qkv_dim + j];
+        actual_k_row[j] = hwy::ConvertScalarTo<float>(cache_row[kv_offset + j]);
+        actual_v_row[j] =
+            hwy::ConvertScalarTo<float>(cache_row[kv_offset + qkv_dim + j]);
       }
-      EXPECT_TRUE(hwy::CompareArraySimilar(
-          k_golden[token_idx][qi], actual_k_row.get(), kDims, GetTolerance(),
+      EXPECT_TRUE(CompareArraySimilar(
+          k_golden[token_idx][qi], actual_k_row.get(), kDims,
           hwy::TargetName(HWY_TARGET), __FILE__, __LINE__))
           << "K cache mismatch for token_idx=" << token_idx << " qi=" << qi
           << " kv_head=" << kv_head;
-      EXPECT_TRUE(hwy::CompareArraySimilar(
-          v_golden[token_idx][qi], actual_v_row.get(), kDims, GetTolerance(),
+      EXPECT_TRUE(CompareArraySimilar(
+          v_golden[token_idx][qi], actual_v_row.get(), kDims,
           hwy::TargetName(HWY_TARGET), __FILE__, __LINE__))
           << "V cache mismatch for token_idx=" << token_idx << " qi=" << qi
           << " kv_head=" << kv_head;
@@ -238,8 +294,8 @@ void CompareQVecsWithGolden(
       for (size_t j = 0; j < kDims; ++j) {
         actual_q_row[j] = q_row[head_offset + j];
       }
-      EXPECT_TRUE(hwy::CompareArraySimilar(
-          q_golden[token_idx][qi], actual_q_row.get(), kDims, GetTolerance(),
+      EXPECT_TRUE(CompareArraySimilar(
+          q_golden[token_idx][qi], actual_q_row.get(), kDims,
           hwy::TargetName(HWY_TARGET), __FILE__, __LINE__))
           << "Q vec mismatch for token_idx=" << token_idx << " qi=" << qi
           << " q_head=" << q_head;
@@ -267,42 +323,42 @@ const float kGoldenAttSums[kNumTokens][kQBatchSize][kDimsToCompare] = {
       26.875, 63, 3.34375, -67.5, 31.125, -190, 125},
      {-30.375, -17.875, 51.75, -78, -84, 6.40625, 15.375, 70, -22.875, 20.125,
       -14.9375, -109.5, 76, 9.25, -142, 29.5, -105}},
-    {{-32.75, 38.25, 78.5, 107.5, 20.25, 197, -136, 42.5, -84, 25.625, 4.96875,
+    {{-32.75, 38.25, 78.5, 107.5, 20.25, 197, -136, 42.5, -84, 25.625, 5.35875,
       128, 27.25, -161, 19.125, -58, 97.5},
      {-18.5, -18, 135, -13.4375, -6.625, -45.75, 29.625, 93, 18.625, 75.5,
       102.5, -184, 52.75, 83.5, -71, 46.5, -52}},
-    {{-16.375, -61.5, -58.25, -27.375, -28, 71, -109.5, 60.25, 3.125, -29.125,
-      6.90625, 150, 144, -155, -47.25, -98.5, 3.5625},
-     {-19, -16.75, 129, 0.59765625, -82, 123.5, 60.75, -36.75, -77, 26.625, 51,
-      -66.5, -0.84765625, -46.5, -152, -2.9375, -81}},
-    {{3.984375, 83, -41.75, 39.5, -203, 110, -76, 131, 0.4609375, -44.5, -63.75,
+    {{-16.375, -61.5, -58.25, -27.375, -28, 71, -109.5, 60.25, 3.625, -29.125,
+      6.4625, 150, 144, -155, -47.25, -98.5, 3.5625},
+     {-19, -16.75, 129, 0.628925, -82, 123.5, 60.75, -36.75, -77, 26.625, 51,
+      -66.5, -0.62165625, -46.5, -152, -2.9375, -81}},
+    {{3.684375, 83, -41.75, 39.5, -203, 110, -76, 131, 1.0069375, -44.5, -63.75,
       -46, -22, -19.375, -16.125, -148, 20.875},
-     {-47, -19.5, 58, 81.5, 21.75, -30, -118, 44.25, -149, 22.5, 188, -66.5, 33,
+     {-47, -19.5, 58, 81.5, 23.35, -30, -118, 44.25, -149, 22.5, 188, -66.5, 33,
       10.9375, -52.5, 23.25, 75}},
-    {{64, -31, -89, -92.5, -11.1875, -54.75, -302, 3.453125, -108, 39.25,
+    {{64, -31, -89, -92.5, -11.1875, -54.75, -302, 4.213125, -108, 39.25,
       -34.75, 18, -52, 100, -186, -75.5, 50.75},
-     {7.6875, -80, -40, 32.25, -30.25, 90, -41, 44.25, -140, -2.4375, 82.5,
+     {7.1875, -80, -40, 32.25, -30.25, 90, -41, 44.25, -140, -2.4375, 82.5,
       39.25, 65, 47.25, -89.5, -34.25, 137}},
     {{39.75, 17.875, 115, 38.75, -44, 139, -53.25, -23.875, -13.0625, 38.5,
-      32.5, 53.75, 109, 4.09375, 57.5, -20.5, 132},
-     {143, 249, 5.09375, 0.83984375, 27.875, -5.84375, 30.25, -101.5, 65.5,
-      13.5, 195, -10.0625, 97.5, 2.203125, -97.5, -100, -19.25}},
+      32.5, 53.75, 109, 4.62375, 57.5, -20.5, 132},
+     {143, 249, 4.9375, 1.33984375, 27.875, -5.84375, 30.25, -101.5, 65.5, 13.5,
+      195, -10.0625, 97.5, 1.903125, -97.5, -100, -19.25}},
     {{-30.125, -169, -150, 58, -35.75, 22.75, 36.5, -32.25, -8.9375, 55.25,
       -117, 26.375, 39.5, 125, 66, 48.75, 20.75},
-     {137, 5.25, 61.25, 37, -42.75, 240, 62, -164, 11.3125, 173, 174, 23.5,
+     {137, 3.85, 61.25, 37, -42.75, 240, 62, -164, 10.3125, 173, 174, 23.5,
       88.5, 48.5, -46.25, -36.75, 101.5}},
-    {{-103, -47.5, 39, -48, -67.5, 121, -136, 99, 80, -47.5, 107.5, 48.75, 97.5,
+    {{-103, -47.5, 39, -48, -67.5, 121, -136, 99, 80, -47.5, 107.5, 43.75, 97.5,
       125, -53.5, -14.625, 262},
-     {29.875, 7.34375, -36.75, -14.5, -27.5, 44.75, -67.5, -40.75, 71.5, 172,
+     {28.075, 6.64375, -36.75, -14.5, -27.5, 44.75, -67.5, -40.75, 71.5, 172,
       81, -27.25, -3.03125, 111, -167, 59, 176}},
     {{-37.25, 109.5, -26.125, -115.5, 108, 57.25, 1.3671875, 72, -122.5, 59.25,
       -52, -12.625, 43.25, 16.25, -41.75, 26.5, 70.5},
-     {40.25, 53.25, -142, 78.5, 38, 4.3125, -27.75, -134, -85, 107.5, 2.5, 93.5,
+     {40.25, 53.25, -142, 78.5, 38, 4.625, -27.75, -134, -85, 107.5, 2.5, 93.5,
       58.25, 173, -53.5, 25.125, 4.8125}},
     {{-8.4375, -35, -35.5, 131, -33.25, 106, 109.5, -92, -135, 80, 21.5,
       -17.125, 15.25, 143, -27, 103, 101},
      {-77, 40.75, -10.125, 33.25, -33, 104, -7.6875, 85.5, -40, 93, 61, 14.5625,
-      8.125, -99.5, 13.6875, -11.6875, 33}},
+      8.55, -99.5, 14.6875, -11.6875, 33}},
 };
 
 // Layer 0, *K*V Head 0
@@ -538,7 +594,7 @@ void RunAttentionTest(AttentionImpl attention_impl) {
 
   GemmaAttention(attention_state.tokens.size(), 0, model_state.layer,
                  attention_state.attention, *attention_state.qbatch, state.env,
-                 AttentionImplToFlags(attention_impl, HWY_NATIVE_DOT_BF16));
+                 attention_impl, /*flags=*/0);
 
   CompareAttSumsWithGolden(attention_state.attention, kGoldenAttSums);
   CompareKVCacheWithGolden(model_state.config,
diff --git a/gemma/configs.cc b/gemma/configs.cc
index cb508e8..000e278 100644
--- a/gemma/configs.cc
+++ b/gemma/configs.cc
@@ -712,9 +712,21 @@ Model DeduceModel(const Path& blob_path, size_t layers, int layer_types) {
   }
 }
 
+// Keep in sync with enum class AttentionImpl.
+const char* kAttentionImplNames[] = {
+    "old", "flash",
+    "unknown"  // keep last
+};
+
+std::string GetAttentionImplName(AttentionImpl impl) {
+  return kAttentionImplNames[static_cast<size_t>(impl)];
+}
+
 AttentionImpl GetAttentionImpl(const std::string& impl) {
-  if (impl == "old") return AttentionImpl::kOld;
-  if (impl == "flash") return AttentionImpl::kFlash;
+  if (impl == GetAttentionImplName(AttentionImpl::kOld))
+    return AttentionImpl::kOld;
+  if (impl == GetAttentionImplName(AttentionImpl::kFlash))
+    return AttentionImpl::kFlash;
   HWY_WARN("Unknown attention implementation: %s. Using kOld.\n", impl.c_str());
   return AttentionImpl::kOld;
 }
diff --git a/gemma/configs.h b/gemma/configs.h
index f1bd0c5..803a48a 100644
--- a/gemma/configs.h
+++ b/gemma/configs.h
@@ -81,11 +81,12 @@ static inline bool EnumValid(LayerAttentionType type) {
 }
 
 enum class AttentionImpl {
-  kOld,
-  kFlash,
+  kOld,                          // Previous Attention implementation
+  kFlash,                        // Flash Attention (default)
   kSentinel,
 };
 
+std::string GetAttentionImplName(AttentionImpl impl);
 AttentionImpl GetAttentionImpl(const std::string& impl);
 
 /*
diff --git a/gemma/flash_attention.cc b/gemma/flash_attention.cc
index 488d425..b25be3c 100644
--- a/gemma/flash_attention.cc
+++ b/gemma/flash_attention.cc
@@ -20,9 +20,7 @@
 #include <array>
 #include <cmath>
 #include <cstdlib>
-#include <iostream>
 #include <limits>
-#include <type_traits>
 #include <vector>
 
 #include "compression/types.h"  // GEMMA_DISABLED_TARGETS
@@ -60,43 +58,7 @@ HWY_BEFORE_NAMESPACE();
 namespace gcpp {
 namespace HWY_NAMESPACE {
 
-static constexpr size_t kNFx8HTileSize = 8;
 static constexpr float kNegInf = -std::numeric_limits<float>::max() / 64.0f;
-// Transposes q into q_t.
-// Both are 4D tensors stuffed into a 2-D MatPtrT.
-// q has shape [batch, qbatch][head, qkv_dim].
-// q_t has shape [qkv_dim][qbatch, head, batch] in order to make the maximum
-// possible consecutive elements have the same KV.
-static void TransposeQ(const MatPtrT<float>& q, MatPtrT<BF16>& q_t,
-                       const size_t qbatch_size, ThreadingContext& ctx) {
-  // Group floats by the number of floats in a cache line.
-  const size_t kNF = ctx.cache_info.LineBytes() / sizeof(float);
-  const size_t num_heads = q.Cols() / q_t.Rows();
-  const size_t batch_size = q.Rows() / qbatch_size;
-  const auto func = [&](const size_t task, size_t worker) HWY_ATTR {
-    GCPP_ZONE(ctx, worker, Zones::kFlashAttentionTransposeQ);
-    for (size_t lane = 0; lane < kNF; ++lane) {
-      size_t q_row = task * kNF + lane;
-      if (q_row >= q_t.Rows()) break;
-      BF16* HWY_RESTRICT qt_row = q_t.Row(q_row);
-      for (size_t qi = 0; qi < qbatch_size; ++qi) {
-        for (size_t h = 0; h < num_heads; ++h) {
-          for (size_t b = 0; b < batch_size; ++b) {
-            qt_row[(qi * num_heads + h) * batch_size + b] =
-                hwy::ConvertScalarTo<BF16>(
-                    q.Row(b * qbatch_size + qi)[h * q_t.Rows() + q_row]);
-          }
-        }
-      }
-    }
-  };
-  {
-    const size_t num_tasks = hwy::DivCeil(q_t.Rows(), kNF);
-    // Better than kFlat.
-    ParallelFor(Parallelism::kHierarchical, num_tasks, ctx,
-                /*cluster_idx=*/0, Callers::kFlashTransposeQ, func);
-  }
-}
 
 // Updates q in place for RMSNorm and positional encoding.
 void RMSNormAndPositionalEncoding(const size_t num_tokens, const QBatch& qbatch,
@@ -139,292 +101,390 @@ void RMSNormAndPositionalEncoding(const size_t num_tokens, const QBatch& qbatch,
   }
 }
 
-// Handles a single v row of flash attention for a single q.k dot product.
-void HWY_INLINE SingleFlashAttentionStep(float x, float cap, float& old_max,
-                                         float& old_d,
-                                         const float* HWY_RESTRICT v,
-                                         const size_t v_cols,
-                                         float* HWY_RESTRICT att_out) {
-  if (cap > 0.0f) {
-    // Compute tanh(x / cap) * cap, being LogitsSoftCap on the scalar x.
-    x = cap * std::tanh(x / cap);
-  }
-  float m = std::max(x, old_max);
-  x = std::exp(x - m);
-  float scale = old_d * std::exp(old_max - m);
-  old_d = x + scale;
-  old_max = m;
-  float one_over_d = 1.0f / old_d;
-  scale *= one_over_d;
-  x *= one_over_d;
-  MulByConst(scale, att_out, v_cols);
-  MulByConstAndAdd(x, v, att_out, v_cols);
-}
-
-// Calculates the complete attention outputs for a single row of q.
-void SingleFlashAttention(const size_t start_pos, const size_t last_pos,
-                          const BF16* HWY_RESTRICT q, const MatPtrT<KV_t>& k,
-                          const MatPtrT<KV_t>& v, const size_t layer_idx,
-                          const AttentionActivationsPtrs& activations,
-                          float* HWY_RESTRICT att_out, ThreadingContext& ctx,
-                          const size_t worker) {
-  GCPP_ZONE(ctx, worker, Zones::kFlashAttentionSingleFlashAttention);
-  const hn::ScalableTag<BF16> dbf;
-  const size_t qkv_dim = k.Cols();
-
-  const size_t pos_mod = activations.div_seq_len.Remainder(start_pos);
-  // TODO: Mixed-mode can be further improved for Turin: we can demote right
-  // before we do the dot product instruction, rather than promote both to f32.
-  // But some potential accuracy loss there, needs evaluation first.
-  float m = Dot(dbf, MakeConstSpan(q, qkv_dim), 0, k.Row(pos_mod), qkv_dim);
-  if (float cap = activations.config.att_cap; cap > 0.0f) {
-    // Compute tanh(x / cap) * cap, being LogitsSoftCap on the scalar x.
-    m = cap * std::tanh(m / cap);
-  }
-  float d = 1.0f;
-  // This is just a copy of the first token.
-  MulByConstTo(d, v.Row(pos_mod), att_out, v.Cols(), ctx, worker);
-  for (size_t pos = start_pos + 1; pos <= last_pos; ++pos) {
-    const size_t pos_mod = activations.div_seq_len.Remainder(pos);
-    float x = Dot(dbf, MakeConstSpan(q, qkv_dim), 0, k.Row(pos_mod), qkv_dim);
-    SingleFlashAttentionStep(x, activations.config.att_cap, m, d,
-                             v.Row(pos_mod), v.Cols(), att_out);
-  }
-}
-
-// Computes and returns a single vector of NF Q.K dot products, which represents
-// the dot products of NF rows of Q for a single K timestep.
-template <class DF, class VF = hn::Vec<DF>>
-VF QDotKVector(DF df, const uint32_t* HWY_RESTRICT q_offsets,
-               const size_t k_pos, const MatPtrT<BF16>& q,
-               const MatPtrT<KV_t>& k) {
-  const hn::ScalableTag<BF16> dbf;
-  const size_t qkv_dim = k.Cols();
-
-  hn::TFromD<DF> results[hn::MaxLanes(df)];
-  for (size_t i = 0; i < hn::Lanes(df); ++i) {
-    results[i] = Dot(dbf, MakeConstSpan(q.Row(0) + q_offsets[i], qkv_dim), 0,
-                     k.Row(k_pos), qkv_dim);
-  }
-  return hn::LoadU(df, results);
-}
-
-// Returns an NF Q rows by 8 K rows tile of Q.K dot products.
-// This is the result of NF rows of Q against 8 K timesteps, with positions
-// given by k_pos[0..7]. Q has been transposed so that the NF rows are read in
-// consecutive elements, and other columns by adding q_stride.
-template <class DF, class VF = hn::Vec<DF>>
-void QDotKTile(DF df, const BF16* HWY_RESTRICT q, const size_t q_stride,
-               const MatPtrT<KV_t>& k, const size_t* k_pos, VF& sum0, VF& sum1,
-               VF& sum2, VF& sum3, VF& sum4, VF& sum5, VF& sum6, VF& sum7) {
-  constexpr size_t kHTileSize = kNFx8HTileSize;
+// Zeroes out kVTileSize of the given vectors.
+template <size_t kVTileSize, class DF, class VF = hn::Vec<DF>>
+HWY_INLINE void ZeroResults(DF df, VF& sum0, VF& HWY_MAYBE_UNUSED sum1,
+                            VF& HWY_MAYBE_UNUSED sum2,
+                            VF& HWY_MAYBE_UNUSED sum3,
+                            VF& HWY_MAYBE_UNUSED sum4,
+                            VF& HWY_MAYBE_UNUSED sum5,
+                            VF& HWY_MAYBE_UNUSED sum6,
+                            VF& HWY_MAYBE_UNUSED sum7) {
   sum0 = hn::Zero(df);
-  sum1 = hn::Zero(df);
-  sum2 = hn::Zero(df);
-  sum3 = hn::Zero(df);
-  sum4 = hn::Zero(df);
-  sum5 = hn::Zero(df);
-  sum6 = hn::Zero(df);
-  sum7 = hn::Zero(df);
-  const float* HWY_RESTRICT k_row[kHTileSize];
-  for (size_t i = 0; i < kHTileSize; ++i) {
-    k_row[i] = k.Row(k_pos[i]);
+  if constexpr (kVTileSize >= 4) {
+    sum1 = hn::Zero(df);
+    sum2 = hn::Zero(df);
+    sum3 = hn::Zero(df);
   }
-
-  const hn::Rebind<BF16, DF> dbfh;
-  using VBF = hn::Vec<decltype(dbfh)>;
-
-  for (size_t i = 0; i < k.Cols(); ++i) {
-    const VBF q_vec_bf = hn::Load(dbfh, q);
-    const VF q_vec = hn::PromoteTo(df, q_vec_bf);
-    VF k_0 = hn::Set(df, k_row[0][i]);
-    sum0 = hn::MulAdd(q_vec, k_0, sum0);
-    VF k_1 = hn::Set(df, k_row[1][i]);
-    sum1 = hn::MulAdd(q_vec, k_1, sum1);
-    VF k_2 = hn::Set(df, k_row[2][i]);
-    sum2 = hn::MulAdd(q_vec, k_2, sum2);
-    VF k_3 = hn::Set(df, k_row[3][i]);
-    sum3 = hn::MulAdd(q_vec, k_3, sum3);
-    VF k_4 = hn::Set(df, k_row[4][i]);
-    sum4 = hn::MulAdd(q_vec, k_4, sum4);
-    VF k_5 = hn::Set(df, k_row[5][i]);
-    sum5 = hn::MulAdd(q_vec, k_5, sum5);
-    VF k_6 = hn::Set(df, k_row[6][i]);
-    sum6 = hn::MulAdd(q_vec, k_6, sum6);
-    VF k_7 = hn::Set(df, k_row[7][i]);
-    sum7 = hn::MulAdd(q_vec, k_7, sum7);
-    q += q_stride;
+  if constexpr (kVTileSize >= 8) {
+    sum4 = hn::Zero(df);
+    sum5 = hn::Zero(df);
+    sum6 = hn::Zero(df);
+    sum7 = hn::Zero(df);
   }
 }
 
-// Returns the element-wise maximum of 8 vectors, in a single vector.
-template <class DF, class VF = hn::Vec<DF>>
-VF HWY_INLINE ElementwiseMaxOf8(DF df, const VF& x0, const VF& x1, const VF& x2,
-                                const VF& x3, const VF& x4, const VF& x5,
-                                const VF& x6, const VF& x7) {
-  VF m0 = hn::Max(x0, x1);
-  VF m1 = hn::Max(x2, x3);
-  VF m2 = hn::Max(x4, x5);
-  VF m3 = hn::Max(x6, x7);
-  m0 = hn::Max(m0, m1);
-  m2 = hn::Max(m2, m3);
-  return hn::Max(m0, m2);
+// Returns a tile of 1, 4 or 8 Q rows by 2NF K Q.K dot products, in float32.
+// K is always pre-transposed to shape:
+// [seq_len / 2kNF, layers * kv_heads * qkv_dim/2 * 2kNF * 2], where the /2, *2
+// represents that pairs of qkv_dim elements are kept together to make best use
+// of BF16 dot product instructions.
+// Note that this version assumes that Q is float32, and not transposed, and
+// HWY_NATIVE_DOT_BF16 is false.
+template <size_t kVTileSize, class DF, class VF = hn::Vec<DF>>
+HWY_INLINE void QDotKTile148FloatNotNative(
+    DF df, const float* HWY_RESTRICT q, const uint32_t* HWY_RESTRICT q_offsets,
+    size_t half_cols, const MatPtrT<KV_t>& k, size_t pos, VF& sum00, VF& sum01,
+    VF& HWY_MAYBE_UNUSED sum10, VF& HWY_MAYBE_UNUSED sum11,
+    VF& HWY_MAYBE_UNUSED sum20, VF& HWY_MAYBE_UNUSED sum21,
+    VF& HWY_MAYBE_UNUSED sum30, VF& HWY_MAYBE_UNUSED sum31,
+    VF& HWY_MAYBE_UNUSED sum40, VF& HWY_MAYBE_UNUSED sum41,
+    VF& HWY_MAYBE_UNUSED sum50, VF& HWY_MAYBE_UNUSED sum51,
+    VF& HWY_MAYBE_UNUSED sum60, VF& HWY_MAYBE_UNUSED sum61,
+    VF& HWY_MAYBE_UNUSED sum70, VF& HWY_MAYBE_UNUSED sum71) {
+  ZeroResults<kVTileSize>(df, sum00, sum10, sum20, sum30, sum40, sum50, sum60,
+                          sum70);
+  ZeroResults<kVTileSize>(df, sum01, sum11, sum21, sum31, sum41, sum51, sum61,
+                          sum71);
+  using DBF = hn::ScalableTag<BF16>;
+  const DBF dbf;
+  using VBF = hn::Vec<DBF>;
+  const size_t kNF = hn::Lanes(df);
+  const float* HWY_RESTRICT q_base[kVTileSize];
+  for (size_t i = 0; i < kVTileSize; ++i) {
+    q_base[i] = q + q_offsets[i];
+  }
+  const BF16* HWY_RESTRICT k_base = k.Row(pos / (2 * kNF));
+  for (size_t i = 0; i < half_cols; ++i, k_base += kNF * 4) {
+    // TODO(rays): Replace with decompress2.
+    VBF k0_vec = hn::LoadU(dbf, k_base);
+    VBF k1_vec = hn::LoadU(dbf, k_base + kNF * 2);
+    VF k0_even = hn::PromoteEvenTo(df, k0_vec);
+    VF k0_odd = hn::PromoteOddTo(df, k0_vec);
+    VF k1_even = hn::PromoteEvenTo(df, k1_vec);
+    VF k1_odd = hn::PromoteOddTo(df, k1_vec);
+    VF q0_even = hn::Set(df, q_base[0][i * 2]);
+    VF q0_odd = hn::Set(df, q_base[0][i * 2 + 1]);
+    sum00 = hn::MulAdd(q0_even, k0_even, sum00);
+    sum01 = hn::MulAdd(q0_even, k1_even, sum01);
+    sum00 = hn::MulAdd(q0_odd, k0_odd, sum00);
+    sum01 = hn::MulAdd(q0_odd, k1_odd, sum01);
+    if constexpr (kVTileSize >= 4) {
+      VF q1_even = hn::Set(df, q_base[1][i * 2]);
+      VF q1_odd = hn::Set(df, q_base[1][i * 2 + 1]);
+      sum10 = hn::MulAdd(q1_even, k0_even, sum10);
+      sum11 = hn::MulAdd(q1_even, k1_even, sum11);
+      sum10 = hn::MulAdd(q1_odd, k0_odd, sum10);
+      sum11 = hn::MulAdd(q1_odd, k1_odd, sum11);
+      VF q2_even = hn::Set(df, q_base[2][i * 2]);
+      VF q2_odd = hn::Set(df, q_base[2][i * 2 + 1]);
+      sum20 = hn::MulAdd(q2_even, k0_even, sum20);
+      sum21 = hn::MulAdd(q2_even, k1_even, sum21);
+      sum20 = hn::MulAdd(q2_odd, k0_odd, sum20);
+      sum21 = hn::MulAdd(q2_odd, k1_odd, sum21);
+      VF q3_even = hn::Set(df, q_base[3][i * 2]);
+      VF q3_odd = hn::Set(df, q_base[3][i * 2 + 1]);
+      sum30 = hn::MulAdd(q3_even, k0_even, sum30);
+      sum31 = hn::MulAdd(q3_even, k1_even, sum31);
+      sum30 = hn::MulAdd(q3_odd, k0_odd, sum30);
+      sum31 = hn::MulAdd(q3_odd, k1_odd, sum31);
+    }
+    if constexpr (kVTileSize >= 8) {
+      VF q4_even = hn::Set(df, q_base[4][i * 2]);
+      VF q4_odd = hn::Set(df, q_base[4][i * 2 + 1]);
+      sum40 = hn::MulAdd(q4_even, k0_even, sum40);
+      sum41 = hn::MulAdd(q4_even, k1_even, sum41);
+      sum40 = hn::MulAdd(q4_odd, k0_odd, sum40);
+      sum41 = hn::MulAdd(q4_odd, k1_odd, sum41);
+      VF q5_even = hn::Set(df, q_base[5][i * 2]);
+      VF q5_odd = hn::Set(df, q_base[5][i * 2 + 1]);
+      sum50 = hn::MulAdd(q5_even, k0_even, sum50);
+      sum51 = hn::MulAdd(q5_even, k1_even, sum51);
+      sum50 = hn::MulAdd(q5_odd, k0_odd, sum50);
+      sum51 = hn::MulAdd(q5_odd, k1_odd, sum51);
+      VF q6_even = hn::Set(df, q_base[6][i * 2]);
+      VF q6_odd = hn::Set(df, q_base[6][i * 2 + 1]);
+      sum60 = hn::MulAdd(q6_even, k0_even, sum60);
+      sum61 = hn::MulAdd(q6_even, k1_even, sum61);
+      sum60 = hn::MulAdd(q6_odd, k0_odd, sum60);
+      sum61 = hn::MulAdd(q6_odd, k1_odd, sum61);
+      VF q7_even = hn::Set(df, q_base[7][i * 2]);
+      VF q7_odd = hn::Set(df, q_base[7][i * 2 + 1]);
+      sum70 = hn::MulAdd(q7_even, k0_even, sum70);
+      sum71 = hn::MulAdd(q7_even, k1_even, sum71);
+      sum70 = hn::MulAdd(q7_odd, k0_odd, sum70);
+      sum71 = hn::MulAdd(q7_odd, k1_odd, sum71);
+    }
+  }
 }
 
-// Returns the element-wise sum of 8 vectors, in a single vector.
-template <class DF, class VF = hn::Vec<DF>>
-VF HWY_INLINE ElementwiseSumOf8(DF df, const VF& x0, const VF& x1, const VF& x2,
-                                const VF& x3, const VF& x4, const VF& x5,
-                                const VF& x6, const VF& x7) {
-  VF sum0 = hn::Add(x0, x1);
-  VF sum1 = hn::Add(x2, x3);
-  VF sum2 = hn::Add(x4, x5);
-  VF sum3 = hn::Add(x6, x7);
-  sum0 = hn::Add(sum0, sum1);
-  sum2 = hn::Add(sum2, sum3);
-  return hn::Add(sum0, sum2);
-}
-
-// Sweeps a tile of NF Q rows by 8 K timesteps accumulators from start_pos to
-// min_last_pos, then sweeps the remaining timesteps in the range (min_last_pos,
-// max_last_pos].
-void TileFlashAttention(
-    const MatPtrT<BF16>& q, const uint32_t* HWY_RESTRICT q_offsets,
-    const StridedView<BF16>& qT, const MatPtrT<KV_t>& k, const size_t start_pos,
-    const uint32_t* HWY_RESTRICT last_pos, const size_t min_last_pos,
-    const size_t max_last_pos, const MatPtrT<KV_t>& v, const size_t layer_idx,
-    const AttentionActivationsPtrs& activations, MatPtrT<float>& att_out,
-    const uint32_t* HWY_RESTRICT out_offsets, ThreadingContext& ctx,
-    const size_t worker) {
-  GCPP_ZONE(ctx, worker, Zones::kFlashAttentionTileFlashAttention);
-  constexpr size_t kHTileSize = kNFx8HTileSize;
+// Loads an adjacent pair of floats, converts them to BF16, and broadcasts them
+// across a vector of BF16 as alternating odd and even elements.
+// hn::ReorderDemote2To(dbf, q_1_float, q_1_float); with q1_float containing
+// alternating odd and even floats appears not to do this.
+HWY_INLINE hn::Vec<hn::ScalableTag<BF16>> DemoteAndBroadcast2ToBF16(
+    const float* HWY_RESTRICT base) {
   using DF = hn::ScalableTag<float>;
   const DF df;
   using VF = hn::Vec<DF>;
-  using DI = hn::ScalableTag<uint32_t>;
-  const DI di;
-  using VI = hn::Vec<DI>;
-  const size_t kVTileSize = hn::Lanes(df);
+  VF v_even = hn::Set(df, base[0]);
+  VF v_odd = hn::Set(df, base[1]);
+  VF interleaved = hn::OddEven(v_odd, v_even);
+  return hn::OrderedDemote2To(hn::ScalableTag<BF16>(), interleaved,
+                              interleaved);
+}
+
+// Returns a tile of 1, 4 or 8 Q rows by 2NF K Q.K dot products, in float32.
+// K is always pre-transposed to shape:
+// [seq_len / 2kNF, layers * kv_heads * qkv_dim/2 * 2kNF * 2], where the /2, *2
+// represents that pairs of qkv_dim elements are kept together to make best use
+// of BF16 dot product instructions.
+// Note that this version assumes that Q is float32, and not transposed, and
+// HWY_NATIVE_DOT_BF16 is true.
+template <size_t kVTileSize, class DF, class VF = hn::Vec<DF>>
+HWY_INLINE void QDotKTile148FloatNative(
+    DF df, const float* HWY_RESTRICT q, const uint32_t* HWY_RESTRICT q_offsets,
+    size_t half_cols, const MatPtrT<KV_t>& k, size_t pos, VF& sum00, VF& sum01,
+    VF& HWY_MAYBE_UNUSED sum10, VF& HWY_MAYBE_UNUSED sum11,
+    VF& HWY_MAYBE_UNUSED sum20, VF& HWY_MAYBE_UNUSED sum21,
+    VF& HWY_MAYBE_UNUSED sum30, VF& HWY_MAYBE_UNUSED sum31,
+    VF& HWY_MAYBE_UNUSED sum40, VF& HWY_MAYBE_UNUSED sum41,
+    VF& HWY_MAYBE_UNUSED sum50, VF& HWY_MAYBE_UNUSED sum51,
+    VF& HWY_MAYBE_UNUSED sum60, VF& HWY_MAYBE_UNUSED sum61,
+    VF& HWY_MAYBE_UNUSED sum70, VF& HWY_MAYBE_UNUSED sum71) {
+  ZeroResults<kVTileSize>(df, sum00, sum10, sum20, sum30, sum40, sum50, sum60,
+                          sum70);
+  ZeroResults<kVTileSize>(df, sum01, sum11, sum21, sum31, sum41, sum51, sum61,
+                          sum71);
+  VF unused = hn::Zero(df);
+  using DBF = hn::ScalableTag<BF16>;
+  const DBF dbf;
+  using VBF = hn::Vec<DBF>;
+  const size_t kNF = hn::Lanes(df);
+  const float* HWY_RESTRICT q_base[kVTileSize];
   for (size_t i = 0; i < kVTileSize; ++i) {
-    hwy::ZeroBytes(att_out.Row(0) + out_offsets[i],
-                   v.Cols() * sizeof(att_out.Row(0)[0]));
+    q_base[i] = q + q_offsets[i];
   }
-  VI lasts = hn::LoadU(di, last_pos);
-  VF old_m = hn::Set(df, -std::numeric_limits<float>::max() / 2.0f);
-  VF old_d = hn::Zero(df);
-  const BF16* HWY_RESTRICT qT_row = qT.Row(0);
-  const size_t qT_stride = qT.Stride();
-  size_t position = start_pos;
-  while (position + kHTileSize - 1 <= min_last_pos) {
-    size_t k_pos[kHTileSize];
-    for (size_t i = 0; i < kHTileSize; ++i) {
-      k_pos[i] = activations.div_seq_len.Remainder(position + i);
+  const BF16* HWY_RESTRICT k_base = k.Row(pos / (2 * kNF));
+  for (size_t i = 0; i < half_cols; ++i, k_base += kNF * 4) {
+    VBF kvec0 = hn::LoadU(dbf, k_base);
+    VBF kvec1 = hn::LoadU(dbf, k_base + kNF * 2);
+    VBF q0_bf16 = DemoteAndBroadcast2ToBF16(q_base[0] + i * 2);
+    sum00 = hn::ReorderWidenMulAccumulate(df, q0_bf16, kvec0, sum00, unused);
+    sum01 = hn::ReorderWidenMulAccumulate(df, q0_bf16, kvec1, sum01, unused);
+    if constexpr (kVTileSize >= 4) {
+      VBF q1_bf16 = DemoteAndBroadcast2ToBF16(q_base[1] + i * 2);
+      sum10 = hn::ReorderWidenMulAccumulate(df, q1_bf16, kvec0, sum10, unused);
+      sum11 = hn::ReorderWidenMulAccumulate(df, q1_bf16, kvec1, sum11, unused);
+      VBF q2_bf16 = DemoteAndBroadcast2ToBF16(q_base[2] + i * 2);
+      sum20 = hn::ReorderWidenMulAccumulate(df, q2_bf16, kvec0, sum20, unused);
+      sum21 = hn::ReorderWidenMulAccumulate(df, q2_bf16, kvec1, sum21, unused);
+      VBF q3_bf16 = DemoteAndBroadcast2ToBF16(q_base[3] + i * 2);
+      sum30 = hn::ReorderWidenMulAccumulate(df, q3_bf16, kvec0, sum30, unused);
+      sum31 = hn::ReorderWidenMulAccumulate(df, q3_bf16, kvec1, sum31, unused);
     }
-    VF x0, x1, x2, x3, x4, x5, x6, x7;
-    QDotKTile(df, qT_row, qT_stride, k, k_pos, x0, x1, x2, x3, x4, x5, x6, x7);
-    if (activations.config.att_cap > 0.0f) {
-      // Compute tanh(x / cap) * cap, being LogitsSoftCap on the tile.
-      VF cap = hn::Set(df, activations.config.att_cap);
-      VF one_over_cap = hn::Div(hn::Set(df, 1.0f), cap);
-      x0 = hn::Mul(cap, hn::Tanh(df, hn::Mul(x0, one_over_cap)));
-      x1 = hn::Mul(cap, hn::Tanh(df, hn::Mul(x1, one_over_cap)));
-      x2 = hn::Mul(cap, hn::Tanh(df, hn::Mul(x2, one_over_cap)));
-      x3 = hn::Mul(cap, hn::Tanh(df, hn::Mul(x3, one_over_cap)));
-      x4 = hn::Mul(cap, hn::Tanh(df, hn::Mul(x4, one_over_cap)));
-      x5 = hn::Mul(cap, hn::Tanh(df, hn::Mul(x5, one_over_cap)));
-      x6 = hn::Mul(cap, hn::Tanh(df, hn::Mul(x6, one_over_cap)));
-      x7 = hn::Mul(cap, hn::Tanh(df, hn::Mul(x7, one_over_cap)));
+    if constexpr (kVTileSize >= 8) {
+      VBF q4_bf16 = DemoteAndBroadcast2ToBF16(q_base[4] + i * 2);
+      sum40 = hn::ReorderWidenMulAccumulate(df, q4_bf16, kvec0, sum40, unused);
+      sum41 = hn::ReorderWidenMulAccumulate(df, q4_bf16, kvec1, sum41, unused);
+      VBF q5_bf16 = DemoteAndBroadcast2ToBF16(q_base[5] + i * 2);
+      sum50 = hn::ReorderWidenMulAccumulate(df, q5_bf16, kvec0, sum50, unused);
+      sum51 = hn::ReorderWidenMulAccumulate(df, q5_bf16, kvec1, sum51, unused);
+      VBF q6_bf16 = DemoteAndBroadcast2ToBF16(q_base[6] + i * 2);
+      sum60 = hn::ReorderWidenMulAccumulate(df, q6_bf16, kvec0, sum60, unused);
+      sum61 = hn::ReorderWidenMulAccumulate(df, q6_bf16, kvec1, sum61, unused);
+      VBF q7_bf16 = DemoteAndBroadcast2ToBF16(q_base[7] + i * 2);
+      sum70 = hn::ReorderWidenMulAccumulate(df, q7_bf16, kvec0, sum70, unused);
+      sum71 = hn::ReorderWidenMulAccumulate(df, q7_bf16, kvec1, sum71, unused);
     }
-    VF m = ElementwiseMaxOf8(df, x0, x1, x2, x3, x4, x5, x6, x7);
-    m = hn::Max(old_m, m);
-    x0 = hn::Exp(df, hn::Sub(x0, m));
-    x1 = hn::Exp(df, hn::Sub(x1, m));
-    x2 = hn::Exp(df, hn::Sub(x2, m));
-    x3 = hn::Exp(df, hn::Sub(x3, m));
-    x4 = hn::Exp(df, hn::Sub(x4, m));
-    x5 = hn::Exp(df, hn::Sub(x5, m));
-    x6 = hn::Exp(df, hn::Sub(x6, m));
-    x7 = hn::Exp(df, hn::Sub(x7, m));
-    VF scale = hn::Mul(old_d, hn::Exp(df, hn::Sub(old_m, m)));
-    old_d = ElementwiseSumOf8(df, x0, x1, x2, x3, x4, x5, x6, x7);
-    old_d = hn::Add(scale, old_d);
-    old_m = m;
-    VF one_over_d = hn::Div(hn::Set(df, 1.0f), old_d);
-    scale = hn::Mul(scale, one_over_d);
-    x0 = hn::Mul(x0, one_over_d);
-    x1 = hn::Mul(x1, one_over_d);
-    x2 = hn::Mul(x2, one_over_d);
-    x3 = hn::Mul(x3, one_over_d);
-    x4 = hn::Mul(x4, one_over_d);
-    x5 = hn::Mul(x5, one_over_d);
-    x6 = hn::Mul(x6, one_over_d);
-    x7 = hn::Mul(x7, one_over_d);
-    MulByConstAndAddTile(df, scale, x0, x1, x2, x3, x4, x5, x6, x7, v, k_pos,
-                         att_out.Row(0), out_offsets, v.Cols());
-    position += kHTileSize;
-  }
-  while (position <= max_last_pos) {
-    size_t k_pos = activations.div_seq_len.Remainder(position);
-    VF x0 = QDotKVector(df, q_offsets, k_pos, q, k);
-    if (activations.config.att_cap > 0.0f) {
-      // Compute tanh(x / cap) * cap, being LogitsSoftCap on the vector.
-      VF cap = hn::Set(df, activations.config.att_cap);
-      VF one_over_cap = hn::Div(hn::Set(df, 1.0f), cap);
-      x0 = hn::Mul(cap, hn::Tanh(df, hn::Mul(x0, one_over_cap)));
-    }
-    // Past the last position, x0 doesn't count.
-    auto mask = hn::Gt(hn::Set(di, position), lasts);
-    VF causal_offset = hn::MaskedSet(df, RebindMask(df, mask),
-                                     std::numeric_limits<float>::max() / 2.0f);
-    x0 = hn::Sub(x0, causal_offset);
-    VF m = hn::Max(old_m, x0);
-    x0 = hn::Exp(df, hn::Sub(x0, m));
-    VF scale = hn::Mul(old_d, hn::Exp(df, hn::Sub(old_m, m)));
-    old_m = m;
-    old_d = hn::Add(scale, x0);
-    VF one_over_d = hn::Div(hn::Set(df, 1.0f), old_d);
-    x0 = hn::Mul(x0, one_over_d);
-    scale = hn::Mul(scale, one_over_d);
-    MulByConstAndAddVector(df, scale, x0, v, k_pos, att_out.Row(0), out_offsets,
-                           v.Cols());
-    ++position;
   }
 }
 
-// Returns an 4 Q rows by NF K tile of Q.K dot products, in single precision.
-// This is the result of 4 rows of Q against NF K timesteps, with positions
-// given by k_offsets[0..NF].
-template <class DF, class VF = hn::Vec<DF>>
-void QDotKTilex4(DF df, const BF16* HWY_RESTRICT q,
-                 const uint32_t* HWY_RESTRICT q_offsets, const MatPtrT<KV_t>& k,
-                 const int32_t* HWY_RESTRICT k_offsets, VF& sum0, VF& sum1,
-                 VF& sum2, VF& sum3) {
-  sum0 = hn::Zero(df);
-  sum1 = hn::Zero(df);
-  sum2 = hn::Zero(df);
-  sum3 = hn::Zero(df);
-  const float* HWY_RESTRICT k_base = k.Row(0);
-  using DI = hn::ScalableTag<int32_t>;
-  const DI di;
-  using VI = hn::Vec<DI>;
-  VI k_offsets_vec = hn::LoadU(di, k_offsets);
-  for (size_t i = 0; i < k.Cols(); ++i) {
-    VF k_vec = hn::GatherIndex(df, k_base + i, k_offsets_vec);
-    VF q_0 = hn::Set(df, hwy::ConvertScalarTo<float>(q[q_offsets[0] + i]));
-    sum0 = hn::MulAdd(q_0, k_vec, sum0);
-    VF q_1 = hn::Set(df, hwy::ConvertScalarTo<float>(q[q_offsets[1] + i]));
-    sum1 = hn::MulAdd(q_1, k_vec, sum1);
-    VF q_2 = hn::Set(df, hwy::ConvertScalarTo<float>(q[q_offsets[2] + i]));
-    sum2 = hn::MulAdd(q_2, k_vec, sum2);
-    VF q_3 = hn::Set(df, hwy::ConvertScalarTo<float>(q[q_offsets[3] + i]));
-    sum3 = hn::MulAdd(q_3, k_vec, sum3);
+// Returns a tile of 1, 4 or 8 Q rows by 2NF K Q.K dot products, in float32.
+// K is always pre-transposed to shape:
+// [seq_len / 2kNF, layers * kv_heads * qkv_dim/2 * 2kNF * 2], where the /2, *2
+// represents that pairs of qkv_dim elements are kept together to make best use
+// of BF16 dot product instructions.
+// Note that this is optimized for the case where q and k are bf16, but there is
+// no native_bf16 instruction.
+template <size_t kVTileSize, class DF, class VF = hn::Vec<DF>>
+HWY_INLINE void QDotKTile148BF16NotNative(
+    DF df, const BF16* HWY_RESTRICT q, const uint32_t* HWY_RESTRICT q_offsets,
+    size_t half_cols, const MatPtrT<KV_t>& k, size_t pos, VF& sum00, VF& sum01,
+    VF& HWY_MAYBE_UNUSED sum10, VF& HWY_MAYBE_UNUSED sum11,
+    VF& HWY_MAYBE_UNUSED sum20, VF& HWY_MAYBE_UNUSED sum21,
+    VF& HWY_MAYBE_UNUSED sum30, VF& HWY_MAYBE_UNUSED sum31,
+    VF& HWY_MAYBE_UNUSED sum40, VF& HWY_MAYBE_UNUSED sum41,
+    VF& HWY_MAYBE_UNUSED sum50, VF& HWY_MAYBE_UNUSED sum51,
+    VF& HWY_MAYBE_UNUSED sum60, VF& HWY_MAYBE_UNUSED sum61,
+    VF& HWY_MAYBE_UNUSED sum70, VF& HWY_MAYBE_UNUSED sum71) {
+  ZeroResults<kVTileSize>(df, sum00, sum10, sum20, sum30, sum40, sum50, sum60,
+                          sum70);
+  ZeroResults<kVTileSize>(df, sum01, sum11, sum21, sum31, sum41, sum51, sum61,
+                          sum71);
+  using DBF = hn::ScalableTag<BF16>;
+  const DBF dbf;
+  using VBF = hn::Vec<DBF>;
+  const size_t kNF = hn::Lanes(df);
+  const float* HWY_RESTRICT q_base[kVTileSize];
+  for (size_t i = 0; i < kVTileSize; ++i) {
+    q_base[i] = reinterpret_cast<const float*>(q + q_offsets[i]);
+  }
+  const BF16* HWY_RESTRICT k_base = k.Row(pos / (2 * kNF));
+  for (size_t i = 0; i < half_cols; ++i, k_base += kNF * 4) {
+    VBF kvec0 = hn::LoadU(dbf, k_base);
+    VBF kvec1 = hn::LoadU(dbf, k_base + kNF * 2);
+    VBF q0 = hn::BitCast(dbf, hn::Set(df, q_base[0][i]));
+    VF k0_even = hn::PromoteEvenTo(df, kvec0);
+    VF k0_odd = hn::PromoteOddTo(df, kvec0);
+    VF k1_even = hn::PromoteEvenTo(df, kvec1);
+    VF k1_odd = hn::PromoteOddTo(df, kvec1);
+    VF q0_even = hn::PromoteEvenTo(df, q0);
+    sum00 = hn::MulAdd(q0_even, k0_even, sum00);
+    sum01 = hn::MulAdd(q0_even, k1_even, sum01);
+    VF q0_odd = hn::PromoteOddTo(df, q0);
+    sum00 = hn::MulAdd(q0_odd, k0_odd, sum00);
+    sum01 = hn::MulAdd(q0_odd, k1_odd, sum01);
+    if constexpr (kVTileSize >= 4) {
+      VBF q1 = hn::BitCast(dbf, hn::Set(df, q_base[1][i]));
+      VF q1_even = hn::PromoteEvenTo(df, q1);
+      sum10 = hn::MulAdd(q1_even, k0_even, sum10);
+      sum11 = hn::MulAdd(q1_even, k1_even, sum11);
+      VF q1_odd = hn::PromoteOddTo(df, q1);
+      sum10 = hn::MulAdd(q1_odd, k0_odd, sum10);
+      sum11 = hn::MulAdd(q1_odd, k1_odd, sum11);
+      VBF q2 = hn::BitCast(dbf, hn::Set(df, q_base[2][i]));
+      VF q2_even = hn::PromoteEvenTo(df, q2);
+      sum20 = hn::MulAdd(q2_even, k0_even, sum20);
+      sum21 = hn::MulAdd(q2_even, k1_even, sum21);
+      VF q2_odd = hn::PromoteOddTo(df, q2);
+      sum20 = hn::MulAdd(q2_odd, k0_odd, sum20);
+      sum21 = hn::MulAdd(q2_odd, k1_odd, sum21);
+      VBF q3 = hn::BitCast(dbf, hn::Set(df, q_base[3][i]));
+      VF q3_even = hn::PromoteEvenTo(df, q3);
+      sum30 = hn::MulAdd(q3_even, k0_even, sum30);
+      sum31 = hn::MulAdd(q3_even, k1_even, sum31);
+      VF q3_odd = hn::PromoteOddTo(df, q3);
+      sum30 = hn::MulAdd(q3_odd, k0_odd, sum30);
+      sum31 = hn::MulAdd(q3_odd, k1_odd, sum31);
+    }
+    if constexpr (kVTileSize >= 8) {
+      VBF q4 = hn::BitCast(dbf, hn::Set(df, q_base[4][i]));
+      VF q4_even = hn::PromoteEvenTo(df, q4);
+      sum40 = hn::MulAdd(q4_even, k0_even, sum40);
+      sum41 = hn::MulAdd(q4_even, k1_even, sum41);
+      VF q4_odd = hn::PromoteOddTo(df, q4);
+      sum40 = hn::MulAdd(q4_odd, k0_odd, sum40);
+      sum41 = hn::MulAdd(q4_odd, k1_odd, sum41);
+      VBF q5 = hn::BitCast(dbf, hn::Set(df, q_base[5][i]));
+      VF q5_even = hn::PromoteEvenTo(df, q5);
+      sum50 = hn::MulAdd(q5_even, k0_even, sum50);
+      sum51 = hn::MulAdd(q5_even, k1_even, sum51);
+      VF q5_odd = hn::PromoteOddTo(df, q5);
+      sum50 = hn::MulAdd(q5_odd, k0_odd, sum50);
+      sum51 = hn::MulAdd(q5_odd, k1_odd, sum51);
+      VBF q6 = hn::BitCast(dbf, hn::Set(df, q_base[6][i]));
+      VF q6_even = hn::PromoteEvenTo(df, q6);
+      sum60 = hn::MulAdd(q6_even, k0_even, sum60);
+      sum61 = hn::MulAdd(q6_even, k1_even, sum61);
+      VF q6_odd = hn::PromoteOddTo(df, q6);
+      sum60 = hn::MulAdd(q6_odd, k0_odd, sum60);
+      sum61 = hn::MulAdd(q6_odd, k1_odd, sum61);
+      VBF q7 = hn::BitCast(dbf, hn::Set(df, q_base[7][i]));
+      VF q7_even = hn::PromoteEvenTo(df, q7);
+      sum70 = hn::MulAdd(q7_even, k0_even, sum70);
+      sum71 = hn::MulAdd(q7_even, k1_even, sum71);
+      VF q7_odd = hn::PromoteOddTo(df, q7);
+      sum70 = hn::MulAdd(q7_odd, k0_odd, sum70);
+      sum71 = hn::MulAdd(q7_odd, k1_odd, sum71);
+    }
+  }
+}
+
+// Returns a tile of 1, 4 or 8 Q rows by 2NF K Q.K dot products, in float32.
+// K is always pre-transposed to shape:
+// [seq_len / 2kNF, layers * kv_heads * qkv_dim/2 * 2kNF * 2], where the /2, *2
+// represents that pairs of qkv_dim elements are kept together to make best use
+// of BF16 dot product instructions.
+// Note that this is optimized for the case where q and k are bf16, and there is
+// a native_bf16 instruction.
+template <size_t kVTileSize, class DF, class VF = hn::Vec<DF>>
+HWY_INLINE void QDotKTile148BF16Native(
+    DF df, const BF16* HWY_RESTRICT q, const uint32_t* HWY_RESTRICT q_offsets,
+    size_t half_cols, const MatPtrT<KV_t>& k, size_t pos, VF& sum00, VF& sum01,
+    VF& HWY_MAYBE_UNUSED sum10, VF& HWY_MAYBE_UNUSED sum11,
+    VF& HWY_MAYBE_UNUSED sum20, VF& HWY_MAYBE_UNUSED sum21,
+    VF& HWY_MAYBE_UNUSED sum30, VF& HWY_MAYBE_UNUSED sum31,
+    VF& HWY_MAYBE_UNUSED sum40, VF& HWY_MAYBE_UNUSED sum41,
+    VF& HWY_MAYBE_UNUSED sum50, VF& HWY_MAYBE_UNUSED sum51,
+    VF& HWY_MAYBE_UNUSED sum60, VF& HWY_MAYBE_UNUSED sum61,
+    VF& HWY_MAYBE_UNUSED sum70, VF& HWY_MAYBE_UNUSED sum71) {
+  ZeroResults<kVTileSize>(df, sum00, sum10, sum20, sum30, sum40, sum50, sum60,
+                          sum70);
+  ZeroResults<kVTileSize>(df, sum01, sum11, sum21, sum31, sum41, sum51, sum61,
+                          sum71);
+  VF unused_sum1 = hn::Zero(df);
+  using DBF = hn::ScalableTag<BF16>;
+  const DBF dbf;
+  using VBF = hn::Vec<DBF>;
+  const size_t kNF = hn::Lanes(df);
+  const float* HWY_RESTRICT q_base[kVTileSize];
+  for (size_t i = 0; i < kVTileSize; ++i) {
+    q_base[i] = reinterpret_cast<const float*>(q + q_offsets[i]);
+  }
+  const BF16* HWY_RESTRICT k_base = k.Row(pos / (2 * kNF));
+  for (size_t i = 0; i < half_cols; ++i, k_base += kNF * 4) {
+    VBF k0_vec = hn::LoadU(dbf, k_base);
+    VBF k1_vec = hn::LoadU(dbf, k_base + kNF * 2);
+    VBF q0 = hn::BitCast(dbf, hn::Set(df, q_base[0][i]));
+    sum00 = hn::ReorderWidenMulAccumulate(df, q0, k0_vec, sum00, unused_sum1);
+    sum01 = hn::ReorderWidenMulAccumulate(df, q0, k1_vec, sum01, unused_sum1);
+    if constexpr (kVTileSize >= 4) {
+      VBF q1 = hn::BitCast(dbf, hn::Set(df, q_base[1][i]));
+      sum10 = hn::ReorderWidenMulAccumulate(df, q1, k0_vec, sum10, unused_sum1);
+      sum11 = hn::ReorderWidenMulAccumulate(df, q1, k1_vec, sum11, unused_sum1);
+      VBF q2 = hn::BitCast(dbf, hn::Set(df, q_base[2][i]));
+      sum20 = hn::ReorderWidenMulAccumulate(df, q2, k0_vec, sum20, unused_sum1);
+      sum21 = hn::ReorderWidenMulAccumulate(df, q2, k1_vec, sum21, unused_sum1);
+      VBF q3 = hn::BitCast(dbf, hn::Set(df, q_base[3][i]));
+      sum30 = hn::ReorderWidenMulAccumulate(df, q3, k0_vec, sum30, unused_sum1);
+      sum31 = hn::ReorderWidenMulAccumulate(df, q3, k1_vec, sum31, unused_sum1);
+    }
+    if constexpr (kVTileSize >= 8) {
+      VBF q4 = hn::BitCast(dbf, hn::Set(df, q_base[4][i]));
+      sum40 = hn::ReorderWidenMulAccumulate(df, q4, k0_vec, sum40, unused_sum1);
+      sum41 = hn::ReorderWidenMulAccumulate(df, q4, k1_vec, sum41, unused_sum1);
+      VBF q5 = hn::BitCast(dbf, hn::Set(df, q_base[5][i]));
+      sum50 = hn::ReorderWidenMulAccumulate(df, q5, k0_vec, sum50, unused_sum1);
+      sum51 = hn::ReorderWidenMulAccumulate(df, q5, k1_vec, sum51, unused_sum1);
+      VBF q6 = hn::BitCast(dbf, hn::Set(df, q_base[6][i]));
+      sum60 = hn::ReorderWidenMulAccumulate(df, q6, k0_vec, sum60, unused_sum1);
+      sum61 = hn::ReorderWidenMulAccumulate(df, q6, k1_vec, sum61, unused_sum1);
+      VBF q7 = hn::BitCast(dbf, hn::Set(df, q_base[7][i]));
+      sum70 = hn::ReorderWidenMulAccumulate(df, q7, k0_vec, sum70, unused_sum1);
+      sum71 = hn::ReorderWidenMulAccumulate(df, q7, k1_vec, sum71, unused_sum1);
+    }
   }
 }
 
 // Handles NF v rows of flash attention for NF q.k dot products from one q row.
+// Automatically handles masking for causal attention and different start_pos
+// and last_pos values.
 template <class DF, class VF = hn::Vec<DF>>
-float HWY_INLINE SingleFlashAttentionRowVector(DF df, VF& x, float& old_max,
+HWY_INLINE float SingleFlashAttentionRowVector(DF df, size_t start_pos,
+                                               size_t pos, size_t last_pos,
+                                               VF& x, float& old_max,
                                                float& old_d) {
+  if (pos < start_pos) {
+    size_t mask_size = start_pos - pos;
+    const VF neg_inf = hn::Neg(hn::Inf(df));
+    x = hn::IfThenElse(hn::FirstN(df, mask_size), neg_inf, x);
+  }
+  if (pos + hn::Lanes(df) > last_pos) {
+    size_t mask_size = pos <= last_pos ? last_pos + 1 - pos : 0;
+    const VF neg_inf = hn::Neg(hn::Inf(df));
+    x = hn::IfThenElse(hn::FirstN(df, mask_size), x, neg_inf);
+  }
   float m = hn::ReduceMax(df, x);
   m = std::max(m, old_max);
   x = hn::Exp(df, hn::Sub(x, hn::Set(df, m)));
@@ -442,6 +502,60 @@ float HWY_INLINE SingleFlashAttentionRowVector(DF df, VF& x, float& old_max,
   return scale;
 }
 
+// Handles 2NF v rows of flash attention for 2NF q.k dot products from 1 q row.
+// Automatically handles masking for causal attention and different start_pos
+// and last_pos values.
+template <class DF, class VF = hn::Vec<DF>>
+HWY_INLINE float DoubleFlashAttentionRowVector(DF df, size_t start_pos,
+                                               size_t pos, size_t last_pos,
+                                               VF& x0, VF& x1, float& old_max,
+                                               float& old_d) {
+  const size_t kNF = hn::Lanes(df);
+  const VF neg_inf = hn::Neg(hn::Inf(df));
+  if (pos < start_pos) {
+    if (pos + kNF <= start_pos) {
+      x0 = neg_inf;
+      size_t mask_size = start_pos - (pos + kNF);
+      x1 = hn::IfThenElse(hn::FirstN(df, mask_size), neg_inf, x1);
+    } else {
+      size_t mask_size = start_pos - pos;
+      x0 = hn::IfThenElse(hn::FirstN(df, mask_size), neg_inf, x0);
+    }
+  }
+  if (pos + 2 * kNF > last_pos) {
+    if (pos + kNF > last_pos) {
+      size_t mask_size = pos <= last_pos ? last_pos + 1 - pos : 0;
+      x0 = hn::IfThenElse(hn::FirstN(df, mask_size), x0, neg_inf);
+      x1 = neg_inf;
+    } else {
+      size_t mask_size = last_pos + 1 - (pos + kNF);
+      x1 = hn::IfThenElse(hn::FirstN(df, mask_size), x1, neg_inf);
+    }
+  }
+  VF x_max = hn::Max(x0, x1);
+  float m = hn::ReduceMax(df, x_max);
+  m = std::max(m, old_max);
+  VF m_vec = hn::Set(df, m);
+  x0 = hn::Exp(df, hn::Sub(x0, m_vec));
+  x1 = hn::Exp(df, hn::Sub(x1, m_vec));
+  float scale = old_d * std::exp(old_max - m);
+  VF x_sum = hn::Add(x0, x1);
+  old_d = hn::ReduceSum(df, x_sum) + scale;
+  old_max = m;
+  if (old_d > 0.0f) {
+    const float one_over_d = 1.0f / old_d;
+    scale *= one_over_d;
+    VF one_over_d_vec = hn::Set(df, one_over_d);
+    x0 = hn::Mul(x0, one_over_d_vec);
+    x1 = hn::Mul(x1, one_over_d_vec);
+  } else {
+    scale = 0.0f;
+    x0 = hn::Zero(df);
+    x1 = hn::Zero(df);
+  }
+  return scale;
+}
+
 // Reduces each of x and stores in following lanes of max (tested with float32)
 template <class DF, typename T = hn::TFromD<DF>,
           class DF4 = hn::CappedTag<T, 4>, class VF4 = hn::Vec<DF4>,
@@ -593,159 +707,538 @@ static HWY_INLINE void FlashAttentionTileStepAndApplySoftCap(
   }
 }
 
-// Implements flash attention for a strip of 4 query vectors.
-// It iterates through timesteps in K from `start_pos` up to `max_last_pos`.
-// Timesteps up to `min_last_pos` (*) are processed in tiles of shape 4 Q rows
-// by NF timesteps in K for efficiency while timesteps between `min_last_pos +
-// 1` and `max_last_pos` are processed one-by-one to handle differing `last_pos`
-// values within the strip.
-// (*) Actually, it only iterates through
-// `min_last_pos - (min_last_pos + 1 - start_pos) % NF` in tiles, as the tiled
-// computation can, for obvious reasons, only process an integer number of
-// tiles.
+// Implements flash attention for a strip of tiles of size 1, 4 or 8 query
+// vectors by 2NF positions in K.
+// It iterates through tiles in K from `params.min_start_pos / 2NF * 2NF` up to
+// `params.max_last_pos` (rounded up to the nearest multiple of 2NF).
+// Masking allows each row within a tile to have a different start and end
+// position.
 //
+// @param params FlashAttentionParams containing the extent of the strip and
+//   size of the tiles.
 // @param q The query matrix [batch_size * q_heads, qkv_dim] in BF16 format.
-// @param q_offsets Offsets from `q.Row(0)` to the start of the 4 query
-//   vectors to be processed in this tile.
-// @param k Key matrix [seq_len, qkv_dim] from KV cache.
-// @param start_pos The first token position in the KV cache to attend to.
-// @param last_pos An array of 4 indices giving the last token position
-//   (inclusive) that each of the 4 queries may attend to.
-// @param min_last_pos The minimum value in `last_pos`. Timesteps up to this
-//   position can be processed efficiently in batches.
-// @param max_last_pos The maximum value in `last_pos`. Timesteps between
-//   `min_last_pos + 1` and this position are processed individually to
-//   respect each query's `last_pos` limit.
+// @param k Key matrix from KV cache. K is always pre-transposed to shape:
+//   [seq_len / 2kNF, layers * kv_heads * qkv_dim/2 * 2kNF * 2],
+//   where the /2, *2 represents that pairs of qkv_dim elements are kept
+//   together to make best use of BF16 dot product instructions.
 // @param v Value matrix [seq_len, qkv_dim] from KV cache.
 // @param layer_idx The index of the current transformer layer.
 // @param activations Attention configurations and buffers.
 // @param att_out Output buffer for attention results.
-// @param out_offsets Offsets from `att_out.Row(0)` to store the 4 output
-//   vectors.
 // @param ctx Threading context.
 // @param worker Worker thread index.
-Tile4FlashState TileFlashAttention4(
-    const MatPtrT<BF16>& q, const uint32_t* HWY_RESTRICT q_offsets,
-    const MatPtrT<KV_t>& k, const size_t start_pos,
-    const uint32_t* HWY_RESTRICT last_pos, const size_t min_last_pos,
-    const size_t max_last_pos, const MatPtrT<KV_t>& v, const size_t layer_idx,
+template <size_t kVTileSize, typename QType>
+Tile4FlashState TileFlashAttention148(
+    const FlashAttentionParams& params, const MatPtrT<QType>& q,
+    const MatPtrT<KV_t>& k, const MatPtrT<KV_t>& v, const size_t layer_idx,
     const AttentionActivationsPtrs& activations, MatPtrT<float>& att_out,
-    const uint32_t* HWY_RESTRICT out_offsets, ThreadingContext& ctx,
-    const size_t worker) {
-  GCPP_ZONE(ctx, worker, Zones::kFlashAttentionTileFlashAttention4);
+    size_t qkv_dim, ThreadingContext& ctx, const size_t worker,
+    AttentionImpl attention_impl) {
+  constexpr Zones kZone =
+      kVTileSize == 8
+          ? Zones::kFlashAttentionTileFlashAttention8
+          : (kVTileSize == 4 ? Zones::kFlashAttentionTileFlashAttention4
+                             : Zones::kFlashAttentionTileFlashAttention1);
+  GCPP_ZONE(ctx, worker, kZone);
   using DF = hn::ScalableTag<float>;
   const DF df;
   using VF = hn::Vec<DF>;
-  constexpr size_t kMaxNF = hn::MaxLanes(df);
-  const size_t kHTileSize = hn::Lanes(df);
-  HWY_DASSERT(kHTileSize <= kMaxNF);
-  constexpr size_t kVTileSize = 4;
+  const size_t kHTileSize = 2 * hn::Lanes(df);
   float scales[kVTileSize];
   for (size_t i = 0; i < kVTileSize; ++i) {
-    hwy::ZeroBytes(att_out.Row(0) + out_offsets[i],
-                   v.Cols() * sizeof(att_out.Row(0)[0]));
+    hwy::ZeroBytes(att_out.Row(0) + params.out_offsets[i],
+                   qkv_dim * sizeof(att_out.Row(0)[0]));
   }
   Tile4FlashState state;
-  size_t position = start_pos;
-  while (position + kHTileSize - 1 <= min_last_pos) {
-    int32_t k_offsets[kMaxNF];
-    size_t v_pos[kMaxNF];
+  size_t position = params.min_start_pos / kHTileSize * kHTileSize;
+  while (position <= params.max_last_pos) {
+    // Each pair of vectors covers 2NF positions in K, with up to 8 pairs of
+    // vectors covering 1, 4 or 8 queries.
+    VF x00, x01;
+    VF HWY_MAYBE_UNUSED x10, x11;
+    VF HWY_MAYBE_UNUSED x20, x21;
+    VF HWY_MAYBE_UNUSED x30, x31;
+    VF HWY_MAYBE_UNUSED x40, x41;
+    VF HWY_MAYBE_UNUSED x50, x51;
+    VF HWY_MAYBE_UNUSED x60, x61;
+    VF HWY_MAYBE_UNUSED x70, x71;
+    constexpr size_t kMaxNF = hn::MaxLanes(df);
+    size_t v_pos[2 * kMaxNF];
     for (size_t i = 0; i < kHTileSize; ++i) {
       v_pos[i] = activations.div_seq_len.Remainder(position + i);
-      k_offsets[i] = k.Row(v_pos[i]) - k.Row(0);
     }
-    VF x0, x1, x2, x3;
-    QDotKTilex4(df, q.Row(0), q_offsets, k, k_offsets, x0, x1, x2, x3);
+    if constexpr (IsF32<QType>()) {
+      if constexpr (HWY_NATIVE_DOT_BF16) {
+        QDotKTile148FloatNative<kVTileSize>(df, q.Row(0), params.out_offsets,
+                                            qkv_dim / 2, k, position, x00, x01,
+                                            x10, x11, x20, x21, x30, x31, x40,
+                                            x41, x50, x51, x60, x61, x70, x71);
+      } else {
+        QDotKTile148FloatNotNative<kVTileSize>(
+            df, q.Row(0), params.out_offsets, qkv_dim / 2, k, position, x00,
+            x01, x10, x11, x20, x21, x30, x31, x40, x41, x50, x51, x60, x61,
+            x70, x71);
+      }
+    } else {
+      if constexpr (HWY_NATIVE_DOT_BF16) {
+        QDotKTile148BF16Native<kVTileSize>(df, q.Row(0), params.q_offsets,
+                                           qkv_dim / 2, k, position, x00, x01,
+                                           x10, x11, x20, x21, x30, x31, x40,
+                                           x41, x50, x51, x60, x61, x70, x71);
+      } else {
+        QDotKTile148BF16NotNative<kVTileSize>(
+            df, q.Row(0), params.q_offsets, qkv_dim / 2, k, position, x00, x01,
+            x10, x11, x20, x21, x30, x31, x40, x41, x50, x51, x60, x61, x70,
+            x71);
+      }
+    }
     if (activations.config.att_cap > 0.0f) {
       // Compute tanh(x / cap) * cap, being LogitsSoftCap on the tile.
       VF cap = hn::Set(df, activations.config.att_cap);
       VF one_over_cap = hn::Div(hn::Set(df, 1.0f), cap);
-      x0 = hn::Mul(cap, hn::Tanh(df, hn::Mul(x0, one_over_cap)));
-      x1 = hn::Mul(cap, hn::Tanh(df, hn::Mul(x1, one_over_cap)));
-      x2 = hn::Mul(cap, hn::Tanh(df, hn::Mul(x2, one_over_cap)));
-      x3 = hn::Mul(cap, hn::Tanh(df, hn::Mul(x3, one_over_cap)));
+      x00 = hn::Mul(cap, hn::Tanh(df, hn::Mul(x00, one_over_cap)));
+      x01 = hn::Mul(cap, hn::Tanh(df, hn::Mul(x01, one_over_cap)));
+      if constexpr (kVTileSize >= 4) {
+        x10 = hn::Mul(cap, hn::Tanh(df, hn::Mul(x10, one_over_cap)));
+        x11 = hn::Mul(cap, hn::Tanh(df, hn::Mul(x11, one_over_cap)));
+        x20 = hn::Mul(cap, hn::Tanh(df, hn::Mul(x20, one_over_cap)));
+        x21 = hn::Mul(cap, hn::Tanh(df, hn::Mul(x21, one_over_cap)));
+        x30 = hn::Mul(cap, hn::Tanh(df, hn::Mul(x30, one_over_cap)));
+        x31 = hn::Mul(cap, hn::Tanh(df, hn::Mul(x31, one_over_cap)));
+      }
+      if constexpr (kVTileSize >= 8) {
+        x40 = hn::Mul(cap, hn::Tanh(df, hn::Mul(x40, one_over_cap)));
+        x41 = hn::Mul(cap, hn::Tanh(df, hn::Mul(x41, one_over_cap)));
+        x50 = hn::Mul(cap, hn::Tanh(df, hn::Mul(x50, one_over_cap)));
+        x51 = hn::Mul(cap, hn::Tanh(df, hn::Mul(x51, one_over_cap)));
+        x60 = hn::Mul(cap, hn::Tanh(df, hn::Mul(x60, one_over_cap)));
+        x61 = hn::Mul(cap, hn::Tanh(df, hn::Mul(x61, one_over_cap)));
+        x70 = hn::Mul(cap, hn::Tanh(df, hn::Mul(x70, one_over_cap)));
+        x71 = hn::Mul(cap, hn::Tanh(df, hn::Mul(x71, one_over_cap)));
+      }
+    }
+    scales[0] = DoubleFlashAttentionRowVector(
+        df, params.start_pos[0], position, params.last_pos[0], x00, x01,
+        state.row_states[0].max, state.row_states[0].d);
+    if constexpr (kVTileSize >= 4) {
+      scales[1] = DoubleFlashAttentionRowVector(
+          df, params.start_pos[1], position, params.last_pos[1], x10, x11,
+          state.row_states[1].max, state.row_states[1].d);
+      scales[2] = DoubleFlashAttentionRowVector(
+          df, params.start_pos[2], position, params.last_pos[2], x20, x21,
+          state.row_states[2].max, state.row_states[2].d);
+      scales[3] = DoubleFlashAttentionRowVector(
+          df, params.start_pos[3], position, params.last_pos[3], x30, x31,
+          state.row_states[3].max, state.row_states[3].d);
+      MulByConstAndAddVT4Mem(df, scales, x00, x01, x10, x11, x20, x21, x30, x31,
+                             v, v_pos, params.max_last_pos + 1 - position,
+                             att_out.Row(0), params.out_offsets, qkv_dim);
+    } else {
+      MulByConstAndAddVT1Mem(df, scales, x00, x01, v, v_pos,
+                             params.max_last_pos + 1 - position, att_out.Row(0),
+                             params.out_offsets, qkv_dim);
+    }
+    if constexpr (kVTileSize >= 8) {
+      scales[4] = DoubleFlashAttentionRowVector(
+          df, params.start_pos[4], position, params.last_pos[4], x40, x41,
+          state.row_states[4].max, state.row_states[4].d);
+      scales[5] = DoubleFlashAttentionRowVector(
+          df, params.start_pos[5], position, params.last_pos[5], x50, x51,
+          state.row_states[5].max, state.row_states[5].d);
+      scales[6] = DoubleFlashAttentionRowVector(
+          df, params.start_pos[6], position, params.last_pos[6], x60, x61,
+          state.row_states[6].max, state.row_states[6].d);
+      scales[7] = DoubleFlashAttentionRowVector(
+          df, params.start_pos[7], position, params.last_pos[7], x70, x71,
+          state.row_states[7].max, state.row_states[7].d);
+      MulByConstAndAddVT4Mem(df, scales + 4, x40, x41, x50, x51, x60, x61, x70,
+                             x71, v, v_pos, params.max_last_pos + 1 - position,
+                             att_out.Row(0), params.out_offsets + 4, qkv_dim);
     }
-    scales[0] = SingleFlashAttentionRowVector(df, x0, state.row_states[0].max,
-                                              state.row_states[0].d);
-    scales[1] = SingleFlashAttentionRowVector(df, x1, state.row_states[1].max,
-                                              state.row_states[1].d);
-    scales[2] = SingleFlashAttentionRowVector(df, x2, state.row_states[2].max,
-                                              state.row_states[2].d);
-    scales[3] = SingleFlashAttentionRowVector(df, x3, state.row_states[3].max,
-                                              state.row_states[3].d);
-    MulByConstAndAddTile4(df, scales, x0, x1, x2, x3, v, v_pos, att_out.Row(0),
-                          out_offsets, v.Cols());
     position += kHTileSize;
   }
-  const hn::ScalableTag<BF16> dbf;
-  const size_t qkv_dim = k.Cols();
-
-  while (position <= max_last_pos) {
-    size_t k_pos = activations.div_seq_len.Remainder(position);
-    if (position <= last_pos[0]) {
-      // Past the last position, x0 doesn't count.
-      float x0 = Dot(dbf, MakeConstSpan(q.Row(0) + q_offsets[0], qkv_dim), 0,
-                     k.Row(k_pos), qkv_dim);
-      SingleFlashAttentionStep(x0, activations.config.att_cap,
-                               state.row_states[0].max, state.row_states[0].d,
-                               v.Row(k_pos), v.Cols(),
-                               att_out.Row(0) + out_offsets[0]);
-    }
-    if (position <= last_pos[1]) {
-      // Past the last position, x1 doesn't count.
-      float x1 = Dot(dbf, MakeConstSpan(q.Row(0) + q_offsets[1], qkv_dim), 0,
-                     k.Row(k_pos), qkv_dim);
-      SingleFlashAttentionStep(x1, activations.config.att_cap,
-                               state.row_states[1].max, state.row_states[1].d,
-                               v.Row(k_pos), v.Cols(),
-                               att_out.Row(0) + out_offsets[1]);
-    }
-    if (position <= last_pos[2]) {
-      // Past the last position, x2 doesn't count.
-      float x2 = Dot(dbf, MakeConstSpan(q.Row(0) + q_offsets[2], qkv_dim), 0,
-                     k.Row(k_pos), qkv_dim);
-      SingleFlashAttentionStep(x2, activations.config.att_cap,
-                               state.row_states[2].max, state.row_states[2].d,
-                               v.Row(k_pos), v.Cols(),
-                               att_out.Row(0) + out_offsets[2]);
-    }
-    if (position <= last_pos[3]) {
-      // Past the last position, x3 doesn't count.
-      float x3 = Dot(dbf, MakeConstSpan(q.Row(0) + q_offsets[3], qkv_dim), 0,
-                     k.Row(k_pos), qkv_dim);
-      SingleFlashAttentionStep(x3, activations.config.att_cap,
-                               state.row_states[3].max, state.row_states[3].d,
-                               v.Row(k_pos), v.Cols(),
-                               att_out.Row(0) + out_offsets[3]);
-    }
-    ++position;
-  }
   return state;
 }
 
-// Rounds n to a number that can be used as the number of Q rows in a tile
-// of flash attention.
-static size_t RoundToSuitablePowerOf2(size_t n) {
-  if (n < 4) return 1;
-  if (n < 8) return 4;
-  if (n < 16) return 8;
-  if (n < 32) return 16;
-  return 32;
-}
-
 // The vertical tile size is determined by the ability to use tiling and the
 // target_parallelism. In practice the possible tile sizes in order of
-// preference for efficiency are kNF, 4, 1, where kNF is likely to be 4 8 or
-// 16. The final tile size is chosen to be the largest possible that allows
-// for target_parallelism parallel tasks.
+// preference for efficiency are 8, 4, 1. The final tile size is chosen to be
+// the largest possible that allows for target_parallelism parallel tasks.
 size_t GetVTileSize(size_t kNF, size_t num_head_groups, size_t num_tokens,
                     size_t total_tasks, size_t target_parallelism) {
-  const size_t kMaxEqualK =
-      RoundToSuitablePowerOf2(num_head_groups * num_tokens);
-  const size_t kMinTileSize = (total_tasks / 4 >= target_parallelism) ? 4 : 1;
-  return (kNF <= kMaxEqualK && total_tasks / kNF >= target_parallelism)
-             ? kNF
-             : std::min(kMinTileSize, kMaxEqualK);
+  const size_t kMaxEqualK = num_head_groups * num_tokens;
+  if (total_tasks / k8xNFVTileSize >= target_parallelism &&
+      kMaxEqualK >= k8xNFVTileSize && kNF >= k8xNFVTileSize) {
+    return k8xNFVTileSize;
+  }
+  if (total_tasks / k4xNFVTileSize >= target_parallelism &&
+      kMaxEqualK >= k4xNFVTileSize && kNF >= k4xNFVTileSize) {
+    return k4xNFVTileSize;
+  }
+  return 1;
+}
+
+// Clears and fills the params vector with FlashAttentionParams for the given
+// num_tokens, target_parallelism, and layer_idx. Computes tile sizes and
+// offsets for each tile to achieve target_parallelism.
+void ComputeFlashParams(size_t num_tokens, const size_t target_parallelism,
+                        size_t layer_idx, AttentionActivationsPtrs& activations,
+                        QBatch& qbatch, AttentionImpl attention_impl,
+                        std::vector<FlashAttentionParams>& params) {
+  const LayerConfig& layer_config = activations.config.layer_configs[layer_idx];
+  const hwy::Divisor div_qbatch(qbatch.Size());
+  const size_t qkv_dim = layer_config.qkv_dim;
+  using DF = hn::ScalableTag<float>;
+  const DF df;
+  const size_t kNF = hn::Lanes(df);
+
+  // A "head group" in the context of GQA refers to a collection of query
+  // heads that share the same key and value heads.
+  const size_t kHeadGroups = layer_config.heads / layer_config.kv_heads;
+  const size_t cache_layer_size = layer_config.CacheLayerSize();
+  const size_t token_batch = num_tokens * div_qbatch.GetDivisor();
+  const size_t total_tasks = token_batch * layer_config.heads;
+  size_t kVTileSize = GetVTileSize(kNF, kHeadGroups, num_tokens, total_tasks,
+                                   target_parallelism);
+  // All layers should have the same number of heads.
+  HWY_DASSERT(activations.div_heads.GetDivisor() == layer_config.heads);
+  // To maximize adjacent tasks with the same kv matrices, task index is encoded
+  // thus: [qi][kv_head][batch_idx][head_group]. Note that the head index is
+  // split into kv_head and head_group, since the head_group does not affect
+  // the KV matrices, and kv_head does. batch_idx does not affect the KV
+  // matrices, but does affect the last position in the sequence. qi affects
+  // everything.
+  params.clear();
+  for (uint32_t qi = 0; qi < div_qbatch.GetDivisor(); ++qi) {
+    for (uint32_t kv_head = 0; kv_head < layer_config.kv_heads; ++kv_head) {
+      const size_t head_offset = kv_head * qkv_dim * 2;
+      const uint32_t kv_offset = layer_idx * cache_layer_size + head_offset;
+      params.push_back(FlashAttentionParams{
+          .qi_index = qi,
+          .kv_offset = kv_offset,
+      });
+      for (uint32_t batch_idx = 0; batch_idx < num_tokens; ++batch_idx) {
+        const size_t pos = qbatch.Pos(qi) + batch_idx;
+        const size_t start_pos = StartPos(pos, activations.config, layer_idx);
+        size_t last = pos;
+        const size_t prefix_end = qbatch.PrefixEnd(qi);
+        if (prefix_end > 0 && prefix_end - 1 > last) {
+          // last_pos is inclusive.
+          last = prefix_end - 1;
+        }
+        for (size_t head_group = 0; head_group < kHeadGroups; ++head_group) {
+          size_t tasks_remaining = kHeadGroups - head_group +
+                                   kHeadGroups * (num_tokens - 1 - batch_idx);
+          // We want to fill a tile of size kVTileSize or k4xNFVTileSize if
+          // smaller, otherwise everything is singles to the next head group.
+          size_t tasks_required = params.back().v_tile_size < k4xNFVTileSize
+                                      ? k4xNFVTileSize
+                                      : kVTileSize;
+          if (params.back().v_tile_size + tasks_remaining < tasks_required ||
+              params.back().v_tile_size == kVTileSize) {
+            // We don't have enough tasks remaining to fill a tile, or the
+            // current tile is full so start new tile.
+            params.push_back(FlashAttentionParams{
+                .qi_index = qi,
+                .kv_offset = kv_offset,
+            });
+          }
+          const size_t head = head_group + kHeadGroups * kv_head;
+          const size_t tq_idx = div_qbatch.GetDivisor() * batch_idx + qi;
+          auto& param = params.back();
+          size_t offset = param.v_tile_size;
+          param.q_offsets[offset] = activations.q_bf.Row(tq_idx) +
+                                    head * qkv_dim - activations.q_bf.Row(0);
+          param.out_offsets[offset] = activations.att_out.Row(tq_idx) +
+                                      head * qkv_dim -
+                                      activations.att_out.Row(0);
+          param.tq_idx[offset] = tq_idx;
+          param.start_pos[offset] = start_pos;
+          param.min_start_pos = HWY_MIN(param.min_start_pos, start_pos);
+          param.last_pos[offset] = last;
+          param.max_last_pos = HWY_MAX(param.max_last_pos, last);
+          ++param.v_tile_size;
+        }
+      }
+    }
+  }
+}
+
+// Returns the maximum number of tiles needed for any query in the batch.
+size_t GetMaxTiles(const std::vector<FlashAttentionParams>& params,
+                   const size_t kHTileSize) {
+  size_t max_tiles = 0;
+  for (const auto& param : params) {
+    size_t start = param.min_start_pos / kHTileSize;
+    size_t last = param.max_last_pos / kHTileSize;
+    max_tiles = HWY_MAX(last + 1 - start, max_tiles);
+  }
+  return max_tiles;
+}
+
+// Splits params into smaller k-strips to allow for more parallelism.
+// The strips are of size num_tiles_per_task * kHTileSize.
+// split_params is cleared and filled with the split tasks.
+void SplitTasksByKPos(std::vector<FlashAttentionParams>& params,
+                      const size_t kHTileSize, const size_t num_tiles_per_task,
+                      const size_t out_stride,
+                      std::vector<FlashAttentionParams>& split_params) {
+  split_params.clear();
+  for (auto& param : params) {
+    param.split_index = split_params.size();
+    size_t start = param.min_start_pos / kHTileSize;
+    size_t last = param.max_last_pos / kHTileSize;
+    for (size_t tile_pos = start; tile_pos <= last;
+         tile_pos += num_tiles_per_task) {
+      auto& split_param = split_params.emplace_back(param);
+      split_param.i_of_n = (tile_pos - start) / num_tiles_per_task;
+      uint32_t tile_last = (tile_pos + num_tiles_per_task) * kHTileSize - 1;
+      if (tile_last < param.max_last_pos) {
+        split_param.max_last_pos = tile_last;
+        for (auto& last_pos : split_param.last_pos) {
+          last_pos = std::min(last_pos, tile_last);
+        }
+      }
+      uint32_t tile_start = tile_pos * kHTileSize;
+      if (tile_start > param.min_start_pos) {
+        split_param.min_start_pos = tile_start;
+        for (auto& start_pos : split_param.start_pos) {
+          start_pos = std::max(start_pos, tile_start);
+        }
+      }
+      if (split_param.i_of_n > 0) {
+        for (size_t i = 0; i < split_param.v_tile_size; ++i) {
+          split_param.tq_idx[i] =
+              param.tq_idx[i] * AttentionActivations::kThreadReplicationFactor +
+              split_param.i_of_n - 1;
+          split_param.out_offsets[i] =
+              param.out_offsets[i] +
+              (split_param.tq_idx[i] - param.tq_idx[i]) * out_stride;
+        }
+      }
+    }
+  }
+}
+
+// Clears and fills activations.flash_params with FlashAttentionParams for the
+// given num_tokens, target_parallelism, and layer_idx. Computes tile sizes and
+// offsets for each tile to achieve target_parallelism.
+// If the parallelism is insufficient for this processor type, and the sequence
+// length is sufficient, the tiles are upgraded to k4xNFVTileSize and the tasks
+// are split along the k positions to achieve the desired parallelism.
+// If splitting was required, returns that factor by which the tiles were
+// upgraded, k4xNFVTileSize, otherwise returns 0.
+uint32_t ComputeAndSplitFlashParams(const size_t kNF, const size_t num_tokens,
+                                    const size_t target_parallelism,
+                                    size_t layer_idx,
+                                    AttentionActivationsPtrs& activations,
+                                    QBatch& qbatch, ThreadingContext& ctx,
+                                    AttentionImpl attention_impl) {
+  ComputeFlashParams(num_tokens, target_parallelism, layer_idx, activations,
+                     qbatch, attention_impl, activations.flash_params);
+  if (activations.flash_params.size() < ctx.pools.MaxWorkers()) {
+    // Insufficient parallelism for this processor type. Try splitting along the
+    // k positions.
+    size_t max_tiles = GetMaxTiles(activations.flash_params, kNF);
+    size_t desired_tiles_per_task = hwy::DivCeil(
+        activations.flash_params.size() * max_tiles, ctx.pools.MaxWorkers());
+    // The cost of combining split tasks is significant, so we want a minimum
+    // number of tiles per task, and we want to use k4xNFVTileSize if possible.
+    constexpr size_t kMinTilesPerTask = 4;
+    if (desired_tiles_per_task >= k4xNFVTileSize * kMinTilesPerTask) {
+      // We can afford to use k4xNFVTileSize vertically, so recompute params.
+      ComputeFlashParams(num_tokens,
+                         activations.flash_params.size() / k4xNFVTileSize,
+                         layer_idx, activations, qbatch, attention_impl,
+                         activations.flash_params);
+      desired_tiles_per_task =
+          hwy::DivCeil(desired_tiles_per_task, k4xNFVTileSize);
+      SplitTasksByKPos(activations.flash_params, kNF, desired_tiles_per_task,
+                       activations.att_out_reps.Stride(),
+                       activations.split_flash_params);
+      return k4xNFVTileSize;
+    }
+  }
+  return 0;
+}
+
+// Combines results from split tasks, processing kNumNF * NF qkv values where
+// kNumNF can be 1 4 or 16. This enables the intermediate results to be held in
+// registers, which speeds up the combination step significantly.
+template <size_t kNumNF>
+void CombineSplitTasks1416(hwy::Span<const FlashAttentionParams> params,
+                           size_t tile_pos, size_t qkv_offset,
+                           AttentionActivationsPtrs& activations) {
+  using DF = hn::ScalableTag<float>;
+  const DF df;
+  using VF = hn::Vec<DF>;
+  const size_t kNF = hn::Lanes(df);
+  float overall_m = params[0].end_state.row_states[tile_pos].max;
+  float overall_d = params[0].end_state.row_states[tile_pos].d;
+  float* HWY_RESTRICT att_out =
+      activations.att_out.Row(0) + params[0].out_offsets[tile_pos] + qkv_offset;
+  VF result_0 = hn::Load(df, att_out);
+  VF result_1, result_2, result_3, result_4, result_5, result_6, result_7;
+  VF result_8, result_9, result_10, result_11, result_12, result_13, result_14;
+  VF result_15;
+  if constexpr (kNumNF > 1) {
+    result_1 = hn::Load(df, att_out + kNF);
+    result_2 = hn::Load(df, att_out + 2 * kNF);
+    result_3 = hn::Load(df, att_out + 3 * kNF);
+  }
+  if constexpr (kNumNF == 16) {
+    result_4 = hn::Load(df, att_out + 4 * kNF);
+    result_5 = hn::Load(df, att_out + 5 * kNF);
+    result_6 = hn::Load(df, att_out + 6 * kNF);
+    result_7 = hn::Load(df, att_out + 7 * kNF);
+    result_8 = hn::Load(df, att_out + 8 * kNF);
+    result_9 = hn::Load(df, att_out + 9 * kNF);
+    result_10 = hn::Load(df, att_out + 10 * kNF);
+    result_11 = hn::Load(df, att_out + 11 * kNF);
+    result_12 = hn::Load(df, att_out + 12 * kNF);
+    result_13 = hn::Load(df, att_out + 13 * kNF);
+    result_14 = hn::Load(df, att_out + 14 * kNF);
+    result_15 = hn::Load(df, att_out + 15 * kNF);
+  }
+  for (size_t i = 1; i < params.size() && params[i].i_of_n > 0; ++i) {
+    float m = params[i].end_state.row_states[tile_pos].max;
+    float d = params[i].end_state.row_states[tile_pos].d;
+    float new_m = std::max(overall_m, m);
+    // Scale factor for existing total given the change in max.
+    float old_scale = overall_d * std::exp(overall_m - new_m);
+    // Scale factor for new group to add.
+    float new_scale = d * std::exp(m - new_m);
+    float new_d = old_scale + new_scale;
+    float one_over_d = 1.0f / new_d;
+    old_scale *= one_over_d;
+    new_scale *= one_over_d;
+    overall_m = new_m;
+    overall_d = new_d;
+    float* HWY_RESTRICT att_in = activations.att_out_reps.Row(0) +
+                                 params[i].out_offsets[tile_pos] + qkv_offset;
+    VF old_scale_vec = hn::Set(df, old_scale);
+    VF new_scale_vec = hn::Set(df, new_scale);
+    result_0 = hn::Mul(result_0, old_scale_vec);
+    result_0 = hn::MulAdd(hn::Load(df, att_in), new_scale_vec, result_0);
+    if constexpr (kNumNF > 1) {
+      result_1 = hn::Mul(result_1, old_scale_vec);
+      result_2 = hn::Mul(result_2, old_scale_vec);
+      result_3 = hn::Mul(result_3, old_scale_vec);
+      result_1 =
+          hn::MulAdd(hn::Load(df, att_in + kNF), new_scale_vec, result_1);
+      result_2 =
+          hn::MulAdd(hn::Load(df, att_in + 2 * kNF), new_scale_vec, result_2);
+      result_3 =
+          hn::MulAdd(hn::Load(df, att_in + 3 * kNF), new_scale_vec, result_3);
+    }
+    if constexpr (kNumNF == 16) {
+      result_4 = hn::Mul(result_4, old_scale_vec);
+      result_5 = hn::Mul(result_5, old_scale_vec);
+      result_6 = hn::Mul(result_6, old_scale_vec);
+      result_7 = hn::Mul(result_7, old_scale_vec);
+      result_8 = hn::Mul(result_8, old_scale_vec);
+      result_9 = hn::Mul(result_9, old_scale_vec);
+      result_10 = hn::Mul(result_10, old_scale_vec);
+      result_11 = hn::Mul(result_11, old_scale_vec);
+      result_12 = hn::Mul(result_12, old_scale_vec);
+      result_13 = hn::Mul(result_13, old_scale_vec);
+      result_14 = hn::Mul(result_14, old_scale_vec);
+      result_15 = hn::Mul(result_15, old_scale_vec);
+      result_4 =
+          hn::MulAdd(hn::Load(df, att_in + 4 * kNF), new_scale_vec, result_4);
+      result_5 =
+          hn::MulAdd(hn::Load(df, att_in + 5 * kNF), new_scale_vec, result_5);
+      result_6 =
+          hn::MulAdd(hn::Load(df, att_in + 6 * kNF), new_scale_vec, result_6);
+      result_7 =
+          hn::MulAdd(hn::Load(df, att_in + 7 * kNF), new_scale_vec, result_7);
+      result_8 =
+          hn::MulAdd(hn::Load(df, att_in + 8 * kNF), new_scale_vec, result_8);
+      result_9 =
+          hn::MulAdd(hn::Load(df, att_in + 9 * kNF), new_scale_vec, result_9);
+      result_10 =
+          hn::MulAdd(hn::Load(df, att_in + 10 * kNF), new_scale_vec, result_10);
+      result_11 =
+          hn::MulAdd(hn::Load(df, att_in + 11 * kNF), new_scale_vec, result_11);
+      result_12 =
+          hn::MulAdd(hn::Load(df, att_in + 12 * kNF), new_scale_vec, result_12);
+      result_13 =
+          hn::MulAdd(hn::Load(df, att_in + 13 * kNF), new_scale_vec, result_13);
+      result_14 =
+          hn::MulAdd(hn::Load(df, att_in + 14 * kNF), new_scale_vec, result_14);
+      result_15 =
+          hn::MulAdd(hn::Load(df, att_in + 15 * kNF), new_scale_vec, result_15);
+    }
+  }
+  hn::Store(result_0, df, att_out);
+  if constexpr (kNumNF > 1) {
+    hn::Store(result_1, df, att_out + kNF);
+    hn::Store(result_2, df, att_out + 2 * kNF);
+    hn::Store(result_3, df, att_out + 3 * kNF);
+  }
+  if constexpr (kNumNF == 16) {
+    hn::Store(result_4, df, att_out + 4 * kNF);
+    hn::Store(result_5, df, att_out + 5 * kNF);
+    hn::Store(result_6, df, att_out + 6 * kNF);
+    hn::Store(result_7, df, att_out + 7 * kNF);
+    hn::Store(result_8, df, att_out + 8 * kNF);
+    hn::Store(result_9, df, att_out + 9 * kNF);
+    hn::Store(result_10, df, att_out + 10 * kNF);
+    hn::Store(result_11, df, att_out + 11 * kNF);
+    hn::Store(result_12, df, att_out + 12 * kNF);
+    hn::Store(result_13, df, att_out + 13 * kNF);
+    hn::Store(result_14, df, att_out + 14 * kNF);
+    hn::Store(result_15, df, att_out + 15 * kNF);
+  }
+}
+
+// Recombines results from split tasks, activations.att_out_reps ->
+// activations.att_out. Instead of repeatedly calling MultiplyByConstAndAdd,
+// which reads/writes the sum each time, the result is kept entirely in
+// registers, and the task is split into 16NF, 4NF, and NF chunks, so that there
+// are enough registers to hold the intermediate results.
+void CombineSplitTasks(size_t qkv_dim, uint32_t tile_factor,
+                       AttentionActivationsPtrs& activations,
+                       ThreadingContext& ctx) {
+  GCPP_ZONE(ctx, 0, Zones::kFlashAttentionCombineSplit);
+  using DF = hn::ScalableTag<float>;
+  const DF df;
+  const size_t kNF = hn::Lanes(df);
+  uint32_t num_16 = qkv_dim / (16 * kNF);
+  uint32_t num_4 = (qkv_dim - kNF * 16 * num_16) / (4 * kNF);
+  uint32_t num_1 = hwy::DivCeil(qkv_dim - kNF * (16 * num_16 + 4 * num_4), kNF);
+  uint32_t tasks_per_qkv = num_16 + num_4 + num_1;
+  ParallelFor(
+      Parallelism::kFlat,
+      activations.flash_params.size() * tasks_per_qkv * tile_factor, ctx,
+      /*cluster_idx=*/0, Callers::kFlashAttention,
+      [&](size_t p, size_t worker) {
+        uint32_t tile = p / tasks_per_qkv;
+        uint32_t p_idx =
+            activations.flash_params[tile / tile_factor].split_index;
+        const auto& param = activations.split_flash_params[p_idx];
+        size_t remaining_params = activations.split_flash_params.size() - p_idx;
+        tile %= tile_factor;
+        if (tile >= param.v_tile_size) return;
+        int32_t qkv_task = p % tasks_per_qkv;
+        if (qkv_task < num_16) {
+          uint32_t qkv_offset = qkv_task * 16 * kNF;
+          CombineSplitTasks1416<16>(
+              hwy::Span<const FlashAttentionParams>(&param, remaining_params),
+              tile, qkv_offset, activations);
+        } else if (qkv_task < num_16 + num_4) {
+          uint32_t qkv_offset = (num_16 * 16 + (qkv_task - num_16) * 4) * kNF;
+          CombineSplitTasks1416<4>(
+              hwy::Span<const FlashAttentionParams>(&param, remaining_params),
+              tile, qkv_offset, activations);
+        } else {
+          uint32_t qkv_offset =
+              (num_16 * 16 + num_4 * 4 + (qkv_task - num_16 - num_4)) * kNF;
+          CombineSplitTasks1416<1>(
+              hwy::Span<const FlashAttentionParams>(&param, remaining_params),
+              tile, qkv_offset, activations);
+        }
+      });
 }
 
 // The nominal aim of attention is to combine 3 inputs Q[L,D], K[L,D], V[L,D]
@@ -759,58 +1252,45 @@ size_t GetVTileSize(size_t kNF, size_t num_head_groups, size_t num_tokens,
 // the one row of O takes L(4D+3) reads and L(D+3) writes.
 // For the whole of Q, this is L^2(4D+3) reads and L^2(D+3) writes.
 //
-// Flash attention fuses these operations together, and has 3 operating modes:
-// 1. NF rows of the result computed using tiles of registers of shape NFx8.
-// 2. 4 rows of the result computed using tiles of registers of shape 4xNF.
-// 3. One row (of Q and the result) at a time.
-// In all cases the intermediate result (Q.KT) is never stored to memory.
-// NF is the number of float lanes in a register, being 16 for AVX3. The softmax
-// is converted to streaming form using the algorithm from:
-// https://courses.cs.washington.edu/courses/cse599m/23sp/notes/flashattn.pdf.
-// Q is transposed to Q_T[D,L] to make the dot product computation efficient.
-//
-// In mode 1:
-// QDotKTileFloat computes NF Q rows x 8 K timesteps of Q.K dot products in one
-// go, reducing reads of Q by 8 and reads of K by NF. The streaming softmax is
-// computed entirely in registers, and a further NF registers to accumulate the
-// results of the product of the softmax and V, reduce the number of reads of V
-// by NF, and the reads/writes of O by 8.
-// The reads are thus reduced to 2DL^2(1/8+1/NF) and writes reduced to DL^2/8,
-// which on AVX3 is an overall reduction by about a factor of 10.
-// Mode 1 can only be accessed if there is a large Qbatch size, or in multi-turn
-// prefill, since in other cases, there is either a single K timestep (prefill)
-// or a single num_heads set of Q rows (decode).
-//
-// In mode 2, the 4 rows of Q are computed against NF K timesteps in a tile,
-// reducing the reads of Q by NF, and the reads of K by 4. The softmax and
-// accumulation of the result is done in registers, cutting the reads of V by 4.
-// The reads/writes of O are reduced by a factor of NF.
-// The overall reduction is limited by the need to use gather to load K.
-// Transposing K would be possible, but is complicated by the wraparound.
-// Mode 2 can be used in all cases when there are at least 4 attention heads,
-// but it may be prefereable to use mode 3 when the batch size is small to
-// maximise parallelism.
-//
-// In mode 3, a single row of Q is computed against a single K timestep at a
-// time, using SingleFlashAttention. In this case there is no reduction in the
-// reads of Q or K, or V, or O, but the reads/writes of the intermediate A are
-// still eliminated.
+// Flash attention fuses these operations together, and operates on tiles of
+// n Q rows x NF K positions, accumulated in n registers, where n is in
+// {1, 4, 8} and NF is the number of float lanes in a register, being 16 for
+// AVX3. This reduces the number of reads of Q by NF and reads of K by n. The
+// softmax is converted to streaming form using the algorithm from:
+// https://courses.cs.washington.edu/courses/cse599m/23sp/notes/flashattn.pdf,
+// which eliminates the need to store A to memory. The accumulated Q.KT result
+// is passed via the streaming softmax directly to the A.V step.
+// To make the dot product computation more efficient, Q, K, and V are stored
+// as BF16 and K is transposed to shape:
+//   [seq_len / NF, layers * kv_heads * qkv_dim/2 * NF * 2], where the /2, *2
+//   represents that pairs of qkv_dim elements are kept together to make best
+//   use of BF16 dot product instructions, where each pair of adjacent BF16
+//   values from Q and K are mul-added into a single F32 result.
 //
 // A further complication is that real attention is not as simple as documented
 // in the paper and above. There are multiple query heads, differing KV, and
-// different sequence lengths, so a lot of the work in FlashAttention is making
-// sure that a collection of q rows with the same KV and sequence length are
-// grouped together so that mode 1 or 2 can be used, and choosing which of the
-// 3 modes to use for best efficiency.
+// different sequence lengths, and the difference between prefill and decode,
+// so a lot of the work in FlashAttention is making sure that a collection of q
+// rows with the same KV and sequence length are grouped together so that the
+// largest possible tiles can be used. This is dealt with by the
+// ComputeAndSplitFlashParams() function.
 void FlashAttention(const size_t num_tokens, const size_t target_parallelism,
                     const size_t layer_idx, const MatPtr& query_norm_scale,
                     AttentionActivationsPtrs& activations, QBatch& qbatch,
-                    ThreadingContext& ctx) {
+                    ThreadingContext& ctx, AttentionImpl attention_impl) {
   GCPP_ZONE(ctx, 0, Zones::kFlashAttentionInclusive);
   RMSNormAndPositionalEncoding(num_tokens, qbatch, activations.q,
                                query_norm_scale, layer_idx, activations, ctx);
-  const hwy::Divisor div_qbatch(qbatch.Size());
+  const LayerConfig& layer_config = activations.config.layer_configs[layer_idx];
+  const size_t qkv_dim = layer_config.qkv_dim;
+  const size_t seq_len =
+      static_cast<size_t>(activations.div_seq_len.GetDivisor());
+
+  using DF = hn::ScalableTag<float>;
+  const DF df;
+  const size_t kNF = hn::Lanes(df);
   // Compress q to q_bf.
+  // TODO(rays): Move this into RMSNormAndPositionalEncoding().
   ParallelFor(
       Parallelism::kWithinCluster, activations.q.Rows(), ctx,
       /*cluster_idx=*/0, Callers::kFlashAttention,
@@ -821,168 +1301,53 @@ void FlashAttention(const size_t num_tokens, const size_t target_parallelism,
             df, activations.q.Row(row), activations.q.Cols(), tls,
             MakeSpan(activations.q_bf.Row(row), activations.q_bf.Cols()), 0);
       });
-  const LayerConfig& layer_config = activations.config.layer_configs[layer_idx];
-  const size_t qkv_dim = layer_config.qkv_dim;
-
-  // A "head group" in the context of GQA refers to a collection of query
-  // heads that share the same key and value heads.
-  const size_t kHeadGroups = layer_config.heads / layer_config.kv_heads;
-  const size_t cache_layer_size = layer_config.CacheLayerSize();
-  const size_t seq_len =
-      static_cast<size_t>(activations.div_seq_len.GetDivisor());
-  const size_t token_batch = num_tokens * div_qbatch.GetDivisor();
-  const size_t total_tasks = token_batch * layer_config.heads;
-
-  using DF = hn::ScalableTag<float>;
-  const DF df;
-  const size_t kNF = hn::Lanes(df);
-  constexpr size_t kMaxNF = hn::MaxLanes(df);
-  HWY_DASSERT(kNF <= kMaxNF);
-  const size_t kVTileSize = GetVTileSize(kNF, kHeadGroups, num_tokens,
-                                         total_tasks, target_parallelism);
-  // Only transpose Q if we are using tiling.
-  if (kVTileSize == kNF) {
-    size_t max_last = 0, min_start = std::numeric_limits<size_t>::max();
-    for (size_t qi = 0; qi < qbatch.Size(); ++qi) {
-      size_t pos = qbatch.Pos(qi);
-      const size_t start = StartPos(pos, activations.config, layer_idx);
-      pos += num_tokens - 1;
-      const size_t end = qbatch.PrefixEnd(qi);
-      if (end > 0 && end - 1 > pos) {
-        pos = end - 1;
-      }
-      max_last = std::max(max_last, pos);
-      min_start = std::min(min_start, start);
-    }
-    if (max_last - min_start + 1 >= kNFx8HTileSize) {
-      // q has shape [batch, qbatch][head, qkv_dim].
-      // We transpose it to [qkv_dim][qbatch, head, batch] in order to make the
-      // maximum possible number of consecutive columns have the same KV
-      // matrices. Each thread will process a tile of NF columns of QT so the
-      // starting column index of QT is just the task index * kVTileSize.
-      TransposeQ(activations.q, activations.q_T, qbatch.Size(), ctx);
-    }
-  }
-  const size_t num_thread_tasks = hwy::DivCeil(total_tasks, kVTileSize);
-  const hwy::Divisor div_tokens(num_tokens);
-  // All layers should have the same number of heads.
-  HWY_DASSERT(activations.div_heads.GetDivisor() == layer_config.heads);
+  int tile_factor =
+      ComputeAndSplitFlashParams(kNF, num_tokens, target_parallelism, layer_idx,
+                                 activations, qbatch, ctx, attention_impl);
+  auto& params = tile_factor >= 1 ? activations.split_flash_params
+                                  : activations.flash_params;
+  size_t num_tasks = params.size();
 
   // For each head/token/query, compute fused flash Q.K, softmax and weighted V.
   const auto func = [&](const size_t task, size_t worker) HWY_ATTR {
     GCPP_ZONE(ctx, worker, Zones::kFlashAttentionFlashAttention);
-    // Offsets into original Q for each row in the tile.
-    uint32_t q_offsets[kMaxNF];
-    // Offsets into att_out for each row in the tile.
-    uint32_t out_offsets[kMaxNF];
-    // Start positions for each row in the tile.
-    size_t start_positions[kMaxNF];
-    // Last positions for each row in the tile. Inclusive.
-    uint32_t last_pos[kMaxNF];
-    // min and max last positions across all rows in the tile determines when
-    // TileFlashAttention switches to single vector mode to handle the
-    // ragged sequence lengths.
-    size_t min_last_pos = std::numeric_limits<size_t>::max();
-    size_t max_last_pos = 0;
-    // Indices into the qbatch.KV for each row in the tile.
-    size_t qi_indices[kMaxNF];
-    // Indices into the kv_cache for each row in the tile.
-    size_t kv_offsets[kMaxNF];
-    // first_task is [qbatch, head, token].
-    const size_t first_task = task * kVTileSize;
-    const size_t last_task = first_task + kVTileSize - 1;
-    bool use_tile_attention = kVTileSize > 1 && last_task < total_tasks;
-    for (size_t offset = 0;
-         offset < kVTileSize && first_task + offset < total_tasks; ++offset) {
-      const size_t batch_idx = div_tokens.Remainder(first_task + offset);
-      const size_t qh = div_tokens.Divide(first_task + offset);
-      const size_t head = activations.div_heads.Remainder(qh);
-      const size_t qi = activations.div_heads.Divide(qh);
-      const size_t tq_idx = div_qbatch.GetDivisor() * batch_idx + qi;
-      qi_indices[offset] = qi;
-
-      // Find the token position in the query and calculate
-      // the range of cache positions to attend to.
-      const size_t pos = qbatch.Pos(qi) + batch_idx;
-      const size_t start_pos = StartPos(pos, activations.config, layer_idx);
-      start_positions[offset] = start_pos;
-      size_t last = pos;
-      const size_t prefix_end = qbatch.PrefixEnd(qi);
-      if (prefix_end > 0 && prefix_end - 1 > last) {
-        // last_pos in `TileFlashAttention` is inclusive.
-        last = prefix_end - 1;
-      }
-      last_pos[offset] = last;
-      min_last_pos = HWY_MIN(min_last_pos, last);
-      max_last_pos = HWY_MAX(max_last_pos, last);
-      q_offsets[offset] = activations.q_bf.Row(tq_idx) + head * qkv_dim -
-                          activations.q_bf.Row(0);
-      out_offsets[offset] = activations.att_out.Row(tq_idx) + head * qkv_dim -
-                            activations.att_out.Row(0);
-      const size_t kv_index = head / kHeadGroups;
-      const size_t head_offset = kv_index * qkv_dim * 2;
-      kv_offsets[offset] = layer_idx * cache_layer_size + head_offset;
-      // If any of the parameters in this if statement differ within this task,
-      // then we can't use TileFlashAttention. TileFlashAttention requires that
-      // all rows in the tile have the same K and V matrices, and Q starts at
-      // the same position. The end positions do not have to be the equal.
-      if (start_positions[offset] != start_positions[0] ||
-          qi_indices[offset] != qi_indices[0] ||
-          kv_offsets[offset] != kv_offsets[0]) {
-        use_tile_attention = false;
-      }
-    }
-    for (size_t offset = 0;
-         offset < kVTileSize && first_task + offset < total_tasks; ++offset) {
-      auto& kv_cache = qbatch.KV(qi_indices[offset]).kv_cache;
-      MatPtrT<KV_t> k("k_view", Extents2D(seq_len, qkv_dim));
-      k.SetPtr(kv_cache.Row(0) + kv_offsets[offset], kv_cache.Stride());
-      MatPtrT<KV_t> v("v_view", Extents2D(seq_len, qkv_dim));
-      v.SetPtr(kv_cache.Row(0) + kv_offsets[offset] + qkv_dim,
-               kv_cache.Stride());
-      if (use_tile_attention) {
-        // To avoid duplicating the code to setup K and V, the call to
-        // TileFlashAttention is inside the loop over tasks, even though it
-        // handles all rows in the task at once.
-        StridedView<BF16> qT =
-            StridedView<BF16>(activations.q_T.Row(0) + first_task, kVTileSize,
-                              activations.q_T.Stride());
-        if (kVTileSize == kNF) {
-          // We can still use TileFlashAttention even if we didn't transpose Q
-          // above. The condition used for transposing Q above is more general
-          // and easier to compute than the condition used within
-          // TileFlashAttention that min_last_pos - start_positions[offset] <
-          // kNFx8HTileSize. In this case, qT is never used. Some tasks might
-          // use qT and some might not, which is why the more general condition
-          // is used above to catch all cases where qT will be used.
-          TileFlashAttention(activations.q_bf, q_offsets, qT, k,
-                             start_positions[offset], last_pos, min_last_pos,
-                             max_last_pos, v, layer_idx, activations,
-                             activations.att_out, out_offsets, ctx, worker);
-        } else if (kVTileSize == 4) {
-          TileFlashAttention4(activations.q_bf, q_offsets, k,
-                              start_positions[offset], last_pos, min_last_pos,
-                              max_last_pos, v, layer_idx, activations,
-                              activations.att_out, out_offsets, ctx, worker);
-        } else {
-          HWY_UNREACHABLE;
-        }
-        break;
-      } else {
-        SingleFlashAttention(start_positions[offset], last_pos[offset],
-                             activations.q_bf.Row(0) + q_offsets[offset], k, v,
-                             layer_idx, activations,
-                             activations.att_out.Row(0) + out_offsets[offset],
-                             ctx, worker);
-      }
+    auto& param = params[task];
+    auto& kv_cache = qbatch.KV(param.qi_index).kv_cache;
+    auto& kT_cache = qbatch.KV(param.qi_index).k_cache;
+    MatPtrT<KV_t> kT("k_T_view", Extents2D(hwy::DivCeil(seq_len, 2 * kNF),
+                                           qkv_dim * 2 * kNF));
+    kT.SetPtr(kT_cache.Row(0) + param.kv_offset * kNF, kT_cache.Stride());
+    MatPtrT<KV_t> v("v_view", Extents2D(seq_len, qkv_dim));
+    v.SetPtr(kv_cache.Row(0) + param.kv_offset + qkv_dim, kv_cache.Stride());
+    auto& vT_cache = qbatch.KV(param.qi_index).v_cache;
+    MatPtrT<KV_t> vT("v_T_view", Extents2D(hwy::DivCeil(seq_len, 2 * kNF),
+                                           qkv_dim * 2 * kNF));
+    vT.SetPtr(vT_cache.Row(0) + param.kv_offset * kNF, vT_cache.Stride());
+    MatPtrT<float>& att_out =
+        param.i_of_n == 0 ? activations.att_out : activations.att_out_reps;
+    if (param.v_tile_size == k8xNFVTileSize) {
+      param.end_state = TileFlashAttention148<k8xNFVTileSize>(
+          param, activations.q_bf, kT, vT, layer_idx, activations, att_out,
+          qkv_dim, ctx, worker, attention_impl);
+    } else if (param.v_tile_size == k4xNFVTileSize) {
+      param.end_state = TileFlashAttention148<k4xNFVTileSize>(
+          param, activations.q_bf, kT, vT, layer_idx, activations, att_out,
+          qkv_dim, ctx, worker, attention_impl);
+    } else {
+      param.end_state = TileFlashAttention148<1>(
+          param, activations.q_bf, kT, vT, layer_idx, activations, att_out,
+          qkv_dim, ctx, worker, attention_impl);
     }
   };
 
   {
     PROFILER_ZONE("Gen.FlashAttention.ForkJoin");
     // Full parallelism is helpful, SmallParallelFor is insufficient.
-    HierarchicalParallelFor(num_thread_tasks, ctx, Callers::kFlashAttention,
-                            func);
+    HierarchicalParallelFor(num_tasks, ctx, Callers::kFlashAttention, func);
+  }
+  if (tile_factor >= 1) {
+    // Run the flash attention correction on the partial outputs.
+    CombineSplitTasks(qkv_dim, tile_factor, activations, ctx);
   }
 }
 
diff --git a/gemma/flash_attention.h b/gemma/flash_attention.h
index 8f3ec21..6466674 100644
--- a/gemma/flash_attention.h
+++ b/gemma/flash_attention.h
@@ -44,22 +44,13 @@ namespace gcpp {
                             float* HWY_RESTRICT att_out,                       \
                             ThreadingContext& ctx, size_t worker);             \
                                                                                \
-  Tile4FlashState TileFlashAttention4(                                         \
-      const MatPtrT<BF16>& q, const uint32_t* HWY_RESTRICT q_offsets,          \
-      const MatPtrT<KV_t>& k, size_t start_pos,                                \
-      const uint32_t* HWY_RESTRICT last_pos, size_t min_last_pos,              \
-      size_t max_last_pos, const MatPtrT<KV_t>& v, size_t layer_idx,           \
-      const LayerWeightsPtrs& layer, const AttentionActivations& activations,  \
-      MatPtrT<float>& att_out, const uint32_t* HWY_RESTRICT out_offsets,       \
-      ThreadingContext& ctx, const size_t worker);                             \
-                                                                               \
   size_t GetVTileSize(size_t kNF, size_t num_head_groups, size_t num_tokens,   \
                       size_t total_tasks, size_t target_parallelism);          \
                                                                                \
   void FlashAttention(size_t num_tokens, size_t target_parallelism,            \
                       size_t layer_idx, const MatPtr& query_norm_scale,        \
                       AttentionActivationsPtrs& activations, QBatch& qbatch,   \
-                      ThreadingContext& ctx);                                  \
+                      ThreadingContext& ctx, AttentionImpl attention_impl);    \
                                                                                \
   /* NOLINTNEXTLINE(google-readability-namespace-comments) */                  \
   }  // namespace NAMESPACE
diff --git a/gemma/flash_attention_test.cc b/gemma/flash_attention_test.cc
index f0a90fa..85e73a8 100644
--- a/gemma/flash_attention_test.cc
+++ b/gemma/flash_attention_test.cc
@@ -62,16 +62,17 @@ namespace HWY_NAMESPACE {
 
 using FloatPtr = hwy::AlignedFreeUniquePtr<float[]>;
 
-void SetMat(const size_t offset, MatPtrT<float>& mat) {
+template <typename T>
+void SetMat(const size_t offset, MatPtrT<T>& mat) {
   const size_t kOuter = mat.Extents().rows;
   const size_t kInner = mat.Extents().cols;
   const float i_scale = 1.0f / kInner;
   const float j_scale = 1.0f / kOuter;
   for (size_t i = 0; i < kOuter; ++i) {
-    float* row = mat.Row(i);
+    T* row = mat.Row(i);
     for (size_t j = 0; j < kInner; ++j) {
-      row[j] =
-          static_cast<float>((i * kInner * i_scale + (j + offset) * j_scale));
+      row[j] = hwy::ConvertScalarTo<T>(
+          static_cast<float>((i * kInner * i_scale + (j + offset) * j_scale)));
     }
   }
 }
@@ -94,14 +95,15 @@ void AssertClose(const MatPtrT<float>& a, const MatPtrT<float>& b) {
       if (rel_abs_delta > 0.0f) {
         rel_abs_delta /= std::max(std::abs(a_row[c]), std::abs(b_row[c]));
       }
-      EXPECT_LT(rel_abs_delta, 1e-5)
+      EXPECT_LT(rel_abs_delta, 1e-3)
           << "a[" << r << "," << c << "]=" << a_row[c] << ", b[" << r << ","
           << c << "]=" << b_row[c];
     }
   }
 }
 
-void TestFlashAttention(size_t target_parallelism) {
+void TestFlashAttention(size_t target_parallelism,
+                        AttentionImpl attention_impl) {
   ThreadingArgs threading_args;
   ThreadingContext ctx(threading_args);
   constexpr size_t kOuter = 1024;
@@ -112,7 +114,9 @@ void TestFlashAttention(size_t target_parallelism) {
   const LayerConfig& layer_config = config.layer_configs[0];
   const LayerWeightsPtrs layers(0, layer_config, tensor_info_registry);
   InferenceArgs inference_args;
-  inference_args.attention_impl = "flash";
+  // attention_impl must be old in order for the att intermediate to be
+  // allocated for the old attention.
+  inference_args.attention_impl = "old";
   RuntimeConfig runtime_config;
   inference_args.CopyTo(runtime_config);
   KVCache kv_cache(config, inference_args, ctx.allocator);
@@ -129,7 +133,8 @@ void TestFlashAttention(size_t target_parallelism) {
   const size_t batch_size = kOuter;
   std::vector<hwy::AlignedFreeUniquePtr<uint8_t*[]>> row_ptrs;
   AttentionActivations attention_storage(config, layer_config, batch_size,
-                                         kOuter, runtime_config, ctx.allocator,
+                                         kOuter, runtime_config,
+                                        ctx.pools.MaxWorkers(), ctx.allocator,
                                          row_ptrs);
   AttentionActivationsPtrs attention(config, kOuter, attention_storage);
   const size_t qkv_dim = layer_config.qkv_dim;
@@ -140,7 +145,10 @@ void TestFlashAttention(size_t target_parallelism) {
   const size_t kHeadGroups = layer_config.heads / layer_config.kv_heads;
   const size_t seq_len =
       static_cast<size_t>(attention.div_seq_len.GetDivisor());
+  MaybeReshapeCache(qbatch.KV(0).kv_cache, qbatch.KV(0).k_cache);
+  MaybeReshapeCache(qbatch.KV(0).kv_cache, qbatch.KV(0).v_cache);
   auto& kvc = qbatch.KV(0).kv_cache;
+  const size_t kFloatsPerTile = 2 * FloatsPerVector();
   for (size_t h = 0; h < layer_config.heads; ++h) {
     // Make strided views into the kv cache for
     // this query and head.
@@ -151,6 +159,17 @@ void TestFlashAttention(size_t target_parallelism) {
     v.SetPtr(kvc.Row(0) + head_offset + qkv_dim, kvc.Stride());
     SetMat(h + layer_config.heads, k);
     SetMat(h + layer_config.heads * 2, v);
+    for (size_t p = 0; p < tokens.size(); ++p) {
+      KV_t* HWY_RESTRICT k_src = k.Row(p);
+      KV_t* HWY_RESTRICT k_dest = qbatch.KV(0).k_cache.Row(p / kFloatsPerTile) +
+                                  head_offset * kFloatsPerTile / 2 +
+                                  p % kFloatsPerTile * 2;
+      KV_t* HWY_RESTRICT v_dest = qbatch.KV(0).v_cache.Row(p / kFloatsPerTile) +
+                                  head_offset * kFloatsPerTile / 2 +
+                                  p % kFloatsPerTile * kFloatsPerTile;
+
+      TransposeKVCacheRow(k_src, k_dest, v_dest, qkv_dim);
+    }
   }
   SetMat(1, attention.q);
   DotSoftmaxWeightedSum(tokens.size(), 0, layers.query_norm_scale, attention,
@@ -165,18 +184,19 @@ void TestFlashAttention(size_t target_parallelism) {
       tokens.size() * div_qbatch.GetDivisor() * layer_config.heads;
   const size_t kVTileSize = GetVTileSize(kNF, kHeadGroups, tokens.size(),
                                          total_tasks, target_parallelism);
-  printf("FlashAttention: target_parallelism=%zu, kNF=%zu, kVTileSize=%zu\n",
-         target_parallelism, kNF, kVTileSize);
+  printf("FlashAttention: parallelism=%zu, kNF=%zu, kVTileSize=%zu, mode %s\n",
+         target_parallelism, kNF, kVTileSize,
+         GetAttentionImplName(attention_impl).c_str());
   FlashAttention(tokens.size(), target_parallelism, 0, layers.query_norm_scale,
-                 attention, qbatch, ctx);
+                 attention, qbatch, ctx, attention_impl);
   AssertClose(attention.att_out, *saved_att);
   ctx.profiler.PrintResults();
 }
 
 void TestAttention() {
-  TestFlashAttention(8192);
-  TestFlashAttention(2048);
-  TestFlashAttention(256);
+  TestFlashAttention(8192, AttentionImpl::kFlash);
+  TestFlashAttention(2048, AttentionImpl::kFlash);
+  TestFlashAttention(256, AttentionImpl::kFlash);
 }
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
diff --git a/gemma/flash_structs.h b/gemma/flash_structs.h
index 73563fe..6e35a4d 100644
--- a/gemma/flash_structs.h
+++ b/gemma/flash_structs.h
@@ -2,11 +2,19 @@
 #define THIRD_PARTY_GEMMA_CPP_GEMMA_FLASH_STRUCTS_H_
 
 #include <stddef.h>
+#include <stdint.h>
 
 #include <limits>
 
 namespace gcpp {
 
+// The vertical tile size in flash attention when register lanes correspond to
+// K-timesteps, and the number of registers is 4 for 4 Q-rows.
+static constexpr size_t k4xNFVTileSize = 4;
+// The vertical tile size in flash attention when register lanes correspond to
+// K-timesteps, and the number of registers is 8 for 8 Q-rows.
+static constexpr size_t k8xNFVTileSize = 8;
+
 // State for computing softmax in a streaming ("online") manner,
 // avoiding large intermediate values by subtracting the running maximum.
 // For a sequence x_1, ..., x_n:
@@ -20,10 +28,44 @@ struct OnlineSoftmaxState {
   float d = 0.0f;
 };
 
-static constexpr size_t kVTileSize4 = 4;
-
 struct Tile4FlashState {
-  OnlineSoftmaxState row_states[kVTileSize4];
+  OnlineSoftmaxState row_states[k8xNFVTileSize];
+};
+
+// Parameters for a strip of tiles of flash attention. For processing a strip
+// of tiles, each of 1, k4xNFVTileSize, or k8xNFVTileSize Q-rows, by NF
+// k-positions. The total width of the strip might cover the entire sequence,
+// or a part of it, depending on whether the strip has been split.
+struct FlashAttentionParams {
+  // Vertical tile size gives the number used in the k8xNFVTileSize arrays.
+  // It is the number of Q rows in the tile.
+  uint32_t v_tile_size = 0;
+  // min start position across all rows in the tile determines the
+  // mask used for the tile.
+  uint32_t min_start_pos = std::numeric_limits<uint32_t>::max();
+  // max last position across all rows in the tile determines the mask
+  // used for the tile.
+  uint32_t max_last_pos = 0;
+  // Index into the qbatch.KV is the same for each row in the tile.
+  uint32_t qi_index;
+  // Index into the kv_cache is the same for each row in the tile.
+  uint32_t kv_offset;
+  // In the original task, the index to the split tasks of the first split task.
+  uint32_t split_index = 0;
+  // The index of the split for running split attention.
+  uint32_t i_of_n = 0;
+  // Offsets into original Q for each row in the tile.
+  uint32_t q_offsets[k8xNFVTileSize];
+  // Offsets into att_out for each row in the tile.
+  uint32_t out_offsets[k8xNFVTileSize];
+  // Start k-positions for each row in the tile.
+  uint32_t start_pos[k8xNFVTileSize];
+  // Last k-positions for each row in the tile. Inclusive.
+  uint32_t last_pos[k8xNFVTileSize];
+  // Row index to att_out.
+  uint32_t tq_idx[k8xNFVTileSize];
+  // Flash attention state for the tile.
+  Tile4FlashState end_state;
 };
 
 }  // namespace gcpp
diff --git a/gemma/gemma.cc b/gemma/gemma.cc
index 5a48d00..2450af8 100644
--- a/gemma/gemma.cc
+++ b/gemma/gemma.cc
@@ -83,9 +83,8 @@ void Attention(LayerAttentionType type, const size_t num_tokens,
 
   if (type == LayerAttentionType::kGemma) {
     // TODO: remove flag to enable FlashAttention.
-    GemmaAttention(
-        num_tokens, layer_idx, layer, activations.attention, qbatch, env,
-        AttentionImplToFlags(activations.attention_impl, HWY_NATIVE_DOT_BF16));
+    GemmaAttention(num_tokens, layer_idx, layer, activations.attention, qbatch,
+                   env, activations.attention_impl, /*flags=*/0);
   }
 }
 
@@ -595,6 +594,9 @@ static void GenerateT(const ModelConfig& config,
 
   const size_t max_gen_steps = PrefillTBatchOrQBatch(
       config, runtime_config, weights, activations, qbatch, env, timing_info);
+  // No-op if the profiler is disabled, but useful to separate prefill and
+  // generate phases for profiling.
+  env.ctx.profiler.PrintResults();
 
   hwy::BitSet4096<> non_eos;  // indexed by qi
 
diff --git a/gemma/kv_cache.cc b/gemma/kv_cache.cc
index 49276f8..e241c34 100644
--- a/gemma/kv_cache.cc
+++ b/gemma/kv_cache.cc
@@ -43,6 +43,17 @@ static size_t CappedSeqLen(const ModelConfig& config,
 
 KVCache::KVCache(const Extents2D& kv_extents, const Allocator& allocator)
     : kv_cache("kv", kv_extents, allocator, MatPadding::kOdd),
+      // WARNING: the rows and cols of k_cache and v_cache will be modified
+      // before use!
+      // The rows will be reduced by a factor of 2xkFloatsPerVector, and the
+      // cols will be increased by 2xkFloatsPerVector on first use. This is to
+      // avoid making KVCache another class that has to be duplicated for each
+      // machine architecture, since kFloatsPerVector is architecture dependent.
+      // The change is shape is safe only if the padding is kPacked.
+      k_cache("k", Extents2D(kv_extents.rows, kv_extents.cols / 2), allocator,
+              MatPadding::kPacked),
+      v_cache("v", Extents2D(kv_extents.rows, kv_extents.cols / 2), allocator,
+              MatPadding::kPacked),
       allocator_(allocator) {}
 
 KVCache::KVCache(const ModelConfig& config, const InferenceArgs& inference_args,
@@ -55,6 +66,8 @@ KVCache KVCache::Copy() {
   KVCache copy(kv_cache.Extents(), allocator_);
 
   CopyMat(kv_cache, copy.kv_cache);
+  CopyMat(k_cache, copy.k_cache);
+  CopyMat(v_cache, copy.v_cache);
   return copy;
 }
 
diff --git a/gemma/kv_cache.h b/gemma/kv_cache.h
index fe6a1ff..3d5d821 100644
--- a/gemma/kv_cache.h
+++ b/gemma/kv_cache.h
@@ -30,7 +30,7 @@
 
 namespace gcpp {
 
-using KV_t = float;
+using KV_t = BF16;
 
 // A non-owning view of a KVCache.
 struct KVCachePtr {
@@ -38,6 +38,8 @@ struct KVCachePtr {
   size_t SeqLen() const;
 
   MatPtrT<KV_t> kv_cache;
+  MatPtrT<KV_t> k_cache;
+  MatPtrT<KV_t> v_cache;
 };
 
 struct KVCache {
@@ -52,10 +54,33 @@ struct KVCache {
   }
 
   MatStorageT<KV_t> kv_cache;  // [seq_len, layers * kv_heads * qkv_dim * 2]
+  // The format of k_cache indicates that there are pairs of values from
+  // qkv_dim in groups of 2x kFloatsPerVector(=NF) elements from the sequence,
+  // in groups of qkv_dim/2 elements in groups of kv_heads elements.
+  // This enables sequential loading of the data when filling 2 vectors with
+  // NF sequence elements of pairs of BF16 qkv values. The next vector then
+  // continues reading the rest of qkv.
+  // [seq_len / 2NF, layers * kv_heads * qkv_dim/2 * 2NF * 2]
+  MatStorageT<KV_t> k_cache;
+  // v_cache is formatted to allow sequential access to V during scaling and
+  // update of att_out.
+  // Originally [seq_len, layers * kv_heads * qkv_dim]
+  // v_cache is transposed to:
+  // [layers, kv_heads, seq_len, qkv_dim], reshaped to:
+  // [layers, kv_heads, seq_len/(2NF), 2NF, qkv_dim/(2NF), 2NF]
+  // then transposed to:
+  // [seq_len/(2NF), layers, kv_heads, qkv_dim/(2NF), 2NF, 2NF]
+  // and finally packed in a 2D MatStorageT as:
+  // [seq_len/(2NF), layers * kv_heads * qkv_dim/(2NF) * 2NF * 2NF]
+  // This allows sequential reads of 2NF registers each of 2NF BF16 values,
+  // repeatedly until all of qkv_dim is read.
+  MatStorageT<KV_t> v_cache;
 
   KVCachePtr ToPtr() {
     return KVCachePtr{
         .kv_cache = kv_cache,
+        .k_cache = k_cache,
+        .v_cache = v_cache,
     };
   }
 
diff --git a/ops/ops-inl.h b/ops/ops-inl.h
index 0eeec31..c68b6c5 100644
--- a/ops/ops-inl.h
+++ b/ops/ops-inl.h
@@ -614,267 +614,6 @@ HWY_NOINLINE HWY_MAYBE_UNUSED void MulByConstAndAdd(const float c,
                                 });
 }
 
-template <class DF, class VF = hn::Vec<DF>, HWY_IF_V_SIZE_GT_D(DF, 63)>
-HWY_INLINE HWY_MAYBE_UNUSED void Mul16(DF df, const VF scale, VF& sum0,
-                                       VF& sum1, VF& sum2, VF& sum3, VF& sum4,
-                                       VF& sum5, VF& sum6, VF& sum7, VF& sum8,
-                                       VF& sum9, VF& sum10, VF& sum11,
-                                       VF& sum12, VF& sum13, VF& sum14,
-                                       VF& sum15) {
-  sum0 = hn::Mul(sum0, hn::BroadcastLane<0>(scale));
-  sum1 = hn::Mul(sum1, hn::BroadcastLane<1>(scale));
-  sum2 = hn::Mul(sum2, hn::BroadcastLane<2>(scale));
-  sum3 = hn::Mul(sum3, hn::BroadcastLane<3>(scale));
-  sum4 = hn::Mul(sum4, hn::BroadcastLane<4>(scale));
-  sum5 = hn::Mul(sum5, hn::BroadcastLane<5>(scale));
-  sum6 = hn::Mul(sum6, hn::BroadcastLane<6>(scale));
-  sum7 = hn::Mul(sum7, hn::BroadcastLane<7>(scale));
-  sum8 = hn::Mul(sum8, hn::BroadcastLane<8>(scale));
-  sum9 = hn::Mul(sum9, hn::BroadcastLane<9>(scale));
-  sum10 = hn::Mul(sum10, hn::BroadcastLane<10>(scale));
-  sum11 = hn::Mul(sum11, hn::BroadcastLane<11>(scale));
-  sum12 = hn::Mul(sum12, hn::BroadcastLane<12>(scale));
-  sum13 = hn::Mul(sum13, hn::BroadcastLane<13>(scale));
-  sum14 = hn::Mul(sum14, hn::BroadcastLane<14>(scale));
-  sum15 = hn::Mul(sum15, hn::BroadcastLane<15>(scale));
-}
-
-template <class DF, class VF = hn::Vec<DF>, HWY_IF_V_SIZE_LE_D(DF, 63)>
-HWY_INLINE HWY_MAYBE_UNUSED void Mul16(DF df, const VF scale, VF& sum0,
-                                       VF& sum1, VF& sum2, VF& sum3, VF& sum4,
-                                       VF& sum5, VF& sum6, VF& sum7, VF& sum8,
-                                       VF& sum9, VF& sum10, VF& sum11,
-                                       VF& sum12, VF& sum13, VF& sum14,
-                                       VF& sum15) {}
-
-template <class DF, class VF = hn::Vec<DF>, HWY_IF_V_SIZE_GT_D(DF, 31)>
-HWY_INLINE HWY_MAYBE_UNUSED void Mul8(DF df, const VF scale, VF& sum0, VF& sum1,
-                                      VF& sum2, VF& sum3, VF& sum4, VF& sum5,
-                                      VF& sum6, VF& sum7) {
-  sum0 = hn::Mul(sum0, hn::BroadcastLane<0>(scale));
-  sum1 = hn::Mul(sum1, hn::BroadcastLane<1>(scale));
-  sum2 = hn::Mul(sum2, hn::BroadcastLane<2>(scale));
-  sum3 = hn::Mul(sum3, hn::BroadcastLane<3>(scale));
-  sum4 = hn::Mul(sum4, hn::BroadcastLane<4>(scale));
-  sum5 = hn::Mul(sum5, hn::BroadcastLane<5>(scale));
-  sum6 = hn::Mul(sum6, hn::BroadcastLane<6>(scale));
-  sum7 = hn::Mul(sum7, hn::BroadcastLane<7>(scale));
-}
-
-template <class DF, class VF = hn::Vec<DF>, HWY_IF_V_SIZE_LE_D(DF, 31)>
-HWY_INLINE HWY_MAYBE_UNUSED void Mul8(DF df, const VF scale, VF& sum0, VF& sum1,
-                                      VF& sum2, VF& sum3, VF& sum4, VF& sum5,
-                                      VF& sum6, VF& sum7) {}
-
-template <class DF, class VF = hn::Vec<DF>, HWY_IF_V_SIZE_GT_D(DF, 63)>
-HWY_INLINE HWY_MAYBE_UNUSED void MulAdd16(
-    DF df, const VF common, const VF split, VF& sum0, VF& sum1, VF& sum2,
-    VF& sum3, VF& sum4, VF& sum5, VF& sum6, VF& sum7, VF& sum8, VF& sum9,
-    VF& sum10, VF& sum11, VF& sum12, VF& sum13, VF& sum14, VF& sum15) {
-  sum0 = hn::MulAdd(common, hn::BroadcastLane<0>(split), sum0);
-  sum1 = hn::MulAdd(common, hn::BroadcastLane<1>(split), sum1);
-  sum2 = hn::MulAdd(common, hn::BroadcastLane<2>(split), sum2);
-  sum3 = hn::MulAdd(common, hn::BroadcastLane<3>(split), sum3);
-  sum4 = hn::MulAdd(common, hn::BroadcastLane<4>(split), sum4);
-  sum5 = hn::MulAdd(common, hn::BroadcastLane<5>(split), sum5);
-  sum6 = hn::MulAdd(common, hn::BroadcastLane<6>(split), sum6);
-  sum7 = hn::MulAdd(common, hn::BroadcastLane<7>(split), sum7);
-  sum8 = hn::MulAdd(common, hn::BroadcastLane<8>(split), sum8);
-  sum9 = hn::MulAdd(common, hn::BroadcastLane<9>(split), sum9);
-  sum10 = hn::MulAdd(common, hn::BroadcastLane<10>(split), sum10);
-  sum11 = hn::MulAdd(common, hn::BroadcastLane<11>(split), sum11);
-  sum12 = hn::MulAdd(common, hn::BroadcastLane<12>(split), sum12);
-  sum13 = hn::MulAdd(common, hn::BroadcastLane<13>(split), sum13);
-  sum14 = hn::MulAdd(common, hn::BroadcastLane<14>(split), sum14);
-  sum15 = hn::MulAdd(common, hn::BroadcastLane<15>(split), sum15);
-}
-
-template <class DF, class VF = hn::Vec<DF>, HWY_IF_V_SIZE_LE_D(DF, 63)>
-HWY_INLINE HWY_MAYBE_UNUSED void MulAdd16(
-    DF df, const VF common, const VF split, VF& sum0, VF& sum1, VF& sum2,
-    VF& sum3, VF& sum4, VF& sum5, VF& sum6, VF& sum7, VF& sum8, VF& sum9,
-    VF& sum10, VF& sum11, VF& sum12, VF& sum13, VF& sum14, VF& sum15) {}
-
-template <class DF, class VF = hn::Vec<DF>, HWY_IF_V_SIZE_GT_D(DF, 31)>
-HWY_INLINE HWY_MAYBE_UNUSED void MulAdd8(DF df, const VF common, const VF split,
-                                         VF& sum0, VF& sum1, VF& sum2, VF& sum3,
-                                         VF& sum4, VF& sum5, VF& sum6,
-                                         VF& sum7) {
-  sum0 = hn::MulAdd(common, hn::BroadcastLane<0>(split), sum0);
-  sum1 = hn::MulAdd(common, hn::BroadcastLane<1>(split), sum1);
-  sum2 = hn::MulAdd(common, hn::BroadcastLane<2>(split), sum2);
-  sum3 = hn::MulAdd(common, hn::BroadcastLane<3>(split), sum3);
-  sum4 = hn::MulAdd(common, hn::BroadcastLane<4>(split), sum4);
-  sum5 = hn::MulAdd(common, hn::BroadcastLane<5>(split), sum5);
-  sum6 = hn::MulAdd(common, hn::BroadcastLane<6>(split), sum6);
-  sum7 = hn::MulAdd(common, hn::BroadcastLane<7>(split), sum7);
-}
-
-template <class DF, class VF = hn::Vec<DF>, HWY_IF_V_SIZE_LE_D(DF, 31)>
-HWY_INLINE HWY_MAYBE_UNUSED void MulAdd8(DF df, const VF common, const VF split,
-                                         VF& sum0, VF& sum1, VF& sum2, VF& sum3,
-                                         VF& sum4, VF& sum5, VF& sum6,
-                                         VF& sum7) {}
-
-template <class DF, class VF = hn::Vec<DF>>
-HWY_INLINE HWY_MAYBE_UNUSED void MulAdd4(DF df, const VF common, const VF split,
-                                         VF& sum0, VF& sum1, VF& sum2,
-                                         VF& sum3) {
-  sum0 = hn::MulAdd(common, hn::BroadcastLane<0>(split), sum0);
-  sum1 = hn::MulAdd(common, hn::BroadcastLane<1>(split), sum1);
-  sum2 = hn::MulAdd(common, hn::BroadcastLane<2>(split), sum2);
-  sum3 = hn::MulAdd(common, hn::BroadcastLane<3>(split), sum3);
-}
-
-// For an 8xNF tile of float values in 8xNF-lane registers, multiplies 8 rows
-// of V by the corresponding values in c0-c7 and adds them to NF rows of out,
-// after first prescaling out by scale.
-// The depth (size) must be a multiple of NF.
-template <class DF, class VF = hn::Vec<DF>>
-HWY_NOINLINE HWY_MAYBE_UNUSED void MulByConstAndAddTile(
-    DF df, const VF scale, const VF c0, const VF c1, const VF c2, const VF c3,
-    const VF c4, const VF c5, const VF c6, const VF c7, const MatPtrT<float>& v,
-    const size_t* HWY_RESTRICT pos, float* HWY_RESTRICT out,
-    const uint32_t* HWY_RESTRICT out_offsets, const size_t size) {
-  namespace hn = hwy::HWY_NAMESPACE;
-  HWY_LANES_CONSTEXPR size_t NF = hn::Lanes(df);
-
-  size_t i = 0;
-  while (i + NF <= size) {
-    if HWY_LANES_CONSTEXPR (NF == 16) {
-      VF out0, out1, out2, out3, out4, out5, out6, out7;
-      VF out8, out9, out10, out11, out12, out13, out14, out15;
-      out0 = hn::Load(df, out + i + out_offsets[0]);
-      out1 = hn::Load(df, out + i + out_offsets[1]);
-      out2 = hn::Load(df, out + i + out_offsets[2]);
-      out3 = hn::Load(df, out + i + out_offsets[3]);
-      out4 = hn::Load(df, out + i + out_offsets[4]);
-      out5 = hn::Load(df, out + i + out_offsets[5]);
-      out6 = hn::Load(df, out + i + out_offsets[6]);
-      out7 = hn::Load(df, out + i + out_offsets[7]);
-      out8 = hn::Load(df, out + i + out_offsets[8]);
-      out9 = hn::Load(df, out + i + out_offsets[9]);
-      out10 = hn::Load(df, out + i + out_offsets[10]);
-      out11 = hn::Load(df, out + i + out_offsets[11]);
-      out12 = hn::Load(df, out + i + out_offsets[12]);
-      out13 = hn::Load(df, out + i + out_offsets[13]);
-      out14 = hn::Load(df, out + i + out_offsets[14]);
-      out15 = hn::Load(df, out + i + out_offsets[15]);
-      Mul16(df, scale, out0, out1, out2, out3, out4, out5, out6, out7, out8,
-            out9, out10, out11, out12, out13, out14, out15);
-      VF x0 = hn::Load(df, v.Row(pos[0]) + i);
-      MulAdd16(df, x0, c0, out0, out1, out2, out3, out4, out5, out6, out7, out8,
-               out9, out10, out11, out12, out13, out14, out15);
-      VF x1 = hn::Load(df, v.Row(pos[1]) + i);
-      MulAdd16(df, x1, c1, out0, out1, out2, out3, out4, out5, out6, out7, out8,
-               out9, out10, out11, out12, out13, out14, out15);
-      VF x2 = hn::Load(df, v.Row(pos[2]) + i);
-      MulAdd16(df, x2, c2, out0, out1, out2, out3, out4, out5, out6, out7, out8,
-               out9, out10, out11, out12, out13, out14, out15);
-      VF x3 = hn::Load(df, v.Row(pos[3]) + i);
-      MulAdd16(df, x3, c3, out0, out1, out2, out3, out4, out5, out6, out7, out8,
-               out9, out10, out11, out12, out13, out14, out15);
-      VF x4 = hn::Load(df, v.Row(pos[4]) + i);
-      MulAdd16(df, x4, c4, out0, out1, out2, out3, out4, out5, out6, out7, out8,
-               out9, out10, out11, out12, out13, out14, out15);
-      VF x5 = hn::Load(df, v.Row(pos[5]) + i);
-      MulAdd16(df, x5, c5, out0, out1, out2, out3, out4, out5, out6, out7, out8,
-               out9, out10, out11, out12, out13, out14, out15);
-      VF x6 = hn::Load(df, v.Row(pos[6]) + i);
-      MulAdd16(df, x6, c6, out0, out1, out2, out3, out4, out5, out6, out7, out8,
-               out9, out10, out11, out12, out13, out14, out15);
-      VF x7 = hn::Load(df, v.Row(pos[7]) + i);
-      MulAdd16(df, x7, c7, out0, out1, out2, out3, out4, out5, out6, out7, out8,
-               out9, out10, out11, out12, out13, out14, out15);
-      hn::Store(out0, df, out + i + out_offsets[0]);
-      hn::Store(out1, df, out + i + out_offsets[1]);
-      hn::Store(out2, df, out + i + out_offsets[2]);
-      hn::Store(out3, df, out + i + out_offsets[3]);
-      hn::Store(out4, df, out + i + out_offsets[4]);
-      hn::Store(out5, df, out + i + out_offsets[5]);
-      hn::Store(out6, df, out + i + out_offsets[6]);
-      hn::Store(out7, df, out + i + out_offsets[7]);
-      hn::Store(out8, df, out + i + out_offsets[8]);
-      hn::Store(out9, df, out + i + out_offsets[9]);
-      hn::Store(out10, df, out + i + out_offsets[10]);
-      hn::Store(out11, df, out + i + out_offsets[11]);
-      hn::Store(out12, df, out + i + out_offsets[12]);
-      hn::Store(out13, df, out + i + out_offsets[13]);
-      hn::Store(out14, df, out + i + out_offsets[14]);
-      hn::Store(out15, df, out + i + out_offsets[15]);
-    }
-    if HWY_LANES_CONSTEXPR (NF == 8) {
-      VF out0, out1, out2, out3, out4, out5, out6, out7;
-      out0 = hn::Load(df, out + i + out_offsets[0]);
-      out1 = hn::Load(df, out + i + out_offsets[1]);
-      out2 = hn::Load(df, out + i + out_offsets[2]);
-      out3 = hn::Load(df, out + i + out_offsets[3]);
-      out4 = hn::Load(df, out + i + out_offsets[4]);
-      out5 = hn::Load(df, out + i + out_offsets[5]);
-      out6 = hn::Load(df, out + i + out_offsets[6]);
-      out7 = hn::Load(df, out + i + out_offsets[7]);
-      Mul8(df, scale, out0, out1, out2, out3, out4, out5, out6, out7);
-      VF x0 = hn::Load(df, v.Row(pos[0]) + i);
-      MulAdd8(df, x0, c0, out0, out1, out2, out3, out4, out5, out6, out7);
-      VF x1 = hn::Load(df, v.Row(pos[1]) + i);
-      MulAdd8(df, x1, c1, out0, out1, out2, out3, out4, out5, out6, out7);
-      VF x2 = hn::Load(df, v.Row(pos[2]) + i);
-      MulAdd8(df, x2, c2, out0, out1, out2, out3, out4, out5, out6, out7);
-      VF x3 = hn::Load(df, v.Row(pos[3]) + i);
-      MulAdd8(df, x3, c3, out0, out1, out2, out3, out4, out5, out6, out7);
-      VF x4 = hn::Load(df, v.Row(pos[4]) + i);
-      MulAdd8(df, x4, c4, out0, out1, out2, out3, out4, out5, out6, out7);
-      VF x5 = hn::Load(df, v.Row(pos[5]) + i);
-      MulAdd8(df, x5, c5, out0, out1, out2, out3, out4, out5, out6, out7);
-      VF x6 = hn::Load(df, v.Row(pos[6]) + i);
-      MulAdd8(df, x6, c6, out0, out1, out2, out3, out4, out5, out6, out7);
-      VF x7 = hn::Load(df, v.Row(pos[7]) + i);
-      MulAdd8(df, x7, c7, out0, out1, out2, out3, out4, out5, out6, out7);
-      hn::Store(out0, df, out + i + out_offsets[0]);
-      hn::Store(out1, df, out + i + out_offsets[1]);
-      hn::Store(out2, df, out + i + out_offsets[2]);
-      hn::Store(out3, df, out + i + out_offsets[3]);
-      hn::Store(out4, df, out + i + out_offsets[4]);
-      hn::Store(out5, df, out + i + out_offsets[5]);
-      hn::Store(out6, df, out + i + out_offsets[6]);
-      hn::Store(out7, df, out + i + out_offsets[7]);
-    }
-    if HWY_LANES_CONSTEXPR (NF == 4) {
-      VF out0, out1, out2, out3;
-      out0 = hn::Load(df, out + i + out_offsets[0]);
-      out1 = hn::Load(df, out + i + out_offsets[1]);
-      out2 = hn::Load(df, out + i + out_offsets[2]);
-      out3 = hn::Load(df, out + i + out_offsets[3]);
-      out0 = hn::Mul(out0, hn::BroadcastLane<0>(scale));
-      out1 = hn::Mul(out1, hn::BroadcastLane<1>(scale));
-      out2 = hn::Mul(out2, hn::BroadcastLane<2>(scale));
-      out3 = hn::Mul(out3, hn::BroadcastLane<3>(scale));
-      VF x0 = hn::Load(df, v.Row(pos[0]) + i);
-      MulAdd4(df, x0, c0, out0, out1, out2, out3);
-      VF x1 = hn::Load(df, v.Row(pos[1]) + i);
-      MulAdd4(df, x1, c1, out0, out1, out2, out3);
-      VF x2 = hn::Load(df, v.Row(pos[2]) + i);
-      MulAdd4(df, x2, c2, out0, out1, out2, out3);
-      VF x3 = hn::Load(df, v.Row(pos[3]) + i);
-      MulAdd4(df, x3, c3, out0, out1, out2, out3);
-      VF x4 = hn::Load(df, v.Row(pos[4]) + i);
-      MulAdd4(df, x4, c4, out0, out1, out2, out3);
-      VF x5 = hn::Load(df, v.Row(pos[5]) + i);
-      MulAdd4(df, x5, c5, out0, out1, out2, out3);
-      VF x6 = hn::Load(df, v.Row(pos[6]) + i);
-      MulAdd4(df, x6, c6, out0, out1, out2, out3);
-      VF x7 = hn::Load(df, v.Row(pos[7]) + i);
-      MulAdd4(df, x7, c7, out0, out1, out2, out3);
-      hn::Store(out0, df, out + i + out_offsets[0]);
-      hn::Store(out1, df, out + i + out_offsets[1]);
-      hn::Store(out2, df, out + i + out_offsets[2]);
-      hn::Store(out3, df, out + i + out_offsets[3]);
-    }
-    i += NF;
-  }
-  HWY_DASSERT(size == i);
-}
-
 template <class DF, class VF = hn::Vec<DF>>
 HWY_INLINE HWY_MAYBE_UNUSED void MulAdd4(DF df, const VF common, const VF c0,
                                          const VF c1, const VF c2, const VF c3,
@@ -887,240 +626,134 @@ HWY_INLINE HWY_MAYBE_UNUSED void MulAdd4(DF df, const VF common, const VF c0,
 }
 
 template <class DF, class VF = hn::Vec<DF>>
-HWY_INLINE HWY_MAYBE_UNUSED void MulAdd4Lanes(DF df, const MatPtrT<float>& v,
-                                              const size_t* HWY_RESTRICT pos,
-                                              const size_t offset, const VF c0,
-                                              const VF c1, const VF c2,
-                                              const VF c3, VF& sum0, VF& sum1,
-                                              VF& sum2, VF& sum3) {
-  // TODO(rays): Check whether a transpose of c0-c3 is applicable and faster.
-  VF x0 = hn::Load(df, v.Row(pos[0]) + offset);
-  MulAdd4(df, x0, hn::BroadcastLane<0>(c0), hn::BroadcastLane<0>(c1),
-          hn::BroadcastLane<0>(c2), hn::BroadcastLane<0>(c3), sum0, sum1, sum2,
-          sum3);
-  VF x1 = hn::Load(df, v.Row(pos[1]) + offset);
-  MulAdd4(df, x1, hn::BroadcastLane<1>(c0), hn::BroadcastLane<1>(c1),
-          hn::BroadcastLane<1>(c2), hn::BroadcastLane<1>(c3), sum0, sum1, sum2,
-          sum3);
-  VF x2 = hn::Load(df, v.Row(pos[2]) + offset);
-  MulAdd4(df, x2, hn::BroadcastLane<2>(c0), hn::BroadcastLane<2>(c1),
-          hn::BroadcastLane<2>(c2), hn::BroadcastLane<2>(c3), sum0, sum1, sum2,
-          sum3);
-  VF x3 = hn::Load(df, v.Row(pos[3]) + offset);
-  MulAdd4(df, x3, hn::BroadcastLane<3>(c0), hn::BroadcastLane<3>(c1),
-          hn::BroadcastLane<3>(c2), hn::BroadcastLane<3>(c3), sum0, sum1, sum2,
-          sum3);
+HWY_INLINE HWY_MAYBE_UNUSED void MulAddNLanesVT4(
+    DF df, const BF16* HWY_RESTRICT v, const float* HWY_RESTRICT c,
+    const size_t num_lanes, VF& sum0a, VF& sum1a, VF& sum2a, VF& sum3a,
+    VF& sum0b, VF& sum1b, VF& sum2b, VF& sum3b) {
+  using DBF = hn::ScalableTag<BF16>;
+  const DBF dbf;
+  using VBF = hn::Vec<DBF>;
+  const size_t kNF = hn::Lanes(df);
+  for (size_t lane = 0; lane < num_lanes; ++lane, v += 2 * kNF) {
+    VBF v0 = hn::Load(dbf, v);
+    VF c0 = hn::Set(df, *c++);
+    VF c1 = hn::Set(df, *c++);
+    VF c2 = hn::Set(df, *c++);
+    VF c3 = hn::Set(df, *c++);
+    VF v0a = hn::PromoteLowerTo(df, v0);
+    VF v0b = hn::PromoteUpperTo(df, v0);
+    MulAdd4(df, v0a, c0, c1, c2, c3, sum0a, sum1a, sum2a, sum3a);
+    MulAdd4(df, v0b, c0, c1, c2, c3, sum0b, sum1b, sum2b, sum3b);
+  }
 }
 
-template <class DF, class VF = hn::Vec<DF>, HWY_IF_V_SIZE_GT_D(DF, 31)>
-HWY_INLINE HWY_MAYBE_UNUSED void MulAddSecond4Lanes(
-    DF df, const MatPtrT<float>& v, const size_t* HWY_RESTRICT pos,
-    const size_t offset, const VF c0, const VF c1, const VF c2, const VF c3,
-    VF& sum0, VF& sum1, VF& sum2, VF& sum3) {
-  VF x4 = hn::Load(df, v.Row(pos[4]) + offset);
-  MulAdd4(df, x4, hn::BroadcastLane<4>(c0), hn::BroadcastLane<4>(c1),
-          hn::BroadcastLane<4>(c2), hn::BroadcastLane<4>(c3), sum0, sum1, sum2,
-          sum3);
-  VF x5 = hn::Load(df, v.Row(pos[5]) + offset);
-  MulAdd4(df, x5, hn::BroadcastLane<5>(c0), hn::BroadcastLane<5>(c1),
-          hn::BroadcastLane<5>(c2), hn::BroadcastLane<5>(c3), sum0, sum1, sum2,
-          sum3);
-  VF x6 = hn::Load(df, v.Row(pos[6]) + offset);
-  MulAdd4(df, x6, hn::BroadcastLane<6>(c0), hn::BroadcastLane<6>(c1),
-          hn::BroadcastLane<6>(c2), hn::BroadcastLane<6>(c3), sum0, sum1, sum2,
-          sum3);
-  VF x7 = hn::Load(df, v.Row(pos[7]) + offset);
-  MulAdd4(df, x7, hn::BroadcastLane<7>(c0), hn::BroadcastLane<7>(c1),
-          hn::BroadcastLane<7>(c2), hn::BroadcastLane<7>(c3), sum0, sum1, sum2,
-          sum3);
-}
-
-template <class DF, class VF = hn::Vec<DF>, HWY_IF_V_SIZE_LE_D(DF, 31)>
-HWY_INLINE HWY_MAYBE_UNUSED void MulAddSecond4Lanes(
-    DF df, const MatPtrT<float>& v, const size_t* HWY_RESTRICT pos,
-    const size_t offset, const VF c0, const VF c1, const VF c2, const VF c3,
-    VF& sum0, VF& sum1, VF& sum2, VF& sum3) {}
-
-template <class DF, class VF = hn::Vec<DF>, HWY_IF_V_SIZE_GT_D(DF, 63)>
-HWY_INLINE HWY_MAYBE_UNUSED void MulAddSecond8Lanes(
-    DF df, const MatPtrT<float>& v, const size_t* HWY_RESTRICT pos,
-    const size_t offset, const VF c0, const VF c1, const VF c2, const VF c3,
-    VF& sum0, VF& sum1, VF& sum2, VF& sum3) {
-  VF x8 = hn::Load(df, v.Row(pos[8]) + offset);
-  MulAdd4(df, x8, hn::BroadcastLane<8>(c0), hn::BroadcastLane<8>(c1),
-          hn::BroadcastLane<8>(c2), hn::BroadcastLane<8>(c3), sum0, sum1, sum2,
-          sum3);
-  VF x9 = hn::Load(df, v.Row(pos[9]) + offset);
-  MulAdd4(df, x9, hn::BroadcastLane<9>(c0), hn::BroadcastLane<9>(c1),
-          hn::BroadcastLane<9>(c2), hn::BroadcastLane<9>(c3), sum0, sum1, sum2,
-          sum3);
-  VF x10 = hn::Load(df, v.Row(pos[10]) + offset);
-  MulAdd4(df, x10, hn::BroadcastLane<10>(c0), hn::BroadcastLane<10>(c1),
-          hn::BroadcastLane<10>(c2), hn::BroadcastLane<10>(c3), sum0, sum1,
-          sum2, sum3);
-  VF x11 = hn::Load(df, v.Row(pos[11]) + offset);
-  MulAdd4(df, x11, hn::BroadcastLane<11>(c0), hn::BroadcastLane<11>(c1),
-          hn::BroadcastLane<11>(c2), hn::BroadcastLane<11>(c3), sum0, sum1,
-          sum2, sum3);
-  VF x12 = hn::Load(df, v.Row(pos[12]) + offset);
-  MulAdd4(df, x12, hn::BroadcastLane<12>(c0), hn::BroadcastLane<12>(c1),
-          hn::BroadcastLane<12>(c2), hn::BroadcastLane<12>(c3), sum0, sum1,
-          sum2, sum3);
-  VF x13 = hn::Load(df, v.Row(pos[13]) + offset);
-  MulAdd4(df, x13, hn::BroadcastLane<13>(c0), hn::BroadcastLane<13>(c1),
-          hn::BroadcastLane<13>(c2), hn::BroadcastLane<13>(c3), sum0, sum1,
-          sum2, sum3);
-  VF x14 = hn::Load(df, v.Row(pos[14]) + offset);
-  MulAdd4(df, x14, hn::BroadcastLane<14>(c0), hn::BroadcastLane<14>(c1),
-          hn::BroadcastLane<14>(c2), hn::BroadcastLane<14>(c3), sum0, sum1,
-          sum2, sum3);
-  VF x15 = hn::Load(df, v.Row(pos[15]) + offset);
-  MulAdd4(df, x15, hn::BroadcastLane<15>(c0), hn::BroadcastLane<15>(c1),
-          hn::BroadcastLane<15>(c2), hn::BroadcastLane<15>(c3), sum0, sum1,
-          sum2, sum3);
-}
-
-template <class DF, class VF = hn::Vec<DF>, HWY_IF_V_SIZE_LE_D(DF, 63)>
-HWY_INLINE HWY_MAYBE_UNUSED void MulAddSecond8Lanes(
-    DF df, const MatPtrT<float>& v, const size_t* HWY_RESTRICT pos,
-    const size_t offset, const VF c0, const VF c1, const VF c2, const VF c3,
-    VF& sum0, VF& sum1, VF& sum2, VF& sum3) {}
-
-// For an NFx4 tile of float values in 4xNF-lane registers, multiplies NF rows
-// of V by the corresponding values in c0-c3 and adds them to NF rows of out,
+// For a 2NFx4 tile of float values in 8xNF-lane registers, multiplies 2NF rows
+// of V by the corresponding values in c00-c31 and adds them to 2NF rows of out,
 // after first prescaling out by scale.
-// The depth (size) must be a multiple of NF.
+// The depth (size) must be a multiple of 2NF.
 template <class DF, class VF = hn::Vec<DF>>
-HWY_NOINLINE HWY_MAYBE_UNUSED void MulByConstAndAddTile4(
-    DF df, const float* HWY_RESTRICT scales, const VF c0, const VF c1,
-    const VF c2, const VF c3, const MatPtrT<float>& v,
-    const size_t* HWY_RESTRICT pos, float* HWY_RESTRICT out,
+HWY_INLINE HWY_MAYBE_UNUSED void MulByConstAndAddVT4Mem(
+    DF df, const float* HWY_RESTRICT scales, const VF c00, const VF c01,
+    const VF c10, const VF c11, const VF c20, const VF c21, const VF c30,
+    const VF c31, const MatPtrT<BF16>& v, const size_t* HWY_RESTRICT pos,
+    size_t num_lanes, float* HWY_RESTRICT out,
     const uint32_t* HWY_RESTRICT out_offsets, const size_t size) {
   namespace hn = hwy::HWY_NAMESPACE;
   HWY_LANES_CONSTEXPR size_t NF = hn::Lanes(df);
+  constexpr size_t kMaxNF = hn::MaxLanes(df);
+  const BF16* HWY_RESTRICT v_bf = v.Row(pos[0] / (2 * NF));
+  HWY_DASSERT(pos[0] % (2 * NF) == 0);
+  HWY_ALIGN float c_mem[8 * kMaxNF];
+  hn::StoreInterleaved4(c00, c10, c20, c30, df, c_mem);
+  hn::StoreInterleaved4(c01, c11, c21, c31, df, c_mem + 4 * NF);
 
   size_t i = 0;
-  while (i + NF <= size) {
-    VF out0, out1, out2, out3;
-    out0 = hn::Load(df, out + i + out_offsets[0]);
-    out1 = hn::Load(df, out + i + out_offsets[1]);
-    out2 = hn::Load(df, out + i + out_offsets[2]);
-    out3 = hn::Load(df, out + i + out_offsets[3]);
-    out0 = hn::Mul(out0, hn::Set(df, scales[0]));
-    out1 = hn::Mul(out1, hn::Set(df, scales[1]));
-    out2 = hn::Mul(out2, hn::Set(df, scales[2]));
-    out3 = hn::Mul(out3, hn::Set(df, scales[3]));
-    MulAdd4Lanes(df, v, pos, i, c0, c1, c2, c3, out0, out1, out2, out3);
-    if HWY_LANES_CONSTEXPR (NF >= 8) {
-      MulAddSecond4Lanes(df, v, pos, i, c0, c1, c2, c3, out0, out1, out2, out3);
-      if HWY_LANES_CONSTEXPR (NF >= 16) {
-        MulAddSecond8Lanes(df, v, pos, i, c0, c1, c2, c3, out0, out1, out2,
-                           out3);
-      }
-    }
-    hn::Store(out0, df, out + i + out_offsets[0]);
-    hn::Store(out1, df, out + i + out_offsets[1]);
-    hn::Store(out2, df, out + i + out_offsets[2]);
-    hn::Store(out3, df, out + i + out_offsets[3]);
-    i += NF;
+  while (i + NF * 2 <= size) {
+    VF out0a, out1a, out2a, out3a, out0b, out1b, out2b, out3b;
+    out0a = hn::Load(df, out + i + out_offsets[0]);
+    out1a = hn::Load(df, out + i + out_offsets[1]);
+    out2a = hn::Load(df, out + i + out_offsets[2]);
+    out3a = hn::Load(df, out + i + out_offsets[3]);
+    VF scale0 = hn::Set(df, scales[0]);
+    VF scale1 = hn::Set(df, scales[1]);
+    VF scale2 = hn::Set(df, scales[2]);
+    VF scale3 = hn::Set(df, scales[3]);
+    out0a = hn::Mul(out0a, scale0);
+    out1a = hn::Mul(out1a, scale1);
+    out2a = hn::Mul(out2a, scale2);
+    out3a = hn::Mul(out3a, scale3);
+    out0b = hn::Load(df, out + i + NF + out_offsets[0]);
+    out1b = hn::Load(df, out + i + NF + out_offsets[1]);
+    out2b = hn::Load(df, out + i + NF + out_offsets[2]);
+    out3b = hn::Load(df, out + i + NF + out_offsets[3]);
+    out0b = hn::Mul(out0b, scale0);
+    out1b = hn::Mul(out1b, scale1);
+    out2b = hn::Mul(out2b, scale2);
+    out3b = hn::Mul(out3b, scale3);
+    MulAddNLanesVT4(df, v_bf, c_mem, HWY_MIN(num_lanes, 2 * NF), out0a, out1a,
+                    out2a, out3a, out0b, out1b, out2b, out3b);
+    hn::Store(out0a, df, out + i + out_offsets[0]);
+    hn::Store(out1a, df, out + i + out_offsets[1]);
+    hn::Store(out2a, df, out + i + out_offsets[2]);
+    hn::Store(out3a, df, out + i + out_offsets[3]);
+    hn::Store(out0b, df, out + i + NF + out_offsets[0]);
+    hn::Store(out1b, df, out + i + NF + out_offsets[1]);
+    hn::Store(out2b, df, out + i + NF + out_offsets[2]);
+    hn::Store(out3b, df, out + i + NF + out_offsets[3]);
+    i += NF * 2;
+    v_bf += 4 * NF * NF;
   }
   HWY_DASSERT(size == i);
 }
 
-// Prescales NF rows of out by scale, then multiplies 1 row of V by the
-// corresponding values in c0 and adds them to the NF rows of out.
-// The depth (size) must be a multiple of NF.
 template <class DF, class VF = hn::Vec<DF>>
-HWY_NOINLINE HWY_MAYBE_UNUSED void MulByConstAndAddVector(
-    DF df, const VF scale, const VF c0, const MatPtrT<float>& v,
-    const size_t pos, float* HWY_RESTRICT out,
-    const uint32_t* HWY_RESTRICT out_offsets, const size_t size) {
+HWY_INLINE HWY_MAYBE_UNUSED void MulAddNLanesVT1(DF df,
+                                                 const BF16* HWY_RESTRICT v,
+                                                 const float* HWY_RESTRICT c,
+                                                 const size_t num_lanes,
+                                                 VF& sum0a, VF& sum0b) {
+  using DBF = hn::ScalableTag<BF16>;
+  const DBF dbf;
+  using VBF = hn::Vec<DBF>;
+  const size_t kNF = hn::Lanes(df);
+  for (size_t lane = 0; lane < num_lanes; ++lane, v += 2 * kNF) {
+    VBF v0 = hn::Load(dbf, v);
+    VF c0 = hn::Set(df, *c++);
+    VF v0a = hn::PromoteLowerTo(df, v0);
+    VF v0b = hn::PromoteUpperTo(df, v0);
+    sum0a = hn::MulAdd(v0a, c0, sum0a);
+    sum0b = hn::MulAdd(v0b, c0, sum0b);
+  }
+}
+
+template <class DF, class VF = hn::Vec<DF>>
+HWY_INLINE HWY_MAYBE_UNUSED void MulByConstAndAddVT1Mem(
+    DF df, const float* HWY_RESTRICT scales, const VF c00, const VF c01,
+    const MatPtrT<BF16>& v, const size_t* HWY_RESTRICT pos, size_t num_lanes,
+    float* HWY_RESTRICT out, const uint32_t* HWY_RESTRICT out_offsets,
+    const size_t size) {
   namespace hn = hwy::HWY_NAMESPACE;
   HWY_LANES_CONSTEXPR size_t NF = hn::Lanes(df);
+  constexpr size_t kMaxNF = hn::MaxLanes(df);
+  const BF16* HWY_RESTRICT v_bf = v.Row(pos[0] / (2 * NF));
+  HWY_DASSERT(pos[0] % (2 * NF) == 0);
+  HWY_ALIGN float c_mem[2 * kMaxNF];
+  hn::Store(c00, df, c_mem);
+  hn::Store(c01, df, c_mem + NF);
 
   size_t i = 0;
-  while (i + NF <= size) {
-    if HWY_LANES_CONSTEXPR (NF == 16) {
-      VF out0, out1, out2, out3, out4, out5, out6, out7;
-      VF out8, out9, out10, out11, out12, out13, out14, out15;
-      out0 = hn::Load(df, out + i + out_offsets[0]);
-      out1 = hn::Load(df, out + i + out_offsets[1]);
-      out2 = hn::Load(df, out + i + out_offsets[2]);
-      out3 = hn::Load(df, out + i + out_offsets[3]);
-      out4 = hn::Load(df, out + i + out_offsets[4]);
-      out5 = hn::Load(df, out + i + out_offsets[5]);
-      out6 = hn::Load(df, out + i + out_offsets[6]);
-      out7 = hn::Load(df, out + i + out_offsets[7]);
-      out8 = hn::Load(df, out + i + out_offsets[8]);
-      out9 = hn::Load(df, out + i + out_offsets[9]);
-      out10 = hn::Load(df, out + i + out_offsets[10]);
-      out11 = hn::Load(df, out + i + out_offsets[11]);
-      out12 = hn::Load(df, out + i + out_offsets[12]);
-      out13 = hn::Load(df, out + i + out_offsets[13]);
-      out14 = hn::Load(df, out + i + out_offsets[14]);
-      out15 = hn::Load(df, out + i + out_offsets[15]);
-      Mul16(df, scale, out0, out1, out2, out3, out4, out5, out6, out7, out8,
-            out9, out10, out11, out12, out13, out14, out15);
-      VF x0 = hn::Load(df, v.Row(pos) + i);
-      MulAdd16(df, x0, c0, out0, out1, out2, out3, out4, out5, out6, out7, out8,
-               out9, out10, out11, out12, out13, out14, out15);
-      hn::Store(out0, df, out + i + out_offsets[0]);
-      hn::Store(out1, df, out + i + out_offsets[1]);
-      hn::Store(out2, df, out + i + out_offsets[2]);
-      hn::Store(out3, df, out + i + out_offsets[3]);
-      hn::Store(out4, df, out + i + out_offsets[4]);
-      hn::Store(out5, df, out + i + out_offsets[5]);
-      hn::Store(out6, df, out + i + out_offsets[6]);
-      hn::Store(out7, df, out + i + out_offsets[7]);
-      hn::Store(out8, df, out + i + out_offsets[8]);
-      hn::Store(out9, df, out + i + out_offsets[9]);
-      hn::Store(out10, df, out + i + out_offsets[10]);
-      hn::Store(out11, df, out + i + out_offsets[11]);
-      hn::Store(out12, df, out + i + out_offsets[12]);
-      hn::Store(out13, df, out + i + out_offsets[13]);
-      hn::Store(out14, df, out + i + out_offsets[14]);
-      hn::Store(out15, df, out + i + out_offsets[15]);
-    }
-    if HWY_LANES_CONSTEXPR (NF == 8) {
-      VF out0, out1, out2, out3, out4, out5, out6, out7;
-      out0 = hn::Load(df, out + i + out_offsets[0]);
-      out1 = hn::Load(df, out + i + out_offsets[1]);
-      out2 = hn::Load(df, out + i + out_offsets[2]);
-      out3 = hn::Load(df, out + i + out_offsets[3]);
-      out4 = hn::Load(df, out + i + out_offsets[4]);
-      out5 = hn::Load(df, out + i + out_offsets[5]);
-      out6 = hn::Load(df, out + i + out_offsets[6]);
-      out7 = hn::Load(df, out + i + out_offsets[7]);
-      Mul8(df, scale, out0, out1, out2, out3, out4, out5, out6, out7);
-      VF x0 = hn::Load(df, v.Row(pos) + i);
-      MulAdd8(df, x0, c0, out0, out1, out2, out3, out4, out5, out6, out7);
-      hn::Store(out0, df, out + i + out_offsets[0]);
-      hn::Store(out1, df, out + i + out_offsets[1]);
-      hn::Store(out2, df, out + i + out_offsets[2]);
-      hn::Store(out3, df, out + i + out_offsets[3]);
-      hn::Store(out4, df, out + i + out_offsets[4]);
-      hn::Store(out5, df, out + i + out_offsets[5]);
-      hn::Store(out6, df, out + i + out_offsets[6]);
-      hn::Store(out7, df, out + i + out_offsets[7]);
-    }
-    if HWY_LANES_CONSTEXPR (NF == 4) {
-      VF out0, out1, out2, out3;
-      out0 = hn::Load(df, out + i + out_offsets[0]);
-      out1 = hn::Load(df, out + i + out_offsets[1]);
-      out2 = hn::Load(df, out + i + out_offsets[2]);
-      out3 = hn::Load(df, out + i + out_offsets[3]);
-      out0 = hn::Mul(out0, hn::BroadcastLane<0>(scale));
-      out1 = hn::Mul(out1, hn::BroadcastLane<1>(scale));
-      out2 = hn::Mul(out2, hn::BroadcastLane<2>(scale));
-      out3 = hn::Mul(out3, hn::BroadcastLane<3>(scale));
-      VF x0 = hn::Load(df, v.Row(pos) + i);
-      MulAdd4(df, x0, c0, out0, out1, out2, out3);
-      hn::Store(out0, df, out + i + out_offsets[0]);
-      hn::Store(out1, df, out + i + out_offsets[1]);
-      hn::Store(out2, df, out + i + out_offsets[2]);
-      hn::Store(out3, df, out + i + out_offsets[3]);
-    }
-    i += NF;
+  while (i + NF * 2 <= size) {
+    VF out0a, out0b;
+    out0a = hn::Load(df, out + i + out_offsets[0]);
+    VF scale0 = hn::Set(df, scales[0]);
+    out0a = hn::Mul(out0a, scale0);
+    out0b = hn::Load(df, out + i + NF + out_offsets[0]);
+    out0b = hn::Mul(out0b, scale0);
+    MulAddNLanesVT1(df, v_bf, c_mem, HWY_MIN(num_lanes, 2 * NF), out0a, out0b);
+    hn::Store(out0a, df, out + i + out_offsets[0]);
+    hn::Store(out0b, df, out + i + NF + out_offsets[0]);
+    i += NF * 2;
+    v_bf += 4 * NF * NF;
   }
   HWY_DASSERT(size == i);
 }
diff --git a/util/mat.h b/util/mat.h
index 25f2cb2..e157473 100644
--- a/util/mat.h
+++ b/util/mat.h
@@ -202,6 +202,17 @@ class MatPtr : public IFields {
     override_rows_ = static_cast<uint32_t>(rows);
   }
 
+  // Changes the number of rows and columns without reallocating the memory.
+  // Increases cols by factor and reduces rows by factor.
+  // The rows must be divisible by factor and the matrix must be packed.
+  void ReshapePackedRowsToCols(size_t factor) {
+    HWY_ASSERT(IsPacked());
+    HWY_ASSERT(private_rows_ % factor == 0);
+    private_rows_ /= factor;
+    cols_ *= factor;
+    stride_ *= factor;
+  }
+
   // Offset by which to advance pointers to the next row.
   size_t Stride() const { return stride_; }
 
diff --git a/util/test_util.h b/util/test_util.h
index 19342e4..443990f 100644
--- a/util/test_util.h
+++ b/util/test_util.h
@@ -106,7 +106,8 @@ template <typename T>
 void FillMatPtrT(MatPtrT<T>& mat) {
   for (int i = 0; i < mat.Rows(); ++i) {
     for (int j = 0; j < mat.Cols(); ++j) {
-      mat.Row(i)[j] = hwy::Unpredictable1() * 0.01f * (i + j + 1);
+      mat.Row(i)[j] =
+          hwy::ConvertScalarTo<T>(hwy::Unpredictable1() * 0.01f * (i + j + 1));
     }
   }
 }
diff --git a/util/zones.cc b/util/zones.cc
index 6480b96..edcddfb 100644
--- a/util/zones.cc
+++ b/util/zones.cc
@@ -17,14 +17,14 @@ const char* ZoneName(Zones zone) {
       return "FlashAttention.Inclusive";
     case Zones::kFlashAttentionRmsNormAndPositionalEncoding:
       return "FlashAttention.RMSNormAndPositionalEncoding";
-    case Zones::kFlashAttentionSingleFlashAttention:
-      return "FlashAttention.SingleFlashAttention";
-    case Zones::kFlashAttentionTileFlashAttention:
-      return "FlashAttention.TileFlashAttention";
+    case Zones::kFlashAttentionTileFlashAttention1:
+      return "FlashAttention.TileFlashAttention1";
     case Zones::kFlashAttentionTileFlashAttention4:
       return "FlashAttention.TileFlashAttention4";
-    case Zones::kFlashAttentionTransposeQ:
-      return "FlashAttention.TransposeQ";
+    case Zones::kFlashAttentionTileFlashAttention8:
+      return "FlashAttention.TileFlashAttention8";
+    case Zones::kFlashAttentionCombineSplit:
+      return "FlashAttention.CombineSplit";
     case Zones::kGenActivation:
       return "Gen.Activation";
     case Zones::kGenActivationFused:
diff --git a/util/zones.h b/util/zones.h
index ac96ad0..e53065e 100644
--- a/util/zones.h
+++ b/util/zones.h
@@ -14,10 +14,10 @@ enum class Zones {   // Keep sorted
   kFlashAttentionFlashAttention,
   kFlashAttentionInclusive,
   kFlashAttentionRmsNormAndPositionalEncoding,
-  kFlashAttentionSingleFlashAttention,
-  kFlashAttentionTileFlashAttention,
+  kFlashAttentionTileFlashAttention1,
   kFlashAttentionTileFlashAttention4,
-  kFlashAttentionTransposeQ,
+  kFlashAttentionTileFlashAttention8,
+  kFlashAttentionCombineSplit,
   kGenActivation,
   kGenActivationFused,
   kGenAttention,