Use more parallelism in the final output of the attention block.

We use MatVec instead of MatVecLoop for the per-head dense layers, because we can parallelize more on the rows of the matrix than on the number of heads. This will be even more efficient after we rearrange the weights and can have a single MatVec operation. Benchmark results (summarization with 1600 tokens for prefill and essay writing with 500 tokens for generation): ``` Prefill speed Generation speed Num threads BEFORE AFTER BEFORE AFTER 32 58.24 t/s 61.79 t/s 32.11 t/s 32.62 t/s 64 83.62 t/s 92.00 t/s 41.10 t/s 41.80 t/s ```
2024-05-02 09:30:07 +00:00 · 2024-05-02 09:30:07 +00:00 · 0afa480d90
parent 12fb2f05cf
commit 0afa480d90
1 changed files with 18 additions and 21 deletions
--- a/gemma/gemma.cc
+++ b/gemma/gemma.cc
@ -741,22 +741,6 @@ HWY_NOINLINE void Attention(size_t batch_start, size_t batch_idx, size_t layer,
      float* HWY_RESTRICT v2 = kv_cache.kv_cache.get() + cache_offset + kQKVDim;
      MulByConstAndAdd(head_att[pos2], v2, att_out, kQKVDim);
    }
    // linear projection from kQKVDim back to kModelDim, sum projections
    // across heads
    float* HWY_RESTRICT head_out =
        head == 0
            ? activations.att_post2.data() + batch_idx * kModelDim
            : activations.att_post1.data() + head * kBatchSize * kModelDim;
    float* even_odd = activations.even_odd.data() + thread * kQKVDim;
    if (head == 0) {
      MatVecAddLoop<TConfig::kSoftmaxAttnOutputBiases, kModelDim, kQKVDim>(
          layer_weights->attn_vec_einsum_w, head * kModelDim * kQKVDim, att_out,
          layer_weights->attention_output_biases.data(), even_odd, head_out);
    } else {
      MatVecLoop<kModelDim, kQKVDim>(layer_weights->attn_vec_einsum_w,
                                     head * kModelDim * kQKVDim, att_out,
                                     even_odd, head_out);
    }
  };
  if constexpr (kHeads == kKVHeads) {
@ -810,11 +794,24 @@ HWY_NOINLINE void Attention(size_t batch_start, size_t batch_idx, size_t layer,
    });
  }
-  // accumulate output across all heads into att_post2. head 0 already wrote
+  // TODO(szabadka) Use a single MatVecAdd like in GriffinRecurrent() after
-  // directly to att_post2.
+  // rearranging the weights.
-  for (size_t head = 1; head < kHeads; ++head) {
+  float* HWY_RESTRICT att_out =
-    AddFrom(activations.att_post1.data() + head * kBatchSize * kModelDim,
+      activations.att_out.data() + batch_idx * kHeads * kQKVDim;
-            activations.att_post2.data() + batch_idx * kModelDim, kModelDim);
+  float* HWY_RESTRICT layer_out =
      activations.att_post2.data() + batch_idx * kModelDim;
  MatVecAdd<TConfig::kSoftmaxAttnOutputBiases, kModelDim, kQKVDim>(
      layer_weights->attn_vec_einsum_w, 0, att_out,
      layer_weights->attention_output_biases.data(),
      activations.even_odd.data(), layer_out, pool);
   for (size_t head = 1; head < kHeads; ++head) {
    float* HWY_RESTRICT head_out =
        activations.att_post1.data() + head * kBatchSize * kModelDim;
    MatVec<kModelDim, kQKVDim>(
        layer_weights->attn_vec_einsum_w, head * kModelDim * kQKVDim,
        att_out + head * kQKVDim,
        activations.even_odd.data(), head_out, pool);
    AddFrom(head_out, layer_out, kModelDim);
  }
 }