From 0afa480d90f4e54a132b21db0f680da62508d84e Mon Sep 17 00:00:00 2001 From: Zoltan Szabadka Date: Thu, 2 May 2024 09:30:07 +0000 Subject: [PATCH] Use more parallelism in the final output of the attention block. We use MatVec instead of MatVecLoop for the per-head dense layers, because we can parallelize more on the rows of the matrix than on the number of heads. This will be even more efficient after we rearrange the weights and can have a single MatVec operation. Benchmark results (summarization with 1600 tokens for prefill and essay writing with 500 tokens for generation): ``` Prefill speed Generation speed Num threads BEFORE AFTER BEFORE AFTER 32 58.24 t/s 61.79 t/s 32.11 t/s 32.62 t/s 64 83.62 t/s 92.00 t/s 41.10 t/s 41.80 t/s ``` --- gemma/gemma.cc | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/gemma/gemma.cc b/gemma/gemma.cc index a92d835..6d80741 100644 --- a/gemma/gemma.cc +++ b/gemma/gemma.cc @@ -741,22 +741,6 @@ HWY_NOINLINE void Attention(size_t batch_start, size_t batch_idx, size_t layer, float* HWY_RESTRICT v2 = kv_cache.kv_cache.get() + cache_offset + kQKVDim; MulByConstAndAdd(head_att[pos2], v2, att_out, kQKVDim); } - // linear projection from kQKVDim back to kModelDim, sum projections - // across heads - float* HWY_RESTRICT head_out = - head == 0 - ? activations.att_post2.data() + batch_idx * kModelDim - : activations.att_post1.data() + head * kBatchSize * kModelDim; - float* even_odd = activations.even_odd.data() + thread * kQKVDim; - if (head == 0) { - MatVecAddLoop( - layer_weights->attn_vec_einsum_w, head * kModelDim * kQKVDim, att_out, - layer_weights->attention_output_biases.data(), even_odd, head_out); - } else { - MatVecLoop(layer_weights->attn_vec_einsum_w, - head * kModelDim * kQKVDim, att_out, - even_odd, head_out); - } }; if constexpr (kHeads == kKVHeads) { @@ -810,11 +794,24 @@ HWY_NOINLINE void Attention(size_t batch_start, size_t batch_idx, size_t layer, }); } - // accumulate output across all heads into att_post2. head 0 already wrote - // directly to att_post2. - for (size_t head = 1; head < kHeads; ++head) { - AddFrom(activations.att_post1.data() + head * kBatchSize * kModelDim, - activations.att_post2.data() + batch_idx * kModelDim, kModelDim); + // TODO(szabadka) Use a single MatVecAdd like in GriffinRecurrent() after + // rearranging the weights. + float* HWY_RESTRICT att_out = + activations.att_out.data() + batch_idx * kHeads * kQKVDim; + float* HWY_RESTRICT layer_out = + activations.att_post2.data() + batch_idx * kModelDim; + MatVecAdd( + layer_weights->attn_vec_einsum_w, 0, att_out, + layer_weights->attention_output_biases.data(), + activations.even_odd.data(), layer_out, pool); + for (size_t head = 1; head < kHeads; ++head) { + float* HWY_RESTRICT head_out = + activations.att_post1.data() + head * kBatchSize * kModelDim; + MatVec( + layer_weights->attn_vec_einsum_w, head * kModelDim * kQKVDim, + att_out + head * kQKVDim, + activations.even_odd.data(), head_out, pool); + AddFrom(head_out, layer_out, kModelDim); } }