mirror of https://github.com/google/gemma.cpp.git
Use more parallelism in the final output of the attention block.
We use MatVec instead of MatVecLoop for the per-head dense layers,
because we can parallelize more on the rows of the matrix than
on the number of heads. This will be even more efficient after
we rearrange the weights and can have a single MatVec operation.
Benchmark results (summarization with 1600 tokens for prefill
and essay writing with 500 tokens for generation):
```
Prefill speed Generation speed
Num threads BEFORE AFTER BEFORE AFTER
32 58.24 t/s 61.79 t/s 32.11 t/s 32.62 t/s
64 83.62 t/s 92.00 t/s 41.10 t/s 41.80 t/s
```
This commit is contained in:
parent
12fb2f05cf
commit
0afa480d90
|
|
@ -741,22 +741,6 @@ HWY_NOINLINE void Attention(size_t batch_start, size_t batch_idx, size_t layer,
|
||||||
float* HWY_RESTRICT v2 = kv_cache.kv_cache.get() + cache_offset + kQKVDim;
|
float* HWY_RESTRICT v2 = kv_cache.kv_cache.get() + cache_offset + kQKVDim;
|
||||||
MulByConstAndAdd(head_att[pos2], v2, att_out, kQKVDim);
|
MulByConstAndAdd(head_att[pos2], v2, att_out, kQKVDim);
|
||||||
}
|
}
|
||||||
// linear projection from kQKVDim back to kModelDim, sum projections
|
|
||||||
// across heads
|
|
||||||
float* HWY_RESTRICT head_out =
|
|
||||||
head == 0
|
|
||||||
? activations.att_post2.data() + batch_idx * kModelDim
|
|
||||||
: activations.att_post1.data() + head * kBatchSize * kModelDim;
|
|
||||||
float* even_odd = activations.even_odd.data() + thread * kQKVDim;
|
|
||||||
if (head == 0) {
|
|
||||||
MatVecAddLoop<TConfig::kSoftmaxAttnOutputBiases, kModelDim, kQKVDim>(
|
|
||||||
layer_weights->attn_vec_einsum_w, head * kModelDim * kQKVDim, att_out,
|
|
||||||
layer_weights->attention_output_biases.data(), even_odd, head_out);
|
|
||||||
} else {
|
|
||||||
MatVecLoop<kModelDim, kQKVDim>(layer_weights->attn_vec_einsum_w,
|
|
||||||
head * kModelDim * kQKVDim, att_out,
|
|
||||||
even_odd, head_out);
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
if constexpr (kHeads == kKVHeads) {
|
if constexpr (kHeads == kKVHeads) {
|
||||||
|
|
@ -810,11 +794,24 @@ HWY_NOINLINE void Attention(size_t batch_start, size_t batch_idx, size_t layer,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// accumulate output across all heads into att_post2. head 0 already wrote
|
// TODO(szabadka) Use a single MatVecAdd like in GriffinRecurrent() after
|
||||||
// directly to att_post2.
|
// rearranging the weights.
|
||||||
for (size_t head = 1; head < kHeads; ++head) {
|
float* HWY_RESTRICT att_out =
|
||||||
AddFrom(activations.att_post1.data() + head * kBatchSize * kModelDim,
|
activations.att_out.data() + batch_idx * kHeads * kQKVDim;
|
||||||
activations.att_post2.data() + batch_idx * kModelDim, kModelDim);
|
float* HWY_RESTRICT layer_out =
|
||||||
|
activations.att_post2.data() + batch_idx * kModelDim;
|
||||||
|
MatVecAdd<TConfig::kSoftmaxAttnOutputBiases, kModelDim, kQKVDim>(
|
||||||
|
layer_weights->attn_vec_einsum_w, 0, att_out,
|
||||||
|
layer_weights->attention_output_biases.data(),
|
||||||
|
activations.even_odd.data(), layer_out, pool);
|
||||||
|
for (size_t head = 1; head < kHeads; ++head) {
|
||||||
|
float* HWY_RESTRICT head_out =
|
||||||
|
activations.att_post1.data() + head * kBatchSize * kModelDim;
|
||||||
|
MatVec<kModelDim, kQKVDim>(
|
||||||
|
layer_weights->attn_vec_einsum_w, head * kModelDim * kQKVDim,
|
||||||
|
att_out + head * kQKVDim,
|
||||||
|
activations.even_odd.data(), head_out, pool);
|
||||||
|
AddFrom(head_out, layer_out, kModelDim);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue