mirror of https://github.com/google/gemma.cpp.git
Simplify FFW by using MatMul_4x4_Batch_Add.
Affects only the griffin model, where prefill TPS improves by about 70%. PiperOrigin-RevId: 652878176
This commit is contained in:
parent
48b900b1b9
commit
ff34370aac
|
|
@ -76,6 +76,8 @@ class CompressedArray {
|
||||||
public:
|
public:
|
||||||
using value_type = MatT;
|
using value_type = MatT;
|
||||||
|
|
||||||
|
// Note that whenever you access data(), you have to consider a scale() that
|
||||||
|
// may be different from 1.0f.
|
||||||
MatT* data() { return data_.data(); }
|
MatT* data() { return data_.data(); }
|
||||||
const MatT* data() const { return data_.data(); }
|
const MatT* data() const { return data_.data(); }
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -59,6 +59,7 @@ struct Activations {
|
||||||
|
|
||||||
// For bf16/f32 vectors * bf16 matrix: faster to unpack once beforehand, into
|
// For bf16/f32 vectors * bf16 matrix: faster to unpack once beforehand, into
|
||||||
// per-thread storage.
|
// per-thread storage.
|
||||||
|
// TODO: only used for MatVec, remove once that is gone.
|
||||||
std::array<float, kModelDim * kMaxThreads> even_odd;
|
std::array<float, kModelDim * kMaxThreads> even_odd;
|
||||||
|
|
||||||
// Griffin layer internal activations
|
// Griffin layer internal activations
|
||||||
|
|
|
||||||
|
|
@ -68,11 +68,11 @@ HWY_NOINLINE void GriffinRecurrent(
|
||||||
PROFILER_ZONE("Gen.Griffin");
|
PROFILER_ZONE("Gen.Griffin");
|
||||||
static_assert(kQueryBatchSize == 1,
|
static_assert(kQueryBatchSize == 1,
|
||||||
"Griffin does not support batched queries.");
|
"Griffin does not support batched queries.");
|
||||||
HWY_DASSERT(num_queries == 1); // TODO: add batch query support for Griffin.
|
HWY_ASSERT(num_queries == 1); // TODO: add batch query support for Griffin.
|
||||||
KVCache& kv_cache = *kv_caches[0];
|
KVCache& kv_cache = *kv_caches[0];
|
||||||
namespace hn = hwy::HWY_NAMESPACE;
|
namespace hn = hwy::HWY_NAMESPACE;
|
||||||
using D = hn::ScalableTag<float>;
|
using D = hn::ScalableTag<float>;
|
||||||
HWY_DASSERT(num_tokens <= kBatchSize);
|
HWY_ASSERT(num_tokens <= kBatchSize);
|
||||||
static constexpr size_t kModelDim =
|
static constexpr size_t kModelDim =
|
||||||
gcpp::Activations<TConfig, kBatchSize * kQueryBatchSize>::kModelDim;
|
gcpp::Activations<TConfig, kBatchSize * kQueryBatchSize>::kModelDim;
|
||||||
static constexpr size_t kConv1dWidth = TConfig::kConv1dWidth;
|
static constexpr size_t kConv1dWidth = TConfig::kConv1dWidth;
|
||||||
|
|
@ -397,64 +397,46 @@ HWY_NOINLINE void FFW(Activations<TConfig, kBatchSize>& activations,
|
||||||
size_t num_tokens,
|
size_t num_tokens,
|
||||||
const CompressedLayer<TConfig>* layer_weights,
|
const CompressedLayer<TConfig>* layer_weights,
|
||||||
hwy::ThreadPool& pool) {
|
hwy::ThreadPool& pool) {
|
||||||
|
PROFILER_ZONE("Gen.FFW");
|
||||||
HWY_DASSERT(num_tokens <= kBatchSize);
|
HWY_DASSERT(num_tokens <= kBatchSize);
|
||||||
constexpr size_t kModelDim = TConfig::kModelDim;
|
constexpr size_t kModelDim = TConfig::kModelDim;
|
||||||
constexpr size_t kFFHiddenDim = TConfig::kFFHiddenDim;
|
constexpr size_t kFFHiddenDim = TConfig::kFFHiddenDim;
|
||||||
float* HWY_RESTRICT even_odd = activations.even_odd.data();
|
|
||||||
|
|
||||||
// TODO: MatMul does not yet support adding another matrix to the result.
|
|
||||||
if constexpr (!TConfig::kFFBiases) {
|
|
||||||
PROFILER_ZONE("Gen.FFW.GatedGELU");
|
|
||||||
|
|
||||||
// MatMul expects col-major B, which is what we have: kModelDim consecutive
|
// MatMul expects col-major B, which is what we have: kModelDim consecutive
|
||||||
// elements in memory, repeated kFFHiddenDim times.
|
// elements in memory, repeated kFFHiddenDim times.
|
||||||
const auto b1 = layer_weights->gating_einsum_w.data();
|
|
||||||
constexpr size_t kColsA = kModelDim;
|
constexpr size_t kColsA = kModelDim;
|
||||||
constexpr size_t kColsB = kFFHiddenDim;
|
constexpr size_t kColsB = kFFHiddenDim;
|
||||||
const auto b2 = b1 + kColsA * kColsB;
|
const auto A = activations.bf_pre_ffw_rms_out.data();
|
||||||
auto A = activations.bf_pre_ffw_rms_out.data();
|
const auto B1 = layer_weights->gating_einsum_w.data();
|
||||||
// Will go through GELU.
|
const auto B2 = B1 + kColsA * kColsB;
|
||||||
MatMul_4x4_Batch<kColsA, kColsB>(num_tokens, A, b1, activations.C1.data(),
|
auto C1 = activations.C1.data();
|
||||||
pool);
|
auto C2 = activations.C2.data();
|
||||||
// What to multiply by.
|
constexpr bool kAddBias = TConfig::kFFBiases;
|
||||||
MatMul_4x4_Batch<kColsA, kColsB>(num_tokens, A, b2, activations.C2.data(),
|
const auto bias = layer_weights->ffw_gating_biases.data();
|
||||||
pool);
|
|
||||||
|
|
||||||
// Activation (Gelu) and multiply by gate.
|
// Will go through GELU.
|
||||||
|
MatMul_4x4_Batch_Add<kColsA, kColsB, kAddBias>(num_tokens, A, B1, C1,
|
||||||
|
bias, pool);
|
||||||
|
// What to multiply by.
|
||||||
|
MatMul_4x4_Batch_Add<kColsA, kColsB, kAddBias>(num_tokens, A, B2, C2,
|
||||||
|
bias + kFFHiddenDim, pool);
|
||||||
|
|
||||||
|
// Activation (Gelu) and multiply by gate. Store activations in C1.
|
||||||
Activation<TConfig>(activations.C1.data(), activations.C2.data(),
|
Activation<TConfig>(activations.C1.data(), activations.C2.data(),
|
||||||
kFFHiddenDim * num_tokens);
|
kFFHiddenDim * num_tokens);
|
||||||
|
|
||||||
MatMul_4x4_Batch<kFFHiddenDim, kModelDim>(num_tokens, activations.C1.data(),
|
// linear_w may have a scale value different from 1, apply that here.
|
||||||
layer_weights->linear_w.data(),
|
// We multiply all activations by the scale value to compensate for the
|
||||||
activations.ffw_out.data(), pool);
|
// missing scale value in the weights.
|
||||||
} else { // TConfig::kFFBiases == true
|
if (layer_weights->linear_w.scale() != 1.0f) {
|
||||||
for (size_t batch_idx = 0; batch_idx < num_tokens; ++batch_idx) {
|
MulByConst(layer_weights->linear_w.scale(), C1, kFFHiddenDim * num_tokens);
|
||||||
const size_t hidden_offset = batch_idx * kFFHiddenDim * 2;
|
|
||||||
const hwy::bfloat16_t* HWY_RESTRICT vec =
|
|
||||||
activations.bf_pre_ffw_rms_out.data() + batch_idx * kModelDim;
|
|
||||||
float* HWY_RESTRICT out = activations.ffw_hidden.data() + hidden_offset;
|
|
||||||
float* HWY_RESTRICT out_mul = out + kFFHiddenDim;
|
|
||||||
|
|
||||||
PROFILER_ZONE("Gen.FFW.GatedGELU");
|
|
||||||
// Same matrix, first and second half of rows. Could fuse into one MatVec.
|
|
||||||
MatVecT</*kAdd=*/true, kFFHiddenDim, kModelDim>(
|
|
||||||
layer_weights->gating_einsum_w, kFFHiddenDim * kModelDim, vec,
|
|
||||||
layer_weights->ffw_gating_biases.data() + kFFHiddenDim, even_odd,
|
|
||||||
out_mul, pool);
|
|
||||||
// Gate, will go through the nonlinearity.
|
|
||||||
MatVecT</*kAdd=*/true, kFFHiddenDim, kModelDim>(
|
|
||||||
layer_weights->gating_einsum_w, 0, vec,
|
|
||||||
layer_weights->ffw_gating_biases.data(), even_odd, out, pool);
|
|
||||||
|
|
||||||
Activation<TConfig>(out, out_mul, kFFHiddenDim);
|
|
||||||
|
|
||||||
MatVecT</*kAdd=*/true, kModelDim, kFFHiddenDim>(
|
|
||||||
layer_weights->linear_w, 0,
|
|
||||||
activations.ffw_hidden.data() + hidden_offset,
|
|
||||||
layer_weights->ffw_output_biases.data(), even_odd,
|
|
||||||
activations.ffw_out.data() + batch_idx * kModelDim, pool);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Hidden layer -> output layer.
|
||||||
|
MatMul_4x4_Batch_Add<kFFHiddenDim, kModelDim, kAddBias>(
|
||||||
|
num_tokens, C1, layer_weights->linear_w.data(),
|
||||||
|
activations.ffw_out.data(), layer_weights->ffw_output_biases.data(),
|
||||||
|
pool);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class TConfig, size_t kBatchSize>
|
template <class TConfig, size_t kBatchSize>
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue