Compare commits

...

13 Commits

Author SHA1 Message Date
Phil Culliton 28ca001d5e Matmul and test functions
PiperOrigin-RevId: 630373984
2024-05-03 06:39:36 -07:00
Copybara-Service 6eeef2e2d9 Merge pull request #166 from samkaufman:deinterleave-vecs
PiperOrigin-RevId: 630360778
2024-05-03 05:23:31 -07:00
Copybara-Service 2a71333c8a Merge pull request #176 from szabadka:gemma3
PiperOrigin-RevId: 630131001
2024-05-02 11:41:05 -07:00
Zoltan Szabadka 9a2682d544 Use more parallelism in the QKV projections of the MHA block.
We compute all three projections with one MatVec and then copy
the kv part to the cache.

Benchmark results for 7b-it model that uses MHA blocks (summarization with
1600 tokens for prefill and essay writing with 500 tokens for generation):

```
                   Prefill speed                Generation speed
Num threads      BEFORE       AFTER            BEFORE       AFTER
32               13.75 t/s    14.80 t/s       9.22 t/s     9.77 t/s
64               19.89 t/s    24.83 t/s      12.46 t/s    13.66 t/s
```
2024-05-02 13:46:45 +00:00
Sam Kaufman 4a6173d929 Remove unused vars. 2024-05-02 00:41:44 -07:00
Sam Kaufman 564937ede6 Merge branch 'dev' into deinterleave-vecs 2024-04-30 16:23:04 -07:00
Sam Kaufman 2829ef17ad Check for HWY_NATIVE_DOT_BF16. 2024-04-30 15:19:28 -07:00
Sam Kaufman 59ebecce22 Fix: specialized MatVecAdd was never called. 2024-04-30 15:17:27 -07:00
Sam Kaufman 6a78a23f4c Abstracted some MatVecAdd spec. dupes. 2024-04-29 16:23:38 -07:00
Sam Kaufman f608337fef Remove Bf16ToF32EO and use PromoteEvenTo and PromoteOddTo. 2024-04-29 14:13:07 -07:00
Sam Kaufman aa0b113214 (VecT*) to static_cast<VecT*>. 2024-04-29 12:53:47 -07:00
Sam Kaufman 5cb63346aa supports_eo -> kSupportsEvenOdd 2024-04-29 12:51:35 -07:00
Sam Kaufman 0816a1070d Even-odd layout MatVecs for bf16 weights. 2024-04-28 20:09:25 -07:00
4 changed files with 320 additions and 107 deletions

View File

@ -58,6 +58,7 @@ struct CompressTraits {};
template <>
struct CompressTraits<float> {
using MatT = float;
static constexpr bool kSupportsEvenOdd = false;
template <class DF, HWY_IF_F32_D(DF)>
static HWY_INLINE void Compress(DF df, const float* HWY_RESTRICT in,
@ -111,6 +112,7 @@ struct CompressTraits<float> {
template <>
struct CompressTraits<hwy::bfloat16_t> {
using MatT = hwy::bfloat16_t;
static constexpr bool kSupportsEvenOdd = true;
template <class DF, HWY_IF_F32_D(DF)>
static HWY_INLINE void Compress(DF df, const float* HWY_RESTRICT in,
@ -219,11 +221,60 @@ struct CompressTraits<hwy::bfloat16_t> {
// bf16*bf16.
return hn::Dot::Compute<kAssumptions>(d_vec, vec_aligned, in + in_ofs, num);
}
// Computes the dot product of an even-odd deinterleaved, f32 `vec_aligned`
// and a column- major matrix `in`. `vec_aligned` should be aligned and
// alternate even-indexed `hn::Lanes(df32)` elements followed by odd-indexed
// `hn::Lanes(df32)` elements.
template <class DF, HWY_IF_F32_D(DF)>
static HWY_INLINE float DotEO(
const DF df32, const hwy::bfloat16_t* HWY_RESTRICT in, size_t in_ofs,
const float* HWY_RESTRICT vec_aligned, size_t num) {
HWY_DASSERT(num >= (hn::Lanes(df32) * 2) && (num % (hn::Lanes(df32) * 2)) == 0);
HWY_DASSERT((in_ofs % (hn::Lanes(df32) * 2)) == 0);
HWY_DASSERT(hn::IsAligned(df32, vec_aligned));
const hn::Repartition<hwy::bfloat16_t, DF> dbf16;
using VF32 = decltype(Zero(df32));
const size_t N = Lanes(dbf16);
VF32 sum0 = Zero(df32);
VF32 sum1 = Zero(df32);
VF32 sum2 = Zero(df32);
VF32 sum3 = Zero(df32);
const hn::RebindToUnsigned<decltype(df32)> du32;
using VU32 = hn::VFromD<decltype(du32)>;
const VU32 odd = Set(du32, 0xFFFF0000u);
VF32 be0, bo0, be1, bo1;
for (size_t i = 0; i < num; /* i += 2 * N */) {
const auto interleaved0 = hn::LoadU(dbf16, in + in_ofs + i);
const VF32 ae0 = Load(df32, vec_aligned + i);
const VF32 ao0 = Load(df32, vec_aligned + i + (N / 2));
sum0 = hn::MulAdd(ae0, hn::PromoteEvenTo(df32, interleaved0), sum0);
sum1 = hn::MulAdd(ao0, hn::PromoteOddTo(df32, interleaved0), sum1);
i += N;
const auto interleaved1 = hn::LoadU(dbf16, in + in_ofs + i);
const VF32 ae1 = Load(df32, vec_aligned + i);
const VF32 ao1 = Load(df32, vec_aligned + i + (N / 2));
sum2 = hn::MulAdd(ae1, hn::PromoteEvenTo(df32, interleaved1), sum2);
sum3 = hn::MulAdd(ao1, hn::PromoteOddTo(df32, interleaved1), sum3);
i += N;
}
sum0 = Add(sum0, sum1);
sum2 = Add(sum2, sum3);
sum0 = Add(sum0, sum2);
return ReduceSum(df32, sum0);
}
};
template <>
struct CompressTraits<SfpStream> {
using MatT = SfpStream;
static constexpr bool kSupportsEvenOdd = false;
template <class DF, HWY_IF_F32_D(DF)>
static HWY_INLINE void Compress(DF df, const float* in, size_t num,
@ -273,6 +324,7 @@ struct CompressTraits<SfpStream> {
template <>
struct CompressTraits<NuqStream> {
using MatT = NuqStream;
static constexpr bool kSupportsEvenOdd = false;
template <class DF, HWY_IF_F32_D(DF)>
static HWY_INLINE void Compress(DF df, const float* in, size_t num,
@ -425,16 +477,22 @@ HWY_INLINE float Dot(DF df, const ArrayT& compressed, size_t compressed_ofs,
}
// Returns dot product with `vec_aligned` of length `num`.
template <class DF, typename MatT, size_t kCapacity, typename VecT>
template <bool kVecEO, class DF, typename MatT, size_t kCapacity, typename VecT>
HWY_INLINE float Dot(DF df, const CompressedArray<MatT, kCapacity>& compressed,
size_t compressed_ofs, const VecT* vec_aligned,
size_t num) {
HWY_DASSERT(compressed_ofs + num <= compressed.size());
HWY_DASSERT(hn::IsAligned(df, vec_aligned));
using Traits = CompressTraits<MatT>;
return (compressed.scale() * Traits::Dot(df, compressed.size(),
compressed.data(), compressed_ofs,
vec_aligned, num));
float dot_result;
if constexpr (kVecEO) {
dot_result = Traits::DotEO(df, compressed.data(), compressed_ofs,
vec_aligned, num);
} else {
dot_result = Traits::Dot(df, compressed.size(), compressed.data(),
compressed_ofs, vec_aligned, num);
}
return compressed.scale() * dot_result;
}
// Callback used by ForeachTensor.

View File

@ -402,10 +402,11 @@ struct Activations {
static constexpr size_t kCacheLayerSize = kKVHeads * kQKVDim * 2;
static constexpr size_t kCachePosSize =
TConfig::kGemmaLayers * kCacheLayerSize;
static constexpr size_t kQDim = kHeads == kKVHeads ? kQKVDim * 3 : kQKVDim;
std::array<float, kBatchSize * kModelDim> x; // input
std::array<float, kBatchSize * kModelDim> pre_att_rms_out;
std::array<float, kBatchSize * kHeads * kQKVDim> q; // query vector
std::array<float, kBatchSize * kHeads * kQDim> q; // query vector
std::array<float, kBatchSize * kHeads * TConfig::kSeqLen>
att; // attention vector
std::array<float, kBatchSize * kHeads * kQKVDim> att_out; // attention output
@ -710,10 +711,9 @@ HWY_NOINLINE void Attention(size_t batch_start, size_t batch_idx, size_t layer,
float* x = activations.pre_att_rms_out.data() + batch_idx * kModelDim;
auto Attn = [&](uint64_t head, size_t head_offset, size_t thread) HWY_ATTR {
auto Attn = [&](float* q, uint64_t head, size_t head_offset,
size_t thread) HWY_ATTR {
// Calculate scores
float* HWY_RESTRICT q =
activations.q.data() + head * kQKVDim + batch_idx * kHeads * kQKVDim;
float* HWY_RESTRICT head_att = activations.att.data() +
head * TConfig::kSeqLen +
batch_idx * kHeads * kQKVDim;
@ -745,34 +745,23 @@ HWY_NOINLINE void Attention(size_t batch_start, size_t batch_idx, size_t layer,
if constexpr (kHeads == kKVHeads) {
// Multi-Head Attention
static_assert(TConfig::kInterleaveQKV);
float* HWY_RESTRICT qkv =
activations.q.data() + batch_idx * kHeads * kQKVDim * 3;
MatVec<kHeads * kQKVDim * 3, kModelDim>(
layer_weights->qkv_einsum_w, 0, x, activations.even_odd.data(), qkv,
pool);
pool.Run(0, kHeads, [&](const uint64_t head, size_t thread) HWY_ATTR {
// linear projections to QKV
const size_t head_offset = TConfig::kInterleaveQKV
? 3 * kQKVDim * kModelDim
: kQKVDim * kModelDim;
const size_t mat_offset =
TConfig::kInterleaveQKV ? kQKVDim * kModelDim : kModelDim * kModelDim;
const size_t q_offset = head * head_offset + 0 * mat_offset;
const size_t k_offset = head * head_offset + 1 * mat_offset;
const size_t v_offset = head * head_offset + 2 * mat_offset;
// ProjQ
float* HWY_RESTRICT q =
activations.q.data() + head * kQKVDim + batch_idx * kHeads * kQKVDim;
MatVecLoop<kQKVDim, kModelDim>(
layer_weights->qkv_einsum_w, q_offset + 0 * kQKVDim * kModelDim, x,
activations.even_odd.data() + thread * kModelDim, q);
// ProjKV
float* HWY_RESTRICT q = qkv + head * kQKVDim * 3;
const size_t kv_offset = cache_pos * kCachePosSize +
layer * kCacheLayerSize + head * kQKVDim * 2;
float* HWY_RESTRICT k = kv_cache.kv_cache.get() + kv_offset;
float* HWY_RESTRICT v = k + kQKVDim;
TwoOfsMatVecLoop<kQKVDim, kModelDim>(layer_weights->qkv_einsum_w,
k_offset, v_offset, x, k, v);
Rope(k, TConfig::kUseHalfRope ? kQKVDim / 2 : kQKVDim, pos);
float* HWY_RESTRICT kv = kv_cache.kv_cache.get() + kv_offset;
Attn(head, head * kQKVDim * 2, thread);
memcpy(kv, q + kQKVDim, 2 * kQKVDim * sizeof(float));
Rope(kv, TConfig::kUseHalfRope ? kQKVDim / 2 : kQKVDim, pos);
Attn(q, head, head * kQKVDim * 2, thread);
});
} else {
// Multi-Query Attention
@ -790,7 +779,7 @@ HWY_NOINLINE void Attention(size_t batch_start, size_t batch_idx, size_t layer,
Rope(kv, TConfig::kUseHalfRope ? kQKVDim / 2 : kQKVDim, pos);
pool.Run(0, kHeads, [&](const uint64_t head, size_t thread) HWY_ATTR {
Attn(head, 0, thread);
Attn(q + head * kQKVDim, head, 0, thread);
});
}

View File

@ -25,6 +25,7 @@
#include <random>
#include <type_traits> // std::enable_if_t
#include "compression/compress.h" // CompressedArray
#include "hwy/base.h"
#include "hwy/contrib/thread_pool/thread_pool.h"
#include "hwy/profiler.h"
@ -92,6 +93,52 @@ HWY_INLINE constexpr size_t RowsPerStrip() {
return kRowsPerStrip;
}
// Largely unoptimized; reordered innermost loops nets ~5-10X speedup on
// ops_test across instruction sets.
template <size_t kM, size_t kN, size_t kK>
HWY_INLINE void MatMul(const float* HWY_RESTRICT a, const float* HWY_RESTRICT b,
float* HWY_RESTRICT out) {
int i, j, k;
for (i = 0; i < kM; ++i) {
for (k = 0; k < kN; ++k) {
for (j = 0; j < kK; ++j) {
out[i * kK + j] += a[i * kN + k] * b[k * kK + j];
}
}
}
}
HWY_INLINE void ToEvenOddF32(const hwy::bfloat16_t* HWY_RESTRICT vec_aligned,
const size_t size, float* HWY_RESTRICT out) {
const hn::ScalableTag<float> df;
const hn::Repartition<hwy::bfloat16_t, decltype(df)> dbf16;
HWY_DASSERT(size % hn::Lanes(dbf16) == 0);
HWY_DASSERT(hn::IsAligned(df, vec_aligned));
for (size_t i = 0; i < size; i += hn::Lanes(dbf16)) {
const auto interleaved = hn::LoadU(dbf16, vec_aligned + i);
hn::Store(hn::PromoteEvenTo(df, interleaved), df, out + i);
hn::Store(hn::PromoteOddTo(df, interleaved), df, out + i + hn::Lanes(df));
}
}
HWY_INLINE void ToEvenOddF32(const float* HWY_RESTRICT vec_aligned,
const size_t size, float* HWY_RESTRICT out) {
const hn::ScalableTag<float> df;
using VF = hn::Vec<decltype(df)>;
HWY_DASSERT(size % (hn::Lanes(df) * 2) == 0);
HWY_DASSERT(hn::IsAligned(df, vec_aligned));
VF vec0, vec1;
for (size_t i = 0; i < size; i += hn::Lanes(df) * 2) {
hn::LoadInterleaved2(df, vec_aligned + i, vec0, vec1);
hn::Store(vec0, df, out + i);
hn::Store(vec1, df, out + i + hn::Lanes(df));
}
}
// Simple version without tiling nor threading.
// even_odd is precomputed for the current thread.
template <bool kAdd, size_t kOuter, size_t kInner, typename ArrayT,
@ -104,12 +151,6 @@ HWY_INLINE void MatVecAddLoop(const ArrayT& mat, const size_t mat_ofs,
PROFILER_ZONE("MatVecAddLoop");
const hn::ScalableTag<float> df;
// Sanity check: we can write without race conditions.
if (HWY_IS_TSAN) {
even_odd[0] = hwy::ConvertScalarTo<float>(vec_aligned[0]);
even_odd[kInner - 1] = -even_odd[0];
}
for (size_t idx_row = 0; idx_row < kOuter; ++idx_row) {
const size_t row_ofs = mat_ofs + idx_row * kInner;
if constexpr (kAdd) {
@ -121,14 +162,40 @@ HWY_INLINE void MatVecAddLoop(const ArrayT& mat, const size_t mat_ofs,
}
}
#if !defined(HWY_NATIVE_DOT_BF16) || !HWY_NATIVE_DOT_BF16
template <bool kAdd, size_t kOuter, size_t kInner, typename VecT, typename AddT,
size_t kCapacity>
HWY_INLINE void MatVecAddLoop(
const CompressedArray<hwy::bfloat16_t, kCapacity>& mat,
const size_t mat_ofs, const VecT* HWY_RESTRICT vec_aligned,
const AddT* HWY_RESTRICT add, float* HWY_RESTRICT even_odd,
float* HWY_RESTRICT out) {
PROFILER_ZONE("MatVecAddLoop");
constexpr bool kVecIsEvenOdd = true;
const hn::ScalableTag<float> df;
ToEvenOddF32(vec_aligned, kInner, even_odd);
for (size_t idx_row = 0; idx_row < kOuter; ++idx_row) {
const size_t row_ofs = mat_ofs + idx_row * kInner;
if constexpr (kAdd) {
out[idx_row] = hwy::ConvertScalarTo<float>(add[idx_row]) +
Dot<kVecIsEvenOdd>(df, mat, row_ofs, even_odd, kInner);
} else {
out[idx_row] = Dot<kVecIsEvenOdd>(df, mat, row_ofs, even_odd, kInner);
}
}
}
#endif
// even_odd is precomputed for the current thread.
template <size_t kOuter, size_t kInner, typename ArrayT, typename VecT>
HWY_INLINE void MatVecLoop(const ArrayT& mat, const size_t mat_ofs,
const VecT* HWY_RESTRICT vec_aligned,
float* HWY_RESTRICT even_odd,
float* HWY_RESTRICT out) {
MatVecAddLoop</*kAdd=*/false, kOuter, kInner, ArrayT, VecT, VecT>(
mat, mat_ofs, vec_aligned, /*add=*/nullptr, even_odd, out);
MatVecAddLoop</*kAdd=*/false, kOuter, kInner>(
mat, mat_ofs, vec_aligned, /*add=*/static_cast<VecT*>(nullptr), even_odd,
out);
}
// Simple version without tiling nor threading, but two offsets/outputs.
@ -176,20 +243,23 @@ namespace detail {
// For each i = [0, num_rows), compute partial (length `num_cols`) dot product
// of row i with `vec_aligned` and add into `out[i]`. The upper-left coordinate
// of the tile is r0, c0.
template <class DF, typename ArrayT, typename VecT>
template <bool kVecEO, class DF, typename ArrayT, typename VecT>
HWY_INLINE void AccumulatePartialDotProducts(
DF df, const ArrayT& mat, size_t mat_ofs, size_t mat_stride, size_t r0,
size_t c0, size_t num_rows, size_t num_cols,
const VecT* HWY_RESTRICT vec_aligned, float* HWY_RESTRICT out) {
for (size_t idx_row = 0; idx_row < num_rows; ++idx_row) {
const size_t row_ofs = mat_ofs + (r0 + idx_row) * mat_stride;
out[idx_row] += Dot(df, mat, row_ofs + c0, vec_aligned + c0, num_cols);
out[idx_row] +=
Dot<kVecEO>(df, mat, row_ofs + c0, vec_aligned + c0, num_cols);
}
}
// Same as above, but sets out[i] to the first partial dot product +
// init (if kInit), which avoids having to zero-initialize and accumulate.
template <bool kInit, class DF, typename ArrayT, typename VecT, typename InitT>
// Same as AccumulatePartialDotProducts, but sets out[i] to the first partial
// dot product + init (if kInit), which avoids having to zero-initialize and
// accumulate.
template <bool kVecEO, bool kInit, class DF, typename ArrayT, typename VecT,
typename InitT>
HWY_INLINE void SetFirstPartialDotProducts(DF df, const ArrayT& mat,
size_t mat_ofs, size_t mat_stride,
size_t r0, size_t c0,
@ -200,10 +270,12 @@ HWY_INLINE void SetFirstPartialDotProducts(DF df, const ArrayT& mat,
for (size_t idx_row = 0; idx_row < num_rows; ++idx_row) {
const size_t row_ofs = mat_ofs + (r0 + idx_row) * mat_stride;
if constexpr (kInit) {
out[idx_row] = hwy::ConvertScalarTo<float>(init[idx_row + r0]) +
Dot(df, mat, row_ofs + c0, vec_aligned + c0, num_cols);
out[idx_row] =
hwy::ConvertScalarTo<float>(init[idx_row + r0]) +
Dot<kVecEO>(df, mat, row_ofs + c0, vec_aligned + c0, num_cols);
} else {
out[idx_row] = Dot(df, mat, row_ofs + c0, vec_aligned + c0, num_cols);
out[idx_row] =
Dot<kVecEO>(df, mat, row_ofs + c0, vec_aligned + c0, num_cols);
}
}
}
@ -212,7 +284,8 @@ HWY_INLINE void SetFirstPartialDotProducts(DF df, const ArrayT& mat,
// horizontal strip of the entire matrix); the result is the full dot product
// for rows r in [r0, r0 + num_rows) + optionally the add vector, which we store
// into in out[r - r0].
template <bool kAdd, class DF, typename ArrayT, typename VecT, typename AddT>
template <bool kVecEO, bool kAdd, class DF, typename ArrayT, typename VecT,
typename AddT>
HWY_INLINE void FullDotProductsForStrip(DF df, const ArrayT& mat,
size_t mat_ofs, size_t mat_stride,
size_t r0, size_t num_rows,
@ -221,42 +294,37 @@ HWY_INLINE void FullDotProductsForStrip(DF df, const ArrayT& mat,
float* HWY_RESTRICT out) {
// Tall and skinny: set `out` to the single dot product.
if (mat_stride < MaxCols()) {
SetFirstPartialDotProducts<kAdd>(df, mat, mat_ofs, mat_stride, r0, 0,
num_rows, mat_stride, vec_aligned, add,
out);
SetFirstPartialDotProducts<kVecEO, kAdd>(df, mat, mat_ofs, mat_stride, r0,
0, num_rows, mat_stride,
vec_aligned, add, out);
return;
}
// We have at least MaxCols, so start by setting `out` to that:
SetFirstPartialDotProducts<kAdd>(df, mat, mat_ofs, mat_stride, r0, 0,
num_rows, MaxCols(), vec_aligned, add, out);
SetFirstPartialDotProducts<kVecEO, kAdd>(df, mat, mat_ofs, mat_stride, r0, 0,
num_rows, MaxCols(), vec_aligned,
add, out);
// For further multiples of MaxCols, accumulate. Remainders handled below.
size_t c0 = MaxCols();
for (; c0 <= mat_stride - MaxCols(); c0 += MaxCols()) {
AccumulatePartialDotProducts(df, mat, mat_ofs, mat_stride, r0, c0, num_rows,
MaxCols(), vec_aligned, out);
AccumulatePartialDotProducts<kVecEO>(df, mat, mat_ofs, mat_stride, r0, c0,
num_rows, MaxCols(), vec_aligned, out);
}
if (c0 < mat_stride) { // Final cols
AccumulatePartialDotProducts(df, mat, mat_ofs, mat_stride, r0, c0, num_rows,
mat_stride - c0, vec_aligned, out);
AccumulatePartialDotProducts<kVecEO>(df, mat, mat_ofs, mat_stride, r0, c0,
num_rows, mat_stride - c0, vec_aligned,
out);
}
}
} // namespace detail
// Stores dot products of rows with `vec_aligned` + add the values from `add`
// (if kAdd), then stores them to `out`.
// `even_odd` has kInner elements for each thread.
template <bool kAdd, size_t kOuter, size_t kInner, typename ArrayT,
typename VecT, typename AddT>
HWY_INLINE void MatVecAdd(const ArrayT& mat, const size_t mat_ofs,
const VecT* HWY_RESTRICT const vec_aligned,
const AddT* HWY_RESTRICT const add,
float* HWY_RESTRICT even_odd, float* HWY_RESTRICT out,
hwy::ThreadPool& pool) {
PROFILER_ZONE("MatVecAdd");
template <bool kVecIsEvenOdd, bool kAdd, size_t kOuter, size_t kInner,
typename ArrayT, typename VecT, typename AddT>
HWY_INLINE void MatVecAddInner(const ArrayT& mat, const size_t mat_ofs,
const VecT* HWY_RESTRICT const vec_aligned,
const AddT* HWY_RESTRICT const add,
float* HWY_RESTRICT even_odd,
float* HWY_RESTRICT out, hwy::ThreadPool& pool) {
const hn::ScalableTag<float> df;
constexpr size_t kRowsPerStrip = RowsPerStrip<kOuter>();
constexpr size_t kNumStrips = kOuter / kRowsPerStrip;
@ -274,9 +342,9 @@ HWY_INLINE void MatVecAdd(const ArrayT& mat, const size_t mat_ofs,
pool.Run(0, kNumStrips, [&](const uint64_t strip, size_t thread) HWY_ATTR {
PROFILER_ZONE("MatVec.lambda");
const size_t r0 = strip * kRowsPerStrip;
detail::FullDotProductsForStrip<kAdd>(df, mat, mat_ofs, kInner, r0,
kRowsPerStrip, vec_aligned, add,
out + r0);
detail::FullDotProductsForStrip<kVecIsEvenOdd, kAdd>(
df, mat, mat_ofs, kInner, r0, kRowsPerStrip, vec_aligned, add,
out + r0);
});
// Remaining rows
@ -284,18 +352,47 @@ HWY_INLINE void MatVecAdd(const ArrayT& mat, const size_t mat_ofs,
if (r0 < kOuter) {
PROFILER_ZONE("MatVec remainder");
const size_t num_rows = kOuter - r0;
detail::FullDotProductsForStrip<kAdd>(df, mat, mat_ofs, kInner, r0,
num_rows, vec_aligned, add, out + r0);
detail::FullDotProductsForStrip<kVecIsEvenOdd, kAdd>(
df, mat, mat_ofs, kInner, r0, num_rows, vec_aligned, add, out + r0);
}
}
} // namespace detail
// Stores dot products of rows with `vec_aligned` + add the values from `add`
// (if kAdd), then stores them to `out`.
//
template <bool kAdd, size_t kOuter, size_t kInner, typename ArrayT,
typename VecT, typename AddT>
HWY_INLINE void MatVecAdd(const ArrayT& mat, const size_t mat_ofs,
const VecT* HWY_RESTRICT const vec_aligned,
const AddT* HWY_RESTRICT const add,
float* HWY_RESTRICT even_odd, float* HWY_RESTRICT out,
hwy::ThreadPool& pool) {
PROFILER_ZONE("MatVecAdd");
#if !defined(HWY_NATIVE_DOT_BF16) || !HWY_NATIVE_DOT_BF16
if constexpr (CompressTraits<typename ArrayT::value_type>::kSupportsEvenOdd &&
hwy::IsSameEither<VecT, float, hwy::bfloat16_t>()) {
ToEvenOddF32(vec_aligned, kInner, even_odd);
detail::MatVecAddInner</*kVecIsEvenOdd=*/true, kAdd, kOuter, kInner>(
mat, mat_ofs, even_odd, add, even_odd, out, pool);
return;
}
#endif
detail::MatVecAddInner</*kVecIsEvenOdd=*/false, kAdd, kOuter, kInner>(
mat, mat_ofs, vec_aligned, add, even_odd, out, pool);
}
template <size_t kOuter, size_t kInner, typename ArrayT, typename VecT>
HWY_INLINE void MatVec(const ArrayT& mat, const size_t mat_ofs,
const VecT* HWY_RESTRICT const vec_aligned,
float* HWY_RESTRICT even_odd, float* HWY_RESTRICT out,
hwy::ThreadPool& pool) {
MatVecAdd</*kAdd=*/false, kOuter, kInner, ArrayT, VecT, VecT>(
mat, mat_ofs, vec_aligned, /*add=*/nullptr, even_odd, out, pool);
MatVecAdd</*kAdd=*/false, kOuter, kInner>(mat, mat_ofs, vec_aligned,
/*add=*/static_cast<VecT*>(nullptr),
even_odd, out, pool);
}
template <class D, HWY_IF_F32_D(D)>
@ -417,17 +514,18 @@ HWY_NOINLINE void TwoMatVecAdd(
const hn::ScalableTag<float> df;
constexpr size_t kRowsPerStrip = RowsPerStrip<kOuter>();
constexpr size_t kNumStrips = kOuter / kRowsPerStrip;
constexpr bool kVecIsEvenOdd = false;
// For each entire strip.
pool.Run(0, kNumStrips, [&](const uint64_t strip, size_t thread) HWY_ATTR {
PROFILER_ZONE("TwoMatVec.lambda");
const size_t r0 = strip * kRowsPerStrip;
detail::FullDotProductsForStrip<kAdd>(df, mat0, mat_ofs, kInner, r0,
kRowsPerStrip, vec_aligned, add0,
out0 + r0);
detail::FullDotProductsForStrip<kAdd>(df, mat1, mat_ofs, kInner, r0,
kRowsPerStrip, vec_aligned, add1,
out1 + r0);
detail::FullDotProductsForStrip<kVecIsEvenOdd, kAdd>(
df, mat0, mat_ofs, kInner, r0, kRowsPerStrip, vec_aligned, add0,
out0 + r0);
detail::FullDotProductsForStrip<kVecIsEvenOdd, kAdd>(
df, mat1, mat_ofs, kInner, r0, kRowsPerStrip, vec_aligned, add1,
out1 + r0);
});
// Remaining rows
@ -435,9 +533,9 @@ HWY_NOINLINE void TwoMatVecAdd(
if (r0 < kOuter) {
PROFILER_ZONE("TwoMatVec remainder");
const size_t num_rows = kOuter - r0;
detail::FullDotProductsForStrip<kAdd>(
detail::FullDotProductsForStrip<kVecIsEvenOdd, kAdd>(
df, mat0, mat_ofs, kInner, r0, num_rows, vec_aligned, add0, out0 + r0);
detail::FullDotProductsForStrip<kAdd>(
detail::FullDotProductsForStrip<kVecIsEvenOdd, kAdd>(
df, mat1, mat_ofs, kInner, r0, num_rows, vec_aligned, add1, out1 + r0);
}
}

View File

@ -17,6 +17,8 @@
#define HWY_DISABLED_TARGETS HWY_SCALAR
#endif
#include <stddef.h>
#include <algorithm>
#include <array>
#include <random>
@ -376,6 +378,25 @@ CompressedArray<float, kOuter * kInner> GenerateMat(size_t offset) {
return mat;
}
template <size_t kOuter, size_t kInner>
CompressedArray<float, kOuter * kInner> GenerateZeroMat(size_t offset) {
hwy::ThreadPool pool(static_cast<size_t>(std::clamp(
static_cast<int>(std::thread::hardware_concurrency()) - 2, 1, 4)));
gcpp::CompressWorkingSet ws;
CompressedArray<float, kOuter * kInner> mat;
std::array<float, kOuter * kInner> content;
pool.Run(0, kOuter, [&](const size_t i, size_t thread) {
for (size_t j = 0; j < kInner; j++) {
content[i * kInner + j] = 0.0f;
}
});
Compress(content, ws, mat, pool);
mat.set_scale(1.0f);
return mat;
}
template <size_t length>
hwy::AlignedFreeUniquePtr<float[]> GenerateVec(size_t offset) {
hwy::AlignedFreeUniquePtr<float[]> vec = hwy::AllocateAligned<float>(length);
@ -386,6 +407,25 @@ hwy::AlignedFreeUniquePtr<float[]> GenerateVec(size_t offset) {
return vec;
}
// A simple matrix multiplication. No optimization / tiling.
template <size_t kM, size_t kN, size_t kK>
hwy::AlignedFreeUniquePtr<float[]> SimpleMatMul(
const hwy::AlignedFreeUniquePtr<float[]>& a,
const hwy::AlignedFreeUniquePtr<float[]>& b) {
hwy::AlignedFreeUniquePtr<float[]> out = hwy::AllocateAligned<float>(kM * kK);
hwy::ZeroBytes(out.get(), kM * kK * sizeof(float));
int i, j, k;
for (i = 0; i < kM; ++i) {
for (j = 0; j < kK; ++j) {
for (k = 0; k < kN; ++k) {
out[i * kK + j] += a[i * kN + k] * b[k * kK + j];
}
}
}
return out;
}
template <size_t kOuter, size_t kInner>
hwy::AlignedFreeUniquePtr<float[]> SimpleMatVecAdd(
const CompressedArray<float, kOuter * kInner>& mat,
@ -417,6 +457,52 @@ void AssertClose(const hwy::AlignedFreeUniquePtr<float[]>& a,
}
}
template <typename MatT>
void AssertClose(const hwy::AlignedFreeUniquePtr<MatT[]>& expected,
const hwy::AlignedFreeUniquePtr<MatT[]>& actual, size_t num) {
for (size_t idx = 0; idx < num; idx++) {
double expected_value = hwy::ConvertScalarTo<double>(expected[idx]);
double actual_value = hwy::ConvertScalarTo<double>(actual[idx]);
const double tolerance =
expected_value * 20 * 1.0 / (1ULL << hwy::MantissaBits<MatT>());
if (!(expected_value - tolerance <= actual_value &&
actual_value <= expected_value + tolerance)) {
fprintf(stderr, "expected[%lu]: %f, actual[%lu]: %f\n", idx,
expected_value, idx, actual_value);
HWY_ASSERT(0);
}
}
}
void TestMatMul() {
hwy::ThreadPool pool(0);
constexpr size_t kM = 128 * 3; // 384
constexpr size_t kK = 128 * 5; // 640
constexpr size_t kN = 128 * 6; // 768
CompressedArray<float, kM * kN> a1 = GenerateMat<kM, kN>(0);
CompressedArray<float, kN * kK> b1 = GenerateMat<kN, kK>(0);
hwy::AlignedFreeUniquePtr<float[]> a = hwy::AllocateAligned<float>(kM * kN);
Decompress(a1, 0, a.get(), kM * kN);
hwy::AlignedFreeUniquePtr<float[]> b = hwy::AllocateAligned<float>(kN * kK);
Decompress(b1, 0, b.get(), kN * kK);
hwy::AlignedFreeUniquePtr<float[]> expected_out1 =
SimpleMatMul<kM, kN, kK>(a, b);
CompressedArray<float, kM * kK> compressed_c = GenerateZeroMat<kM, kK>(0);
hwy::AlignedFreeUniquePtr<float[]> c = hwy::AllocateAligned<float>(kM * kK);
Decompress(compressed_c, 0, c.get(), kM * kK);
MatMul<kM, kN, kK>(a.get(), b.get(), c.get());
AssertClose(expected_out1, c, kM * kK);
}
void TestMatVecAdd() {
hwy::ThreadPool pool(0);
constexpr size_t kOuter = 128 * 3;
@ -436,24 +522,6 @@ void TestMatVecAdd() {
AssertClose<kOuter>(actual_out, expected_out);
}
void TestMatVecAddLoop() {
constexpr size_t kOuter = 128 * 3;
constexpr size_t kInner = 128 * 5;
CompressedArray<float, kOuter * kInner> mat = GenerateMat<kOuter, kInner>(0);
hwy::AlignedFreeUniquePtr<float[]> vec = GenerateVec<kInner>(0);
hwy::AlignedFreeUniquePtr<float[]> add = GenerateVec<kOuter>(0);
hwy::AlignedFreeUniquePtr<float[]> even_odd =
hwy::AllocateAligned<float>(kInner);
hwy::AlignedFreeUniquePtr<float[]> expected_out =
SimpleMatVecAdd<kOuter, kInner>(mat, vec, add);
hwy::AlignedFreeUniquePtr<float[]> actual_out =
hwy::AllocateAligned<float>(kOuter);
HWY_ASSERT(vec && add && even_odd && expected_out && actual_out);
MatVecAddLoop<true, kOuter, kInner>(mat, 0, vec.get(), add.get(),
even_odd.get(), actual_out.get());
AssertClose<kOuter>(actual_out, expected_out);
}
void TestTwoMatVecAdd() {
hwy::ThreadPool pool(0);
constexpr size_t kOuter = 128 * 3;
@ -536,8 +604,8 @@ HWY_EXPORT_AND_TEST_P(OpsTest, TestAllMulByConst);
HWY_EXPORT_AND_TEST_P(OpsTest, TestAllMulByConstAndAdd);
HWY_EXPORT_AND_TEST_P(OpsTest, TestAllSoftmax);
HWY_EXPORT_AND_TEST_P(OpsTest, TestAllCreateDistribution);
HWY_EXPORT_AND_TEST_P(OpsTest, TestMatMul);
HWY_EXPORT_AND_TEST_P(OpsTest, TestMatVecAdd);
HWY_EXPORT_AND_TEST_P(OpsTest, TestMatVecAddLoop);
HWY_EXPORT_AND_TEST_P(OpsTest, TestTwoMatVecAdd);
HWY_EXPORT_AND_TEST_P(OpsTest, TestTwoOfsMatVecAddLoop);
HWY_EXPORT_AND_TEST_P(OpsTest, TestSigmoid);