Add per-thread even_odd storage for #166.

Also inline ProjQ and ProjKV lambdas,
add missing includes/deps for ops_test.

PiperOrigin-RevId: 629460608
This commit is contained in:
Jan Wassenberg 2024-04-30 10:41:52 -07:00 committed by Copybara-Service
parent 8f04a8346d
commit 12fb2f05cf
6 changed files with 109 additions and 65 deletions

View File

@ -46,8 +46,10 @@ cc_test(
deps = [ deps = [
":ops", ":ops",
"@googletest//:gtest_main", "@googletest//:gtest_main",
"//compression:compress",
"@hwy//:hwy", "@hwy//:hwy",
"@hwy//:hwy_test_util", "@hwy//:hwy_test_util",
"@hwy//:thread_pool",
], ],
) )

View File

@ -28,6 +28,11 @@
#define GEMMA_TOPK 1 #define GEMMA_TOPK 1
#endif // !GEMMA_TOPK #endif // !GEMMA_TOPK
// Allow changing upper bound on threads as a compiler flag
#ifndef GEMMA_MAX_THREADS
#define GEMMA_MAX_THREADS 128
#endif // !GEMMA_MAX_THREADS
#include <stddef.h> #include <stddef.h>
#include <array> #include <array>
@ -45,6 +50,7 @@ namespace gcpp {
static constexpr size_t kSeqLen = GEMMA_MAX_SEQLEN; static constexpr size_t kSeqLen = GEMMA_MAX_SEQLEN;
static constexpr size_t kTopK = GEMMA_TOPK; static constexpr size_t kTopK = GEMMA_TOPK;
static constexpr size_t kMaxThreads = GEMMA_MAX_THREADS;
enum class LayerAttentionType { enum class LayerAttentionType {
kGemma, kGemma,

View File

@ -421,6 +421,10 @@ struct Activations {
std::array<float, kBatchSize * kModelDim> ffw_out; std::array<float, kBatchSize * kModelDim> ffw_out;
std::array<float, kBatchSize * TConfig::kVocabSize> logits; std::array<float, kBatchSize * TConfig::kVocabSize> logits;
// For bf16/f32 vectors * bf16 matrix: faster to unpack once beforehand, into
// per-thread storage.
std::array<float, kModelDim * kMaxThreads> even_odd;
// Griffin layer internal activations // Griffin layer internal activations
static constexpr size_t kGriffinDim = static constexpr size_t kGriffinDim =
TConfig::kGriffinLayers > 0 ? kModelDim : 0; TConfig::kGriffinLayers > 0 ? kModelDim : 0;
@ -575,13 +579,14 @@ HWY_NOINLINE void GriffinRecurrent(
gcpp::Activations<TConfig, kBatchSize>::kModelDim; gcpp::Activations<TConfig, kBatchSize>::kModelDim;
static constexpr size_t kConv1dWidth = TConfig::kConv1dWidth; static constexpr size_t kConv1dWidth = TConfig::kConv1dWidth;
static constexpr size_t kHeads = TConfig::kHeads; static constexpr size_t kHeads = TConfig::kHeads;
static constexpr bool kAdd = true;
const size_t batch_offset = batch_idx * kModelDim; const size_t batch_offset = batch_idx * kModelDim;
const size_t pos = batch_start + batch_idx; const size_t pos = batch_start + batch_idx;
// X / Y linear layers. // X / Y linear layers.
float* HWY_RESTRICT y = activations.griffin_y.data() + batch_offset; float* HWY_RESTRICT y = activations.griffin_y.data() + batch_offset;
float* HWY_RESTRICT x = activations.griffin_x.data() + batch_offset; float* HWY_RESTRICT x = activations.griffin_x.data() + batch_offset;
TwoMatVecAdd<true, kModelDim, kModelDim>( TwoMatVecAdd<kAdd, kModelDim, kModelDim>(
layer_weights->griffin.linear_x_w, layer_weights->griffin.linear_y_w, 0, layer_weights->griffin.linear_x_w, layer_weights->griffin.linear_y_w, 0,
activations.pre_att_rms_out.data() + batch_offset, activations.pre_att_rms_out.data() + batch_offset,
/*add0=*/layer_weights->griffin.linear_x_biases.data(), /*add0=*/layer_weights->griffin.linear_x_biases.data(),
@ -631,7 +636,7 @@ HWY_NOINLINE void GriffinRecurrent(
constexpr size_t kHeadDim = kModelDim / kHeads; constexpr size_t kHeadDim = kModelDim / kHeads;
constexpr size_t kMatrixSize = kHeadDim * kHeadDim; constexpr size_t kMatrixSize = kHeadDim * kHeadDim;
size_t head_offset = head * kHeadDim; size_t head_offset = head * kHeadDim;
TwoOfsMatVecAddLoop<true, kHeadDim, kHeadDim>( TwoOfsMatVecAddLoop<kAdd, kHeadDim, kHeadDim>(
layer_weights->griffin.gate_w, kMatrixSize * head, layer_weights->griffin.gate_w, kMatrixSize * head,
kMatrixSize * (kHeads + head), x + head_offset, kMatrixSize * (kHeads + head), x + head_offset,
/*add0=*/layer_weights->griffin.gate_biases.data() + head_offset, /*add0=*/layer_weights->griffin.gate_biases.data() + head_offset,
@ -670,9 +675,10 @@ HWY_NOINLINE void GriffinRecurrent(
// Final linear layer. // Final linear layer.
float* out_ptr = activations.att_post2.data() + batch_idx * kModelDim; float* out_ptr = activations.att_post2.data() + batch_idx * kModelDim;
MatVecAdd<true, kModelDim, kModelDim>( MatVecAdd<kAdd, kModelDim, kModelDim>(
layer_weights->griffin.linear_out_w, 0, x, layer_weights->griffin.linear_out_w, 0, x,
layer_weights->griffin.linear_out_biases.data(), out_ptr, pool); layer_weights->griffin.linear_out_biases.data(),
activations.even_odd.data(), out_ptr, pool);
} }
template <size_t kBatchSize, typename LayerT, class TConfig> template <size_t kBatchSize, typename LayerT, class TConfig>
@ -704,26 +710,7 @@ HWY_NOINLINE void Attention(size_t batch_start, size_t batch_idx, size_t layer,
float* x = activations.pre_att_rms_out.data() + batch_idx * kModelDim; float* x = activations.pre_att_rms_out.data() + batch_idx * kModelDim;
auto ProjQ = [&](uint64_t head, size_t head_offset) HWY_ATTR { auto Attn = [&](uint64_t head, size_t head_offset, size_t thread) HWY_ATTR {
float* HWY_RESTRICT q =
activations.q.data() + head * kQKVDim + batch_idx * kHeads * kQKVDim;
MatVecLoop<kQKVDim, kModelDim>(layer_weights->qkv_einsum_w,
head_offset + 0 * kQKVDim * kModelDim, x, q);
};
auto ProjKV = [&](size_t k_offset, size_t v_offset,
size_t kv_offset) HWY_ATTR {
float* HWY_RESTRICT k = kv_cache.kv_cache.get() + kv_offset;
float* HWY_RESTRICT v = k + kQKVDim;
TwoOfsMatVecLoop<kQKVDim, kModelDim>(layer_weights->qkv_einsum_w, k_offset,
v_offset, x, k, v);
Rope(k, TConfig::kUseHalfRope ? kQKVDim / 2 : kQKVDim, pos);
};
auto Attn = [&](uint64_t head, size_t head_offset) HWY_ATTR {
// Calculate scores // Calculate scores
float* HWY_RESTRICT q = float* HWY_RESTRICT q =
activations.q.data() + head * kQKVDim + batch_idx * kHeads * kQKVDim; activations.q.data() + head * kQKVDim + batch_idx * kHeads * kQKVDim;
@ -760,20 +747,21 @@ HWY_NOINLINE void Attention(size_t batch_start, size_t batch_idx, size_t layer,
head == 0 head == 0
? activations.att_post2.data() + batch_idx * kModelDim ? activations.att_post2.data() + batch_idx * kModelDim
: activations.att_post1.data() + head * kBatchSize * kModelDim; : activations.att_post1.data() + head * kBatchSize * kModelDim;
float* even_odd = activations.even_odd.data() + thread * kQKVDim;
if (head == 0) { if (head == 0) {
MatVecAddLoop<TConfig::kSoftmaxAttnOutputBiases, kModelDim, kQKVDim>( MatVecAddLoop<TConfig::kSoftmaxAttnOutputBiases, kModelDim, kQKVDim>(
layer_weights->attn_vec_einsum_w, head * kModelDim * kQKVDim, att_out, layer_weights->attn_vec_einsum_w, head * kModelDim * kQKVDim, att_out,
layer_weights->attention_output_biases.data(), head_out); layer_weights->attention_output_biases.data(), even_odd, head_out);
} else { } else {
MatVecLoop<kModelDim, kQKVDim>(layer_weights->attn_vec_einsum_w, MatVecLoop<kModelDim, kQKVDim>(layer_weights->attn_vec_einsum_w,
head * kModelDim * kQKVDim, att_out, head * kModelDim * kQKVDim, att_out,
head_out); even_odd, head_out);
} }
}; };
if constexpr (kHeads == kKVHeads) { if constexpr (kHeads == kKVHeads) {
// Multi-Head Attention // Multi-Head Attention
pool.Run(0, kHeads, [&](const uint64_t head, size_t /*thread*/) HWY_ATTR { pool.Run(0, kHeads, [&](const uint64_t head, size_t thread) HWY_ATTR {
// linear projections to QKV // linear projections to QKV
const size_t head_offset = TConfig::kInterleaveQKV const size_t head_offset = TConfig::kInterleaveQKV
? 3 * kQKVDim * kModelDim ? 3 * kQKVDim * kModelDim
@ -784,32 +772,41 @@ HWY_NOINLINE void Attention(size_t batch_start, size_t batch_idx, size_t layer,
const size_t k_offset = head * head_offset + 1 * mat_offset; const size_t k_offset = head * head_offset + 1 * mat_offset;
const size_t v_offset = head * head_offset + 2 * mat_offset; const size_t v_offset = head * head_offset + 2 * mat_offset;
ProjQ(head, q_offset); // ProjQ
float* HWY_RESTRICT q =
activations.q.data() + head * kQKVDim + batch_idx * kHeads * kQKVDim;
MatVecLoop<kQKVDim, kModelDim>(
layer_weights->qkv_einsum_w, q_offset + 0 * kQKVDim * kModelDim, x,
activations.even_odd.data() + thread * kModelDim, q);
const size_t kv_offset = // ProjKV
cache_pos * kCachePosSize + layer * kCacheLayerSize + const size_t kv_offset = cache_pos * kCachePosSize +
head * kQKVDim * 2; layer * kCacheLayerSize + head * kQKVDim * 2;
float* HWY_RESTRICT k = kv_cache.kv_cache.get() + kv_offset;
float* HWY_RESTRICT v = k + kQKVDim;
TwoOfsMatVecLoop<kQKVDim, kModelDim>(layer_weights->qkv_einsum_w,
k_offset, v_offset, x, k, v);
Rope(k, TConfig::kUseHalfRope ? kQKVDim / 2 : kQKVDim, pos);
ProjKV(k_offset, v_offset, kv_offset); Attn(head, head * kQKVDim * 2, thread);
Attn(head, head * kQKVDim * 2);
}); });
} else { } else {
// Multi-Query Attention // Multi-Query Attention
float* HWY_RESTRICT q = activations.q.data() + batch_idx * kHeads * kQKVDim; float* HWY_RESTRICT q = activations.q.data() + batch_idx * kHeads * kQKVDim;
MatVec<kHeads * kQKVDim, kModelDim>(layer_weights->qkv_einsum_w, 0, x, q, MatVec<kHeads * kQKVDim, kModelDim>(layer_weights->qkv_einsum_w, 0, x,
pool); activations.even_odd.data(), q, pool);
float* HWY_RESTRICT kv = kv_cache.kv_cache.get() + float* HWY_RESTRICT kv = kv_cache.kv_cache.get() +
cache_pos * kCachePosSize + cache_pos * kCachePosSize +
layer * kCacheLayerSize; layer * kCacheLayerSize;
MatVec<kQKVDim * 2, kModelDim>(layer_weights->qkv_einsum_w, MatVec<kQKVDim * 2, kModelDim>(layer_weights->qkv_einsum_w,
kHeads * kQKVDim * kModelDim, x, kv, pool); kHeads * kQKVDim * kModelDim, x,
activations.even_odd.data(), kv, pool);
Rope(kv, TConfig::kUseHalfRope ? kQKVDim / 2 : kQKVDim, pos); Rope(kv, TConfig::kUseHalfRope ? kQKVDim / 2 : kQKVDim, pos);
pool.Run(0, kHeads, [&](const uint64_t head, size_t /*thread*/) HWY_ATTR { pool.Run(0, kHeads, [&](const uint64_t head, size_t thread) HWY_ATTR {
Attn(head, 0); Attn(head, 0, thread);
}); });
} }
@ -829,6 +826,7 @@ HWY_NOINLINE void FFW(Activations<TConfig, kBatchSize>& activations,
static constexpr size_t kModelDim = TConfig::kModelDim; static constexpr size_t kModelDim = TConfig::kModelDim;
static constexpr size_t kFFHiddenDim = TConfig::kFFHiddenDim; static constexpr size_t kFFHiddenDim = TConfig::kFFHiddenDim;
const size_t hidden_offset = batch_idx * kFFHiddenDim * 2; const size_t hidden_offset = batch_idx * kFFHiddenDim * 2;
float* HWY_RESTRICT even_odd = activations.even_odd.data();
{ {
PROFILER_ZONE("Gen.FFW.GatedGELU"); PROFILER_ZONE("Gen.FFW.GatedGELU");
@ -837,15 +835,15 @@ HWY_NOINLINE void FFW(Activations<TConfig, kBatchSize>& activations,
float* HWY_RESTRICT out = activations.ffw_hidden.data() + hidden_offset; float* HWY_RESTRICT out = activations.ffw_hidden.data() + hidden_offset;
float* HWY_RESTRICT out_mul = out + kFFHiddenDim; float* HWY_RESTRICT out_mul = out + kFFHiddenDim;
// Same matrix, first and second half of rows. Could fuse into one MatVec, // Same matrix, first and second half of rows. Could fuse into one MatVec.
// but separating them could help on NUMA e.g. multiple sockets.
MatVecAdd<TConfig::kFFBiases, kFFHiddenDim, kModelDim>( MatVecAdd<TConfig::kFFBiases, kFFHiddenDim, kModelDim>(
layer_weights->gating_einsum_w, kFFHiddenDim * kModelDim, vec, layer_weights->gating_einsum_w, kFFHiddenDim * kModelDim, vec,
layer_weights->ffw_gating_biases.data() + kFFHiddenDim, out_mul, pool); layer_weights->ffw_gating_biases.data() + kFFHiddenDim, even_odd,
out_mul, pool);
// Gate, will go through the nonlinearity. // Gate, will go through the nonlinearity.
MatVecAdd<TConfig::kFFBiases, kFFHiddenDim, kModelDim>( MatVecAdd<TConfig::kFFBiases, kFFHiddenDim, kModelDim>(
layer_weights->gating_einsum_w, 0, vec, layer_weights->gating_einsum_w, 0, vec,
layer_weights->ffw_gating_biases.data(), out, pool); layer_weights->ffw_gating_biases.data(), even_odd, out, pool);
namespace hn = hwy::HWY_NAMESPACE; namespace hn = hwy::HWY_NAMESPACE;
using DF = hn::ScalableTag<float>; using DF = hn::ScalableTag<float>;
@ -858,7 +856,7 @@ HWY_NOINLINE void FFW(Activations<TConfig, kBatchSize>& activations,
PROFILER_ZONE("Gen.FFW\\GatedGELU"); PROFILER_ZONE("Gen.FFW\\GatedGELU");
MatVecAdd<TConfig::kFFBiases, kModelDim, kFFHiddenDim>( MatVecAdd<TConfig::kFFBiases, kModelDim, kFFHiddenDim>(
layer_weights->linear_w, 0, activations.ffw_hidden.data() + hidden_offset, layer_weights->linear_w, 0, activations.ffw_hidden.data() + hidden_offset,
layer_weights->ffw_output_biases.data(), layer_weights->ffw_output_biases.data(), even_odd,
activations.ffw_out.data() + batch_idx * kModelDim, pool); activations.ffw_out.data() + batch_idx * kModelDim, pool);
} }
@ -1110,9 +1108,9 @@ void GenerateImpl(GemmaImpl<TConfig>& gemma, size_t max_tokens,
if (is_generating_phase) { if (is_generating_phase) {
PROFILER_ZONE("Gen.Embedding"); PROFILER_ZONE("Gen.Embedding");
// Generation phase // Generation phase
MatVec<kVocabSize, TConfig::kModelDim>(weights.embedder_input_embedding, MatVec<kVocabSize, TConfig::kModelDim>(
0, final_activation, weights.embedder_input_embedding, 0, final_activation,
activations.logits.data(), pool); activations.even_odd.data(), activations.logits.data(), pool);
// Barrier: must have all logits so we can subtract max. // Barrier: must have all logits so we can subtract max.
Softmax(activations.logits.data(), kVocabSize); Softmax(activations.logits.data(), kVocabSize);
token = SampleTopK<TConfig::kTopK>(activations.logits.data(), kVocabSize, token = SampleTopK<TConfig::kTopK>(activations.logits.data(), kVocabSize,
@ -1193,9 +1191,9 @@ float ComputeCrossEntropyImpl(GemmaImpl<TConfig>& gemma, size_t max_tokens,
} }
Transformer(token, pos, weights, activations, kv_cache, pool, Transformer(token, pos, weights, activations, kv_cache, pool,
/*layers_output=*/nullptr); /*layers_output=*/nullptr);
MatVec<kVocabSize, kModelDim>(weights.embedder_input_embedding, 0, MatVec<kVocabSize, kModelDim>(
activations.x.data(), weights.embedder_input_embedding, 0, activations.x.data(),
activations.logits.data(), pool); activations.even_odd.data(), activations.logits.data(), pool);
LogitsSoftCap(30.0f, activations.logits.data(), kVocabSize); LogitsSoftCap(30.0f, activations.logits.data(), kVocabSize);
memcpy(logits.data(), activations.logits.data(), memcpy(logits.data(), activations.logits.data(),
kVocabSize * sizeof(logits[0])); kVocabSize * sizeof(logits[0]));

View File

@ -93,15 +93,23 @@ HWY_INLINE constexpr size_t RowsPerStrip() {
} }
// Simple version without tiling nor threading. // Simple version without tiling nor threading.
// even_odd is precomputed for the current thread.
template <bool kAdd, size_t kOuter, size_t kInner, typename ArrayT, template <bool kAdd, size_t kOuter, size_t kInner, typename ArrayT,
typename VecT, typename AddT> typename VecT, typename AddT>
HWY_INLINE void MatVecAddLoop(const ArrayT& mat, const size_t mat_ofs, HWY_INLINE void MatVecAddLoop(const ArrayT& mat, const size_t mat_ofs,
const VecT* HWY_RESTRICT vec_aligned, const VecT* HWY_RESTRICT vec_aligned,
const AddT* HWY_RESTRICT add, const AddT* HWY_RESTRICT add,
float* HWY_RESTRICT even_odd,
float* HWY_RESTRICT out) { float* HWY_RESTRICT out) {
PROFILER_ZONE("MatVecAddLoop"); PROFILER_ZONE("MatVecAddLoop");
const hn::ScalableTag<float> df; const hn::ScalableTag<float> df;
// Sanity check: we can write without race conditions.
if (HWY_IS_TSAN) {
even_odd[0] = hwy::ConvertScalarTo<float>(vec_aligned[0]);
even_odd[kInner - 1] = -even_odd[0];
}
for (size_t idx_row = 0; idx_row < kOuter; ++idx_row) { for (size_t idx_row = 0; idx_row < kOuter; ++idx_row) {
const size_t row_ofs = mat_ofs + idx_row * kInner; const size_t row_ofs = mat_ofs + idx_row * kInner;
if constexpr (kAdd) { if constexpr (kAdd) {
@ -113,12 +121,14 @@ HWY_INLINE void MatVecAddLoop(const ArrayT& mat, const size_t mat_ofs,
} }
} }
// even_odd is precomputed for the current thread.
template <size_t kOuter, size_t kInner, typename ArrayT, typename VecT> template <size_t kOuter, size_t kInner, typename ArrayT, typename VecT>
HWY_INLINE void MatVecLoop(const ArrayT& mat, const size_t mat_ofs, HWY_INLINE void MatVecLoop(const ArrayT& mat, const size_t mat_ofs,
const VecT* HWY_RESTRICT vec_aligned, const VecT* HWY_RESTRICT vec_aligned,
float* HWY_RESTRICT even_odd,
float* HWY_RESTRICT out) { float* HWY_RESTRICT out) {
MatVecAddLoop<false, kOuter, kInner, ArrayT, VecT, VecT>( MatVecAddLoop</*kAdd=*/false, kOuter, kInner, ArrayT, VecT, VecT>(
mat, mat_ofs, vec_aligned, /*add=*/nullptr, out); mat, mat_ofs, vec_aligned, /*add=*/nullptr, even_odd, out);
} }
// Simple version without tiling nor threading, but two offsets/outputs. // Simple version without tiling nor threading, but two offsets/outputs.
@ -156,7 +166,7 @@ HWY_INLINE void TwoOfsMatVecLoop(const ArrayT& mat, const size_t mat_ofs0,
const VecT* HWY_RESTRICT vec_aligned, const VecT* HWY_RESTRICT vec_aligned,
float* HWY_RESTRICT out0, float* HWY_RESTRICT out0,
float* HWY_RESTRICT out1) { float* HWY_RESTRICT out1) {
TwoOfsMatVecAddLoop<false, kOuter, kInner, ArrayT, VecT, VecT>( TwoOfsMatVecAddLoop</*kAdd=*/false, kOuter, kInner, ArrayT, VecT, VecT>(
mat, mat_ofs0, mat_ofs1, vec_aligned, /*add0=*/nullptr, /*add1=*/nullptr, mat, mat_ofs0, mat_ofs1, vec_aligned, /*add0=*/nullptr, /*add1=*/nullptr,
out0, out1); out0, out1);
} }
@ -237,19 +247,29 @@ HWY_INLINE void FullDotProductsForStrip(DF df, const ArrayT& mat,
// Stores dot products of rows with `vec_aligned` + add the values from `add` // Stores dot products of rows with `vec_aligned` + add the values from `add`
// (if kAdd), then stores them to `out`. // (if kAdd), then stores them to `out`.
// // `even_odd` has kInner elements for each thread.
template <bool kAdd, size_t kOuter, size_t kInner, typename ArrayT, template <bool kAdd, size_t kOuter, size_t kInner, typename ArrayT,
typename VecT, typename AddT> typename VecT, typename AddT>
HWY_INLINE void MatVecAdd(const ArrayT& mat, const size_t mat_ofs, HWY_INLINE void MatVecAdd(const ArrayT& mat, const size_t mat_ofs,
const VecT* HWY_RESTRICT const vec_aligned, const VecT* HWY_RESTRICT const vec_aligned,
const AddT* HWY_RESTRICT const add, const AddT* HWY_RESTRICT const add,
float* HWY_RESTRICT out, hwy::ThreadPool& pool) { float* HWY_RESTRICT even_odd, float* HWY_RESTRICT out,
hwy::ThreadPool& pool) {
PROFILER_ZONE("MatVecAdd"); PROFILER_ZONE("MatVecAdd");
const hn::ScalableTag<float> df; const hn::ScalableTag<float> df;
constexpr size_t kRowsPerStrip = RowsPerStrip<kOuter>(); constexpr size_t kRowsPerStrip = RowsPerStrip<kOuter>();
constexpr size_t kNumStrips = kOuter / kRowsPerStrip; constexpr size_t kNumStrips = kOuter / kRowsPerStrip;
// Sanity check: each thread can write without race conditions.
if (HWY_IS_TSAN) {
pool.Run(
0, pool.NumWorkers(), [even_odd](uint64_t /*task*/, size_t thread) {
even_odd[thread * kInner] = -static_cast<float>(thread);
even_odd[thread * kInner + kInner - 1] = static_cast<float>(thread);
});
}
// For each entire strip. // For each entire strip.
pool.Run(0, kNumStrips, [&](const uint64_t strip, size_t thread) HWY_ATTR { pool.Run(0, kNumStrips, [&](const uint64_t strip, size_t thread) HWY_ATTR {
PROFILER_ZONE("MatVec.lambda"); PROFILER_ZONE("MatVec.lambda");
@ -272,9 +292,10 @@ HWY_INLINE void MatVecAdd(const ArrayT& mat, const size_t mat_ofs,
template <size_t kOuter, size_t kInner, typename ArrayT, typename VecT> template <size_t kOuter, size_t kInner, typename ArrayT, typename VecT>
HWY_INLINE void MatVec(const ArrayT& mat, const size_t mat_ofs, HWY_INLINE void MatVec(const ArrayT& mat, const size_t mat_ofs,
const VecT* HWY_RESTRICT const vec_aligned, const VecT* HWY_RESTRICT const vec_aligned,
float* HWY_RESTRICT out, hwy::ThreadPool& pool) { float* HWY_RESTRICT even_odd, float* HWY_RESTRICT out,
MatVecAdd<false, kOuter, kInner, ArrayT, VecT, VecT>( hwy::ThreadPool& pool) {
mat, mat_ofs, vec_aligned, /*add=*/nullptr, out, pool); MatVecAdd</*kAdd=*/false, kOuter, kInner, ArrayT, VecT, VecT>(
mat, mat_ofs, vec_aligned, /*add=*/nullptr, even_odd, out, pool);
} }
template <class D, HWY_IF_F32_D(D)> template <class D, HWY_IF_F32_D(D)>
@ -427,7 +448,7 @@ HWY_NOINLINE void TwoMatVec(const ArrayT& mat0, const ArrayT& mat1,
const VecT* HWY_RESTRICT vec_aligned, const VecT* HWY_RESTRICT vec_aligned,
float* HWY_RESTRICT out0, float* HWY_RESTRICT out1, float* HWY_RESTRICT out0, float* HWY_RESTRICT out1,
hwy::ThreadPool& pool) { hwy::ThreadPool& pool) {
TwoMatVecAdd<false, kOuter, kInner, ArrayT, VecT, VecT>( TwoMatVecAdd</*kAdd=*/false, kOuter, kInner, ArrayT, VecT, VecT>(
mat0, mat1, mat_ofs, vec_aligned, /*add0=*/nullptr, /*add1=*/nullptr, mat0, mat1, mat_ofs, vec_aligned, /*add0=*/nullptr, /*add1=*/nullptr,
out0, out1, pool); out0, out1, pool);
} }

View File

@ -17,11 +17,15 @@
#define HWY_DISABLED_TARGETS HWY_SCALAR #define HWY_DISABLED_TARGETS HWY_SCALAR
#endif #endif
#include <algorithm>
#include <array> #include <array>
#include <random> #include <random>
#include <vector>
#include "compression/compress.h"
#include "hwy/aligned_allocator.h" #include "hwy/aligned_allocator.h"
#include "hwy/base.h" #include "hwy/base.h"
#include "hwy/contrib/thread_pool/thread_pool.h"
// clang-format off // clang-format off
#undef HWY_TARGET_INCLUDE #undef HWY_TARGET_INCLUDE
@ -375,6 +379,7 @@ CompressedArray<float, kOuter * kInner> GenerateMat(size_t offset) {
template <size_t length> template <size_t length>
hwy::AlignedFreeUniquePtr<float[]> GenerateVec(size_t offset) { hwy::AlignedFreeUniquePtr<float[]> GenerateVec(size_t offset) {
hwy::AlignedFreeUniquePtr<float[]> vec = hwy::AllocateAligned<float>(length); hwy::AlignedFreeUniquePtr<float[]> vec = hwy::AllocateAligned<float>(length);
HWY_ASSERT(vec);
for (size_t idx = 0; idx < length; idx++) { for (size_t idx = 0; idx < length; idx++) {
vec[idx] = static_cast<float>(idx + offset); vec[idx] = static_cast<float>(idx + offset);
} }
@ -388,8 +393,9 @@ hwy::AlignedFreeUniquePtr<float[]> SimpleMatVecAdd(
const hwy::AlignedFreeUniquePtr<float[]>& add) { const hwy::AlignedFreeUniquePtr<float[]>& add) {
hwy::AlignedFreeUniquePtr<float[]> uncompressed_mat = hwy::AlignedFreeUniquePtr<float[]> uncompressed_mat =
hwy::AllocateAligned<float>(kOuter * kInner); hwy::AllocateAligned<float>(kOuter * kInner);
Decompress(mat, 0, uncompressed_mat.get(), kOuter * kInner);
hwy::AlignedFreeUniquePtr<float[]> out = hwy::AllocateAligned<float>(kOuter); hwy::AlignedFreeUniquePtr<float[]> out = hwy::AllocateAligned<float>(kOuter);
HWY_ASSERT(uncompressed_mat && out);
Decompress(mat, 0, uncompressed_mat.get(), kOuter * kInner);
for (size_t idx_row = 0; idx_row < kOuter; idx_row++) { for (size_t idx_row = 0; idx_row < kOuter; idx_row++) {
out[idx_row] = add[idx_row]; out[idx_row] = add[idx_row];
for (size_t idx_col = 0; idx_col < kInner; idx_col++) { for (size_t idx_col = 0; idx_col < kInner; idx_col++) {
@ -418,12 +424,15 @@ void TestMatVecAdd() {
CompressedArray<float, kOuter * kInner> mat = GenerateMat<kOuter, kInner>(0); CompressedArray<float, kOuter * kInner> mat = GenerateMat<kOuter, kInner>(0);
hwy::AlignedFreeUniquePtr<float[]> vec = GenerateVec<kInner>(0); hwy::AlignedFreeUniquePtr<float[]> vec = GenerateVec<kInner>(0);
hwy::AlignedFreeUniquePtr<float[]> add = GenerateVec<kOuter>(0); hwy::AlignedFreeUniquePtr<float[]> add = GenerateVec<kOuter>(0);
hwy::AlignedFreeUniquePtr<float[]> even_odd =
hwy::AllocateAligned<float>(kInner * pool.NumWorkers());
hwy::AlignedFreeUniquePtr<float[]> expected_out = hwy::AlignedFreeUniquePtr<float[]> expected_out =
SimpleMatVecAdd<kOuter, kInner>(mat, vec, add); SimpleMatVecAdd<kOuter, kInner>(mat, vec, add);
hwy::AlignedFreeUniquePtr<float[]> actual_out = hwy::AlignedFreeUniquePtr<float[]> actual_out =
hwy::AllocateAligned<float>(kOuter); hwy::AllocateAligned<float>(kOuter);
MatVecAdd<true, kOuter, kInner>(mat, 0, vec.get(), add.get(), HWY_ASSERT(vec && add && even_odd && expected_out && actual_out);
actual_out.get(), pool); MatVecAdd</*kAdd=*/true, kOuter, kInner>(
mat, 0, vec.get(), add.get(), even_odd.get(), actual_out.get(), pool);
AssertClose<kOuter>(actual_out, expected_out); AssertClose<kOuter>(actual_out, expected_out);
} }
@ -433,12 +442,15 @@ void TestMatVecAddLoop() {
CompressedArray<float, kOuter * kInner> mat = GenerateMat<kOuter, kInner>(0); CompressedArray<float, kOuter * kInner> mat = GenerateMat<kOuter, kInner>(0);
hwy::AlignedFreeUniquePtr<float[]> vec = GenerateVec<kInner>(0); hwy::AlignedFreeUniquePtr<float[]> vec = GenerateVec<kInner>(0);
hwy::AlignedFreeUniquePtr<float[]> add = GenerateVec<kOuter>(0); hwy::AlignedFreeUniquePtr<float[]> add = GenerateVec<kOuter>(0);
hwy::AlignedFreeUniquePtr<float[]> even_odd =
hwy::AllocateAligned<float>(kInner);
hwy::AlignedFreeUniquePtr<float[]> expected_out = hwy::AlignedFreeUniquePtr<float[]> expected_out =
SimpleMatVecAdd<kOuter, kInner>(mat, vec, add); SimpleMatVecAdd<kOuter, kInner>(mat, vec, add);
hwy::AlignedFreeUniquePtr<float[]> actual_out = hwy::AlignedFreeUniquePtr<float[]> actual_out =
hwy::AllocateAligned<float>(kOuter); hwy::AllocateAligned<float>(kOuter);
HWY_ASSERT(vec && add && even_odd && expected_out && actual_out);
MatVecAddLoop<true, kOuter, kInner>(mat, 0, vec.get(), add.get(), MatVecAddLoop<true, kOuter, kInner>(mat, 0, vec.get(), add.get(),
actual_out.get()); even_odd.get(), actual_out.get());
AssertClose<kOuter>(actual_out, expected_out); AssertClose<kOuter>(actual_out, expected_out);
} }
@ -459,6 +471,8 @@ void TestTwoMatVecAdd() {
hwy::AllocateAligned<float>(kOuter); hwy::AllocateAligned<float>(kOuter);
hwy::AlignedFreeUniquePtr<float[]> actual_out1 = hwy::AlignedFreeUniquePtr<float[]> actual_out1 =
hwy::AllocateAligned<float>(kOuter); hwy::AllocateAligned<float>(kOuter);
HWY_ASSERT(vec && add0 && add1 && expected_out0 && actual_out0 &&
expected_out1 && actual_out1);
TwoMatVecAdd<true, kOuter, kInner>(mat0, mat1, 0, vec.get(), add0.get(), TwoMatVecAdd<true, kOuter, kInner>(mat0, mat1, 0, vec.get(), add0.get(),
add1.get(), actual_out0.get(), add1.get(), actual_out0.get(),
actual_out1.get(), pool); actual_out1.get(), pool);
@ -481,6 +495,8 @@ void TestTwoOfsMatVecAddLoop() {
hwy::AllocateAligned<float>(kOuter); hwy::AllocateAligned<float>(kOuter);
hwy::AlignedFreeUniquePtr<float[]> actual_out1 = hwy::AlignedFreeUniquePtr<float[]> actual_out1 =
hwy::AllocateAligned<float>(kOuter); hwy::AllocateAligned<float>(kOuter);
HWY_ASSERT(vec && add0 && add1 && expected_out0 && actual_out0 &&
expected_out1 && actual_out1);
TwoOfsMatVecAddLoop<true, kOuter, kInner>(mat, 0, 0, vec.get(), add0.get(), TwoOfsMatVecAddLoop<true, kOuter, kInner>(mat, 0, 0, vec.get(), add0.get(),
add1.get(), actual_out0.get(), add1.get(), actual_out0.get(),
actual_out1.get()); actual_out1.get());

View File

@ -96,8 +96,9 @@ class AppArgs : public ArgsBase<AppArgs> {
} }
static inline size_t GetSupportedThreadCount() { static inline size_t GetSupportedThreadCount() {
return static_cast<size_t>(std::clamp( return static_cast<size_t>(
static_cast<int>(std::thread::hardware_concurrency()) - 2, 1, 18)); std::clamp(static_cast<int>(std::thread::hardware_concurrency()) - 2, 1,
HWY_MIN(static_cast<int>(kMaxThreads), 18)));
} }
Path log; // output Path log; // output