mirror of https://github.com/google/gemma.cpp.git
Fix bench_matmul perf regression: A input should be padded
PiperOrigin-RevId: 781976414
This commit is contained in:
parent
4bc44d5678
commit
349c86f2d9
|
|
@ -69,45 +69,51 @@ void ForeachPackedAndRawType() {
|
||||||
|
|
||||||
// Generates inputs: deterministic, within max SfpStream range.
|
// Generates inputs: deterministic, within max SfpStream range.
|
||||||
template <typename MatT>
|
template <typename MatT>
|
||||||
MatStorageT<MatT> GenerateMat(const Extents2D& extents, hwy::ThreadPool& pool) {
|
MatStorageT<MatT> GenerateMat(const Extents2D& extents, MatPadding padding,
|
||||||
|
hwy::ThreadPool& pool) {
|
||||||
gcpp::CompressWorkingSet ws;
|
gcpp::CompressWorkingSet ws;
|
||||||
|
ws.tls.resize(pool.NumWorkers());
|
||||||
MatStorageT<float> raw("raw", extents, MatPadding::kPacked);
|
MatStorageT<float> raw("raw", extents, MatPadding::kPacked);
|
||||||
MatStorageT<MatT> compressed("mat", extents, MatPadding::kPacked);
|
MatStorageT<MatT> compressed("mat", extents, padding);
|
||||||
const float scale = SfpStream::kMax / extents.Area();
|
const float scale = SfpStream::kMax / extents.Area();
|
||||||
pool.Run(0, extents.rows, [&](const size_t r, size_t /*thread*/) {
|
pool.Run(0, extents.rows, [&](const size_t r, size_t thread) {
|
||||||
float* HWY_RESTRICT row = raw.Row(r);
|
float* HWY_RESTRICT row = raw.Row(r);
|
||||||
for (size_t c = 0; c < extents.cols; c++) {
|
for (size_t c = 0; c < extents.cols; c++) {
|
||||||
float f = static_cast<float>(r * extents.cols + c) * scale;
|
float f = static_cast<float>(r * extents.cols + c) * scale;
|
||||||
if ((r + c) & 1) f = -f; // Also generate some negative values.
|
if ((r + c) & 1) f = -f; // Also generate some negative values.
|
||||||
row[c] = f;
|
row[c] = f;
|
||||||
}
|
}
|
||||||
|
Compress(raw.Row(r), raw.Cols(), ws.tls[thread],
|
||||||
|
MakeSpan(compressed.Row(r), compressed.Cols()),
|
||||||
|
/*packed_ofs=*/0);
|
||||||
});
|
});
|
||||||
|
|
||||||
Compress(raw.PackedScale1(), raw.Extents().Area(), ws, compressed.Span(),
|
|
||||||
/*packed_ofs=*/0, pool);
|
|
||||||
compressed.SetScale(0.6f); // Arbitrary value, different from 1.
|
compressed.SetScale(0.6f); // Arbitrary value, different from 1.
|
||||||
return compressed;
|
return compressed;
|
||||||
}
|
}
|
||||||
|
|
||||||
// `extents` describes the transposed matrix.
|
// Same, but `extents` describes the transposed matrix.
|
||||||
template <typename MatT>
|
template <typename MatT>
|
||||||
MatStorageT<MatT> GenerateTransposedMat(const Extents2D extents,
|
MatStorageT<MatT> GenerateTransposedMat(const Extents2D extents,
|
||||||
|
MatPadding padding,
|
||||||
hwy::ThreadPool& pool) {
|
hwy::ThreadPool& pool) {
|
||||||
gcpp::CompressWorkingSet ws;
|
gcpp::CompressWorkingSet ws;
|
||||||
|
ws.tls.resize(pool.NumWorkers());
|
||||||
MatStorageT<float> raw("raw", extents, MatPadding::kPacked);
|
MatStorageT<float> raw("raw", extents, MatPadding::kPacked);
|
||||||
MatStorageT<MatT> compressed("trans", extents, MatPadding::kPacked);
|
MatStorageT<MatT> compressed("trans", extents, padding);
|
||||||
const float scale = SfpStream::kMax / extents.Area();
|
const float scale = SfpStream::kMax / extents.Area();
|
||||||
pool.Run(0, extents.rows, [&](const size_t r, size_t /*thread*/) {
|
pool.Run(0, extents.rows, [&](const size_t r, size_t thread) {
|
||||||
float* HWY_RESTRICT row = raw.Row(r);
|
float* HWY_RESTRICT row = raw.Row(r);
|
||||||
for (size_t c = 0; c < extents.cols; c++) {
|
for (size_t c = 0; c < extents.cols; c++) {
|
||||||
float f = static_cast<float>(c * extents.rows + r) * scale;
|
float f = static_cast<float>(c * extents.rows + r) * scale;
|
||||||
if ((r + c) & 1) f = -f; // Also generate some negative values.
|
if ((r + c) & 1) f = -f; // Also generate some negative values.
|
||||||
row[c] = f;
|
row[c] = f;
|
||||||
}
|
}
|
||||||
|
Compress(raw.Row(r), raw.Cols(), ws.tls[thread],
|
||||||
|
MakeSpan(compressed.Row(r), compressed.Cols()),
|
||||||
|
/*packed_ofs=*/0);
|
||||||
});
|
});
|
||||||
|
|
||||||
Compress(raw.PackedScale1(), raw.Extents().Area(), ws, compressed.Span(),
|
|
||||||
/*packed_ofs=*/0, pool);
|
|
||||||
// Arbitrary value, different from 1, must match `GenerateMat`.
|
// Arbitrary value, different from 1, must match `GenerateMat`.
|
||||||
compressed.SetScale(0.6f);
|
compressed.SetScale(0.6f);
|
||||||
return compressed;
|
return compressed;
|
||||||
|
|
|
||||||
|
|
@ -89,12 +89,14 @@ void BenchMatMul(size_t M, size_t K, size_t N, bool add, MatMulEnv& env) {
|
||||||
|
|
||||||
MatStorageT<float> add_storage("add", Extents2D(), MatPadding::kPacked);
|
MatStorageT<float> add_storage("add", Extents2D(), MatPadding::kPacked);
|
||||||
if (add) {
|
if (add) {
|
||||||
add_storage = GenerateMat<float>(Extents2D(1, N), pool);
|
add_storage =
|
||||||
|
GenerateMat<float>(Extents2D(1, N), MatPadding::kPacked, pool);
|
||||||
add_storage.SetScale(1.0f);
|
add_storage.SetScale(1.0f);
|
||||||
}
|
}
|
||||||
|
|
||||||
MatStorageT<TA> a = GenerateMat<TA>(A_extents, pool);
|
MatStorageT<TA> a = GenerateMat<TA>(A_extents, MatPadding::kOdd, pool);
|
||||||
MatStorageT<TB> b_trans = GenerateTransposedMat<TB>(B_extents, pool);
|
MatStorageT<TB> b_trans =
|
||||||
|
GenerateTransposedMat<TB>(B_extents, MatPadding::kOdd, pool);
|
||||||
|
|
||||||
const float* add_row = add ? add_storage.PackedScale1() : nullptr;
|
const float* add_row = add ? add_storage.PackedScale1() : nullptr;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1298,7 +1298,7 @@ struct MMImpl {
|
||||||
// `K = B.Cols()`, which must match `A.Cols()`, is the number
|
// `K = B.Cols()`, which must match `A.Cols()`, is the number
|
||||||
// of rows in the original B. `N = C.Cols()` must be a multiple of 4. There
|
// of rows in the original B. `N = C.Cols()` must be a multiple of 4. There
|
||||||
// are no other restrictions on shape, though performance is better when `M % 4
|
// are no other restrictions on shape, though performance is better when `M % 4
|
||||||
// == 0` or `M <= 4`.
|
// == 0` or `M <= 4`, and when A is padded (`!A.IsPacked()`).
|
||||||
//
|
//
|
||||||
// NOTE: if A and/or B are BF16 and padded, the interval `[Cols(),
|
// NOTE: if A and/or B are BF16 and padded, the interval `[Cols(),
|
||||||
// hwy::RoundUpTo(Cols(), hn::Lanes(dbf))` must be zero-initialized to match
|
// hwy::RoundUpTo(Cols(), hn::Lanes(dbf))` must be zero-initialized to match
|
||||||
|
|
|
||||||
|
|
@ -219,14 +219,16 @@ void TestMatMul(size_t rows_ac, size_t cols_a_rows_b, size_t cols_bc, bool add,
|
||||||
const Extents2D B_extents(cols_bc, cols_a_rows_b); // already transposed
|
const Extents2D B_extents(cols_bc, cols_a_rows_b); // already transposed
|
||||||
const Extents2D C_extents(rows_ac, cols_bc);
|
const Extents2D C_extents(rows_ac, cols_bc);
|
||||||
|
|
||||||
MatStorageT<TA> A(GenerateMat<TA>(A_extents, pool));
|
MatStorageT<TA> A(GenerateMat<TA>(A_extents, MatPadding::kOdd, pool));
|
||||||
MatStorageT<TB> BT(GenerateTransposedMat<TB>(B_extents, pool));
|
// Must be packed because we call Span() on it.
|
||||||
|
MatStorageT<TB> BT(
|
||||||
|
GenerateTransposedMat<TB>(B_extents, MatPadding::kPacked, pool));
|
||||||
MatStorageT<TC> C_slow("C_slow", C_extents, MatPadding::kOdd);
|
MatStorageT<TC> C_slow("C_slow", C_extents, MatPadding::kOdd);
|
||||||
MatStorageT<TC> C("C", C_extents, MatPadding::kOdd);
|
MatStorageT<TC> C("C", C_extents, MatPadding::kOdd);
|
||||||
C.AllocateAndAttachRowPtrs(env.row_ptrs);
|
C.AllocateAndAttachRowPtrs(env.row_ptrs);
|
||||||
|
|
||||||
MatStorageT<float> add_storage =
|
MatStorageT<float> add_storage =
|
||||||
add ? GenerateMat<float>(Extents2D(1, cols_bc), pool)
|
add ? GenerateMat<float>(Extents2D(1, cols_bc), MatPadding::kPacked, pool)
|
||||||
: MatStorageT<float>("add", Extents2D(), MatPadding::kPacked);
|
: MatStorageT<float>("add", Extents2D(), MatPadding::kPacked);
|
||||||
add_storage.SetScale(1.0f);
|
add_storage.SetScale(1.0f);
|
||||||
const float* add_row = add ? add_storage.PackedScale1() : nullptr;
|
const float* add_row = add ? add_storage.PackedScale1() : nullptr;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue