diff --git a/compression/test_util-inl.h b/compression/test_util-inl.h index bddac6b..3af8e78 100644 --- a/compression/test_util-inl.h +++ b/compression/test_util-inl.h @@ -69,45 +69,51 @@ void ForeachPackedAndRawType() { // Generates inputs: deterministic, within max SfpStream range. template -MatStorageT GenerateMat(const Extents2D& extents, hwy::ThreadPool& pool) { +MatStorageT GenerateMat(const Extents2D& extents, MatPadding padding, + hwy::ThreadPool& pool) { gcpp::CompressWorkingSet ws; + ws.tls.resize(pool.NumWorkers()); MatStorageT raw("raw", extents, MatPadding::kPacked); - MatStorageT compressed("mat", extents, MatPadding::kPacked); + MatStorageT compressed("mat", extents, padding); const float scale = SfpStream::kMax / extents.Area(); - pool.Run(0, extents.rows, [&](const size_t r, size_t /*thread*/) { + pool.Run(0, extents.rows, [&](const size_t r, size_t thread) { float* HWY_RESTRICT row = raw.Row(r); for (size_t c = 0; c < extents.cols; c++) { float f = static_cast(r * extents.cols + c) * scale; if ((r + c) & 1) f = -f; // Also generate some negative values. row[c] = f; } + Compress(raw.Row(r), raw.Cols(), ws.tls[thread], + MakeSpan(compressed.Row(r), compressed.Cols()), + /*packed_ofs=*/0); }); - Compress(raw.PackedScale1(), raw.Extents().Area(), ws, compressed.Span(), - /*packed_ofs=*/0, pool); compressed.SetScale(0.6f); // Arbitrary value, different from 1. return compressed; } -// `extents` describes the transposed matrix. +// Same, but `extents` describes the transposed matrix. template MatStorageT GenerateTransposedMat(const Extents2D extents, + MatPadding padding, hwy::ThreadPool& pool) { gcpp::CompressWorkingSet ws; + ws.tls.resize(pool.NumWorkers()); MatStorageT raw("raw", extents, MatPadding::kPacked); - MatStorageT compressed("trans", extents, MatPadding::kPacked); + MatStorageT compressed("trans", extents, padding); const float scale = SfpStream::kMax / extents.Area(); - pool.Run(0, extents.rows, [&](const size_t r, size_t /*thread*/) { + pool.Run(0, extents.rows, [&](const size_t r, size_t thread) { float* HWY_RESTRICT row = raw.Row(r); for (size_t c = 0; c < extents.cols; c++) { float f = static_cast(c * extents.rows + r) * scale; if ((r + c) & 1) f = -f; // Also generate some negative values. row[c] = f; } + Compress(raw.Row(r), raw.Cols(), ws.tls[thread], + MakeSpan(compressed.Row(r), compressed.Cols()), + /*packed_ofs=*/0); }); - Compress(raw.PackedScale1(), raw.Extents().Area(), ws, compressed.Span(), - /*packed_ofs=*/0, pool); // Arbitrary value, different from 1, must match `GenerateMat`. compressed.SetScale(0.6f); return compressed; diff --git a/ops/bench_matmul.cc b/ops/bench_matmul.cc index 3de3b76..7f0a587 100644 --- a/ops/bench_matmul.cc +++ b/ops/bench_matmul.cc @@ -89,12 +89,14 @@ void BenchMatMul(size_t M, size_t K, size_t N, bool add, MatMulEnv& env) { MatStorageT add_storage("add", Extents2D(), MatPadding::kPacked); if (add) { - add_storage = GenerateMat(Extents2D(1, N), pool); + add_storage = + GenerateMat(Extents2D(1, N), MatPadding::kPacked, pool); add_storage.SetScale(1.0f); } - MatStorageT a = GenerateMat(A_extents, pool); - MatStorageT b_trans = GenerateTransposedMat(B_extents, pool); + MatStorageT a = GenerateMat(A_extents, MatPadding::kOdd, pool); + MatStorageT b_trans = + GenerateTransposedMat(B_extents, MatPadding::kOdd, pool); const float* add_row = add ? add_storage.PackedScale1() : nullptr; diff --git a/ops/matmul-inl.h b/ops/matmul-inl.h index 1e59165..9d67a10 100644 --- a/ops/matmul-inl.h +++ b/ops/matmul-inl.h @@ -1298,7 +1298,7 @@ struct MMImpl { // `K = B.Cols()`, which must match `A.Cols()`, is the number // of rows in the original B. `N = C.Cols()` must be a multiple of 4. There // are no other restrictions on shape, though performance is better when `M % 4 -// == 0` or `M <= 4`. +// == 0` or `M <= 4`, and when A is padded (`!A.IsPacked()`). // // NOTE: if A and/or B are BF16 and padded, the interval `[Cols(), // hwy::RoundUpTo(Cols(), hn::Lanes(dbf))` must be zero-initialized to match diff --git a/ops/matmul_test.cc b/ops/matmul_test.cc index fb33fb4..3885534 100644 --- a/ops/matmul_test.cc +++ b/ops/matmul_test.cc @@ -219,14 +219,16 @@ void TestMatMul(size_t rows_ac, size_t cols_a_rows_b, size_t cols_bc, bool add, const Extents2D B_extents(cols_bc, cols_a_rows_b); // already transposed const Extents2D C_extents(rows_ac, cols_bc); - MatStorageT A(GenerateMat(A_extents, pool)); - MatStorageT BT(GenerateTransposedMat(B_extents, pool)); + MatStorageT A(GenerateMat(A_extents, MatPadding::kOdd, pool)); + // Must be packed because we call Span() on it. + MatStorageT BT( + GenerateTransposedMat(B_extents, MatPadding::kPacked, pool)); MatStorageT C_slow("C_slow", C_extents, MatPadding::kOdd); MatStorageT C("C", C_extents, MatPadding::kOdd); C.AllocateAndAttachRowPtrs(env.row_ptrs); MatStorageT add_storage = - add ? GenerateMat(Extents2D(1, cols_bc), pool) + add ? GenerateMat(Extents2D(1, cols_bc), MatPadding::kPacked, pool) : MatStorageT("add", Extents2D(), MatPadding::kPacked); add_storage.SetScale(1.0f); const float* add_row = add ? add_storage.PackedScale1() : nullptr;