diff --git a/gemma/ops_test.cc b/gemma/ops_test.cc index 5f3c45f..76bb9bc 100644 --- a/gemma/ops_test.cc +++ b/gemma/ops_test.cc @@ -13,6 +13,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #ifndef HWY_DISABLED_TARGETS #define HWY_DISABLED_TARGETS HWY_SCALAR #endif @@ -363,25 +364,6 @@ CompressedArray GenerateMat(size_t offset, return mat; } -template -CompressedArray GenerateTransposeMat( - size_t offset, hwy::ThreadPool& pool) { - gcpp::CompressWorkingSet ws; - CompressedArray mat; - std::array content; - const float scale = 1.0f / kInner; - pool.Run(0, kOuter, [&](const size_t i, size_t /*thread*/) { - for (size_t j = 0; j < kInner; j++) { - content[i * kInner + j] = - static_cast((j * kInner + i + offset) * scale); - } - }); - - Compress(content, ws, mat, pool); - mat.set_scale(1.0f); - return mat; -} - template CompressedArray GenerateZeroMat(hwy::ThreadPool& pool) { gcpp::CompressWorkingSet ws; @@ -397,6 +379,72 @@ CompressedArray GenerateZeroMat(hwy::ThreadPool& pool) { return mat; } +template +std::unique_ptr> GenerateMatHeap( + size_t offset, hwy::ThreadPool& pool) { + gcpp::CompressWorkingSet ws; + std::unique_ptr> mat = + std::unique_ptr>( + new CompressedArray); + hwy::AlignedFreeUniquePtr content = + hwy::AllocateAligned(kOuter * kInner); + const float scale = 1.0f / kInner; + pool.Run(0, kOuter, [&](const size_t i, size_t /*thread*/) { + for (size_t j = 0; j < kInner; j++) { + content[i * kInner + j] = + static_cast((i * kInner + j + offset) * scale); + } + }); + + Compress(content.get(), kOuter * kInner, ws, kOuter * kInner, mat->data(), 0, + pool); + mat->set_scale(1.0f); + return mat; +} + +template +std::unique_ptr> +GenerateTransposeMatHeap(size_t offset, hwy::ThreadPool& pool) { + gcpp::CompressWorkingSet ws; + std::unique_ptr> mat = + std::unique_ptr>( + new CompressedArray); + hwy::AlignedFreeUniquePtr content = + hwy::AllocateAligned(kOuter * kInner); + const float scale = 1.0f / kInner; + pool.Run(0, kOuter, [&](const size_t i, size_t /*thread*/) { + for (size_t j = 0; j < kInner; j++) { + content[i * kInner + j] = + static_cast((j * kInner + i + offset) * scale); + } + }); + + Compress(content.get(), kOuter * kInner, ws, kOuter * kInner, mat->data(), 0, + pool); + mat->set_scale(1.0f); + return mat; +} + +template +std::unique_ptr> GenerateZeroMatHeap( + hwy::ThreadPool& pool) { + gcpp::CompressWorkingSet ws; + std::unique_ptr> mat = + std::unique_ptr>( + new CompressedArray); + hwy::AlignedFreeUniquePtr content = + hwy::AllocateAligned(kOuter * kInner); + + pool.Run(0, kOuter, [&](const size_t i, size_t thread) { + hwy::ZeroBytes(&content[i * kInner], kInner * sizeof(content[0])); + }); + + Compress(content.get(), kOuter * kInner, ws, kOuter * kInner, mat->data(), 0, + pool); + mat->set_scale(1.0f); + return mat; +} + template hwy::AlignedFreeUniquePtr GenerateVec(size_t offset) { hwy::AlignedFreeUniquePtr vec = hwy::AllocateAligned(length); @@ -484,17 +532,21 @@ void TestTiledMatMul() { constexpr size_t kN = 512; // * 5; // 6; // 768 constexpr size_t kK = 512; // * 5; // 640 - CompressedArray a = GenerateMat(0, pool); - CompressedArray b = GenerateMat(0, pool); - CompressedArray c_slow = GenerateZeroMat(pool); - MatMulSlow(a.data(), b.data(), c_slow.data()); + std::unique_ptr> a = + GenerateMatHeap(0, pool); + std::unique_ptr> b = + GenerateMatHeap(0, pool); + std::unique_ptr> c_slow = + GenerateZeroMatHeap(pool); + + MatMulSlow(a->data(), b->data(), c_slow->data()); hwy::AlignedFreeUniquePtr c = hwy::AllocateAligned(kM * kK); - CompressedArray b_trans = - GenerateTransposeMat(0, pool); - MatMul_4x4(a.data(), b_trans.data(), c.get(), pool); + std::unique_ptr> b_trans = + GenerateTransposeMatHeap(0, pool); + MatMul_4x4(a->data(), b_trans->data(), c.get(), pool); - AssertClose(c_slow.data(), c.get(), kM * kK); + AssertClose(c_slow->data(), c.get(), kM * kK); } void TestAllTiledMatMul() {