mirror of https://github.com/google/gemma.cpp.git
parent
6eeef2e2d9
commit
28ca001d5e
15
gemma/ops.h
15
gemma/ops.h
|
|
@ -93,6 +93,21 @@ HWY_INLINE constexpr size_t RowsPerStrip() {
|
|||
return kRowsPerStrip;
|
||||
}
|
||||
|
||||
// Largely unoptimized; reordered innermost loops nets ~5-10X speedup on
|
||||
// ops_test across instruction sets.
|
||||
template <size_t kM, size_t kN, size_t kK>
|
||||
HWY_INLINE void MatMul(const float* HWY_RESTRICT a, const float* HWY_RESTRICT b,
|
||||
float* HWY_RESTRICT out) {
|
||||
int i, j, k;
|
||||
for (i = 0; i < kM; ++i) {
|
||||
for (k = 0; k < kN; ++k) {
|
||||
for (j = 0; j < kK; ++j) {
|
||||
out[i * kK + j] += a[i * kN + k] * b[k * kK + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
HWY_INLINE void ToEvenOddF32(const hwy::bfloat16_t* HWY_RESTRICT vec_aligned,
|
||||
const size_t size, float* HWY_RESTRICT out) {
|
||||
const hn::ScalableTag<float> df;
|
||||
|
|
|
|||
|
|
@ -17,6 +17,8 @@
|
|||
#define HWY_DISABLED_TARGETS HWY_SCALAR
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <random>
|
||||
|
|
@ -376,6 +378,25 @@ CompressedArray<float, kOuter * kInner> GenerateMat(size_t offset) {
|
|||
return mat;
|
||||
}
|
||||
|
||||
template <size_t kOuter, size_t kInner>
|
||||
CompressedArray<float, kOuter * kInner> GenerateZeroMat(size_t offset) {
|
||||
hwy::ThreadPool pool(static_cast<size_t>(std::clamp(
|
||||
static_cast<int>(std::thread::hardware_concurrency()) - 2, 1, 4)));
|
||||
gcpp::CompressWorkingSet ws;
|
||||
CompressedArray<float, kOuter * kInner> mat;
|
||||
std::array<float, kOuter * kInner> content;
|
||||
|
||||
pool.Run(0, kOuter, [&](const size_t i, size_t thread) {
|
||||
for (size_t j = 0; j < kInner; j++) {
|
||||
content[i * kInner + j] = 0.0f;
|
||||
}
|
||||
});
|
||||
|
||||
Compress(content, ws, mat, pool);
|
||||
mat.set_scale(1.0f);
|
||||
return mat;
|
||||
}
|
||||
|
||||
template <size_t length>
|
||||
hwy::AlignedFreeUniquePtr<float[]> GenerateVec(size_t offset) {
|
||||
hwy::AlignedFreeUniquePtr<float[]> vec = hwy::AllocateAligned<float>(length);
|
||||
|
|
@ -386,6 +407,25 @@ hwy::AlignedFreeUniquePtr<float[]> GenerateVec(size_t offset) {
|
|||
return vec;
|
||||
}
|
||||
|
||||
// A simple matrix multiplication. No optimization / tiling.
|
||||
template <size_t kM, size_t kN, size_t kK>
|
||||
hwy::AlignedFreeUniquePtr<float[]> SimpleMatMul(
|
||||
const hwy::AlignedFreeUniquePtr<float[]>& a,
|
||||
const hwy::AlignedFreeUniquePtr<float[]>& b) {
|
||||
hwy::AlignedFreeUniquePtr<float[]> out = hwy::AllocateAligned<float>(kM * kK);
|
||||
hwy::ZeroBytes(out.get(), kM * kK * sizeof(float));
|
||||
|
||||
int i, j, k;
|
||||
for (i = 0; i < kM; ++i) {
|
||||
for (j = 0; j < kK; ++j) {
|
||||
for (k = 0; k < kN; ++k) {
|
||||
out[i * kK + j] += a[i * kN + k] * b[k * kK + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
template <size_t kOuter, size_t kInner>
|
||||
hwy::AlignedFreeUniquePtr<float[]> SimpleMatVecAdd(
|
||||
const CompressedArray<float, kOuter * kInner>& mat,
|
||||
|
|
@ -417,6 +457,52 @@ void AssertClose(const hwy::AlignedFreeUniquePtr<float[]>& a,
|
|||
}
|
||||
}
|
||||
|
||||
template <typename MatT>
|
||||
void AssertClose(const hwy::AlignedFreeUniquePtr<MatT[]>& expected,
|
||||
const hwy::AlignedFreeUniquePtr<MatT[]>& actual, size_t num) {
|
||||
for (size_t idx = 0; idx < num; idx++) {
|
||||
double expected_value = hwy::ConvertScalarTo<double>(expected[idx]);
|
||||
double actual_value = hwy::ConvertScalarTo<double>(actual[idx]);
|
||||
|
||||
const double tolerance =
|
||||
expected_value * 20 * 1.0 / (1ULL << hwy::MantissaBits<MatT>());
|
||||
|
||||
if (!(expected_value - tolerance <= actual_value &&
|
||||
actual_value <= expected_value + tolerance)) {
|
||||
fprintf(stderr, "expected[%lu]: %f, actual[%lu]: %f\n", idx,
|
||||
expected_value, idx, actual_value);
|
||||
HWY_ASSERT(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TestMatMul() {
|
||||
hwy::ThreadPool pool(0);
|
||||
constexpr size_t kM = 128 * 3; // 384
|
||||
constexpr size_t kK = 128 * 5; // 640
|
||||
constexpr size_t kN = 128 * 6; // 768
|
||||
|
||||
CompressedArray<float, kM * kN> a1 = GenerateMat<kM, kN>(0);
|
||||
CompressedArray<float, kN * kK> b1 = GenerateMat<kN, kK>(0);
|
||||
|
||||
hwy::AlignedFreeUniquePtr<float[]> a = hwy::AllocateAligned<float>(kM * kN);
|
||||
Decompress(a1, 0, a.get(), kM * kN);
|
||||
|
||||
hwy::AlignedFreeUniquePtr<float[]> b = hwy::AllocateAligned<float>(kN * kK);
|
||||
Decompress(b1, 0, b.get(), kN * kK);
|
||||
|
||||
hwy::AlignedFreeUniquePtr<float[]> expected_out1 =
|
||||
SimpleMatMul<kM, kN, kK>(a, b);
|
||||
|
||||
CompressedArray<float, kM * kK> compressed_c = GenerateZeroMat<kM, kK>(0);
|
||||
hwy::AlignedFreeUniquePtr<float[]> c = hwy::AllocateAligned<float>(kM * kK);
|
||||
Decompress(compressed_c, 0, c.get(), kM * kK);
|
||||
|
||||
MatMul<kM, kN, kK>(a.get(), b.get(), c.get());
|
||||
|
||||
AssertClose(expected_out1, c, kM * kK);
|
||||
}
|
||||
|
||||
void TestMatVecAdd() {
|
||||
hwy::ThreadPool pool(0);
|
||||
constexpr size_t kOuter = 128 * 3;
|
||||
|
|
@ -518,6 +604,7 @@ HWY_EXPORT_AND_TEST_P(OpsTest, TestAllMulByConst);
|
|||
HWY_EXPORT_AND_TEST_P(OpsTest, TestAllMulByConstAndAdd);
|
||||
HWY_EXPORT_AND_TEST_P(OpsTest, TestAllSoftmax);
|
||||
HWY_EXPORT_AND_TEST_P(OpsTest, TestAllCreateDistribution);
|
||||
HWY_EXPORT_AND_TEST_P(OpsTest, TestMatMul);
|
||||
HWY_EXPORT_AND_TEST_P(OpsTest, TestMatVecAdd);
|
||||
HWY_EXPORT_AND_TEST_P(OpsTest, TestTwoMatVecAdd);
|
||||
HWY_EXPORT_AND_TEST_P(OpsTest, TestTwoOfsMatVecAddLoop);
|
||||
|
|
|
|||
Loading…
Reference in New Issue