From 3cdd5e524a4c74de49e936cb5d9580d56aacfe07 Mon Sep 17 00:00:00 2001 From: Jan Wassenberg Date: Tue, 5 Mar 2024 23:00:09 -0800 Subject: [PATCH] Fix loop iteration in GeluMulToBF16 Also attempt to speed up builders (parallel) PiperOrigin-RevId: 613092863 --- .github/workflows/build.yml | 2 +- ops.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 06f4dfa..a0e9dc2 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -44,7 +44,7 @@ jobs: -D CMAKE_CXX_COMPILER_LAUNCHER=ccache - name: Build - run: cmake --build ${{ github.workspace }}/build --preset ${{ matrix.preset }} --config ${{ matrix.build_type }} + run: cmake --build ${{ github.workspace }}/build --preset ${{ matrix.preset }} --config ${{ matrix.build_type }} -j 4 - name: Archive production artifacts uses: actions/upload-artifact@v4 diff --git a/ops.h b/ops.h index 8f92d82..3725776 100644 --- a/ops.h +++ b/ops.h @@ -241,7 +241,7 @@ static HWY_NOINLINE HWY_MAYBE_UNUSED void GeluMulToBF16( size_t i = 0; if (size >= 2 * NF) { - for (; i < size - 2 * NF; i += 2 * NF) { + for (; i <= size - 2 * NF; i += 2 * NF) { const VF mul0 = hn::LoadU(df, mul + i); const VF mul1 = hn::LoadU(df, mul + i + NF); const VF g0 = hn::Mul(mul0, Gelu(df, hn::LoadU(df, gelu_in + i)));