Fix loop iteration in GeluMulToBF16

Also attempt to speed up builders (parallel)

PiperOrigin-RevId: 613092863
This commit is contained in:
Jan Wassenberg 2024-03-05 23:00:09 -08:00 committed by Copybara-Service
parent c8b9675898
commit 3cdd5e524a
2 changed files with 2 additions and 2 deletions

View File

@ -44,7 +44,7 @@ jobs:
-D CMAKE_CXX_COMPILER_LAUNCHER=ccache
- name: Build
run: cmake --build ${{ github.workspace }}/build --preset ${{ matrix.preset }} --config ${{ matrix.build_type }}
run: cmake --build ${{ github.workspace }}/build --preset ${{ matrix.preset }} --config ${{ matrix.build_type }} -j 4
- name: Archive production artifacts
uses: actions/upload-artifact@v4

2
ops.h
View File

@ -241,7 +241,7 @@ static HWY_NOINLINE HWY_MAYBE_UNUSED void GeluMulToBF16(
size_t i = 0;
if (size >= 2 * NF) {
for (; i < size - 2 * NF; i += 2 * NF) {
for (; i <= size - 2 * NF; i += 2 * NF) {
const VF mul0 = hn::LoadU(df, mul + i);
const VF mul1 = hn::LoadU(df, mul + i + NF);
const VF g0 = hn::Mul(mul0, Gelu(df, hn::LoadU(df, gelu_in + i)));