diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 06f4dfa..a0e9dc2 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -44,7 +44,7 @@ jobs:
         -D CMAKE_CXX_COMPILER_LAUNCHER=ccache
 
     - name: Build
-      run: cmake --build ${{ github.workspace }}/build --preset ${{ matrix.preset }} --config ${{ matrix.build_type }}
+      run: cmake --build ${{ github.workspace }}/build --preset ${{ matrix.preset }} --config ${{ matrix.build_type }} -j 4
 
     - name: Archive production artifacts
       uses: actions/upload-artifact@v4
diff --git a/README.md b/README.md
index 1d8d282..b0b9aad 100644
--- a/README.md
+++ b/README.md
@@ -65,15 +65,26 @@ winget install --id Kitware.CMake
 winget install --id Microsoft.VisualStudio.2022.BuildTools --force --override "--passive --wait --add Microsoft.VisualStudio.Workload.VCTools;installRecommended --add Microsoft.VisualStudio.Component.VC.Llvm.Clang --add Microsoft.VisualStudio.Component.VC.Llvm.ClangToolset"
 ```
 
-### Step 1: Obtain model weights and tokenizer from Kaggle
+### Step 1: Obtain model weights and tokenizer from Kaggle or Hugging Face Hub
 
 Visit [the Gemma model page on
-Kaggle](https://www.kaggle.com/models/google/gemma) and select `Model Variations
+Kaggle](https://www.kaggle.com/models/google/gemma/frameworks/gemmaCpp) and select `Model Variations
 |> Gemma C++`. On this tab, the `Variation` dropdown includes the options below.
 Note bfloat16 weights are higher fidelity, while 8-bit switched floating point
 weights enable faster inference. In general, we recommend starting with the
 `-sfp` checkpoints.
 
+Alternatively, visit the [gemma.cpp](https://huggingface.co/models?other=gemma.cpp)
+models on the Hugging Face Hub. First go the the model repository of the model of interest
+(see recommendations below). Then, click the `Files and versions` tab and download the 
+model and tokenizer files. For programmatic downloading, if you have `huggingface_hub`
+installed, you can also download by running:
+
+```
+huggingface-cli login # Just the first time
+huggingface-cli download google/gemma-2b-sfp-cpp --local-dir build/
+```
+
 2B instruction-tuned (`it`) and pre-trained (`pt`) models:
 
 | Model name  | Description |
@@ -98,6 +109,8 @@ weights enable faster inference. In general, we recommend starting with the
 
 ### Step 2: Extract Files
 
+If you downloaded the models from Hugging Face, skip to step 3.
+
 After filling out the consent form, the download should proceed to retrieve a
 tar archive file `archive.tar.gz`. Extract files from `archive.tar.gz` (this can
 take a few minutes):
diff --git a/ops.h b/ops.h
index 179001c..4ac64b8 100644
--- a/ops.h
+++ b/ops.h
@@ -241,7 +241,7 @@ static HWY_NOINLINE HWY_MAYBE_UNUSED void GeluMulToBF16(
 
   size_t i = 0;
   if (size >= 2 * NF) {
-    for (; i < size - 2 * NF; i += 2 * NF) {
+    for (; i <= size - 2 * NF; i += 2 * NF) {
       const VF mul0 = hn::LoadU(df, mul + i);
       const VF mul1 = hn::LoadU(df, mul + i + NF);
       const VF g0 = hn::Mul(mul0, Gelu(df, hn::LoadU(df, gelu_in + i)));